Merge branch 'master' into vulkan

This commit is contained in:
Triang3l 2022-06-22 13:15:50 +03:00
commit 0d8bd0e0c6
14 changed files with 213 additions and 185 deletions

View File

@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// Get dynamic rasterizer state. // Get dynamic rasterizer state.
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
render_target_cache_->depth_float24_conversion();
draw_util::ViewportInfo viewport_info; draw_util::ViewportInfo viewport_info;
draw_util::GetHostViewportInfo( draw_util::GetHostViewportInfo(
regs, draw_resolution_scale_x, draw_resolution_scale_y, true, regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false, D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
normalized_depth_control, normalized_depth_control,
host_render_targets_used && host_render_targets_used &&
(depth_float24_conversion == render_target_cache_->depth_float24_convert_in_pixel_shader(),
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(), host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
viewport_info); viewport_info);
draw_util::Scissor scissor; draw_util::Scissor scissor;

View File

@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() {
gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb; gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;
depth_float24_conversion_ = GetConfigDepthFloat24Conversion(); depth_float24_round_ = cvars::depth_float24_round;
depth_float24_convert_in_pixel_shader_ =
cvars::depth_float24_convert_in_pixel_shader;
// Check if 2x MSAA is supported or needs to be emulated with 4x MSAA // Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
// instead. // instead.
@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() {
// Blending is done in linear space directly in shaders. // Blending is done in linear space directly in shaders.
gamma_render_target_as_srgb_ = false; gamma_render_target_as_srgb_ = false;
// Always true float24 depth. // Always true float24 depth rounded to the nearest even.
depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding; depth_float24_round_ = true;
depth_float24_convert_in_pixel_shader_ = true;
// Only ForcedSampleCount, which doesn't support 2x. // Only ForcedSampleCount, which doesn't support 2x.
msaa_2x_supported_ = false; msaa_2x_supported_ = false;
@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget(
bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent( bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
xenos::DepthRenderTargetFormat format) const { xenos::DepthRenderTargetFormat format) const {
if (format == xenos::DepthRenderTargetFormat::kD24FS8) { if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy; return !depth_float24_convert_in_pixel_shader_;
} }
return false; return false;
} }
@ -3542,8 +3545,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
} break; } break;
case xenos::DepthRenderTargetFormat::kD24FS8: { case xenos::DepthRenderTargetFormat::kD24FS8: {
// Convert using r1.y as temporary. // Convert using r1.y as temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1, // When converting the depth in pixel shaders, it's always exact,
true); // truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, i, 3, i, 3, 1, 1,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break; } break;
} }
// Merge depth and stencil into r0/r1.x. // Merge depth and stencil into r0/r1.x.
@ -3729,8 +3737,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
} break; } break;
case xenos::DepthRenderTargetFormat::kD24FS8: { case xenos::DepthRenderTargetFormat::kD24FS8: {
// Convert using r1.y as temporary. // Convert using r1.y as temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1, // When converting the depth in pixel shaders, it's always exact,
true); // truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 1, 3, 1, 3, 1, 1,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break; } break;
} }
if (dest_is_color) { if (dest_is_color) {
@ -4105,8 +4118,14 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
dxbc::Src::R(0, dxbc::Src::kYYYY)); dxbc::Src::R(0, dxbc::Src::kYYYY));
} break; } break;
case xenos::DepthRenderTargetFormat::kD24FS8: { case xenos::DepthRenderTargetFormat::kD24FS8: {
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2, // When converting the depth in pixel shaders, it's always
true); // exact, truncating not to insert additional rounding
// instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 0, 1, 0, 0, 0, 2,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break; } break;
} }
a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY), a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
@ -6167,7 +6186,12 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
case xenos::DepthRenderTargetFormat::kD24FS8: case xenos::DepthRenderTargetFormat::kD24FS8:
// Convert to [0, 2) float24 from [0, 1) float32, using r0.x as // Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
// temporary. // temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true); // When converting the depth in pixel shaders, it's always exact,
// truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 1, 0, 1, 0, 0, 0,
!depth_float24_convert_in_pixel_shader() && depth_float24_round(),
true);
break; break;
} }
// Combine 24-bit depth and stencil into r1.x. // Combine 24-bit depth and stencil into r1.x.

View File

@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
!cvars::snorm16_render_target_full_range; !cvars::snorm16_render_target_full_range;
} }
DepthFloat24Conversion depth_float24_conversion() const { bool depth_float24_round() const { return depth_float24_round_; }
return depth_float24_conversion_; bool depth_float24_convert_in_pixel_shader() const {
return depth_float24_convert_in_pixel_shader_;
} }
DXGI_FORMAT GetColorResourceDXGIFormat( DXGI_FORMAT GetColorResourceDXGIFormat(
@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
bool gamma_render_target_as_srgb_ = false; bool gamma_render_target_as_srgb_ = false;
DepthFloat24Conversion depth_float24_conversion_ = bool depth_float24_round_ = false;
DepthFloat24Conversion::kOnCopy; bool depth_float24_convert_in_pixel_shader_ = false;
bool msaa_2x_supported_ = false; bool msaa_2x_supported_ = false;

View File

@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification(
RenderTargetCache::Path::kHostRenderTargets) { RenderTargetCache::Path::kHostRenderTargets) {
using DepthStencilMode = using DepthStencilMode =
DxbcShaderTranslator::Modification::DepthStencilMode; DxbcShaderTranslator::Modification::DepthStencilMode;
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion = if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
render_target_cache_.depth_float24_conversion();
if ((depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
normalized_depth_control.z_enable && normalized_depth_control.z_enable &&
regs.Get<reg::RB_DEPTH_INFO>().depth_format == regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24FS8) { xenos::DepthRenderTargetFormat::kD24FS8) {
modification.pixel.depth_stencil_mode = modification.pixel.depth_stencil_mode =
depth_float24_conversion == render_target_cache_.depth_float24_round()
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ? DepthStencilMode::kFloat24Rounding
? DepthStencilMode::kFloat24Truncating : DepthStencilMode::kFloat24Truncating;
: DepthStencilMode::kFloat24Rounding;
} else { } else {
if (shader.implicit_early_z_write_allowed() && if (shader.implicit_early_z_write_allowed() &&
(!shader.writes_color_target(0) || (!shader.writes_color_target(0) ||
@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size(); state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
} else { } else {
if ((description.depth_func != xenos::CompareFunction::kAlways || if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
(description.depth_func != xenos::CompareFunction::kAlways ||
description.depth_write) && description.depth_write) &&
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
switch (render_target_cache_.depth_float24_conversion()) { if (render_target_cache_.depth_float24_round()) {
case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating: state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps; state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps); } else {
break; state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding: state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
break;
default:
break;
} }
} }
} }

View File

@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs,
// interpolated Z instead if conversion can't be done exactly, without // interpolated Z instead if conversion can't be done exactly, without
// modifying clipping bounds by adjusting Z in vertex shaders, as that // modifying clipping bounds by adjusting Z in vertex shaders, as that
// may cause polygons placed explicitly at Z = 0 or Z = W to be clipped. // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min)); // Rounding the bounds to the nearest even regardless of the depth
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max)); // rounding mode not to add even more error by truncating twice.
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
} }
if (full_float24_in_0_to_1) { if (full_float24_in_0_to_1) {
// Remap the full [0...2) float24 range to [0...1) support data round-trip // Remap the full [0...2) float24 range to [0...1) support data round-trip

View File

@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t temp2_temp_component); uint32_t temp2_temp_component);
// Converts the depth value externally clamped to the representable [0, 2) // Converts the depth value externally clamped to the representable [0, 2)
// range to 20e4 floating point, with zeros in bits 24:31, rounding to the // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
// nearest even. Source and destination may be the same, temporary must be // nearest even or towards zero. Source and destination may be the same,
// different than both. If remap_from_0_to_0_5 is true, it's assumed that // temporary must be different than both. If remap_from_0_to_0_5 is true, it's
// 0...1 is pre-remapped to 0...0.5 in the input. // assumed that 0...1 is pre-remapped to 0...0.5 in the input.
static void PreClampedDepthTo20e4( static void PreClampedDepthTo20e4(
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component, dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp, uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
uint32_t temp_temp_component, bool remap_from_0_to_0_5); uint32_t temp_temp_component, bool round_to_nearest_even,
bool remap_from_0_to_0_5);
// Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
// float. Two temporaries must be different, but one can be the same as the // float. Two temporaries must be different, but one can be the same as the
// source. The destination may be anything writable. If remap_to_0_to_0_5 is // source. The destination may be anything writable. If remap_to_0_to_0_5 is

View File

@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
} else { } else {
// Properly convert to 20e4, with rounding to the nearest even (the bias was // Properly convert to 20e4, with rounding to the nearest even (the bias was
// pre-applied by multiplying by 2), then convert back restoring the bias. // pre-applied by multiplying by 2), then convert back restoring the bias.
PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false); PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false);
Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true); Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
} }
@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32(
void DxbcShaderTranslator::PreClampedDepthTo20e4( void DxbcShaderTranslator::PreClampedDepthTo20e4(
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component, dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp, uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
uint32_t temp_temp_component, bool remap_from_0_to_0_5) { uint32_t temp_temp_component, bool round_to_nearest_even,
bool remap_from_0_to_0_5) {
assert_true(temp_temp != f24_temp || assert_true(temp_temp != f24_temp ||
temp_temp_component != f24_temp_component); temp_temp_component != f24_temp_component);
assert_true(temp_temp != f32_temp || assert_true(temp_temp != f32_temp ||
@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4(
// Close the denormal check. // Close the denormal check.
a.OpEndIf(); a.OpEndIf();
// Build the 20e4 number. // Build the 20e4 number.
// temp = (biased_f32 >> 3) & 1 if (round_to_nearest_even) {
a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src); // temp = (biased_f32 >> 3) & 1
// f24 = biased_f32 + 3 a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3)); // f24 = biased_f32 + 3
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
a.OpIAdd(f24_dest, f24_src, temp_src); // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
a.OpIAdd(f24_dest, f24_src, temp_src);
}
// For rounding to the nearest even:
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
// For rounding towards zero:
// f24 = (biased_f32 >> 3) & 0xFFFFFF
a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src); a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
} }
@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
// 20e4 conversion. // 20e4 conversion.
PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp, PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
d32_temp_component, temp_temp, temp_temp_component, d32_temp_component, temp_temp, temp_temp_component,
false); true, false);
} }
a_.OpElse(); a_.OpElse();
{ {

View File

@ -39,42 +39,109 @@ DEFINE_bool(
"reduce bandwidth usage during transfers as the previous depth won't need " "reduce bandwidth usage during transfers as the previous depth won't need "
"to be read.", "to be read.",
"GPU"); "GPU");
// The round trip is done, in particular, in 545407F2. // Lossless round trip: 545407F2.
DEFINE_string( // Lossy round trip with the "greater or equal" test afterwards: 4D530919.
depth_float24_conversion, "", // Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF.
"Method for converting 32-bit Z values to 20e4 floating point when using " DEFINE_bool(
"host depth buffers without native 20e4 support (when not using rasterizer-" depth_float24_round, false,
"ordered views / fragment shader interlocks to perform depth testing " "Whether to round to the nearest even, rather than truncating (rounding "
"manually).\n" "towards zero), the depth when converting it to 24-bit floating-point "
"Use: [any, on_copy, truncate, round]\n" "(20e4) from the host precision (32-bit floating point) when using a host "
" on_copy:\n" "depth buffer.\n"
" Do depth testing at host precision, converting when copying between " "false:\n"
"color and depth buffers (or between depth buffers of different formats) " " Recommended.\n"
"to support reinterpretation, but keeps the last host depth buffer used " " The conversion may move the depth values farther away from the camera.\n"
"for each EDRAM range and reloads the host precision value if it's still " " Without depth_float24_convert_in_pixel_shader:\n"
"up to date after the EDRAM range was used with a different pixel format.\n" " The \"greater or equal\" depth test function continues to work fine if "
" + Highest performance, allows early depth test and writing.\n" "the full host precision depth data is lost, it's still possible to draw "
" + Host MSAA is possible with pixel-rate shading where supported.\n" "another pass of the same geometry with it.\n"
" - EDRAM > RAM > EDRAM depth buffer round trip done in certain games " " (See the description of depth_float24_convert_in_pixel_shader for more "
"destroys precision irreparably, causing artifacts if another rendering " "information about full precision depth data loss.)\n"
"pass is done after the EDRAM reupload.\n" " With depth_float24_convert_in_pixel_shader:\n"
" truncate:\n" " Faster - the pixel shader for hidden surfaces may still be skipped "
" Convert to 20e4 directly in pixel shaders, always rounding down.\n" "(using conservative depth output).\n"
" + Average performance, conservative early depth test is possible.\n" "true:\n"
" + No precision loss when anything changes in the storage of the depth " " Only for special cases of issues caused by minor 32-bit floating-point "
"buffer, EDRAM > RAM > EDRAM copying preserves precision.\n" "rounding errors, for instance, when the game tries to draw something at "
" - Rounding mode is incorrect, sometimes giving results smaller than " "the camera plane by setting Z of the vertex position to W.\n"
"they should be - may cause inaccuracy especially in edge cases when the " " The conversion may move the depth values closer or farther.\n"
"game wants to write an exact value.\n" " Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" " Without depth_float24_convert_in_pixel_shader:\n"
" round:\n" " Not possible to recover from a full host precision depth data loss - in "
" Convert to 20e4 directly in pixel shaders, correctly rounding to the " "subsequent passes of rendering the same geometry, half of the samples "
"nearest even.\n" "will be failing the depth test with the \"greater or equal\" depth test "
" + Highest accuracy.\n" "function.\n"
" - Significantly limited performance, early depth test is not possible.\n" " With depth_float24_convert_in_pixel_shader:\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" " Slower - depth rejection before the pixel shader is not possible.\n"
" Any other value:\n" "When the depth buffer is emulated in software (via the fragment shader "
" Choose what is considered the most optimal (currently \"on_copy\").", "interlock / rasterizer-ordered view), this is ignored, and rounding to "
"the nearest even is always done.",
"GPU");
// With MSAA, when converting the depth in pixel shaders, they must run at
// sample frequency - otherwise, if the depth is the same for the entire pixel,
// intersections of polygons cannot be antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// rounding to the nearest even regardless of whether truncation was requested
// for the values, to reduce the error already caused by truncation rather than
// to amplify it). This ensures that clamping to the viewport bounds, which
// happens after the pixel shader even if it overwrites the resulting depth, is
// never done to a value not representable as float24 (for example, if the
// minimum Z is a number too small to be represented as float24, but not zero,
// it won't be possible to write what should become 0x000000 to the depth
// buffer). Note that this may add some error to the depth values from the
// rasterizer; however, modifying Z in the vertex shader to make interpolated
// depth values would cause clipping to be done to different bounds, which may
// be more undesirable, especially in cases when Z is explicitly set to a value
// like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
// space and disappear).
//
// If false, doing the depth test at the host precision, converting to 20e4 to
// support reinterpretation, but keeping track of both the last color (or
// non-20e4 depth) value (let's call it stored_f24) and the last host depth
// value (stored_host) for each EDRAM pixel, reloading the last host depth value
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
// something else, like clearing, or an actually used color buffer; this is
// inexact though, and will incorrectly load pixels that were overwritten by
// something else in the EDRAM, but turned out to have the same value on the
// guest as before - an outdated host-precision value will be loaded in these
// cases instead).
DEFINE_bool(
depth_float24_convert_in_pixel_shader, false,
"Whether to convert the depth values to 24-bit floating-point (20e4) from "
"the host precision (32-bit floating point) directly in the pixel shaders "
"of guest draws when using a host depth buffer.\n"
"This prevents visual artifacts (interleaved stripes of parts of surfaces "
"rendered and parts not rendered, having either the same width in case of "
"the \"greater or equal\" depth test function, or the former being much "
"thinner than the latter with the \"equal\" function) if the full host "
"precision depth data is lost.\n"
"This issue may happen if the game reloads the depth data previously "
"evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region "
"that previously contained that depth buffer was overwritten by another "
"depth buffer, or the game loads it to a different location in the EDRAM "
"than it was previously placed at, thus Xenia is unable to restore the "
"depth data with the original precision, and instead falls back to "
"converting the lower-precision values, so in subsequent rendering passes "
"for the same geometry, the actual depth values of the surfaces don't "
"match those stored in the depth buffer anymore.\n"
"This is a costly option because it makes the GPU unable to use depth "
"buffer compression, and also with MSAA, forces the pixel shader to run "
"for every subpixel sample rather than for the entire pixel, making pixel "
"shading 2 or 4 times heavier depending on the MSAA sample count.\n"
"The rounding direction is controlled by the depth_float24_round "
"configuration variable.\n"
"Note that with depth_float24_round = true, this becomes even more costly "
"because pixel shaders must be executed regardless of whether the surface "
"is behind the previously drawn surfaces. With depth_float24_round = "
"false, conservative depth output is used, however, so depth rejection "
"before the pixel shader may still work.\n"
"If sample-rate shading is not supported by the host GPU, the conversion "
"in the pixel shader is done only when MSAA is not used.\n"
"When the depth buffer is emulated in software (via the fragment shader "
"interlock / rasterizer-ordered view), this is ignored because 24-bit "
"depth is always used directly.",
"GPU"); "GPU");
DEFINE_bool( DEFINE_bool(
draw_resolution_scaled_texture_offsets, true, draw_resolution_scaled_texture_offsets, true,
@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets(
return rts_used; return rts_used;
} }
RenderTargetCache::DepthFloat24Conversion
RenderTargetCache::GetConfigDepthFloat24Conversion() {
if (cvars::depth_float24_conversion == "truncate") {
return DepthFloat24Conversion::kOnOutputTruncating;
}
if (cvars::depth_float24_conversion == "round") {
return DepthFloat24Conversion::kOnOutputRounding;
}
return DepthFloat24Conversion::kOnCopy;
}
uint32_t RenderTargetCache::GetRenderTargetHeight( uint32_t RenderTargetCache::GetRenderTargetHeight(
uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const { uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
if (!pitch_tiles_at_32bpp) { if (!pitch_tiles_at_32bpp) {

View File

@ -29,6 +29,8 @@
#include "xenia/gpu/xenos.h" #include "xenia/gpu/xenos.h"
DECLARE_bool(depth_transfer_not_equal_test); DECLARE_bool(depth_transfer_not_equal_test);
DECLARE_bool(depth_float24_round);
DECLARE_bool(depth_float24_convert_in_pixel_shader);
DECLARE_bool(draw_resolution_scaled_texture_offsets); DECLARE_bool(draw_resolution_scaled_texture_offsets);
DECLARE_bool(gamma_render_target_as_srgb); DECLARE_bool(gamma_render_target_as_srgb);
DECLARE_bool(native_2x_msaa); DECLARE_bool(native_2x_msaa);
@ -89,60 +91,6 @@ class RenderTargetCache {
kPixelShaderInterlock, kPixelShaderInterlock,
}; };
enum class DepthFloat24Conversion {
// Doing depth test at the host precision, converting to 20e4 to support
// reinterpretation, but keeping track of both the last color (or non-20e4
// depth) value (let's call it stored_f24) and the last host depth value
// (stored_host) for each EDRAM pixel, reloading the last host depth value
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
// something else, like clearing, or an actually used color buffer; this is
// inexact though, and will incorrectly load pixels that were overwritten by
// something else in the EDRAM, but turned out to have the same value on the
// guest as before - an outdated host-precision value will be loaded in
// these cases instead).
//
// EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM
// round trip destroys precision beyond repair.
//
// Full host early Z and MSAA with pixel-rate shading are supported.
kOnCopy,
// Converting the depth to the closest host value representable exactly as a
// 20e4 float in pixel shaders, to support invariance in cases when the
// guest reuploads a previously resolved depth buffer to the EDRAM, rounding
// towards zero (which contradicts the rounding used by the Direct3D 9
// reference rasterizer, but allows less-than-or-equal pixel shader depth
// output to be used to preserve most of early Z culling when the game is
// using reversed depth, which is the usual way of doing depth testing on
// the Xbox 360 and of utilizing the advantages of a floating-point
// encoding).
//
// With MSAA, pixel shaders must run at sample frequency - otherwise, if the
// depth is the same for the entire pixel, intersections of polygons cannot
// be antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// correct rounding to the nearest even, to reduce the error already caused
// by truncation rather than to amplify it). This ensures that clamping to
// the viewport bounds, which happens after the pixel shader even if it
// overwrites the resulting depth, is never done to a value not
// representable as float24 (for example, if the minimum Z is a number too
// small to be represented as float24, but not zero, it won't be possible to
// write what should become 0x000000 to the depth buffer). Note that this
// may add some error to the depth values from the rasterizer; however,
// modifying Z in the vertex shader to make interpolated depth values would
// cause clipping to be done to different bounds, which may be more
// undesirable, especially in cases when Z is explicitly set to a value like
// 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
// space and disappear).
kOnOutputTruncating,
// Similar to kOnOutputTruncating, but rounding to the nearest even, more
// correctly, however, because the resulting depth can be bigger than the
// original host value, early depth testing can't be used at all. Same
// viewport usage rules apply.
kOnOutputRounding,
};
// Useful host-specific values. // Useful host-specific values.
// sRGB conversion from the Direct3D 11.3 functional specification. // sRGB conversion from the Direct3D 11.3 functional specification.
static constexpr float kSrgbToLinearDenominator1 = 12.92f; static constexpr float kSrgbToLinearDenominator1 = 12.92f;
@ -512,8 +460,6 @@ class RenderTargetCache {
} }
}; };
static DepthFloat24Conversion GetConfigDepthFloat24Conversion();
virtual uint32_t GetMaxRenderTargetWidth() const = 0; virtual uint32_t GetMaxRenderTargetWidth() const = 0;
virtual uint32_t GetMaxRenderTargetHeight() const = 0; virtual uint32_t GetMaxRenderTargetHeight() const = 0;

View File

@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z
ult [precise(z)] r0.z, r0.x, l(0x38800000) ult [precise(z)] r0.z, r0.x, l(0x38800000)
iadd [precise(x)] r0.x, r0.x, l(0xc8000000) iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
movc [precise(x)] r0.x, r0.z, r0.y, r0.x movc [precise(x)] r0.x, r0.z, r0.y, r0.x
iadd [precise(y)] r0.y, r0.x, l(3) ubfe [precise(y)] r0.y, l(1), l(3), r0.x
ubfe [precise(x)] r0.x, l(1), l(3), r0.x iadd [precise(x)] r0.x, r0.y, r0.x
iadd [precise(x)] r0.x, r0.x, r0.y iadd [precise(x)] r0.x, r0.x, l(3)
ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
firstbit_hi [precise(w)] r0.w, r0.y firstbit_hi [precise(w)] r0.w, r0.y
iadd [precise(w)] r0.w, r0.w, l(-11) iadd [precise(w)] r0.w, r0.w, l(-11)
@ -76,10 +76,10 @@ ret
const BYTE float24_round_ps[] = const BYTE float24_round_ps[] =
{ {
68, 88, 66, 67, 229, 54, 68, 88, 66, 67, 110, 79,
46, 1, 194, 31, 164, 202, 84, 202, 151, 165, 237, 180,
193, 71, 175, 129, 44, 52, 64, 17, 0, 132, 236, 126,
218, 154, 1, 0, 0, 0, 142, 105, 1, 0, 0, 0,
8, 7, 0, 0, 5, 0, 8, 7, 0, 0, 5, 0,
0, 0, 52, 0, 0, 0, 0, 0, 52, 0, 0, 0,
160, 0, 0, 0, 120, 2, 160, 0, 0, 0, 120, 2,
@ -259,22 +259,22 @@ const BYTE float24_round_ps[] =
0, 0, 0, 0, 26, 0, 0, 0, 0, 0, 26, 0,
16, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0, 10, 0, 16, 0, 0, 0,
0, 0, 30, 0, 16, 7, 0, 0, 138, 0, 16, 9,
34, 0, 16, 0, 0, 0, 34, 0, 16, 0, 0, 0,
0, 0, 10, 0, 16, 0,
0, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
138, 0, 8, 9, 18, 0,
16, 0, 0, 0, 0, 0,
1, 64, 0, 0, 1, 0,
0, 0, 1, 64, 0, 0, 0, 0, 1, 64, 0, 0,
3, 0, 0, 0, 10, 0, 1, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 30, 0, 8, 7,
18, 0, 16, 0, 0, 0,
0, 0, 26, 0, 16, 0,
0, 0, 0, 0, 10, 0,
16, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0,
30, 0, 8, 7, 18, 0, 30, 0, 8, 7, 18, 0,
16, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0, 10, 0, 16, 0, 0, 0,
0, 0, 26, 0, 16, 0, 0, 0, 1, 64, 0, 0,
0, 0, 0, 0, 138, 0, 3, 0, 0, 0, 138, 0,
56, 15, 114, 0, 16, 0, 56, 15, 114, 0, 16, 0,
0, 0, 0, 0, 2, 64, 0, 0, 0, 0, 2, 64,
0, 0, 24, 0, 0, 0, 0, 0, 24, 0, 0, 0,

View File

@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth {
// allow for safe reinterpretation of any 24-bit value to and from float24 // allow for safe reinterpretation of any 24-bit value to and from float24
// depth using depth output without unrestricted depth range. // depth using depth output without unrestricted depth range.
return asfloat(XeFloat20e4To32( return asfloat(XeFloat20e4To32(
XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true)); XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true),
true));
} }

View File

@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) {
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
// We also can't clamp the stored value to 1 as load->store->load must be exact. // We also can't clamp the stored value to 1 as load->store->load must be exact.
uint XeFloat32To20e4(uint f32u32) { uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) {
// Keep only positive (high bit set means negative for both float and int) and // Keep only positive (high bit set means negative for both float and int) and
// saturate to the maximum representable value near 2 (also dropping NaNs). // saturate to the maximum representable value near 2 (also dropping NaNs).
f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u); f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
uint denormalized = uint denormalized =
((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u); ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u); uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; if (round_to_nearest_even) {
f24u32 += 3u + ((f24u32 >> 3u) & 1u);
}
return (f24u32 >> 3u) & 0xFFFFFFu;
} }
uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) { uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {

View File

@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) {
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
uint32_t Float32To20e4(float f32) { uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
if (!(f32 > 0.0f)) { if (!(f32 > 0.0f)) {
// Positive only, and not -0 or NaN. // Positive only, and not -0 or NaN.
return 0; return 0;
@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) {
// Rebias the exponent to represent the value as a normalized 20e4. // Rebias the exponent to represent the value as a normalized 20e4.
f32u32 += 0xC8000000u; f32u32 += 0xC8000000u;
} }
return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF; if (round_to_nearest_even) {
f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
}
return (f32u32 >> 3) & 0xFFFFFF;
} }
float Float20e4To32(uint32_t f24) { float Float20e4To32(uint32_t f24) {

View File

@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10);
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
// floating-point number. // floating-point number.
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
// depth, rounding to the nearest even. // depth, rounding to the nearest even or towards zero.
uint32_t Float32To20e4(float f32); uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
// IEEE-754 32-bit floating-point number. // IEEE-754 32-bit floating-point number.
float Float20e4To32(uint32_t f24); float Float20e4To32(uint32_t f24);