diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 81f7c4c28..46bfb1452 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Get dynamic rasterizer state. uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); - RenderTargetCache::DepthFloat24Conversion depth_float24_conversion = - render_target_cache_->depth_float24_conversion(); draw_util::ViewportInfo viewport_info; draw_util::GetHostViewportInfo( regs, draw_resolution_scale_x, draw_resolution_scale_y, true, D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false, normalized_depth_control, host_render_targets_used && - (depth_float24_conversion == - RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating || - depth_float24_conversion == - RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding), + render_target_cache_->depth_float24_convert_in_pixel_shader(), host_render_targets_used, pixel_shader && pixel_shader->writes_depth(), viewport_info); draw_util::Scissor scissor; diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 34c3654ed..b42cf9aba 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() { gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb; - depth_float24_conversion_ = GetConfigDepthFloat24Conversion(); + depth_float24_round_ = cvars::depth_float24_round; + depth_float24_convert_in_pixel_shader_ = + cvars::depth_float24_convert_in_pixel_shader; // Check if 2x MSAA is supported or needs to be emulated with 4x MSAA // instead. @@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() { // Blending is done in linear space directly in shaders. gamma_render_target_as_srgb_ = false; - // Always true float24 depth. - depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding; + // Always true float24 depth rounded to the nearest even. + depth_float24_round_ = true; + depth_float24_convert_in_pixel_shader_ = true; // Only ForcedSampleCount, which doesn't support 2x. msaa_2x_supported_ = false; @@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget( bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent( xenos::DepthRenderTargetFormat format) const { if (format == xenos::DepthRenderTargetFormat::kD24FS8) { - return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy; + return !depth_float24_convert_in_pixel_shader_; } return false; } @@ -3542,8 +3545,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { } break; case xenos::DepthRenderTargetFormat::kD24FS8: { // Convert using r1.y as temporary. - DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1, - true); + // When converting the depth in pixel shaders, it's always exact, + // truncating not to insert additional rounding instructions. + DxbcShaderTranslator::PreClampedDepthTo20e4( + a, i, 3, i, 3, 1, 1, + !depth_float24_convert_in_pixel_shader() && + depth_float24_round(), + true); } break; } // Merge depth and stencil into r0/r1.x. @@ -3729,8 +3737,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { } break; case xenos::DepthRenderTargetFormat::kD24FS8: { // Convert using r1.y as temporary. - DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1, - true); + // When converting the depth in pixel shaders, it's always exact, + // truncating not to insert additional rounding instructions. + DxbcShaderTranslator::PreClampedDepthTo20e4( + a, 1, 3, 1, 3, 1, 1, + !depth_float24_convert_in_pixel_shader() && + depth_float24_round(), + true); } break; } if (dest_is_color) { @@ -4105,8 +4118,14 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { dxbc::Src::R(0, dxbc::Src::kYYYY)); } break; case xenos::DepthRenderTargetFormat::kD24FS8: { - DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2, - true); + // When converting the depth in pixel shaders, it's always + // exact, truncating not to insert additional rounding + // instructions. + DxbcShaderTranslator::PreClampedDepthTo20e4( + a, 0, 1, 0, 0, 0, 2, + !depth_float24_convert_in_pixel_shader() && + depth_float24_round(), + true); } break; } a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY), @@ -6167,7 +6186,12 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( case xenos::DepthRenderTargetFormat::kD24FS8: // Convert to [0, 2) float24 from [0, 1) float32, using r0.x as // temporary. - DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true); + // When converting the depth in pixel shaders, it's always exact, + // truncating not to insert additional rounding instructions. + DxbcShaderTranslator::PreClampedDepthTo20e4( + a, 1, 0, 1, 0, 0, 0, + !depth_float24_convert_in_pixel_shader() && depth_float24_round(), + true); break; } // Combine 24-bit depth and stencil into r1.x. diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h index 4ad7d4b15..6f823d34e 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h @@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache { !cvars::snorm16_render_target_full_range; } - DepthFloat24Conversion depth_float24_conversion() const { - return depth_float24_conversion_; + bool depth_float24_round() const { return depth_float24_round_; } + bool depth_float24_convert_in_pixel_shader() const { + return depth_float24_convert_in_pixel_shader_; } DXGI_FORMAT GetColorResourceDXGIFormat( @@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache { bool gamma_render_target_as_srgb_ = false; - DepthFloat24Conversion depth_float24_conversion_ = - DepthFloat24Conversion::kOnCopy; + bool depth_float24_round_ = false; + bool depth_float24_convert_in_pixel_shader_ = false; bool msaa_2x_supported_ = false; diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index a42c4e9fb..699ba235e 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification( RenderTargetCache::Path::kHostRenderTargets) { using DepthStencilMode = DxbcShaderTranslator::Modification::DepthStencilMode; - RenderTargetCache::DepthFloat24Conversion depth_float24_conversion = - render_target_cache_.depth_float24_conversion(); - if ((depth_float24_conversion == - RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating || - depth_float24_conversion == - RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) && + if (render_target_cache_.depth_float24_convert_in_pixel_shader() && normalized_depth_control.z_enable && regs.Get().depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { modification.pixel.depth_stencil_mode = - depth_float24_conversion == - RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating - ? DepthStencilMode::kFloat24Truncating - : DepthStencilMode::kFloat24Rounding; + render_target_cache_.depth_float24_round() + ? DepthStencilMode::kFloat24Rounding + : DepthStencilMode::kFloat24Truncating; } else { if (shader.implicit_early_z_write_allowed() && (!shader.writes_color_target(0) || @@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size(); } else { - if ((description.depth_func != xenos::CompareFunction::kAlways || + if (render_target_cache_.depth_float24_convert_in_pixel_shader() && + (description.depth_func != xenos::CompareFunction::kAlways || description.depth_write) && description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { - switch (render_target_cache_.depth_float24_conversion()) { - case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating: - state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps; - state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps); - break; - case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding: - state_desc.PS.pShaderBytecode = shaders::float24_round_ps; - state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps); - break; - default: - break; + if (render_target_cache_.depth_float24_round()) { + state_desc.PS.pShaderBytecode = shaders::float24_round_ps; + state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps); + } else { + state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps; + state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps); } } } diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index e894a925d..803709a17 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs, // interpolated Z instead if conversion can't be done exactly, without // modifying clipping bounds by adjusting Z in vertex shaders, as that // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped. - z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min)); - z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max)); + // Rounding the bounds to the nearest even regardless of the depth + // rounding mode not to add even more error by truncating twice. + z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true)); + z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true)); } if (full_float24_in_0_to_1) { // Remap the full [0...2) float24 range to [0...1) support data round-trip diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 6b78310e8..b2e0a4ab4 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t temp2_temp_component); // Converts the depth value externally clamped to the representable [0, 2) // range to 20e4 floating point, with zeros in bits 24:31, rounding to the - // nearest even. Source and destination may be the same, temporary must be - // different than both. If remap_from_0_to_0_5 is true, it's assumed that - // 0...1 is pre-remapped to 0...0.5 in the input. + // nearest even or towards zero. Source and destination may be the same, + // temporary must be different than both. If remap_from_0_to_0_5 is true, it's + // assumed that 0...1 is pre-remapped to 0...0.5 in the input. static void PreClampedDepthTo20e4( dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component, uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp, - uint32_t temp_temp_component, bool remap_from_0_to_0_5); + uint32_t temp_temp_component, bool round_to_nearest_even, + bool remap_from_0_to_0_5); // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit // float. Two temporaries must be different, but one can be the same as the // source. The destination may be anything writable. If remap_to_0_to_0_5 is diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index df800b9c2..7c42c9044 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { } else { // Properly convert to 20e4, with rounding to the nearest even (the bias was // pre-applied by multiplying by 2), then convert back restoring the bias. - PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false); + PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false); Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true); } @@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32( void DxbcShaderTranslator::PreClampedDepthTo20e4( dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component, uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp, - uint32_t temp_temp_component, bool remap_from_0_to_0_5) { + uint32_t temp_temp_component, bool round_to_nearest_even, + bool remap_from_0_to_0_5) { assert_true(temp_temp != f24_temp || temp_temp_component != f24_temp_component); assert_true(temp_temp != f32_temp || @@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4( // Close the denormal check. a.OpEndIf(); // Build the 20e4 number. - // temp = (biased_f32 >> 3) & 1 - a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src); - // f24 = biased_f32 + 3 - a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3)); - // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) - a.OpIAdd(f24_dest, f24_src, temp_src); + if (round_to_nearest_even) { + // temp = (biased_f32 >> 3) & 1 + a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src); + // f24 = biased_f32 + 3 + a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3)); + // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) + a.OpIAdd(f24_dest, f24_src, temp_src); + } + // For rounding to the nearest even: // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF + // For rounding towards zero: + // f24 = (biased_f32 >> 3) & 0xFFFFFF a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src); } @@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, // 20e4 conversion. PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp, d32_temp_component, temp_temp, temp_temp_component, - false); + true, false); } a_.OpElse(); { diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 2f7a3951c..3a73d314e 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -39,42 +39,109 @@ DEFINE_bool( "reduce bandwidth usage during transfers as the previous depth won't need " "to be read.", "GPU"); -// The round trip is done, in particular, in 545407F2. -DEFINE_string( - depth_float24_conversion, "", - "Method for converting 32-bit Z values to 20e4 floating point when using " - "host depth buffers without native 20e4 support (when not using rasterizer-" - "ordered views / fragment shader interlocks to perform depth testing " - "manually).\n" - "Use: [any, on_copy, truncate, round]\n" - " on_copy:\n" - " Do depth testing at host precision, converting when copying between " - "color and depth buffers (or between depth buffers of different formats) " - "to support reinterpretation, but keeps the last host depth buffer used " - "for each EDRAM range and reloads the host precision value if it's still " - "up to date after the EDRAM range was used with a different pixel format.\n" - " + Highest performance, allows early depth test and writing.\n" - " + Host MSAA is possible with pixel-rate shading where supported.\n" - " - EDRAM > RAM > EDRAM depth buffer round trip done in certain games " - "destroys precision irreparably, causing artifacts if another rendering " - "pass is done after the EDRAM reupload.\n" - " truncate:\n" - " Convert to 20e4 directly in pixel shaders, always rounding down.\n" - " + Average performance, conservative early depth test is possible.\n" - " + No precision loss when anything changes in the storage of the depth " - "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n" - " - Rounding mode is incorrect, sometimes giving results smaller than " - "they should be - may cause inaccuracy especially in edge cases when the " - "game wants to write an exact value.\n" - " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" - " round:\n" - " Convert to 20e4 directly in pixel shaders, correctly rounding to the " - "nearest even.\n" - " + Highest accuracy.\n" - " - Significantly limited performance, early depth test is not possible.\n" - " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" - " Any other value:\n" - " Choose what is considered the most optimal (currently \"on_copy\").", +// Lossless round trip: 545407F2. +// Lossy round trip with the "greater or equal" test afterwards: 4D530919. +// Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF. +DEFINE_bool( + depth_float24_round, false, + "Whether to round to the nearest even, rather than truncating (rounding " + "towards zero), the depth when converting it to 24-bit floating-point " + "(20e4) from the host precision (32-bit floating point) when using a host " + "depth buffer.\n" + "false:\n" + " Recommended.\n" + " The conversion may move the depth values farther away from the camera.\n" + " Without depth_float24_convert_in_pixel_shader:\n" + " The \"greater or equal\" depth test function continues to work fine if " + "the full host precision depth data is lost, it's still possible to draw " + "another pass of the same geometry with it.\n" + " (See the description of depth_float24_convert_in_pixel_shader for more " + "information about full precision depth data loss.)\n" + " With depth_float24_convert_in_pixel_shader:\n" + " Faster - the pixel shader for hidden surfaces may still be skipped " + "(using conservative depth output).\n" + "true:\n" + " Only for special cases of issues caused by minor 32-bit floating-point " + "rounding errors, for instance, when the game tries to draw something at " + "the camera plane by setting Z of the vertex position to W.\n" + " The conversion may move the depth values closer or farther.\n" + " Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n" + " Without depth_float24_convert_in_pixel_shader:\n" + " Not possible to recover from a full host precision depth data loss - in " + "subsequent passes of rendering the same geometry, half of the samples " + "will be failing the depth test with the \"greater or equal\" depth test " + "function.\n" + " With depth_float24_convert_in_pixel_shader:\n" + " Slower - depth rejection before the pixel shader is not possible.\n" + "When the depth buffer is emulated in software (via the fragment shader " + "interlock / rasterizer-ordered view), this is ignored, and rounding to " + "the nearest even is always done.", + "GPU"); +// With MSAA, when converting the depth in pixel shaders, they must run at +// sample frequency - otherwise, if the depth is the same for the entire pixel, +// intersections of polygons cannot be antialiased. +// +// Important usage note: When using this mode, bounds of the fixed-function +// viewport must be converted to and back from float24 too (preferably using +// rounding to the nearest even regardless of whether truncation was requested +// for the values, to reduce the error already caused by truncation rather than +// to amplify it). This ensures that clamping to the viewport bounds, which +// happens after the pixel shader even if it overwrites the resulting depth, is +// never done to a value not representable as float24 (for example, if the +// minimum Z is a number too small to be represented as float24, but not zero, +// it won't be possible to write what should become 0x000000 to the depth +// buffer). Note that this may add some error to the depth values from the +// rasterizer; however, modifying Z in the vertex shader to make interpolated +// depth values would cause clipping to be done to different bounds, which may +// be more undesirable, especially in cases when Z is explicitly set to a value +// like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip +// space and disappear). +// +// If false, doing the depth test at the host precision, converting to 20e4 to +// support reinterpretation, but keeping track of both the last color (or +// non-20e4 depth) value (let's call it stored_f24) and the last host depth +// value (stored_host) for each EDRAM pixel, reloading the last host depth value +// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by +// something else, like clearing, or an actually used color buffer; this is +// inexact though, and will incorrectly load pixels that were overwritten by +// something else in the EDRAM, but turned out to have the same value on the +// guest as before - an outdated host-precision value will be loaded in these +// cases instead). +DEFINE_bool( + depth_float24_convert_in_pixel_shader, false, + "Whether to convert the depth values to 24-bit floating-point (20e4) from " + "the host precision (32-bit floating point) directly in the pixel shaders " + "of guest draws when using a host depth buffer.\n" + "This prevents visual artifacts (interleaved stripes of parts of surfaces " + "rendered and parts not rendered, having either the same width in case of " + "the \"greater or equal\" depth test function, or the former being much " + "thinner than the latter with the \"equal\" function) if the full host " + "precision depth data is lost.\n" + "This issue may happen if the game reloads the depth data previously " + "evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region " + "that previously contained that depth buffer was overwritten by another " + "depth buffer, or the game loads it to a different location in the EDRAM " + "than it was previously placed at, thus Xenia is unable to restore the " + "depth data with the original precision, and instead falls back to " + "converting the lower-precision values, so in subsequent rendering passes " + "for the same geometry, the actual depth values of the surfaces don't " + "match those stored in the depth buffer anymore.\n" + "This is a costly option because it makes the GPU unable to use depth " + "buffer compression, and also with MSAA, forces the pixel shader to run " + "for every subpixel sample rather than for the entire pixel, making pixel " + "shading 2 or 4 times heavier depending on the MSAA sample count.\n" + "The rounding direction is controlled by the depth_float24_round " + "configuration variable.\n" + "Note that with depth_float24_round = true, this becomes even more costly " + "because pixel shaders must be executed regardless of whether the surface " + "is behind the previously drawn surfaces. With depth_float24_round = " + "false, conservative depth output is used, however, so depth rejection " + "before the pixel shader may still work.\n" + "If sample-rate shading is not supported by the host GPU, the conversion " + "in the pixel shader is done only when MSAA is not used.\n" + "When the depth buffer is emulated in software (via the fragment shader " + "interlock / rasterizer-ordered view), this is ignored because 24-bit " + "depth is always used directly.", "GPU"); DEFINE_bool( draw_resolution_scaled_texture_offsets, true, @@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets( return rts_used; } -RenderTargetCache::DepthFloat24Conversion -RenderTargetCache::GetConfigDepthFloat24Conversion() { - if (cvars::depth_float24_conversion == "truncate") { - return DepthFloat24Conversion::kOnOutputTruncating; - } - if (cvars::depth_float24_conversion == "round") { - return DepthFloat24Conversion::kOnOutputRounding; - } - return DepthFloat24Conversion::kOnCopy; -} - uint32_t RenderTargetCache::GetRenderTargetHeight( uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const { if (!pitch_tiles_at_32bpp) { diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index a8cab45d6..d794f66e7 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -29,6 +29,8 @@ #include "xenia/gpu/xenos.h" DECLARE_bool(depth_transfer_not_equal_test); +DECLARE_bool(depth_float24_round); +DECLARE_bool(depth_float24_convert_in_pixel_shader); DECLARE_bool(draw_resolution_scaled_texture_offsets); DECLARE_bool(gamma_render_target_as_srgb); DECLARE_bool(native_2x_msaa); @@ -89,60 +91,6 @@ class RenderTargetCache { kPixelShaderInterlock, }; - enum class DepthFloat24Conversion { - // Doing depth test at the host precision, converting to 20e4 to support - // reinterpretation, but keeping track of both the last color (or non-20e4 - // depth) value (let's call it stored_f24) and the last host depth value - // (stored_host) for each EDRAM pixel, reloading the last host depth value - // if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by - // something else, like clearing, or an actually used color buffer; this is - // inexact though, and will incorrectly load pixels that were overwritten by - // something else in the EDRAM, but turned out to have the same value on the - // guest as before - an outdated host-precision value will be loaded in - // these cases instead). - // - // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM - // round trip destroys precision beyond repair. - // - // Full host early Z and MSAA with pixel-rate shading are supported. - kOnCopy, - // Converting the depth to the closest host value representable exactly as a - // 20e4 float in pixel shaders, to support invariance in cases when the - // guest reuploads a previously resolved depth buffer to the EDRAM, rounding - // towards zero (which contradicts the rounding used by the Direct3D 9 - // reference rasterizer, but allows less-than-or-equal pixel shader depth - // output to be used to preserve most of early Z culling when the game is - // using reversed depth, which is the usual way of doing depth testing on - // the Xbox 360 and of utilizing the advantages of a floating-point - // encoding). - // - // With MSAA, pixel shaders must run at sample frequency - otherwise, if the - // depth is the same for the entire pixel, intersections of polygons cannot - // be antialiased. - // - // Important usage note: When using this mode, bounds of the fixed-function - // viewport must be converted to and back from float24 too (preferably using - // correct rounding to the nearest even, to reduce the error already caused - // by truncation rather than to amplify it). This ensures that clamping to - // the viewport bounds, which happens after the pixel shader even if it - // overwrites the resulting depth, is never done to a value not - // representable as float24 (for example, if the minimum Z is a number too - // small to be represented as float24, but not zero, it won't be possible to - // write what should become 0x000000 to the depth buffer). Note that this - // may add some error to the depth values from the rasterizer; however, - // modifying Z in the vertex shader to make interpolated depth values would - // cause clipping to be done to different bounds, which may be more - // undesirable, especially in cases when Z is explicitly set to a value like - // 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip - // space and disappear). - kOnOutputTruncating, - // Similar to kOnOutputTruncating, but rounding to the nearest even, more - // correctly, however, because the resulting depth can be bigger than the - // original host value, early depth testing can't be used at all. Same - // viewport usage rules apply. - kOnOutputRounding, - }; - // Useful host-specific values. // sRGB conversion from the Direct3D 11.3 functional specification. static constexpr float kSrgbToLinearDenominator1 = 12.92f; @@ -512,8 +460,6 @@ class RenderTargetCache { } }; - static DepthFloat24Conversion GetConfigDepthFloat24Conversion(); - virtual uint32_t GetMaxRenderTargetWidth() const = 0; virtual uint32_t GetMaxRenderTargetHeight() const = 0; diff --git a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h index 8a82eb0e7..7ccf5cb3a 100644 --- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h +++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h @@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z ult [precise(z)] r0.z, r0.x, l(0x38800000) iadd [precise(x)] r0.x, r0.x, l(0xc8000000) movc [precise(x)] r0.x, r0.z, r0.y, r0.x -iadd [precise(y)] r0.y, r0.x, l(3) -ubfe [precise(x)] r0.x, l(1), l(3), r0.x -iadd [precise(x)] r0.x, r0.x, r0.y +ubfe [precise(y)] r0.y, l(1), l(3), r0.x +iadd [precise(x)] r0.x, r0.y, r0.x +iadd [precise(x)] r0.x, r0.x, l(3) ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx firstbit_hi [precise(w)] r0.w, r0.y iadd [precise(w)] r0.w, r0.w, l(-11) @@ -76,10 +76,10 @@ ret const BYTE float24_round_ps[] = { - 68, 88, 66, 67, 229, 54, - 46, 1, 194, 31, 164, 202, - 193, 71, 175, 129, 44, 52, - 218, 154, 1, 0, 0, 0, + 68, 88, 66, 67, 110, 79, + 84, 202, 151, 165, 237, 180, + 64, 17, 0, 132, 236, 126, + 142, 105, 1, 0, 0, 0, 8, 7, 0, 0, 5, 0, 0, 0, 52, 0, 0, 0, 160, 0, 0, 0, 120, 2, @@ -259,22 +259,22 @@ const BYTE float24_round_ps[] = 0, 0, 0, 0, 26, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, - 0, 0, 30, 0, 16, 7, + 0, 0, 138, 0, 16, 9, 34, 0, 16, 0, 0, 0, - 0, 0, 10, 0, 16, 0, - 0, 0, 0, 0, 1, 64, - 0, 0, 3, 0, 0, 0, - 138, 0, 8, 9, 18, 0, - 16, 0, 0, 0, 0, 0, - 1, 64, 0, 0, 1, 0, 0, 0, 1, 64, 0, 0, - 3, 0, 0, 0, 10, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 3, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 30, 0, 8, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 30, 0, 8, 7, 18, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, - 0, 0, 26, 0, 16, 0, - 0, 0, 0, 0, 138, 0, + 0, 0, 1, 64, 0, 0, + 3, 0, 0, 0, 138, 0, 56, 15, 114, 0, 16, 0, 0, 0, 0, 0, 2, 64, 0, 0, 24, 0, 0, 0, diff --git a/src/xenia/gpu/shaders/float24_round.ps.hlsl b/src/xenia/gpu/shaders/float24_round.ps.hlsl index 87f0ae56a..e142b1c5c 100644 --- a/src/xenia/gpu/shaders/float24_round.ps.hlsl +++ b/src/xenia/gpu/shaders/float24_round.ps.hlsl @@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth { // allow for safe reinterpretation of any 24-bit value to and from float24 // depth using depth output without unrestricted depth range. return asfloat(XeFloat20e4To32( - XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true)); + XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true), + true)); } diff --git a/src/xenia/gpu/shaders/pixel_formats.xesli b/src/xenia/gpu/shaders/pixel_formats.xesli index 3492739ce..4328ad876 100644 --- a/src/xenia/gpu/shaders/pixel_formats.xesli +++ b/src/xenia/gpu/shaders/pixel_formats.xesli @@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) { // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). // We also can't clamp the stored value to 1 as load->store->load must be exact. -uint XeFloat32To20e4(uint f32u32) { +uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) { // Keep only positive (high bit set means negative for both float and int) and // saturate to the maximum representable value near 2 (also dropping NaNs). f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u); uint denormalized = ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u); uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u); - return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; + if (round_to_nearest_even) { + f24u32 += 3u + ((f24u32 >> 3u) & 1u); + } + return (f24u32 >> 3u) & 0xFFFFFFu; } uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) { diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index 397ba3b24..66e3a4fe2 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) { // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). -uint32_t Float32To20e4(float f32) { +uint32_t Float32To20e4(float f32, bool round_to_nearest_even) { if (!(f32 > 0.0f)) { // Positive only, and not -0 or NaN. return 0; @@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) { // Rebias the exponent to represent the value as a normalized 20e4. f32u32 += 0xC8000000u; } - return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF; + if (round_to_nearest_even) { + f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1); + } + return (f32u32 >> 3) & 0xFFFFFF; } float Float20e4To32(uint32_t f24) { diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index c4e0870d6..95c104a01 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10); // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit // floating-point number. // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point -// depth, rounding to the nearest even. -uint32_t Float32To20e4(float f32); +// depth, rounding to the nearest even or towards zero. +uint32_t Float32To20e4(float f32, bool round_to_nearest_even); // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an // IEEE-754 32-bit floating-point number. float Float20e4To32(uint32_t f24);