[D3D12] Truncate depth to float24 in EDRAM range ownership transfers and resolves by default

Doesn't ruin the "greater or equal" depth test in subsequent rendering passes if precision is lost, unlike rounding to the nearest
2022-06-22 12:53:09 +03:00 · 2022-06-22 12:53:09 +03:00 · 7869b080d3
parent e2f632f8fa
commit 7869b080d3
14 changed files with 193 additions and 185 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // Get dynamic rasterizer state.
  uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
  uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
-  RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
-      render_target_cache_->depth_float24_conversion();
  draw_util::ViewportInfo viewport_info;
  draw_util::GetHostViewportInfo(
      regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
      D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
      normalized_depth_control,
      host_render_targets_used &&
-          (depth_float24_conversion ==
-               RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
-           depth_float24_conversion ==
-               RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
+          render_target_cache_->depth_float24_convert_in_pixel_shader(),
      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
      viewport_info);
  draw_util::Scissor scissor;
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() {

    gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;

-    depth_float24_conversion_ = GetConfigDepthFloat24Conversion();
+    depth_float24_round_ = cvars::depth_float24_round;
+    depth_float24_convert_in_pixel_shader_ =
+        cvars::depth_float24_convert_in_pixel_shader;

    // Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
    // instead.
@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() {
    // Blending is done in linear space directly in shaders.
    gamma_render_target_as_srgb_ = false;

-    // Always true float24 depth.
-    depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding;
+    // Always true float24 depth rounded to the nearest even.
+    depth_float24_round_ = true;
+    depth_float24_convert_in_pixel_shader_ = true;

    // Only ForcedSampleCount, which doesn't support 2x.
    msaa_2x_supported_ = false;
@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget(
 bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
    xenos::DepthRenderTargetFormat format) const {
  if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
-    return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy;
+    return !depth_float24_convert_in_pixel_shader_;
  }
  return false;
 }
@ -3542,8 +3545,8 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
          } break;
          case xenos::DepthRenderTargetFormat::kD24FS8: {
            // Convert using r1.y as temporary.
-            DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1,
-                                                        true);
+            DxbcShaderTranslator::PreClampedDepthTo20e4(
+                a, i, 3, i, 3, 1, 1, depth_float24_round(), true);
          } break;
        }
        // Merge depth and stencil into r0/r1.x.
@ -3729,8 +3732,8 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
          } break;
          case xenos::DepthRenderTargetFormat::kD24FS8: {
            // Convert using r1.y as temporary.
-            DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1,
-                                                        true);
+            DxbcShaderTranslator::PreClampedDepthTo20e4(
+                a, 1, 3, 1, 3, 1, 1, depth_float24_round(), true);
          } break;
        }
        if (dest_is_color) {
@ -4105,8 +4108,8 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
                         dxbc::Src::R(0, dxbc::Src::kYYYY));
              } break;
              case xenos::DepthRenderTargetFormat::kD24FS8: {
-                DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2,
-                                                            true);
+                DxbcShaderTranslator::PreClampedDepthTo20e4(
+                    a, 0, 1, 0, 0, 0, 2, depth_float24_round(), true);
              } break;
            }
            a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
@ -6167,7 +6170,8 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
      case xenos::DepthRenderTargetFormat::kD24FS8:
        // Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
        // temporary.
-        DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true);
+        DxbcShaderTranslator::PreClampedDepthTo20e4(
+            a, 1, 0, 1, 0, 0, 0, depth_float24_round(), true);
        break;
    }
    // Combine 24-bit depth and stencil into r1.x.
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
           !cvars::snorm16_render_target_full_range;
  }

-  DepthFloat24Conversion depth_float24_conversion() const {
-    return depth_float24_conversion_;
+  bool depth_float24_round() const { return depth_float24_round_; }
+  bool depth_float24_convert_in_pixel_shader() const {
+    return depth_float24_convert_in_pixel_shader_;
  }

  DXGI_FORMAT GetColorResourceDXGIFormat(
@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {

  bool gamma_render_target_as_srgb_ = false;

-  DepthFloat24Conversion depth_float24_conversion_ =
-      DepthFloat24Conversion::kOnCopy;
+  bool depth_float24_round_ = false;
+  bool depth_float24_convert_in_pixel_shader_ = false;

  bool msaa_2x_supported_ = false;

--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification(
      RenderTargetCache::Path::kHostRenderTargets) {
    using DepthStencilMode =
        DxbcShaderTranslator::Modification::DepthStencilMode;
-    RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
-        render_target_cache_.depth_float24_conversion();
-    if ((depth_float24_conversion ==
-             RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
-         depth_float24_conversion ==
-             RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
+    if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
        normalized_depth_control.z_enable &&
        regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
            xenos::DepthRenderTargetFormat::kD24FS8) {
      modification.pixel.depth_stencil_mode =
-          depth_float24_conversion ==
-                  RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating
-              ? DepthStencilMode::kFloat24Truncating
-              : DepthStencilMode::kFloat24Rounding;
+          render_target_cache_.depth_float24_round()
+              ? DepthStencilMode::kFloat24Rounding
+              : DepthStencilMode::kFloat24Truncating;
    } else {
      if (shader.implicit_early_z_write_allowed() &&
          (!shader.writes_color_target(0) ||
@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
    state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
    state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
  } else {
-    if ((description.depth_func != xenos::CompareFunction::kAlways ||
+    if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
+        (description.depth_func != xenos::CompareFunction::kAlways ||
         description.depth_write) &&
        description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
-      switch (render_target_cache_.depth_float24_conversion()) {
-        case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating:
-          state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
-          state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
-          break;
-        case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding:
-          state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
-          state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
-          break;
-        default:
-          break;
+      if (render_target_cache_.depth_float24_round()) {
+        state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
+        state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
+      } else {
+        state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
+        state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
      }
    }
  }
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs,
      // interpolated Z instead if conversion can't be done exactly, without
      // modifying clipping bounds by adjusting Z in vertex shaders, as that
      // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
-      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
-      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
+      // Rounding the bounds to the nearest even regardless of the depth
+      // rounding mode not to add even more error by truncating twice.
+      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
+      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
    }
    if (full_float24_in_0_to_1) {
      // Remap the full [0...2) float24 range to [0...1) support data round-trip
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
                           uint32_t temp2_temp_component);
  // Converts the depth value externally clamped to the representable [0, 2)
  // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
-  // nearest even. Source and destination may be the same, temporary must be
-  // different than both. If remap_from_0_to_0_5 is true, it's assumed that
-  // 0...1 is pre-remapped to 0...0.5 in the input.
+  // nearest even or towards zero. Source and destination may be the same,
+  // temporary must be different than both. If remap_from_0_to_0_5 is true, it's
+  // assumed that 0...1 is pre-remapped to 0...0.5 in the input.
  static void PreClampedDepthTo20e4(
      dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
      uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
-      uint32_t temp_temp_component, bool remap_from_0_to_0_5);
+      uint32_t temp_temp_component, bool round_to_nearest_even,
+      bool remap_from_0_to_0_5);
  // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
  // float. Two temporaries must be different, but one can be the same as the
  // source. The destination may be anything writable. If remap_to_0_to_0_5 is
--- a/src/xenia/gpu/dxbc_shader_translator_om.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_om.cc
@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
  } else {
    // Properly convert to 20e4, with rounding to the nearest even (the bias was
    // pre-applied by multiplying by 2), then convert back restoring the bias.
-    PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false);
+    PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false);
    Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
  }

@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32(
 void DxbcShaderTranslator::PreClampedDepthTo20e4(
    dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
    uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
-    uint32_t temp_temp_component, bool remap_from_0_to_0_5) {
+    uint32_t temp_temp_component, bool round_to_nearest_even,
+    bool remap_from_0_to_0_5) {
  assert_true(temp_temp != f24_temp ||
              temp_temp_component != f24_temp_component);
  assert_true(temp_temp != f32_temp ||
@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4(
  // Close the denormal check.
  a.OpEndIf();
  // Build the 20e4 number.
-  // temp = (biased_f32 >> 3) & 1
-  a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
-  // f24 = biased_f32 + 3
-  a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
-  // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
-  a.OpIAdd(f24_dest, f24_src, temp_src);
+  if (round_to_nearest_even) {
+    // temp = (biased_f32 >> 3) & 1
+    a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
+    // f24 = biased_f32 + 3
+    a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
+    // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
+    a.OpIAdd(f24_dest, f24_src, temp_src);
+  }
+  // For rounding to the nearest even:
  // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
+  // For rounding towards zero:
+  // f24 = (biased_f32 >> 3) & 0xFFFFFF
  a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
 }

@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
    // 20e4 conversion.
    PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
                          d32_temp_component, temp_temp, temp_temp_component,
-                          false);
+                          true, false);
  }
  a_.OpElse();
  {
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@ -39,42 +39,109 @@ DEFINE_bool(
    "reduce bandwidth usage during transfers as the previous depth won't need "
    "to be read.",
    "GPU");
-// The round trip is done, in particular, in 545407F2.
-DEFINE_string(
-    depth_float24_conversion, "",
-    "Method for converting 32-bit Z values to 20e4 floating point when using "
-    "host depth buffers without native 20e4 support (when not using rasterizer-"
-    "ordered views / fragment shader interlocks to perform depth testing "
-    "manually).\n"
-    "Use: [any, on_copy, truncate, round]\n"
-    " on_copy:\n"
-    "  Do depth testing at host precision, converting when copying between "
-    "color and depth buffers (or between depth buffers of different formats) "
-    "to support reinterpretation, but keeps the last host depth buffer used "
-    "for each EDRAM range and reloads the host precision value if it's still "
-    "up to date after the EDRAM range was used with a different pixel format.\n"
-    "  + Highest performance, allows early depth test and writing.\n"
-    "  + Host MSAA is possible with pixel-rate shading where supported.\n"
-    "  - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
-    "destroys precision irreparably, causing artifacts if another rendering "
-    "pass is done after the EDRAM reupload.\n"
-    " truncate:\n"
-    "  Convert to 20e4 directly in pixel shaders, always rounding down.\n"
-    "  + Average performance, conservative early depth test is possible.\n"
-    "  + No precision loss when anything changes in the storage of the depth "
-    "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
-    "  - Rounding mode is incorrect, sometimes giving results smaller than "
-    "they should be - may cause inaccuracy especially in edge cases when the "
-    "game wants to write an exact value.\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
-    " round:\n"
-    "  Convert to 20e4 directly in pixel shaders, correctly rounding to the "
-    "nearest even.\n"
-    "  + Highest accuracy.\n"
-    "  - Significantly limited performance, early depth test is not possible.\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
-    " Any other value:\n"
-    "  Choose what is considered the most optimal (currently \"on_copy\").",
+// Lossless round trip: 545407F2.
+// Lossy round trip with the "greater or equal" test afterwards: 4D530919.
+// Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF.
+DEFINE_bool(
+    depth_float24_round, false,
+    "Whether to round to the nearest even, rather than truncating (rounding "
+    "towards zero), the depth when converting it to 24-bit floating-point "
+    "(20e4) from the host precision (32-bit floating point) when using a host "
+    "depth buffer.\n"
+    "false:\n"
+    " Recommended.\n"
+    " The conversion may move the depth values farther away from the camera.\n"
+    " Without depth_float24_convert_in_pixel_shader:\n"
+    "  The \"greater or equal\" depth test function continues to work fine if "
+    "the full host precision depth data is lost, it's still possible to draw "
+    "another pass of the same geometry with it.\n"
+    "  (See the description of depth_float24_convert_in_pixel_shader for more "
+    "information about full precision depth data loss.)\n"
+    " With depth_float24_convert_in_pixel_shader:\n"
+    "  Faster - the pixel shader for hidden surfaces may still be skipped "
+    "(using conservative depth output).\n"
+    "true:\n"
+    " Only for special cases of issues caused by minor 32-bit floating-point "
+    "rounding errors, for instance, when the game tries to draw something at "
+    "the camera plane by setting Z of the vertex position to W.\n"
+    " The conversion may move the depth values closer or farther.\n"
+    " Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n"
+    " Without depth_float24_convert_in_pixel_shader:\n"
+    "  Not possible to recover from a full host precision depth data loss - in "
+    "subsequent passes of rendering the same geometry, half of the samples "
+    "will be failing the depth test with the \"greater or equal\" depth test "
+    "function.\n"
+    " With depth_float24_convert_in_pixel_shader:\n"
+    "  Slower - depth rejection before the pixel shader is not possible.\n"
+    "When the depth buffer is emulated in software (via the fragment shader "
+    "interlock / rasterizer-ordered view), this is ignored, and rounding to "
+    "the nearest even is always done.",
+    "GPU");
+// With MSAA, when converting the depth in pixel shaders, they must run at
+// sample frequency - otherwise, if the depth is the same for the entire pixel,
+// intersections of polygons cannot be antialiased.
+//
+// Important usage note: When using this mode, bounds of the fixed-function
+// viewport must be converted to and back from float24 too (preferably using
+// rounding to the nearest even regardless of whether truncation was requested
+// for the values, to reduce the error already caused by truncation rather than
+// to amplify it). This ensures that clamping to the viewport bounds, which
+// happens after the pixel shader even if it overwrites the resulting depth, is
+// never done to a value not representable as float24 (for example, if the
+// minimum Z is a number too small to be represented as float24, but not zero,
+// it won't be possible to write what should become 0x000000 to the depth
+// buffer). Note that this may add some error to the depth values from the
+// rasterizer; however, modifying Z in the vertex shader to make interpolated
+// depth values would cause clipping to be done to different bounds, which may
+// be more undesirable, especially in cases when Z is explicitly set to a value
+// like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
+// space and disappear).
+//
+// If false, doing the depth test at the host precision, converting to 20e4 to
+// support reinterpretation, but keeping track of both the last color (or
+// non-20e4 depth) value (let's call it stored_f24) and the last host depth
+// value (stored_host) for each EDRAM pixel, reloading the last host depth value
+// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
+// something else, like clearing, or an actually used color buffer; this is
+// inexact though, and will incorrectly load pixels that were overwritten by
+// something else in the EDRAM, but turned out to have the same value on the
+// guest as before - an outdated host-precision value will be loaded in these
+// cases instead).
+DEFINE_bool(
+    depth_float24_convert_in_pixel_shader, false,
+    "Whether to convert the depth values to 24-bit floating-point (20e4) from "
+    "the host precision (32-bit floating point) directly in the pixel shaders "
+    "of guest draws when using a host depth buffer.\n"
+    "This prevents visual artifacts (interleaved stripes of parts of surfaces "
+    "rendered and parts not rendered, having either the same width in case of "
+    "the \"greater or equal\" depth test function, or the former being much "
+    "thinner than the latter with the \"equal\" function) if the full host "
+    "precision depth data is lost.\n"
+    "This issue may happen if the game reloads the depth data previously "
+    "evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region "
+    "that previously contained that depth buffer was overwritten by another "
+    "depth buffer, or the game loads it to a different location in the EDRAM "
+    "than it was previously placed at, thus Xenia is unable to restore the "
+    "depth data with the original precision, and instead falls back to "
+    "converting the lower-precision values, so in subsequent rendering passes "
+    "for the same geometry, the actual depth values of the surfaces don't "
+    "match those stored in the depth buffer anymore.\n"
+    "This is a costly option because it makes the GPU unable to use depth "
+    "buffer compression, and also with MSAA, forces the pixel shader to run "
+    "for every subpixel sample rather than for the entire pixel, making pixel "
+    "shading 2 or 4 times heavier depending on the MSAA sample count.\n"
+    "The rounding direction is controlled by the depth_float24_round "
+    "configuration variable.\n"
+    "Note that with depth_float24_round = true, this becomes even more costly "
+    "because pixel shaders must be executed regardless of whether the surface "
+    "is behind the previously drawn surfaces. With depth_float24_round = "
+    "false, conservative depth output is used, however, so depth rejection "
+    "before the pixel shader may still work.\n"
+    "If sample-rate shading is not supported by the host GPU, the conversion "
+    "in the pixel shader is done only when MSAA is not used.\n"
+    "When the depth buffer is emulated in software (via the fragment shader "
+    "interlock / rasterizer-ordered view), this is ignored because 24-bit "
+    "depth is always used directly.",
    "GPU");
 DEFINE_bool(
    draw_resolution_scaled_texture_offsets, true,
@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets(
  return rts_used;
 }

-RenderTargetCache::DepthFloat24Conversion
-RenderTargetCache::GetConfigDepthFloat24Conversion() {
-  if (cvars::depth_float24_conversion == "truncate") {
-    return DepthFloat24Conversion::kOnOutputTruncating;
-  }
-  if (cvars::depth_float24_conversion == "round") {
-    return DepthFloat24Conversion::kOnOutputRounding;
-  }
-  return DepthFloat24Conversion::kOnCopy;
-}
-
 uint32_t RenderTargetCache::GetRenderTargetHeight(
    uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
  if (!pitch_tiles_at_32bpp) {
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@ -29,6 +29,8 @@
 #include "xenia/gpu/xenos.h"

 DECLARE_bool(depth_transfer_not_equal_test);
+DECLARE_bool(depth_float24_round);
+DECLARE_bool(depth_float24_convert_in_pixel_shader);
 DECLARE_bool(draw_resolution_scaled_texture_offsets);
 DECLARE_bool(gamma_render_target_as_srgb);
 DECLARE_bool(native_2x_msaa);
@ -89,60 +91,6 @@ class RenderTargetCache {
    kPixelShaderInterlock,
  };

-  enum class DepthFloat24Conversion {
-    // Doing depth test at the host precision, converting to 20e4 to support
-    // reinterpretation, but keeping track of both the last color (or non-20e4
-    // depth) value (let's call it stored_f24) and the last host depth value
-    // (stored_host) for each EDRAM pixel, reloading the last host depth value
-    // if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
-    // something else, like clearing, or an actually used color buffer; this is
-    // inexact though, and will incorrectly load pixels that were overwritten by
-    // something else in the EDRAM, but turned out to have the same value on the
-    // guest as before - an outdated host-precision value will be loaded in
-    // these cases instead).
-    //
-    // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM
-    // round trip destroys precision beyond repair.
-    //
-    // Full host early Z and MSAA with pixel-rate shading are supported.
-    kOnCopy,
-    // Converting the depth to the closest host value representable exactly as a
-    // 20e4 float in pixel shaders, to support invariance in cases when the
-    // guest reuploads a previously resolved depth buffer to the EDRAM, rounding
-    // towards zero (which contradicts the rounding used by the Direct3D 9
-    // reference rasterizer, but allows less-than-or-equal pixel shader depth
-    // output to be used to preserve most of early Z culling when the game is
-    // using reversed depth, which is the usual way of doing depth testing on
-    // the Xbox 360 and of utilizing the advantages of a floating-point
-    // encoding).
-    //
-    // With MSAA, pixel shaders must run at sample frequency - otherwise, if the
-    // depth is the same for the entire pixel, intersections of polygons cannot
-    // be antialiased.
-    //
-    // Important usage note: When using this mode, bounds of the fixed-function
-    // viewport must be converted to and back from float24 too (preferably using
-    // correct rounding to the nearest even, to reduce the error already caused
-    // by truncation rather than to amplify it). This ensures that clamping to
-    // the viewport bounds, which happens after the pixel shader even if it
-    // overwrites the resulting depth, is never done to a value not
-    // representable as float24 (for example, if the minimum Z is a number too
-    // small to be represented as float24, but not zero, it won't be possible to
-    // write what should become 0x000000 to the depth buffer). Note that this
-    // may add some error to the depth values from the rasterizer; however,
-    // modifying Z in the vertex shader to make interpolated depth values would
-    // cause clipping to be done to different bounds, which may be more
-    // undesirable, especially in cases when Z is explicitly set to a value like
-    // 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
-    // space and disappear).
-    kOnOutputTruncating,
-    // Similar to kOnOutputTruncating, but rounding to the nearest even, more
-    // correctly, however, because the resulting depth can be bigger than the
-    // original host value, early depth testing can't be used at all. Same
-    // viewport usage rules apply.
-    kOnOutputRounding,
-  };
-
  // Useful host-specific values.
  // sRGB conversion from the Direct3D 11.3 functional specification.
  static constexpr float kSrgbToLinearDenominator1 = 12.92f;
@ -512,8 +460,6 @@ class RenderTargetCache {
    }
  };

-  static DepthFloat24Conversion GetConfigDepthFloat24Conversion();
-
  virtual uint32_t GetMaxRenderTargetWidth() const = 0;
  virtual uint32_t GetMaxRenderTargetHeight() const = 0;

--- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h
+++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h
@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z
 ult [precise(z)] r0.z, r0.x, l(0x38800000)
 iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
 movc [precise(x)] r0.x, r0.z, r0.y, r0.x
-iadd [precise(y)] r0.y, r0.x, l(3)
-ubfe [precise(x)] r0.x, l(1), l(3), r0.x
-iadd [precise(x)] r0.x, r0.x, r0.y
+ubfe [precise(y)] r0.y, l(1), l(3), r0.x
+iadd [precise(x)] r0.x, r0.y, r0.x
+iadd [precise(x)] r0.x, r0.x, l(3)
 ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
 firstbit_hi [precise(w)] r0.w, r0.y
 iadd [precise(w)] r0.w, r0.w, l(-11)
@ -76,10 +76,10 @@ ret

 const BYTE float24_round_ps[] =
 {
-     68,  88,  66,  67, 229,  54, 
-     46,   1, 194,  31, 164, 202, 
-    193,  71, 175, 129,  44,  52, 
-    218, 154,   1,   0,   0,   0, 
+     68,  88,  66,  67, 110,  79, 
+     84, 202, 151, 165, 237, 180, 
+     64,  17,   0, 132, 236, 126, 
+    142, 105,   1,   0,   0,   0, 
      8,   7,   0,   0,   5,   0, 
      0,   0,  52,   0,   0,   0, 
    160,   0,   0,   0, 120,   2, 
@ -259,22 +259,22 @@ const BYTE float24_round_ps[] =
      0,   0,   0,   0,  26,   0, 
     16,   0,   0,   0,   0,   0, 
     10,   0,  16,   0,   0,   0, 
-      0,   0,  30,   0,  16,   7, 
+      0,   0, 138,   0,  16,   9, 
     34,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   3,   0,   0,   0, 
-    138,   0,   8,   9,  18,   0, 
-     16,   0,   0,   0,   0,   0, 
-      1,  64,   0,   0,   1,   0, 
      0,   0,   1,  64,   0,   0, 
-      3,   0,   0,   0,  10,   0, 
+      1,   0,   0,   0,   1,  64, 
+      0,   0,   3,   0,   0,   0, 
+     10,   0,  16,   0,   0,   0, 
+      0,   0,  30,   0,   8,   7, 
+     18,   0,  16,   0,   0,   0, 
+      0,   0,  26,   0,  16,   0, 
+      0,   0,   0,   0,  10,   0, 
     16,   0,   0,   0,   0,   0, 
     30,   0,   8,   7,  18,   0, 
     16,   0,   0,   0,   0,   0, 
     10,   0,  16,   0,   0,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0, 138,   0, 
+      0,   0,   1,  64,   0,   0, 
+      3,   0,   0,   0, 138,   0, 
     56,  15, 114,   0,  16,   0, 
      0,   0,   0,   0,   2,  64, 
      0,   0,  24,   0,   0,   0, 
--- a/src/xenia/gpu/shaders/float24_round.ps.hlsl
+++ b/src/xenia/gpu/shaders/float24_round.ps.hlsl
@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth {
  // allow for safe reinterpretation of any 24-bit value to and from float24
  // depth using depth output without unrestricted depth range.
  return asfloat(XeFloat20e4To32(
-      XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true));
+      XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true),
+      true));
 }
--- a/src/xenia/gpu/shaders/pixel_formats.xesli
+++ b/src/xenia/gpu/shaders/pixel_formats.xesli
@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) {
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
 // We also can't clamp the stored value to 1 as load->store->load must be exact.

-uint XeFloat32To20e4(uint f32u32) {
+uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) {
  // Keep only positive (high bit set means negative for both float and int) and
  // saturate to the maximum representable value near 2 (also dropping NaNs).
  f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
  uint denormalized =
      ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
  uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
-  return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
+  if (round_to_nearest_even) {
+    f24u32 += 3u + ((f24u32 >> 3u) & 1u);
+  }
+  return (f24u32 >> 3u) & 0xFFFFFFu;
 }

 uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) {
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).

-uint32_t Float32To20e4(float f32) {
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
  if (!(f32 > 0.0f)) {
    // Positive only, and not -0 or NaN.
    return 0;
@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) {
    // Rebias the exponent to represent the value as a normalized 20e4.
    f32u32 += 0xC8000000u;
  }
-  return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF;
+  if (round_to_nearest_even) {
+    f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
+  }
+  return (f32u32 >> 3) & 0xFFFFFF;
 }

 float Float20e4To32(uint32_t f24) {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10);
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
-// depth, rounding to the nearest even.
-uint32_t Float32To20e4(float f32);
+// depth, rounding to the nearest even or towards zero.
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
 float Float20e4To32(uint32_t f24);