Merge branch 'master' into vulkan

2022-06-22 13:15:50 +03:00 · 2022-06-22 13:15:50 +03:00 · 0d8bd0e0c6
parent c0703e64db cbf0476d42
commit 0d8bd0e0c6
14 changed files with 213 additions and 185 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // Get dynamic rasterizer state.
  uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
  uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
  RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
      render_target_cache_->depth_float24_conversion();
  draw_util::ViewportInfo viewport_info;
  draw_util::GetHostViewportInfo(
      regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
      D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
      normalized_depth_control,
      host_render_targets_used &&
-          (depth_float24_conversion ==
+          render_target_cache_->depth_float24_convert_in_pixel_shader(),
               RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
           depth_float24_conversion ==
               RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
      viewport_info);
  draw_util::Scissor scissor;
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() {
    gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;
-    depth_float24_conversion_ = GetConfigDepthFloat24Conversion();
+    depth_float24_round_ = cvars::depth_float24_round;
    depth_float24_convert_in_pixel_shader_ =
        cvars::depth_float24_convert_in_pixel_shader;
    // Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
    // instead.
@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() {
    // Blending is done in linear space directly in shaders.
    gamma_render_target_as_srgb_ = false;
-    // Always true float24 depth.
+    // Always true float24 depth rounded to the nearest even.
-    depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding;
+    depth_float24_round_ = true;
    depth_float24_convert_in_pixel_shader_ = true;
    // Only ForcedSampleCount, which doesn't support 2x.
    msaa_2x_supported_ = false;
@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget(
 bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
    xenos::DepthRenderTargetFormat format) const {
  if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
-    return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy;
+    return !depth_float24_convert_in_pixel_shader_;
  }
  return false;
 }
@ -3542,8 +3545,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
          } break;
          case xenos::DepthRenderTargetFormat::kD24FS8: {
            // Convert using r1.y as temporary.
-            DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1,
+            // When converting the depth in pixel shaders, it's always exact,
-                                                        true);
+            // truncating not to insert additional rounding instructions.
            DxbcShaderTranslator::PreClampedDepthTo20e4(
                a, i, 3, i, 3, 1, 1,
                !depth_float24_convert_in_pixel_shader() &&
                    depth_float24_round(),
                true);
          } break;
        }
        // Merge depth and stencil into r0/r1.x.
@ -3729,8 +3737,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
          } break;
          case xenos::DepthRenderTargetFormat::kD24FS8: {
            // Convert using r1.y as temporary.
-            DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1,
+            // When converting the depth in pixel shaders, it's always exact,
-                                                        true);
+            // truncating not to insert additional rounding instructions.
            DxbcShaderTranslator::PreClampedDepthTo20e4(
                a, 1, 3, 1, 3, 1, 1,
                !depth_float24_convert_in_pixel_shader() &&
                    depth_float24_round(),
                true);
          } break;
        }
        if (dest_is_color) {
@ -4105,8 +4118,14 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
                         dxbc::Src::R(0, dxbc::Src::kYYYY));
              } break;
              case xenos::DepthRenderTargetFormat::kD24FS8: {
-                DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2,
+                // When converting the depth in pixel shaders, it's always
-                                                            true);
+                // exact, truncating not to insert additional rounding
                // instructions.
                DxbcShaderTranslator::PreClampedDepthTo20e4(
                    a, 0, 1, 0, 0, 0, 2,
                    !depth_float24_convert_in_pixel_shader() &&
                        depth_float24_round(),
                    true);
              } break;
            }
            a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
@ -6167,7 +6186,12 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
      case xenos::DepthRenderTargetFormat::kD24FS8:
        // Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
        // temporary.
-        DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true);
+        // When converting the depth in pixel shaders, it's always exact,
        // truncating not to insert additional rounding instructions.
        DxbcShaderTranslator::PreClampedDepthTo20e4(
            a, 1, 0, 1, 0, 0, 0,
            !depth_float24_convert_in_pixel_shader() && depth_float24_round(),
            true);
        break;
    }
    // Combine 24-bit depth and stencil into r1.x.
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
           !cvars::snorm16_render_target_full_range;
  }
-  DepthFloat24Conversion depth_float24_conversion() const {
+  bool depth_float24_round() const { return depth_float24_round_; }
-    return depth_float24_conversion_;
+  bool depth_float24_convert_in_pixel_shader() const {
    return depth_float24_convert_in_pixel_shader_;
  }
  DXGI_FORMAT GetColorResourceDXGIFormat(
@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
  bool gamma_render_target_as_srgb_ = false;
-  DepthFloat24Conversion depth_float24_conversion_ =
+  bool depth_float24_round_ = false;
-      DepthFloat24Conversion::kOnCopy;
+  bool depth_float24_convert_in_pixel_shader_ = false;
  bool msaa_2x_supported_ = false;
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification(
      RenderTargetCache::Path::kHostRenderTargets) {
    using DepthStencilMode =
        DxbcShaderTranslator::Modification::DepthStencilMode;
-    RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
+    if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
        render_target_cache_.depth_float24_conversion();
    if ((depth_float24_conversion ==
             RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
         depth_float24_conversion ==
             RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
        normalized_depth_control.z_enable &&
        regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
            xenos::DepthRenderTargetFormat::kD24FS8) {
      modification.pixel.depth_stencil_mode =
-          depth_float24_conversion ==
+          render_target_cache_.depth_float24_round()
-                  RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating
+              ? DepthStencilMode::kFloat24Rounding
-              ? DepthStencilMode::kFloat24Truncating
+              : DepthStencilMode::kFloat24Truncating;
              : DepthStencilMode::kFloat24Rounding;
    } else {
      if (shader.implicit_early_z_write_allowed() &&
          (!shader.writes_color_target(0) ||
@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
    state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
    state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
  } else {
-    if ((description.depth_func != xenos::CompareFunction::kAlways ||
+    if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
        (description.depth_func != xenos::CompareFunction::kAlways ||
         description.depth_write) &&
        description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
-      switch (render_target_cache_.depth_float24_conversion()) {
+      if (render_target_cache_.depth_float24_round()) {
-        case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating:
+        state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
-          state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
+        state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
-          state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
+      } else {
-          break;
+        state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
-        case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding:
+        state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
          state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
          state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
          break;
        default:
          break;
      }
    }
  }
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs,
      // interpolated Z instead if conversion can't be done exactly, without
      // modifying clipping bounds by adjusting Z in vertex shaders, as that
      // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
-      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
+      // Rounding the bounds to the nearest even regardless of the depth
-      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
+      // rounding mode not to add even more error by truncating twice.
      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
    }
    if (full_float24_in_0_to_1) {
      // Remap the full [0...2) float24 range to [0...1) support data round-trip
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
                           uint32_t temp2_temp_component);
  // Converts the depth value externally clamped to the representable [0, 2)
  // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
-  // nearest even. Source and destination may be the same, temporary must be
+  // nearest even or towards zero. Source and destination may be the same,
-  // different than both. If remap_from_0_to_0_5 is true, it's assumed that
+  // temporary must be different than both. If remap_from_0_to_0_5 is true, it's
-  // 0...1 is pre-remapped to 0...0.5 in the input.
+  // assumed that 0...1 is pre-remapped to 0...0.5 in the input.
  static void PreClampedDepthTo20e4(
      dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
      uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
-      uint32_t temp_temp_component, bool remap_from_0_to_0_5);
+      uint32_t temp_temp_component, bool round_to_nearest_even,
      bool remap_from_0_to_0_5);
  // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
  // float. Two temporaries must be different, but one can be the same as the
  // source. The destination may be anything writable. If remap_to_0_to_0_5 is
--- a/src/xenia/gpu/dxbc_shader_translator_om.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_om.cc
@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
  } else {
    // Properly convert to 20e4, with rounding to the nearest even (the bias was
    // pre-applied by multiplying by 2), then convert back restoring the bias.
-    PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false);
+    PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false);
    Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
  }
@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32(
 void DxbcShaderTranslator::PreClampedDepthTo20e4(
    dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
    uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
-    uint32_t temp_temp_component, bool remap_from_0_to_0_5) {
+    uint32_t temp_temp_component, bool round_to_nearest_even,
    bool remap_from_0_to_0_5) {
  assert_true(temp_temp != f24_temp ||
              temp_temp_component != f24_temp_component);
  assert_true(temp_temp != f32_temp ||
@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4(
  // Close the denormal check.
  a.OpEndIf();
  // Build the 20e4 number.
-  // temp = (biased_f32 >> 3) & 1
+  if (round_to_nearest_even) {
-  a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
+    // temp = (biased_f32 >> 3) & 1
-  // f24 = biased_f32 + 3
+    a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
-  a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
+    // f24 = biased_f32 + 3
-  // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
+    a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
-  a.OpIAdd(f24_dest, f24_src, temp_src);
+    // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
    a.OpIAdd(f24_dest, f24_src, temp_src);
  }
  // For rounding to the nearest even:
  // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
  // For rounding towards zero:
  // f24 = (biased_f32 >> 3) & 0xFFFFFF
  a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
 }
@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
    // 20e4 conversion.
    PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
                          d32_temp_component, temp_temp, temp_temp_component,
-                          false);
+                          true, false);
  }
  a_.OpElse();
  {
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@ -39,42 +39,109 @@ DEFINE_bool(
    "reduce bandwidth usage during transfers as the previous depth won't need "
    "to be read.",
    "GPU");
-// The round trip is done, in particular, in 545407F2.
+// Lossless round trip: 545407F2.
-DEFINE_string(
+// Lossy round trip with the "greater or equal" test afterwards: 4D530919.
-    depth_float24_conversion, "",
+// Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF.
-    "Method for converting 32-bit Z values to 20e4 floating point when using "
+DEFINE_bool(
-    "host depth buffers without native 20e4 support (when not using rasterizer-"
+    depth_float24_round, false,
-    "ordered views / fragment shader interlocks to perform depth testing "
+    "Whether to round to the nearest even, rather than truncating (rounding "
-    "manually).\n"
+    "towards zero), the depth when converting it to 24-bit floating-point "
-    "Use: [any, on_copy, truncate, round]\n"
+    "(20e4) from the host precision (32-bit floating point) when using a host "
-    " on_copy:\n"
+    "depth buffer.\n"
-    "  Do depth testing at host precision, converting when copying between "
+    "false:\n"
-    "color and depth buffers (or between depth buffers of different formats) "
+    " Recommended.\n"
-    "to support reinterpretation, but keeps the last host depth buffer used "
+    " The conversion may move the depth values farther away from the camera.\n"
-    "for each EDRAM range and reloads the host precision value if it's still "
+    " Without depth_float24_convert_in_pixel_shader:\n"
-    "up to date after the EDRAM range was used with a different pixel format.\n"
+    "  The \"greater or equal\" depth test function continues to work fine if "
-    "  + Highest performance, allows early depth test and writing.\n"
+    "the full host precision depth data is lost, it's still possible to draw "
-    "  + Host MSAA is possible with pixel-rate shading where supported.\n"
+    "another pass of the same geometry with it.\n"
-    "  - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
+    "  (See the description of depth_float24_convert_in_pixel_shader for more "
-    "destroys precision irreparably, causing artifacts if another rendering "
+    "information about full precision depth data loss.)\n"
-    "pass is done after the EDRAM reupload.\n"
+    " With depth_float24_convert_in_pixel_shader:\n"
-    " truncate:\n"
+    "  Faster - the pixel shader for hidden surfaces may still be skipped "
-    "  Convert to 20e4 directly in pixel shaders, always rounding down.\n"
+    "(using conservative depth output).\n"
-    "  + Average performance, conservative early depth test is possible.\n"
+    "true:\n"
-    "  + No precision loss when anything changes in the storage of the depth "
+    " Only for special cases of issues caused by minor 32-bit floating-point "
-    "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
+    "rounding errors, for instance, when the game tries to draw something at "
-    "  - Rounding mode is incorrect, sometimes giving results smaller than "
+    "the camera plane by setting Z of the vertex position to W.\n"
-    "they should be - may cause inaccuracy especially in edge cases when the "
+    " The conversion may move the depth values closer or farther.\n"
-    "game wants to write an exact value.\n"
+    " Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
+    " Without depth_float24_convert_in_pixel_shader:\n"
-    " round:\n"
+    "  Not possible to recover from a full host precision depth data loss - in "
-    "  Convert to 20e4 directly in pixel shaders, correctly rounding to the "
+    "subsequent passes of rendering the same geometry, half of the samples "
-    "nearest even.\n"
+    "will be failing the depth test with the \"greater or equal\" depth test "
-    "  + Highest accuracy.\n"
+    "function.\n"
-    "  - Significantly limited performance, early depth test is not possible.\n"
+    " With depth_float24_convert_in_pixel_shader:\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
+    "  Slower - depth rejection before the pixel shader is not possible.\n"
-    " Any other value:\n"
+    "When the depth buffer is emulated in software (via the fragment shader "
-    "  Choose what is considered the most optimal (currently \"on_copy\").",
+    "interlock / rasterizer-ordered view), this is ignored, and rounding to "
    "the nearest even is always done.",
    "GPU");
 // With MSAA, when converting the depth in pixel shaders, they must run at
 // sample frequency - otherwise, if the depth is the same for the entire pixel,
 // intersections of polygons cannot be antialiased.
 //
 // Important usage note: When using this mode, bounds of the fixed-function
 // viewport must be converted to and back from float24 too (preferably using
 // rounding to the nearest even regardless of whether truncation was requested
 // for the values, to reduce the error already caused by truncation rather than
 // to amplify it). This ensures that clamping to the viewport bounds, which
 // happens after the pixel shader even if it overwrites the resulting depth, is
 // never done to a value not representable as float24 (for example, if the
 // minimum Z is a number too small to be represented as float24, but not zero,
 // it won't be possible to write what should become 0x000000 to the depth
 // buffer). Note that this may add some error to the depth values from the
 // rasterizer; however, modifying Z in the vertex shader to make interpolated
 // depth values would cause clipping to be done to different bounds, which may
 // be more undesirable, especially in cases when Z is explicitly set to a value
 // like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
 // space and disappear).
 //
 // If false, doing the depth test at the host precision, converting to 20e4 to
 // support reinterpretation, but keeping track of both the last color (or
 // non-20e4 depth) value (let's call it stored_f24) and the last host depth
 // value (stored_host) for each EDRAM pixel, reloading the last host depth value
 // if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
 // something else, like clearing, or an actually used color buffer; this is
 // inexact though, and will incorrectly load pixels that were overwritten by
 // something else in the EDRAM, but turned out to have the same value on the
 // guest as before - an outdated host-precision value will be loaded in these
 // cases instead).
 DEFINE_bool(
    depth_float24_convert_in_pixel_shader, false,
    "Whether to convert the depth values to 24-bit floating-point (20e4) from "
    "the host precision (32-bit floating point) directly in the pixel shaders "
    "of guest draws when using a host depth buffer.\n"
    "This prevents visual artifacts (interleaved stripes of parts of surfaces "
    "rendered and parts not rendered, having either the same width in case of "
    "the \"greater or equal\" depth test function, or the former being much "
    "thinner than the latter with the \"equal\" function) if the full host "
    "precision depth data is lost.\n"
    "This issue may happen if the game reloads the depth data previously "
    "evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region "
    "that previously contained that depth buffer was overwritten by another "
    "depth buffer, or the game loads it to a different location in the EDRAM "
    "than it was previously placed at, thus Xenia is unable to restore the "
    "depth data with the original precision, and instead falls back to "
    "converting the lower-precision values, so in subsequent rendering passes "
    "for the same geometry, the actual depth values of the surfaces don't "
    "match those stored in the depth buffer anymore.\n"
    "This is a costly option because it makes the GPU unable to use depth "
    "buffer compression, and also with MSAA, forces the pixel shader to run "
    "for every subpixel sample rather than for the entire pixel, making pixel "
    "shading 2 or 4 times heavier depending on the MSAA sample count.\n"
    "The rounding direction is controlled by the depth_float24_round "
    "configuration variable.\n"
    "Note that with depth_float24_round = true, this becomes even more costly "
    "because pixel shaders must be executed regardless of whether the surface "
    "is behind the previously drawn surfaces. With depth_float24_round = "
    "false, conservative depth output is used, however, so depth rejection "
    "before the pixel shader may still work.\n"
    "If sample-rate shading is not supported by the host GPU, the conversion "
    "in the pixel shader is done only when MSAA is not used.\n"
    "When the depth buffer is emulated in software (via the fragment shader "
    "interlock / rasterizer-ordered view), this is ignored because 24-bit "
    "depth is always used directly.",
    "GPU");
 DEFINE_bool(
    draw_resolution_scaled_texture_offsets, true,
@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets(
  return rts_used;
 }
 RenderTargetCache::DepthFloat24Conversion
 RenderTargetCache::GetConfigDepthFloat24Conversion() {
  if (cvars::depth_float24_conversion == "truncate") {
    return DepthFloat24Conversion::kOnOutputTruncating;
  }
  if (cvars::depth_float24_conversion == "round") {
    return DepthFloat24Conversion::kOnOutputRounding;
  }
  return DepthFloat24Conversion::kOnCopy;
 }
 uint32_t RenderTargetCache::GetRenderTargetHeight(
    uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
  if (!pitch_tiles_at_32bpp) {
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@ -29,6 +29,8 @@
 #include "xenia/gpu/xenos.h"
 DECLARE_bool(depth_transfer_not_equal_test);
 DECLARE_bool(depth_float24_round);
 DECLARE_bool(depth_float24_convert_in_pixel_shader);
 DECLARE_bool(draw_resolution_scaled_texture_offsets);
 DECLARE_bool(gamma_render_target_as_srgb);
 DECLARE_bool(native_2x_msaa);
@ -89,60 +91,6 @@ class RenderTargetCache {
    kPixelShaderInterlock,
  };
  enum class DepthFloat24Conversion {
    // Doing depth test at the host precision, converting to 20e4 to support
    // reinterpretation, but keeping track of both the last color (or non-20e4
    // depth) value (let's call it stored_f24) and the last host depth value
    // (stored_host) for each EDRAM pixel, reloading the last host depth value
    // if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
    // something else, like clearing, or an actually used color buffer; this is
    // inexact though, and will incorrectly load pixels that were overwritten by
    // something else in the EDRAM, but turned out to have the same value on the
    // guest as before - an outdated host-precision value will be loaded in
    // these cases instead).
    //
    // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM
    // round trip destroys precision beyond repair.
    //
    // Full host early Z and MSAA with pixel-rate shading are supported.
    kOnCopy,
    // Converting the depth to the closest host value representable exactly as a
    // 20e4 float in pixel shaders, to support invariance in cases when the
    // guest reuploads a previously resolved depth buffer to the EDRAM, rounding
    // towards zero (which contradicts the rounding used by the Direct3D 9
    // reference rasterizer, but allows less-than-or-equal pixel shader depth
    // output to be used to preserve most of early Z culling when the game is
    // using reversed depth, which is the usual way of doing depth testing on
    // the Xbox 360 and of utilizing the advantages of a floating-point
    // encoding).
    //
    // With MSAA, pixel shaders must run at sample frequency - otherwise, if the
    // depth is the same for the entire pixel, intersections of polygons cannot
    // be antialiased.
    //
    // Important usage note: When using this mode, bounds of the fixed-function
    // viewport must be converted to and back from float24 too (preferably using
    // correct rounding to the nearest even, to reduce the error already caused
    // by truncation rather than to amplify it). This ensures that clamping to
    // the viewport bounds, which happens after the pixel shader even if it
    // overwrites the resulting depth, is never done to a value not
    // representable as float24 (for example, if the minimum Z is a number too
    // small to be represented as float24, but not zero, it won't be possible to
    // write what should become 0x000000 to the depth buffer). Note that this
    // may add some error to the depth values from the rasterizer; however,
    // modifying Z in the vertex shader to make interpolated depth values would
    // cause clipping to be done to different bounds, which may be more
    // undesirable, especially in cases when Z is explicitly set to a value like
    // 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
    // space and disappear).
    kOnOutputTruncating,
    // Similar to kOnOutputTruncating, but rounding to the nearest even, more
    // correctly, however, because the resulting depth can be bigger than the
    // original host value, early depth testing can't be used at all. Same
    // viewport usage rules apply.
    kOnOutputRounding,
  };
  // Useful host-specific values.
  // sRGB conversion from the Direct3D 11.3 functional specification.
  static constexpr float kSrgbToLinearDenominator1 = 12.92f;
@ -512,8 +460,6 @@ class RenderTargetCache {
    }
  };
  static DepthFloat24Conversion GetConfigDepthFloat24Conversion();
  virtual uint32_t GetMaxRenderTargetWidth() const = 0;
  virtual uint32_t GetMaxRenderTargetHeight() const = 0;
--- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h
+++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h
@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z
 ult [precise(z)] r0.z, r0.x, l(0x38800000)
 iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
 movc [precise(x)] r0.x, r0.z, r0.y, r0.x
-iadd [precise(y)] r0.y, r0.x, l(3)
+ubfe [precise(y)] r0.y, l(1), l(3), r0.x
-ubfe [precise(x)] r0.x, l(1), l(3), r0.x
+iadd [precise(x)] r0.x, r0.y, r0.x
-iadd [precise(x)] r0.x, r0.x, r0.y
+iadd [precise(x)] r0.x, r0.x, l(3)
 ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
 firstbit_hi [precise(w)] r0.w, r0.y
 iadd [precise(w)] r0.w, r0.w, l(-11)
@ -76,10 +76,10 @@ ret
 const BYTE float24_round_ps[] =
 {
-     68,  88,  66,  67, 229,  54, 
+     68,  88,  66,  67, 110,  79, 
-     46,   1, 194,  31, 164, 202, 
+     84, 202, 151, 165, 237, 180, 
-    193,  71, 175, 129,  44,  52, 
+     64,  17,   0, 132, 236, 126, 
-    218, 154,   1,   0,   0,   0, 
+    142, 105,   1,   0,   0,   0, 
      8,   7,   0,   0,   5,   0, 
      0,   0,  52,   0,   0,   0, 
    160,   0,   0,   0, 120,   2, 
@ -259,22 +259,22 @@ const BYTE float24_round_ps[] =
      0,   0,   0,   0,  26,   0, 
     16,   0,   0,   0,   0,   0, 
     10,   0,  16,   0,   0,   0, 
-      0,   0,  30,   0,  16,   7, 
+      0,   0, 138,   0,  16,   9, 
     34,   0,  16,   0,   0,   0, 
      0,   0,  10,   0,  16,   0, 
      0,   0,   0,   0,   1,  64, 
      0,   0,   3,   0,   0,   0, 
    138,   0,   8,   9,  18,   0, 
     16,   0,   0,   0,   0,   0, 
      1,  64,   0,   0,   1,   0, 
      0,   0,   1,  64,   0,   0, 
-      3,   0,   0,   0,  10,   0, 
+      1,   0,   0,   0,   1,  64, 
      0,   0,   3,   0,   0,   0, 
     10,   0,  16,   0,   0,   0, 
      0,   0,  30,   0,   8,   7, 
     18,   0,  16,   0,   0,   0, 
      0,   0,  26,   0,  16,   0, 
      0,   0,   0,   0,  10,   0, 
     16,   0,   0,   0,   0,   0, 
     30,   0,   8,   7,  18,   0, 
     16,   0,   0,   0,   0,   0, 
     10,   0,  16,   0,   0,   0, 
-      0,   0,  26,   0,  16,   0, 
+      0,   0,   1,  64,   0,   0, 
-      0,   0,   0,   0, 138,   0, 
+      3,   0,   0,   0, 138,   0, 
     56,  15, 114,   0,  16,   0, 
      0,   0,   0,   0,   2,  64, 
      0,   0,  24,   0,   0,   0, 
--- a/src/xenia/gpu/shaders/float24_round.ps.hlsl
+++ b/src/xenia/gpu/shaders/float24_round.ps.hlsl
@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth {
  // allow for safe reinterpretation of any 24-bit value to and from float24
  // depth using depth output without unrestricted depth range.
  return asfloat(XeFloat20e4To32(
-      XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true));
+      XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true),
      true));
 }
--- a/src/xenia/gpu/shaders/pixel_formats.xesli
+++ b/src/xenia/gpu/shaders/pixel_formats.xesli
@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) {
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
 // We also can't clamp the stored value to 1 as load->store->load must be exact.
-uint XeFloat32To20e4(uint f32u32) {
+uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) {
  // Keep only positive (high bit set means negative for both float and int) and
  // saturate to the maximum representable value near 2 (also dropping NaNs).
  f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
  uint denormalized =
      ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
  uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
-  return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
+  if (round_to_nearest_even) {
    f24u32 += 3u + ((f24u32 >> 3u) & 1u);
  }
  return (f24u32 >> 3u) & 0xFFFFFFu;
 }
 uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) {
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
-uint32_t Float32To20e4(float f32) {
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
  if (!(f32 > 0.0f)) {
    // Positive only, and not -0 or NaN.
    return 0;
@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) {
    // Rebias the exponent to represent the value as a normalized 20e4.
    f32u32 += 0xC8000000u;
  }
-  return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF;
+  if (round_to_nearest_even) {
    f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
  }
  return (f32u32 >> 3) & 0xFFFFFF;
 }
 float Float20e4To32(uint32_t f24) {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10);
 // Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
 // floating-point number.
 // Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
-// depth, rounding to the nearest even.
+// depth, rounding to the nearest even or towards zero.
-uint32_t Float32To20e4(float f32);
+uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
 // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
 // IEEE-754 32-bit floating-point number.
 float Float20e4To32(uint32_t f24);