From 4514050f5549638bc377edf86f4b919bf91778f4 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Wed, 22 Jun 2022 13:25:06 +0300
Subject: [PATCH] [Vulkan] Truncate depth to float24 in EDRAM range ownership
 transfers and resolves by default

Doesn't ruin the "greater or equal" depth test in subsequent rendering passes if precision is lost, unlike rounding to the nearest
---
 src/xenia/gpu/spirv_shader_translator.h       |  5 ++--
 src/xenia/gpu/spirv_shader_translator_rb.cc   | 30 ++++++++++---------
 .../gpu/vulkan/vulkan_render_target_cache.cc  | 14 ++++++---
 .../gpu/vulkan/vulkan_render_target_cache.h   |  4 +++
 4 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 3df49136f..075279848 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -208,10 +208,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
                               spv::Id ext_inst_glsl_std_450);
   // Converts the depth value externally clamped to the representable [0, 2)
   // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
-  // nearest even. If remap_from_0_to_0_5 is true, it's assumed that 0...1 is
-  // pre-remapped to 0...0.5 in the input.
+  // nearest even or towards zero. If remap_from_0_to_0_5 is true, it's assumed
+  // that 0...1 is pre-remapped to 0...0.5 in the input.
   static spv::Id PreClampedDepthTo20e4(spv::Builder& builder,
                                        spv::Id f32_scalar,
+                                       bool round_to_nearest_even,
                                        bool remap_from_0_to_0_5,
                                        spv::Id ext_inst_glsl_std_450);
   // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc
index 4cb260bdd..8282016b5 100644
--- a/src/xenia/gpu/spirv_shader_translator_rb.cc
+++ b/src/xenia/gpu/spirv_shader_translator_rb.cc
@@ -230,8 +230,8 @@ spv::Id SpirvShaderTranslator::Float7e3To32(spv::Builder& builder,
 }
 
 spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4(
-    spv::Builder& builder, spv::Id f32_scalar, bool remap_from_0_to_0_5,
-    spv::Id ext_inst_glsl_std_450) {
+    spv::Builder& builder, spv::Id f32_scalar, bool round_to_nearest_even,
+    bool remap_from_0_to_0_5, spv::Id ext_inst_glsl_std_450) {
   // CFloat24 from d3dref9.dll +
   // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
   // Assuming the value is already clamped to [0, 2) (in all places, the depth
@@ -305,18 +305,20 @@ spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4(
           builder.makeUintConstant(0x38800000 - (remap_bias << 23))),
       denormal_biased_f32, normal_biased_f32);
 
-  // Build the 20e4 number rounding to the nearest even.
-  // ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
-  return builder.createTriOp(
-      spv::OpBitFieldUExtract, type_uint,
-      builder.createBinOp(
-          spv::OpIAdd, type_uint,
-          builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
-                              builder.makeUintConstant(3)),
-          builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
-                              builder.makeUintConstant(3),
-                              builder.makeUintConstant(1))),
-      builder.makeUintConstant(3), builder.makeUintConstant(24));
+  // Build the 20e4 number rounding to the nearest even or towards zero.
+  if (round_to_nearest_even) {
+    // biased_f32 += 3 + ((biased_f32 >> 3) & 1)
+    biased_f32 = builder.createBinOp(
+        spv::OpIAdd, type_uint,
+        builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
+                            builder.makeUintConstant(3)),
+        builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                            builder.makeUintConstant(3),
+                            builder.makeUintConstant(1)));
+  }
+  return builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                             builder.makeUintConstant(3),
+                             builder.makeUintConstant(24));
 }
 
 spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder,
diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
index 4d021ca7a..4d8545fd0 100644
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
@@ -416,6 +416,8 @@ bool VulkanRenderTargetCache::Initialize() {
 
   // TODO(Triang3l): All paths (FSI).
 
+  depth_float24_round_ = cvars::depth_float24_round;
+
   // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in
   // transfers.
   if (cvars::native_2x_msaa) {
@@ -3037,7 +3039,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
           } break;
           case xenos::DepthRenderTargetFormat::kD24FS8: {
             depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4(
-                builder, source_depth_float[i], true, ext_inst_glsl_std_450);
+                builder, source_depth_float[i], depth_float24_round(), true,
+                ext_inst_glsl_std_450);
           } break;
         }
         // Merge depth and stencil.
@@ -3353,7 +3356,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
           } break;
           case xenos::DepthRenderTargetFormat::kD24FS8: {
             packed = SpirvShaderTranslator::PreClampedDepthTo20e4(
-                builder, source_depth_float[0], true, ext_inst_glsl_std_450);
+                builder, source_depth_float[0], depth_float24_round(), true,
+                ext_inst_glsl_std_450);
           } break;
         }
         if (mode.output == TransferOutput::kDepth) {
@@ -3855,7 +3859,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
               } break;
               case xenos::DepthRenderTargetFormat::kD24FS8: {
                 host_depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4(
-                    builder, host_depth32, true, ext_inst_glsl_std_450);
+                    builder, host_depth32, depth_float24_round(), true,
+                    ext_inst_glsl_std_450);
               } break;
             }
             assert_true(host_depth24 != spv::NoResult);
@@ -5548,7 +5553,8 @@ VkPipeline VulkanRenderTargetCache::GetDumpPipeline(DumpPipelineKey key) {
       } break;
       case xenos::DepthRenderTargetFormat::kD24FS8: {
         packed[0] = SpirvShaderTranslator::PreClampedDepthTo20e4(
-            builder, source_depth32, true, ext_inst_glsl_std_450);
+            builder, source_depth32, depth_float24_round(), true,
+            ext_inst_glsl_std_450);
       } break;
     }
     id_vector_temp.clear();
diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
index 10b2c1aed..2857fde1f 100644
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
@@ -128,6 +128,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
     return last_update_framebuffer_;
   }
 
+  bool depth_float24_round() const { return depth_float24_round_; }
+
   bool msaa_2x_attachments_supported() const {
     return msaa_2x_attachments_supported_;
   }
@@ -824,6 +826,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
 
   bool gamma_render_target_as_srgb_ = false;
 
+  bool depth_float24_round_ = false;
+
   bool msaa_2x_attachments_supported_ = false;
   bool msaa_2x_no_attachments_supported_ = false;