From 8d07c798978e6e8a2df729b338d33276225b7172 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Sun, 13 Feb 2022 20:50:31 +0300
Subject: [PATCH] [GPU] Cleanup RB_COLOR_MASK and RB_DEPTHCONTROL normalization

---
 .../gpu/d3d12/d3d12_command_processor.cc      | 35 +++-----
 src/xenia/gpu/d3d12/d3d12_command_processor.h | 12 +--
 src/xenia/gpu/d3d12/pipeline_cache.cc         | 16 ++--
 src/xenia/gpu/d3d12/pipeline_cache.h          |  2 +
 src/xenia/gpu/draw_util.cc                    | 45 +++++++++-
 src/xenia/gpu/draw_util.h                     | 13 ++-
 src/xenia/gpu/render_target_cache.cc          | 85 +++++++++----------
 src/xenia/gpu/render_target_cache.h           |  2 +-
 8 files changed, 114 insertions(+), 96 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index ccab20972..bcf5367bb 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -103,22 +103,6 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) {
   render_target_cache_->RestoreEdramSnapshot(snapshot);
 }
 
-uint32_t D3D12CommandProcessor::GetCurrentColorMask(
-    uint32_t shader_writes_color_targets) const {
-  auto& regs = *register_file_;
-  if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
-      xenos::ModeControl::kColorDepth) {
-    return 0;
-  }
-  uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF;
-  for (uint32_t i = 0; i < 4; ++i) {
-    if (!(shader_writes_color_targets & (1 << i))) {
-      color_mask &= ~(0xF << (i * 4));
-    }
-  }
-  return color_mask;
-}
-
 void D3D12CommandProcessor::PushTransitionBarrier(
     ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
     D3D12_RESOURCE_STATES new_state, UINT subresource) {
@@ -2152,10 +2136,12 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           : DxbcShaderTranslator::Modification(0);
 
   // Set up the render targets - this may perform dispatches and draws.
-  uint32_t pixel_shader_writes_color_targets =
-      pixel_shader ? pixel_shader->writes_color_targets() : 0;
+  uint32_t normalized_color_mask =
+      pixel_shader ? draw_util::GetNormalizedColorMask(
+                         regs, pixel_shader->writes_color_targets())
+                   : 0;
   if (!render_target_cache_->Update(is_rasterization_done,
-                                    pixel_shader_writes_color_targets)) {
+                                    normalized_color_mask)) {
     return false;
   }
 
@@ -2186,7 +2172,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   ID3D12RootSignature* root_signature;
   if (!pipeline_cache_->ConfigurePipeline(
           vertex_shader_translation, pixel_shader_translation,
-          primitive_processing_result, bound_depth_and_color_render_target_bits,
+          primitive_processing_result, normalized_color_mask,
+          bound_depth_and_color_render_target_bits,
           bound_depth_and_color_render_target_formats, &pipeline_handle,
           &root_signature)) {
     return false;
@@ -2241,9 +2228,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
       memexport_used, primitive_polygonal,
       primitive_processing_result.line_loop_closing_index,
       primitive_processing_result.host_index_endian, viewport_info,
-      used_texture_mask,
-      pixel_shader ? GetCurrentColorMask(pixel_shader->writes_color_targets())
-                   : 0);
+      used_texture_mask, normalized_color_mask);
 
   // Update constant buffers, descriptors and root parameters.
   if (!UpdateBindings(vertex_shader, pixel_shader, root_signature)) {
@@ -3114,7 +3099,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
     bool shared_memory_is_uav, bool primitive_polygonal,
     uint32_t line_loop_closing_index, xenos::Endian index_endian,
     const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
-    uint32_t color_mask) {
+    uint32_t normalized_color_mask) {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@@ -3161,7 +3146,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
       // Get the mask for keeping previous color's components unmodified,
       // or two UINT32_MAX if no colors actually existing in the RT are written.
       DxbcShaderTranslator::ROV_GetColorFormatSystemConstants(
-          color_info.color_format, (color_mask >> (i * 4)) & 0b1111,
+          color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111,
           rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3],
           rt_keep_masks[i][0], rt_keep_masks[i][1]);
     }
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index abdd1fbda..032673704 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -83,16 +83,6 @@ class D3D12CommandProcessor : public CommandProcessor {
   uint64_t GetCurrentFrame() const { return frame_current_; }
   uint64_t GetCompletedFrame() const { return frame_completed_; }
 
-  // Gets the current color write mask, taking the pixel shader's write mask
-  // into account. If a shader doesn't write to a render target, it shouldn't be
-  // written to and it shouldn't be even bound - otherwise, in 4D5307E6, one
-  // render target is being destroyed by a shader not writing anything, and in
-  // 58410955, the result of clearing the top tile is being ignored because
-  // there are 4 render targets bound with the same EDRAM base (clearly not
-  // correct usage), but the shader only clears 1, and then EDRAM buffer stores
-  // conflict with each other.
-  uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const;
-
   void PushTransitionBarrier(
       ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
       D3D12_RESOURCE_STATES new_state,
@@ -362,7 +352,7 @@ class D3D12CommandProcessor : public CommandProcessor {
                                   xenos::Endian index_endian,
                                   const draw_util::ViewportInfo& viewport_info,
                                   uint32_t used_texture_mask,
-                                  uint32_t color_mask);
+                                  uint32_t normalized_color_mask);
   bool UpdateBindings(const D3D12Shader* vertex_shader,
                       const D3D12Shader* pixel_shader,
                       ID3D12RootSignature* root_signature);
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index 3a0cb3313..ff75f4151 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -934,6 +934,7 @@ bool PipelineCache::ConfigurePipeline(
     D3D12Shader::D3D12Translation* vertex_shader,
     D3D12Shader::D3D12Translation* pixel_shader,
     const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+    uint32_t normalized_color_mask,
     uint32_t bound_depth_and_color_render_target_bits,
     const uint32_t* bound_depth_and_color_render_target_formats,
     void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
@@ -1005,7 +1006,7 @@ bool PipelineCache::ConfigurePipeline(
   PipelineRuntimeDescription runtime_description;
   if (!GetCurrentStateDescription(
           vertex_shader, pixel_shader, primitive_processing_result,
-          bound_depth_and_color_render_target_bits,
+          normalized_color_mask, bound_depth_and_color_render_target_bits,
           bound_depth_and_color_render_target_formats, runtime_description)) {
     return false;
   }
@@ -1272,6 +1273,7 @@ bool PipelineCache::GetCurrentStateDescription(
     D3D12Shader::D3D12Translation* vertex_shader,
     D3D12Shader::D3D12Translation* pixel_shader,
     const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+    uint32_t normalized_color_mask,
     uint32_t bound_depth_and_color_render_target_bits,
     const uint32_t* bound_depth_and_color_render_target_formats,
     PipelineRuntimeDescription& runtime_description_out) {
@@ -1547,10 +1549,6 @@ bool PipelineCache::GetCurrentStateDescription(
 
     // Render targets and blending state. 32 because of 0x1F mask, for safety
     // (all unknown to zero).
-    uint32_t color_mask =
-        pixel_shader ? command_processor_.GetCurrentColorMask(
-                           pixel_shader->shader().writes_color_targets())
-                     : 0;
     static const PipelineBlendFactor kBlendFactorMap[32] = {
         /*  0 */ PipelineBlendFactor::kZero,
         /*  1 */ PipelineBlendFactor::kOne,
@@ -1622,8 +1620,7 @@ bool PipelineCache::GetCurrentStateDescription(
           reg::RB_COLOR_INFO::rt_register_indices[i]);
       rt.format = xenos::ColorRenderTargetFormat(
           bound_depth_and_color_render_target_formats[1 + i]);
-      // TODO(Triang3l): Normalize unused bits of the color write mask.
-      rt.write_mask = (color_mask >> (i * 4)) & 0xF;
+      rt.write_mask = (normalized_color_mask >> (i * 4)) & 0xF;
       if (rt.write_mask) {
         auto blendcontrol = regs.Get<reg::RB_BLENDCONTROL>(
             reg::RB_BLENDCONTROL::rt_register_indices[i]);
@@ -2017,9 +2014,6 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
       }
       D3D12_RENDER_TARGET_BLEND_DESC& blend_desc =
           state_desc.BlendState.RenderTarget[i];
-      // Treat 1 * src + 0 * dest as disabled blending (there are opaque
-      // surfaces drawn with blending enabled, but it's 1 * src + 0 * dest, in
-      // 415607E6 - GPU performance is better when not blending.
       if (rt.src_blend != PipelineBlendFactor::kOne ||
           rt.dest_blend != PipelineBlendFactor::kZero ||
           rt.blend_op != xenos::BlendOp::kAdd ||
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index 290c47996..2f8f8b02b 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -82,6 +82,7 @@ class PipelineCache {
       D3D12Shader::D3D12Translation* vertex_shader,
       D3D12Shader::D3D12Translation* pixel_shader,
       const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+      uint32_t normalized_color_mask,
       uint32_t bound_depth_and_color_render_target_bits,
       const uint32_t* bound_depth_and_color_render_targets_formats,
       void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);
@@ -247,6 +248,7 @@ class PipelineCache {
       D3D12Shader::D3D12Translation* vertex_shader,
       D3D12Shader::D3D12Translation* pixel_shader,
       const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+      uint32_t normalized_color_mask,
       uint32_t bound_depth_and_color_render_target_bits,
       const uint32_t* bound_depth_and_color_render_target_formats,
       PipelineRuntimeDescription& runtime_description_out);
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 83ea6601d..370ffeb43 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -550,6 +550,49 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
   scissor_out.extent[1] = uint32_t(br_y - tl_y);
 }
 
+uint32_t GetNormalizedColorMask(const RegisterFile& regs,
+                                uint32_t pixel_shader_writes_color_targets) {
+  if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
+      xenos::ModeControl::kColorDepth) {
+    return 0;
+  }
+  uint32_t normalized_color_mask = 0;
+  uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
+  for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
+    // Exclude the render targets not statically written to by the pixel shader.
+    // If the shader doesn't write to a render target, it shouldn't be written
+    // to, and no ownership transfers should happen to it on the host even -
+    // otherwise, in 4D5307E6, one render target is being destroyed by a shader
+    // not writing anything, and in 58410955, the result of clearing the top
+    // tile is being ignored because there are 4 render targets bound with the
+    // same EDRAM base (clearly not correct usage), but the shader only clears
+    // 1, and then ownership of EDRAM portions by host render targets is
+    // conflicting.
+    if (!(pixel_shader_writes_color_targets & (uint32_t(1) << i))) {
+      continue;
+    }
+    // Check if any existing component is written to.
+    uint32_t format_component_mask =
+        (uint32_t(1) << xenos::GetColorRenderTargetFormatComponentCount(
+             regs.Get<reg::RB_COLOR_INFO>(
+                     reg::RB_COLOR_INFO::rt_register_indices[i])
+                 .color_format)) -
+        1;
+    uint32_t rt_write_mask = (rb_color_mask >> (4 * i)) & format_component_mask;
+    if (!rt_write_mask) {
+      continue;
+    }
+    // Mark the non-existent components as written so in the host driver, no
+    // slow path (involving reading and merging components) is taken if the
+    // driver doesn't perform this check internally, and some components are not
+    // included in the mask even though they actually don't exist in the format.
+    rt_write_mask |= 0b1111 & ~format_component_mask;
+    // Add to the normalized mask.
+    normalized_color_mask |= rt_write_mask << (4 * i);
+  }
+  return normalized_color_mask;
+}
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index 39d430676..1474461ec 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -186,6 +186,17 @@ struct Scissor {
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
                 bool clamp_to_surface_pitch = true);
 
+// Returns the color component write mask for the draw command taking into
+// account which color targets are written to by the pixel shader, as well as
+// components that don't exist in the formats of the render targets (render
+// targets with only non-existent components written are skipped, but
+// non-existent components are forced to written if some existing components of
+// the render target are actually used to make sure the host driver doesn't try
+// to take a slow path involving reading and mixing if there are any disabled
+// components even if they don't actually exist).
+uint32_t GetNormalizedColorMask(const RegisterFile& regs,
+                                uint32_t pixel_shader_writes_color_targets);
+
 // Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
 // multiplication result, for fast division and multiplication by
 // EDRAM-tile-related amounts.
diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc
index 9ee219648..9e1fe50ff 100644
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@@ -366,7 +366,7 @@ void RenderTargetCache::ClearCache() {
 void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
 
 bool RenderTargetCache::Update(bool is_rasterization_done,
-                               uint32_t shader_writes_color_targets) {
+                               uint32_t normalized_color_mask) {
   const RegisterFile& regs = register_file();
   bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
 
@@ -419,9 +419,6 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
     }
   }
 
-  uint32_t rts_remaining;
-  uint32_t rt_index;
-
   // Get used render targets.
   // [0] is depth / stencil where relevant, [1...4] is color.
   // Depth / stencil testing / writing is before color in the pipeline.
@@ -432,7 +429,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
   uint32_t rts_are_64bpp = 0;
   uint32_t color_rts_are_gamma = 0;
   if (is_rasterization_done) {
-    auto rb_depthcontrol = regs.Get<reg::RB_DEPTHCONTROL>();
+    auto rb_depthcontrol = draw_util::GetDepthControlForCurrentEdramMode(regs);
     if (rb_depthcontrol.z_enable || rb_depthcontrol.stencil_enable) {
       depth_and_color_rts_used_bits |= 1;
       auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();
@@ -445,50 +442,46 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
       resource_formats[0] =
           interlock_barrier_only ? 0 : uint32_t(rb_depth_info.depth_format);
     }
-    if (regs.Get<reg::RB_MODECONTROL>().edram_mode ==
-        xenos::ModeControl::kColorDepth) {
-      uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
-      rts_remaining = shader_writes_color_targets;
-      while (xe::bit_scan_forward(rts_remaining, &rt_index)) {
-        rts_remaining &= ~(uint32_t(1) << rt_index);
-        auto color_info = regs.Get<reg::RB_COLOR_INFO>(
-            reg::RB_COLOR_INFO::rt_register_indices[rt_index]);
-        xenos::ColorRenderTargetFormat color_format =
-            regs.Get<reg::RB_COLOR_INFO>(
-                    reg::RB_COLOR_INFO::rt_register_indices[rt_index])
-                .color_format;
-        if ((rb_color_mask >> (rt_index * 4)) &
-            ((uint32_t(1) << xenos::GetColorRenderTargetFormatComponentCount(
-                  color_format)) -
-             1)) {
-          uint32_t rt_bit_index = 1 + rt_index;
-          depth_and_color_rts_used_bits |= uint32_t(1) << rt_bit_index;
-          edram_bases[rt_bit_index] =
-              std::min(color_info.color_base, xenos::kEdramTileCount);
-          bool is_64bpp = xenos::IsColorRenderTargetFormat64bpp(color_format);
-          if (is_64bpp) {
-            rts_are_64bpp |= uint32_t(1) << rt_bit_index;
-          }
-          if (color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) {
-            color_rts_are_gamma |= uint32_t(1) << rt_index;
-          }
-          xenos::ColorRenderTargetFormat color_resource_format;
-          if (interlock_barrier_only) {
-            // Only changes in mapping between coordinates and addresses are
-            // interesting (along with access overlap between draw calls), thus
-            // only pixel size is relevant.
-            color_resource_format =
-                is_64bpp ? xenos::ColorRenderTargetFormat::k_16_16_16_16
-                         : xenos::ColorRenderTargetFormat::k_8_8_8_8;
-          } else {
-            color_resource_format = GetColorResourceFormat(
-                xenos::GetStorageColorFormat(color_format));
-          }
-          resource_formats[rt_bit_index] = uint32_t(color_resource_format);
-        }
+    for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
+      if (!(normalized_color_mask & (uint32_t(0b1111) << (4 * i)))) {
+        continue;
       }
+      auto color_info = regs.Get<reg::RB_COLOR_INFO>(
+          reg::RB_COLOR_INFO::rt_register_indices[i]);
+      uint32_t rt_bit_index = 1 + i;
+      depth_and_color_rts_used_bits |= uint32_t(1) << rt_bit_index;
+      edram_bases[rt_bit_index] =
+          std::min(color_info.color_base, xenos::kEdramTileCount);
+      xenos::ColorRenderTargetFormat color_format =
+          regs.Get<reg::RB_COLOR_INFO>(
+                  reg::RB_COLOR_INFO::rt_register_indices[i])
+              .color_format;
+      bool is_64bpp = xenos::IsColorRenderTargetFormat64bpp(color_format);
+      if (is_64bpp) {
+        rts_are_64bpp |= uint32_t(1) << rt_bit_index;
+      }
+      if (color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) {
+        color_rts_are_gamma |= uint32_t(1) << i;
+      }
+      xenos::ColorRenderTargetFormat color_resource_format;
+      if (interlock_barrier_only) {
+        // Only changes in mapping between coordinates and addresses are
+        // interesting (along with access overlap between draw calls), thus only
+        // pixel size is relevant.
+        color_resource_format =
+            is_64bpp ? xenos::ColorRenderTargetFormat::k_16_16_16_16
+                     : xenos::ColorRenderTargetFormat::k_8_8_8_8;
+      } else {
+        color_resource_format =
+            GetColorResourceFormat(xenos::GetStorageColorFormat(color_format));
+      }
+      resource_formats[rt_bit_index] = uint32_t(color_resource_format);
     }
   }
+
+  uint32_t rts_remaining;
+  uint32_t rt_index;
+
   // Eliminate other bound render targets if their EDRAM base conflicts with
   // another render target - it's an error in most host implementations to bind
   // the same render target into multiple slots, also the behavior would be
diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h
index 263d4fe60..743946438 100644
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@@ -215,7 +215,7 @@ class RenderTargetCache {
   virtual void BeginFrame();
 
   virtual bool Update(bool is_rasterization_done,
-                      uint32_t shader_writes_color_targets);
+                      uint32_t normalized_color_mask);
 
   // Returns bits where 0 is whether a depth render target is currently bound on
   // the host and 1... are whether the same applies to color render targets, and