[GPU] Get unclipped draw height by running VS on the CPU

2022-04-28 22:25:25 +03:00 · 2022-04-28 22:25:25 +03:00 · 0fd578cafd
parent b2b1d7b518
commit 0fd578cafd
12 changed files with 1866 additions and 53 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
  // Initialize the render target cache before configuring binding - need to
  // know if using rasterizer-ordered views for the bindless root signature.
  render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
-      *register_file_, *this, trace_writer_, bindless_resources_used_);
+      *register_file_, *memory_, trace_writer_, *this,
      bindless_resources_used_);
  if (!render_target_cache_->Initialize()) {
    XELOGE("Failed to initialize the render target cache");
    return false;
@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
                   : 0;
  if (!render_target_cache_->Update(is_rasterization_done,
                                    normalized_depth_control,
-                                    normalized_color_mask)) {
+                                    normalized_color_mask, *vertex_shader)) {
    return false;
  }
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {
 bool D3D12RenderTargetCache::Update(
    bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
-    uint32_t shader_writes_color_targets) {
+    uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
  if (!RenderTargetCache::Update(is_rasterization_done,
                                 normalized_depth_control,
-                                 shader_writes_color_targets)) {
+                                 shader_writes_color_targets, vertex_shader)) {
    return false;
  }
  switch (GetPath()) {
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@ -43,10 +43,10 @@ class D3D12CommandProcessor;
 class D3D12RenderTargetCache final : public RenderTargetCache {
 public:
  D3D12RenderTargetCache(const RegisterFile& register_file,
                         const Memory& memory, TraceWriter& trace_writer,
                         D3D12CommandProcessor& command_processor,
                         TraceWriter& trace_writer,
                         bool bindless_resources_used)
-      : RenderTargetCache(register_file),
+      : RenderTargetCache(register_file, memory, &trace_writer),
        command_processor_(command_processor),
        trace_writer_(trace_writer),
        bindless_resources_used_(bindless_resources_used) {}
@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
  bool Update(bool is_rasterization_done,
              reg::RB_DEPTHCONTROL normalized_depth_control,
-              uint32_t shader_writes_color_targets) override;
+              uint32_t shader_writes_color_targets,
              const Shader& vertex_shader) override;
  void InvalidateCommandListRenderTargets() {
    are_current_command_list_render_targets_valid_ = false;
--- a/src/xenia/gpu/draw_extent_estimator.cc
+++ b/src/xenia/gpu/draw_extent_estimator.cc
@ -0,0 +1,350 @@
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2022 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
 #include "xenia/gpu/draw_extent_estimator.h"
 #include <algorithm>
 #include <cfloat>
 #include <cstdint>
 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/ucode.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/graphics_util.h"
 DEFINE_bool(
    execute_unclipped_draw_vs_on_cpu, true,
    "Execute the vertex shader for draws with clipping disabled, primarily "
    "screen-space draws (such as clears), on the CPU when possible to estimate "
    "the extent of the EDRAM involved in the draw.\n"
    "Enabling this may significantly improve GPU performance as otherwise up "
    "to the entire EDRAM may be considered used in draws without clipping, "
    "potentially resulting in spurious EDRAM range ownership transfer round "
    "trips between host render targets.\n"
    "Also, on hosts where certain render target formats have to be emulated in "
    "a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
    "this prevents corruption of other render targets located after the "
    "current ones in the EDRAM by lossy range ownership transfers done for "
    "those draws.",
    "GPU");
 DEFINE_bool(
    execute_unclipped_draw_vs_on_cpu_with_scissor, false,
    "Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
    "non-scissored draws (with the right and the bottom sides of the scissor "
    "rectangle at 8192 or beyond) even though if the scissor rectangle is "
    "present, it's usually sufficient for esimating the height of the render "
    "target.\n"
    "Enabling this may cause excessive processing of vertices on the CPU, as "
    "some games draw rectangles (for their UI, for instance) without clipping, "
    "but with a proper scissor rectangle.",
    "GPU");
 namespace xe {
 namespace gpu {
 void DrawExtentEstimator::PositionYExportSink::Export(
    ucode::ExportRegister export_register, const float* value,
    uint32_t value_mask) {
  if (export_register == ucode::ExportRegister::kVSPosition) {
    if (value_mask & 0b0010) {
      position_y_ = value[1];
    }
    if (value_mask & 0b1000) {
      position_w_ = value[3];
    }
  } else if (export_register ==
             ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
    if (value_mask & 0b0001) {
      point_size_ = value[0];
    }
    if (value_mask & 0b0100) {
      vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
    }
  }
 }
 uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
  SCOPE_profile_cpu_f("gpu");
  const RegisterFile& regs = register_file_;
  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
  if (!vgt_draw_initiator.num_indices) {
    return 0;
  }
  if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
      vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
    // TODO(Triang3l): Support immediate indices.
    return xenos::kTexture2DCubeMaxWidthHeight;
  }
  // Not reproducing tessellation.
  if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
                                 vgt_draw_initiator.prim_type) &&
      regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
          xenos::VGTOutputPath::kTessellationEnable) {
    return xenos::kTexture2DCubeMaxWidthHeight;
  }
  assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
  assert_true(vertex_shader.is_ucode_analyzed());
  if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
    return xenos::kTexture2DCubeMaxWidthHeight;
  }
  auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
  union {
    const void* index_buffer;
    const uint16_t* index_buffer_16;
    const uint32_t* index_buffer_32;
  };
  xenos::Endian index_endian = vgt_dma_size.swap_mode;
  if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
    xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
    uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
    uint32_t index_buffer_read_count =
        std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
    if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
      // Handle the index endianness to same way as the PrimitiveProcessor.
      if (index_endian == xenos::Endian::k8in32) {
        index_endian = xenos::Endian::k8in16;
      } else if (index_endian == xenos::Endian::k16in32) {
        index_endian = xenos::Endian::kNone;
      }
      index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
      if (trace_writer_) {
        trace_writer_->WriteMemoryRead(
            index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
      }
    } else {
      assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
      index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
      if (trace_writer_) {
        trace_writer_->WriteMemoryRead(
            index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
      }
    }
    index_buffer = memory_.TranslatePhysical(index_buffer_base);
  }
  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
  uint32_t reset_index =
      regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
  uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
  uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
  uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
  float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
                               ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
                               : 1.0f;
  float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
                                ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
                                : 0.0f;
  int32_t point_vertex_min_diameter_float = 0;
  int32_t point_vertex_max_diameter_float = 0;
  float point_constant_radius_y = 0.0f;
  if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
    auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
    *reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
        float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
    *reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
        float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
    point_constant_radius_y =
        float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
  }
  float max_y = -FLT_MAX;
  shader_interpreter_.SetShader(vertex_shader);
  PositionYExportSink position_y_export_sink;
  shader_interpreter_.SetExportSink(&position_y_export_sink);
  for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
    uint32_t vertex_index;
    if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
      if (i < vgt_dma_size.num_words) {
        if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
          vertex_index = index_buffer_16[i];
        } else {
          vertex_index = index_buffer_32[i];
        }
        // The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
        vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
      } else {
        vertex_index = 0;
      }
      if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
        continue;
      }
    } else {
      assert_true(vgt_draw_initiator.source_select ==
                  xenos::SourceSelect::kAutoIndex);
      vertex_index = i;
    }
    vertex_index =
        std::min(max_index,
                 std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
    position_y_export_sink.Reset();
    shader_interpreter_.temp_registers()[0] = float(vertex_index);
    shader_interpreter_.Execute();
    if (position_y_export_sink.vertex_kill().has_value() &&
        (position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
      continue;
    }
    if (!position_y_export_sink.position_y().has_value()) {
      continue;
    }
    float vertex_y = position_y_export_sink.position_y().value();
    if (!pa_cl_vte_cntl.vtx_xy_fmt) {
      if (!position_y_export_sink.position_w().has_value()) {
        continue;
      }
      vertex_y /= position_y_export_sink.position_w().value();
    }
    vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
    if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
      float point_radius_y;
      if (position_y_export_sink.point_size().has_value()) {
        // Vertex-specified diameter. Clamped effectively as a signed integer in
        // the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
        // to the maximum.
        point_radius_y = position_y_export_sink.point_size().value();
        *reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
            point_vertex_max_diameter_float,
            std::max(point_vertex_min_diameter_float,
                     *reinterpret_cast<const int32_t*>(&point_radius_y)));
        point_radius_y *= 0.5f;
      } else {
        // Constant radius.
        point_radius_y = point_constant_radius_y;
      }
      vertex_y += point_radius_y;
    }
    // std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
    // always returned - max_y, which is initialized to a normalized value.
    max_y = std::max(max_y, vertex_y);
  }
  shader_interpreter_.SetExportSink(nullptr);
  int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
  // 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
  // as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
  // range of the window offset plus the half-pixel offset is -16384 to 16384.5,
  // so it's safe to add both - adding it will neither move the 16p8 clamping
  // bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
  // cause 24p8 overflow.
  if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
    max_y_24p8 += 128;
  }
  if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
    max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
  }
  // Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
  auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
  return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
          ((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
                                                                     : 255)) >>
         8;
 }
 uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
                                           const Shader& vertex_shader) {
  SCOPE_profile_cpu_f("gpu");
  const RegisterFile& regs = register_file_;
  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
  int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
  // Scissor.
  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
  int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
  bool scissor_window_offset =
      !regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
  if (scissor_window_offset) {
    scissor_bottom += window_y_offset;
  }
  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
  scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
  uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
  if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
    // Actual extent from the vertices.
    if (try_to_estimate_vertex_max_y &&
        cvars::execute_unclipped_draw_vs_on_cpu) {
      bool estimate_vertex_max_y;
      if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
        estimate_vertex_max_y = true;
      } else {
        estimate_vertex_max_y = false;
        if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
          // Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
          // may be a normal render target height (80x8192 is well within the
          // EDRAM size, for instance), no need to process the vertices on the
          // CPU in this case.
          int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
          if (scissor_window_offset) {
            scissor_right += pa_sc_window_offset.window_x_offset;
          }
          scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
          if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
            estimate_vertex_max_y = true;
          }
        }
      }
      if (estimate_vertex_max_y) {
        max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
      }
    }
  } else {
    // Viewport. Though the Xenos itself doesn't have an implicit viewport
    // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
    // usually exists and can't be disabled.
    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
    float viewport_bottom = 0.0f;
    // First calculate all the integer.0 or integer.5 offsetting exactly at full
    // precision.
    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
      viewport_bottom += float(window_y_offset);
    }
    if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
      viewport_bottom += 0.5f;
    }
    // Then apply the floating-point viewport offset.
    if (pa_cl_vte_cntl.vport_y_offset_ena) {
      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
    }
    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
                           : 1.0f;
    // Using floor, or, rather, truncation (because maxing with zero anyway)
    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
    // GPUs on Direct3D 12 (but not WARP), also like in
    // draw_util::GetHostViewportInfo.
    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
    // argument in the !(a < b) case (always for NaN), min as float (max_y is
    // well below 2^24) to safely drop very large values.
    max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
  }
  return max_y;
 }
 }  // namespace gpu
 }  // namespace xe
--- a/src/xenia/gpu/draw_extent_estimator.h
+++ b/src/xenia/gpu/draw_extent_estimator.h
@ -0,0 +1,76 @@
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2022 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
 #ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
 #define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
 #include <cstdint>
 #include <optional>
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/shader_interpreter.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/memory.h"
 namespace xe {
 namespace gpu {
 class DrawExtentEstimator {
 public:
  DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
                      TraceWriter* trace_writer)
      : register_file_(register_file),
        memory_(memory),
        trace_writer_(trace_writer),
        shader_interpreter_(register_file, memory) {
    shader_interpreter_.SetTraceWriter(trace_writer);
  }
  // The shader must have its ucode analyzed.
  uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
  uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
                        const Shader& vertex_shader);
 private:
  class PositionYExportSink : public ShaderInterpreter::ExportSink {
   public:
    void Export(ucode::ExportRegister export_register, const float* value,
                uint32_t value_mask) override;
    void Reset() {
      position_y_.reset();
      position_w_.reset();
      point_size_.reset();
      vertex_kill_.reset();
    }
    const std::optional<float>& position_y() const { return position_y_; }
    const std::optional<float>& position_w() const { return position_w_; }
    const std::optional<float>& point_size() const { return point_size_; }
    const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
   private:
    std::optional<float> position_y_;
    std::optional<float> position_w_;
    std::optional<float> point_size_;
    std::optional<uint32_t> vertex_kill_;
  };
  const RegisterFile& register_file_;
  const Memory& memory_;
  TraceWriter* trace_writer_;
  ShaderInterpreter shader_interpreter_;
 };
 }  // namespace gpu
 }  // namespace xe
 #endif  // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
 };
 static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
 union alignas(uint32_t) SQ_VS_CONST {
  uint32_t value;
  struct {
    uint32_t base : 9;  // +0
    uint32_t : 3;       // +9
    // Vec4 count minus one.
    uint32_t size : 9;  // 12
  };
  static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
 };
 static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
 // Same as SQ_VS_CONST.
 union alignas(uint32_t) SQ_PS_CONST {
  uint32_t value;
  struct {
    uint32_t base : 9;  // +0
    uint32_t : 3;       // +9
    // Vec4 count minus one.
    uint32_t size : 9;  // 12
  };
  static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
 };
 static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
 /*******************************************************************************
 __   _____ ___ _____ _____  __
 \ \ / / __| _ \_   _| __\ \/ /
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@ -22,7 +22,6 @@
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/xenos.h"
@ -143,6 +142,19 @@ DEFINE_bool(
    "-1...1, remap -32...32 to -1...1 to use the full possible range of "
    "values, at the expense of multiplicative blending correctness.",
    "GPU");
 // Enabled by default as the GPU is overall usually the bottleneck when the
 // pixel shader interlock render backend implementation is used, anything that
 // may improve GPU performance is favorable.
 DEFINE_bool(
    execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
    "If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
    "for unclipped draws on the CPU even when using the pixel shader interlock "
    "(rasterizer-ordered view) implementation of the render backend on the "
    "host, for which no expensive copying between host render targets is "
    "needed when the ownership of a EDRAM range is changed.\n"
    "If this is enabled, excessive barriers may be eliminated when switching "
    "between different render targets in separate EDRAM locations.",
    "GPU");
 namespace xe {
 namespace gpu {
@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
 bool RenderTargetCache::Update(bool is_rasterization_done,
                               reg::RB_DEPTHCONTROL normalized_depth_control,
-                               uint32_t normalized_color_mask) {
+                               uint32_t normalized_color_mask,
                               const Shader& vertex_shader) {
  const RegisterFile& regs = register_file();
  bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
  // Estimate height used by render targets (for color for writes, for depth /
  // stencil for both reads and writes) from various sources.
-  uint32_t height_used =
+  uint32_t height_used = std::min(
-      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
+      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
-  int32_t window_y_offset =
+      draw_extent_estimator_.EstimateMaxY(
-      regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
+          interlock_barrier_only
-  if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
+              ? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
-    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+              : true,
-    float viewport_bottom = 0.0f;
+          vertex_shader));
    // First calculate all the integer.0 or integer.5 offsetting exactly at full
    // precision.
    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
      viewport_bottom += float(window_y_offset);
    }
    if (cvars::half_pixel_offset &&
        !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
      viewport_bottom += 0.5f;
    }
    // Then apply the floating-point viewport offset.
    if (pa_cl_vte_cntl.vport_y_offset_ena) {
      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
    }
    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
                           : 1.0f;
    // Using floor, or, rather, truncation (because maxing with zero anyway)
    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
    // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
    // argument in the !(a < b) case (always for NaN), min as float (height_used
    // is well below 2^24) to safely drop very large values.
    height_used =
        uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
  }
  int32_t scissor_bottom =
      int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
  if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
    scissor_bottom += window_y_offset;
  }
  scissor_bottom =
      std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
  height_used =
      std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
  // Sorted by EDRAM base and then by index in the pipeline - for simplicity,
  // treat render targets placed closer to the end of the EDRAM as truncating
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@ -21,9 +21,11 @@
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/gpu/draw_extent_estimator.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/xenos.h"
 DECLARE_bool(depth_transfer_not_equal_test);
@ -217,7 +219,8 @@ class RenderTargetCache {
  virtual bool Update(bool is_rasterization_done,
                      reg::RB_DEPTHCONTROL normalized_depth_control,
-                      uint32_t normalized_color_mask);
+                      uint32_t normalized_color_mask,
                      const Shader& vertex_shader);
  // Returns bits where 0 is whether a depth render target is currently bound on
  // the host and 1... are whether the same applies to color render targets, and
@ -228,8 +231,10 @@ class RenderTargetCache {
      uint32_t* depth_and_color_formats_out = nullptr) const;
 protected:
-  RenderTargetCache(const RegisterFile& register_file)
+  RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
-      : register_file_(register_file) {}
+                    TraceWriter* trace_writer)
      : register_file_(register_file),
        draw_extent_estimator_(register_file, memory, trace_writer) {}
  const RegisterFile& register_file() const { return register_file_; }
@ -606,6 +611,8 @@ class RenderTargetCache {
 private:
  const RegisterFile& register_file_;
  DrawExtentEstimator draw_extent_estimator_;
  // For host render targets.
  struct OwnershipRange {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -914,6 +914,12 @@ class Shader {
  // True if the current shader has any `kill` instructions.
  bool kills_pixels() const { return kills_pixels_; }
  // True if the shader has any texture-related instructions (any fetch
  // instructions other than vertex fetch) writing any non-constant components.
  bool uses_texture_fetch_instruction_results() const {
    return uses_texture_fetch_instruction_results_;
  }
  // True if the shader overrides the pixel depth.
  bool writes_depth() const { return writes_depth_; }
@ -1002,6 +1008,7 @@ class Shader {
  uint32_t register_static_address_bound_ = 0;
  bool uses_register_dynamic_addressing_ = false;
  bool kills_pixels_ = false;
  bool uses_texture_fetch_instruction_results_ = false;
  bool writes_depth_ = false;
  uint32_t writes_color_targets_ = 0b0000;
--- a/src/xenia/gpu/shader_interpreter.cc
+++ b/src/xenia/gpu/shader_interpreter.cc
--- a/src/xenia/gpu/shader_interpreter.h
+++ b/src/xenia/gpu/shader_interpreter.h
@ -0,0 +1,149 @@
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2022 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
 #ifndef XENIA_GPU_SHADER_INTERPRETER_H_
 #define XENIA_GPU_SHADER_INTERPRETER_H_
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/ucode.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"
 namespace xe {
 namespace gpu {
 class ShaderInterpreter {
 public:
  ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
      : register_file_(register_file), memory_(memory) {}
  class ExportSink {
   public:
    virtual ~ExportSink() = default;
    virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
    virtual void Export(ucode::ExportRegister export_register,
                        const float* value, uint32_t value_mask) {}
  };
  void SetTraceWriter(TraceWriter* new_trace_writer) {
    trace_writer_ = new_trace_writer;
  }
  ExportSink* GetExportSink() const { return export_sink_; }
  void SetExportSink(ExportSink* new_export_sink) {
    export_sink_ = new_export_sink;
  }
  const float* temp_registers() const { return &temp_registers_[0][0]; }
  float* temp_registers() { return &temp_registers_[0][0]; }
  static bool CanInterpretShader(const Shader& shader) {
    assert_true(shader.is_ucode_analyzed());
    // Texture instructions are not very common in vertex shaders (and not used
    // in Direct3D 9's internal rectangles such as clears) and are extremely
    // complex, not implemented.
    if (shader.uses_texture_fetch_instruction_results()) {
      return false;
    }
    return true;
  }
  void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
    shader_type_ = shader_type;
    ucode_ = ucode;
  }
  void SetShader(const Shader& shader) {
    assert_true(CanInterpretShader(shader));
    SetShader(shader.type(), shader.ucode_dwords());
  }
  void Execute();
 private:
  struct State {
    ucode::VertexFetchInstruction vfetch_full_last;
    uint32_t vfetch_address_dwords;
    float previous_scalar;
    uint32_t call_stack_depth;
    uint32_t call_return_addresses[4];
    uint32_t loop_stack_depth;
    xenos::LoopConstant loop_constants[4];
    uint32_t loop_iterators[4];
    int32_t address_register;
    bool predicate;
    void Reset() { std::memset(this, 0, sizeof(*this)); }
    int32_t GetLoopAddress() const {
      assert_true(loop_stack_depth && loop_stack_depth < 4);
      if (!loop_stack_depth || loop_stack_depth >= 4) {
        return 0;
      }
      xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
      // Clamp to the real range specified in the IPR2015-00325 sequencer
      // specification.
      // https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
      return std::min(
          INT32_C(256),
          std::max(INT32_C(-256),
                   int32_t(int32_t(loop_iterators[loop_stack_depth]) *
                               loop_constant.step +
                           loop_constant.start)));
    }
  };
  static float FlushDenormal(float value) {
    uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
    bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
    return *reinterpret_cast<const float*>(&bits);
  }
  const float* GetTempRegister(uint32_t address, bool is_relative) const {
    return temp_registers_[(
        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
  }
  // For simplicity (due to writability), not bounds-checking.
  float* GetTempRegister(uint32_t address, bool is_relative) {
    return temp_registers_[(
        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
  }
  const float* GetFloatConstant(uint32_t address, bool is_relative,
                                bool relative_address_is_a0) const;
  void ExecuteAluInstruction(ucode::AluInstruction instr);
  void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
                        const float* value);
  void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
  const RegisterFile& register_file_;
  const Memory& memory_;
  TraceWriter* trace_writer_ = nullptr;
  ExportSink* export_sink_ = nullptr;
  xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
  const uint32_t* ucode_ = nullptr;
  // For both inputs and locals.
  float temp_registers_[64][4];
  State state_;
 };
 }  // namespace gpu
 }  // namespace xe
 #endif  // XENIA_GPU_SHADER_INTERPRETER_H_
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
    GatherOperandInformation(binding.fetch_instr.operands[i]);
  }
  if (binding.fetch_instr.result.GetUsedResultComponents()) {
    uses_texture_fetch_instruction_results_ = true;
  }
  switch (op.opcode()) {
    case FetchOpcode::kSetTextureLod:
    case FetchOpcode::kSetTextureGradientsHorz: