[GPU] Get unclipped draw height by running VS on the CPU

2022-04-28 22:25:25 +03:00 · 2022-04-28 22:25:25 +03:00 · 0fd578cafd
parent b2b1d7b518
commit 0fd578cafd
12 changed files with 1866 additions and 53 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
  // Initialize the render target cache before configuring binding - need to
  // know if using rasterizer-ordered views for the bindless root signature.
  render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
-      *register_file_, *this, trace_writer_, bindless_resources_used_);
+      *register_file_, *memory_, trace_writer_, *this,
+      bindless_resources_used_);
  if (!render_target_cache_->Initialize()) {
    XELOGE("Failed to initialize the render target cache");
    return false;
@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
                   : 0;
  if (!render_target_cache_->Update(is_rasterization_done,
                                    normalized_depth_control,
-                                    normalized_color_mask)) {
+                                    normalized_color_mask, *vertex_shader)) {
    return false;
  }

--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {

 bool D3D12RenderTargetCache::Update(
    bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
-    uint32_t shader_writes_color_targets) {
+    uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
  if (!RenderTargetCache::Update(is_rasterization_done,
                                 normalized_depth_control,
-                                 shader_writes_color_targets)) {
+                                 shader_writes_color_targets, vertex_shader)) {
    return false;
  }
  switch (GetPath()) {
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@ -43,10 +43,10 @@ class D3D12CommandProcessor;
 class D3D12RenderTargetCache final : public RenderTargetCache {
 public:
  D3D12RenderTargetCache(const RegisterFile& register_file,
+                         const Memory& memory, TraceWriter& trace_writer,
                         D3D12CommandProcessor& command_processor,
-                         TraceWriter& trace_writer,
                         bool bindless_resources_used)
-      : RenderTargetCache(register_file),
+      : RenderTargetCache(register_file, memory, &trace_writer),
        command_processor_(command_processor),
        trace_writer_(trace_writer),
        bindless_resources_used_(bindless_resources_used) {}
@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {

  bool Update(bool is_rasterization_done,
              reg::RB_DEPTHCONTROL normalized_depth_control,
-              uint32_t shader_writes_color_targets) override;
+              uint32_t shader_writes_color_targets,
+              const Shader& vertex_shader) override;

  void InvalidateCommandListRenderTargets() {
    are_current_command_list_render_targets_valid_ = false;
--- a/src/xenia/gpu/draw_extent_estimator.cc
+++ b/src/xenia/gpu/draw_extent_estimator.cc
@ -0,0 +1,350 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/draw_extent_estimator.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <cstdint>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/cvar.h"
+#include "xenia/base/profiling.h"
+#include "xenia/gpu/registers.h"
+#include "xenia/gpu/ucode.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/ui/graphics_util.h"
+
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu, true,
+    "Execute the vertex shader for draws with clipping disabled, primarily "
+    "screen-space draws (such as clears), on the CPU when possible to estimate "
+    "the extent of the EDRAM involved in the draw.\n"
+    "Enabling this may significantly improve GPU performance as otherwise up "
+    "to the entire EDRAM may be considered used in draws without clipping, "
+    "potentially resulting in spurious EDRAM range ownership transfer round "
+    "trips between host render targets.\n"
+    "Also, on hosts where certain render target formats have to be emulated in "
+    "a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
+    "this prevents corruption of other render targets located after the "
+    "current ones in the EDRAM by lossy range ownership transfers done for "
+    "those draws.",
+    "GPU");
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu_with_scissor, false,
+    "Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
+    "non-scissored draws (with the right and the bottom sides of the scissor "
+    "rectangle at 8192 or beyond) even though if the scissor rectangle is "
+    "present, it's usually sufficient for esimating the height of the render "
+    "target.\n"
+    "Enabling this may cause excessive processing of vertices on the CPU, as "
+    "some games draw rectangles (for their UI, for instance) without clipping, "
+    "but with a proper scissor rectangle.",
+    "GPU");
+
+namespace xe {
+namespace gpu {
+
+void DrawExtentEstimator::PositionYExportSink::Export(
+    ucode::ExportRegister export_register, const float* value,
+    uint32_t value_mask) {
+  if (export_register == ucode::ExportRegister::kVSPosition) {
+    if (value_mask & 0b0010) {
+      position_y_ = value[1];
+    }
+    if (value_mask & 0b1000) {
+      position_w_ = value[3];
+    }
+  } else if (export_register ==
+             ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
+    if (value_mask & 0b0001) {
+      point_size_ = value[0];
+    }
+    if (value_mask & 0b0100) {
+      vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
+    }
+  }
+}
+
+uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
+  SCOPE_profile_cpu_f("gpu");
+
+  const RegisterFile& regs = register_file_;
+
+  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
+  if (!vgt_draw_initiator.num_indices) {
+    return 0;
+  }
+  if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
+      vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
+    // TODO(Triang3l): Support immediate indices.
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  // Not reproducing tessellation.
+  if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
+                                 vgt_draw_initiator.prim_type) &&
+      regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
+          xenos::VGTOutputPath::kTessellationEnable) {
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
+  assert_true(vertex_shader.is_ucode_analyzed());
+  if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
+  union {
+    const void* index_buffer;
+    const uint16_t* index_buffer_16;
+    const uint32_t* index_buffer_32;
+  };
+  xenos::Endian index_endian = vgt_dma_size.swap_mode;
+  if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
+    xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
+    uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
+    uint32_t index_buffer_read_count =
+        std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
+    if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
+      // Handle the index endianness to same way as the PrimitiveProcessor.
+      if (index_endian == xenos::Endian::k8in32) {
+        index_endian = xenos::Endian::k8in16;
+      } else if (index_endian == xenos::Endian::k16in32) {
+        index_endian = xenos::Endian::kNone;
+      }
+      index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
+      if (trace_writer_) {
+        trace_writer_->WriteMemoryRead(
+            index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
+      }
+    } else {
+      assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
+      index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
+      if (trace_writer_) {
+        trace_writer_->WriteMemoryRead(
+            index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
+      }
+    }
+    index_buffer = memory_.TranslatePhysical(index_buffer_base);
+  }
+  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
+  uint32_t reset_index =
+      regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
+  uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
+  uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
+  uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
+
+  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+  float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
+                               ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+                               : 1.0f;
+  float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
+                                ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
+                                : 0.0f;
+
+  int32_t point_vertex_min_diameter_float = 0;
+  int32_t point_vertex_max_diameter_float = 0;
+  float point_constant_radius_y = 0.0f;
+  if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+    auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
+    *reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
+        float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
+    *reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
+        float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
+    point_constant_radius_y =
+        float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
+  }
+
+  float max_y = -FLT_MAX;
+
+  shader_interpreter_.SetShader(vertex_shader);
+
+  PositionYExportSink position_y_export_sink;
+  shader_interpreter_.SetExportSink(&position_y_export_sink);
+  for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
+    uint32_t vertex_index;
+    if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
+      if (i < vgt_dma_size.num_words) {
+        if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
+          vertex_index = index_buffer_16[i];
+        } else {
+          vertex_index = index_buffer_32[i];
+        }
+        // The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
+        vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
+      } else {
+        vertex_index = 0;
+      }
+      if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
+        continue;
+      }
+    } else {
+      assert_true(vgt_draw_initiator.source_select ==
+                  xenos::SourceSelect::kAutoIndex);
+      vertex_index = i;
+    }
+    vertex_index =
+        std::min(max_index,
+                 std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
+
+    position_y_export_sink.Reset();
+
+    shader_interpreter_.temp_registers()[0] = float(vertex_index);
+    shader_interpreter_.Execute();
+
+    if (position_y_export_sink.vertex_kill().has_value() &&
+        (position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
+      continue;
+    }
+    if (!position_y_export_sink.position_y().has_value()) {
+      continue;
+    }
+    float vertex_y = position_y_export_sink.position_y().value();
+    if (!pa_cl_vte_cntl.vtx_xy_fmt) {
+      if (!position_y_export_sink.position_w().has_value()) {
+        continue;
+      }
+      vertex_y /= position_y_export_sink.position_w().value();
+    }
+
+    vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
+
+    if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+      float point_radius_y;
+      if (position_y_export_sink.point_size().has_value()) {
+        // Vertex-specified diameter. Clamped effectively as a signed integer in
+        // the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
+        // to the maximum.
+        point_radius_y = position_y_export_sink.point_size().value();
+        *reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
+            point_vertex_max_diameter_float,
+            std::max(point_vertex_min_diameter_float,
+                     *reinterpret_cast<const int32_t*>(&point_radius_y)));
+        point_radius_y *= 0.5f;
+      } else {
+        // Constant radius.
+        point_radius_y = point_constant_radius_y;
+      }
+      vertex_y += point_radius_y;
+    }
+
+    // std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
+    // always returned - max_y, which is initialized to a normalized value.
+    max_y = std::max(max_y, vertex_y);
+  }
+  shader_interpreter_.SetExportSink(nullptr);
+
+  int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
+  // 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
+  // as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
+  // range of the window offset plus the half-pixel offset is -16384 to 16384.5,
+  // so it's safe to add both - adding it will neither move the 16p8 clamping
+  // bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
+  // cause 24p8 overflow.
+  if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
+    max_y_24p8 += 128;
+  }
+  if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
+    max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
+  }
+  // Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
+  auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
+  return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
+          ((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
+                                                                     : 255)) >>
+         8;
+}
+
+uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
+                                           const Shader& vertex_shader) {
+  SCOPE_profile_cpu_f("gpu");
+
+  const RegisterFile& regs = register_file_;
+
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
+
+  // Scissor.
+  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
+  int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
+  bool scissor_window_offset =
+      !regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
+  if (scissor_window_offset) {
+    scissor_bottom += window_y_offset;
+  }
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
+  uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
+
+  if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
+    // Actual extent from the vertices.
+    if (try_to_estimate_vertex_max_y &&
+        cvars::execute_unclipped_draw_vs_on_cpu) {
+      bool estimate_vertex_max_y;
+      if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
+        estimate_vertex_max_y = true;
+      } else {
+        estimate_vertex_max_y = false;
+        if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
+          // Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
+          // may be a normal render target height (80x8192 is well within the
+          // EDRAM size, for instance), no need to process the vertices on the
+          // CPU in this case.
+          int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
+          if (scissor_window_offset) {
+            scissor_right += pa_sc_window_offset.window_x_offset;
+          }
+          scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
+          if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
+            estimate_vertex_max_y = true;
+          }
+        }
+      }
+      if (estimate_vertex_max_y) {
+        max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
+      }
+    }
+  } else {
+    // Viewport. Though the Xenos itself doesn't have an implicit viewport
+    // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
+    // usually exists and can't be disabled.
+    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+    float viewport_bottom = 0.0f;
+    // First calculate all the integer.0 or integer.5 offsetting exactly at full
+    // precision.
+    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
+      viewport_bottom += float(window_y_offset);
+    }
+    if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
+      viewport_bottom += 0.5f;
+    }
+    // Then apply the floating-point viewport offset.
+    if (pa_cl_vte_cntl.vport_y_offset_ena) {
+      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+    }
+    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
+                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
+                           : 1.0f;
+    // Using floor, or, rather, truncation (because maxing with zero anyway)
+    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
+    // GPUs on Direct3D 12 (but not WARP), also like in
+    // draw_util::GetHostViewportInfo.
+    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
+    // argument in the !(a < b) case (always for NaN), min as float (max_y is
+    // well below 2^24) to safely drop very large values.
+    max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
+  }
+
+  return max_y;
+}
+
+}  // namespace gpu
+}  // namespace xe
--- a/src/xenia/gpu/draw_extent_estimator.h
+++ b/src/xenia/gpu/draw_extent_estimator.h
@ -0,0 +1,76 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
+#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "xenia/gpu/register_file.h"
+#include "xenia/gpu/shader.h"
+#include "xenia/gpu/shader_interpreter.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+class DrawExtentEstimator {
+ public:
+  DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
+                      TraceWriter* trace_writer)
+      : register_file_(register_file),
+        memory_(memory),
+        trace_writer_(trace_writer),
+        shader_interpreter_(register_file, memory) {
+    shader_interpreter_.SetTraceWriter(trace_writer);
+  }
+
+  // The shader must have its ucode analyzed.
+  uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
+  uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
+                        const Shader& vertex_shader);
+
+ private:
+  class PositionYExportSink : public ShaderInterpreter::ExportSink {
+   public:
+    void Export(ucode::ExportRegister export_register, const float* value,
+                uint32_t value_mask) override;
+
+    void Reset() {
+      position_y_.reset();
+      position_w_.reset();
+      point_size_.reset();
+      vertex_kill_.reset();
+    }
+
+    const std::optional<float>& position_y() const { return position_y_; }
+    const std::optional<float>& position_w() const { return position_w_; }
+    const std::optional<float>& point_size() const { return point_size_; }
+    const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
+
+   private:
+    std::optional<float> position_y_;
+    std::optional<float> position_w_;
+    std::optional<float> point_size_;
+    std::optional<uint32_t> vertex_kill_;
+  };
+
+  const RegisterFile& register_file_;
+  const Memory& memory_;
+  TraceWriter* trace_writer_;
+
+  ShaderInterpreter shader_interpreter_;
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
 };
 static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));

+union alignas(uint32_t) SQ_VS_CONST {
+  uint32_t value;
+  struct {
+    uint32_t base : 9;  // +0
+    uint32_t : 3;       // +9
+    // Vec4 count minus one.
+    uint32_t size : 9;  // 12
+  };
+  static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
+};
+static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
+
+// Same as SQ_VS_CONST.
+union alignas(uint32_t) SQ_PS_CONST {
+  uint32_t value;
+  struct {
+    uint32_t base : 9;  // +0
+    uint32_t : 3;       // +9
+    // Vec4 count minus one.
+    uint32_t size : 9;  // 12
+  };
+  static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
+};
+static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
+
 /*******************************************************************************
 __   _____ ___ _____ _____  __
 \ \ / / __| _ \_   _| __\ \/ /
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@ -22,7 +22,6 @@
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/gpu/draw_util.h"
-#include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/xenos.h"
@ -143,6 +142,19 @@ DEFINE_bool(
    "-1...1, remap -32...32 to -1...1 to use the full possible range of "
    "values, at the expense of multiplicative blending correctness.",
    "GPU");
+// Enabled by default as the GPU is overall usually the bottleneck when the
+// pixel shader interlock render backend implementation is used, anything that
+// may improve GPU performance is favorable.
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
+    "If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
+    "for unclipped draws on the CPU even when using the pixel shader interlock "
+    "(rasterizer-ordered view) implementation of the render backend on the "
+    "host, for which no expensive copying between host render targets is "
+    "needed when the ownership of a EDRAM range is changed.\n"
+    "If this is enabled, excessive barriers may be eliminated when switching "
+    "between different render targets in separate EDRAM locations.",
+    "GPU");

 namespace xe {
 namespace gpu {
@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }

 bool RenderTargetCache::Update(bool is_rasterization_done,
                               reg::RB_DEPTHCONTROL normalized_depth_control,
-                               uint32_t normalized_color_mask) {
+                               uint32_t normalized_color_mask,
+                               const Shader& vertex_shader) {
  const RegisterFile& regs = register_file();
  bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;

@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,

  // Estimate height used by render targets (for color for writes, for depth /
  // stencil for both reads and writes) from various sources.
-  uint32_t height_used =
-      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
-  int32_t window_y_offset =
-      regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
-  if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
-    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-    float viewport_bottom = 0.0f;
-    // First calculate all the integer.0 or integer.5 offsetting exactly at full
-    // precision.
-    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
-      viewport_bottom += float(window_y_offset);
-    }
-    if (cvars::half_pixel_offset &&
-        !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
-      viewport_bottom += 0.5f;
-    }
-    // Then apply the floating-point viewport offset.
-    if (pa_cl_vte_cntl.vport_y_offset_ena) {
-      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
-    }
-    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
-                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
-                           : 1.0f;
-    // Using floor, or, rather, truncation (because maxing with zero anyway)
-    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
-    // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
-    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
-    // argument in the !(a < b) case (always for NaN), min as float (height_used
-    // is well below 2^24) to safely drop very large values.
-    height_used =
-        uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
-  }
-  int32_t scissor_bottom =
-      int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
-  if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
-    scissor_bottom += window_y_offset;
-  }
-  scissor_bottom =
-      std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
-  height_used =
-      std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
+  uint32_t height_used = std::min(
+      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
+      draw_extent_estimator_.EstimateMaxY(
+          interlock_barrier_only
+              ? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
+              : true,
+          vertex_shader));

  // Sorted by EDRAM base and then by index in the pipeline - for simplicity,
  // treat render targets placed closer to the end of the EDRAM as truncating
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@ -21,9 +21,11 @@
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
+#include "xenia/gpu/draw_extent_estimator.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
+#include "xenia/gpu/shader.h"
 #include "xenia/gpu/xenos.h"

 DECLARE_bool(depth_transfer_not_equal_test);
@ -217,7 +219,8 @@ class RenderTargetCache {

  virtual bool Update(bool is_rasterization_done,
                      reg::RB_DEPTHCONTROL normalized_depth_control,
-                      uint32_t normalized_color_mask);
+                      uint32_t normalized_color_mask,
+                      const Shader& vertex_shader);

  // Returns bits where 0 is whether a depth render target is currently bound on
  // the host and 1... are whether the same applies to color render targets, and
@ -228,8 +231,10 @@ class RenderTargetCache {
      uint32_t* depth_and_color_formats_out = nullptr) const;

 protected:
-  RenderTargetCache(const RegisterFile& register_file)
-      : register_file_(register_file) {}
+  RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
+                    TraceWriter* trace_writer)
+      : register_file_(register_file),
+        draw_extent_estimator_(register_file, memory, trace_writer) {}

  const RegisterFile& register_file() const { return register_file_; }

@ -606,6 +611,8 @@ class RenderTargetCache {
 private:
  const RegisterFile& register_file_;

+  DrawExtentEstimator draw_extent_estimator_;
+
  // For host render targets.

  struct OwnershipRange {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -914,6 +914,12 @@ class Shader {
  // True if the current shader has any `kill` instructions.
  bool kills_pixels() const { return kills_pixels_; }

+  // True if the shader has any texture-related instructions (any fetch
+  // instructions other than vertex fetch) writing any non-constant components.
+  bool uses_texture_fetch_instruction_results() const {
+    return uses_texture_fetch_instruction_results_;
+  }
+
  // True if the shader overrides the pixel depth.
  bool writes_depth() const { return writes_depth_; }

@ -1002,6 +1008,7 @@ class Shader {
  uint32_t register_static_address_bound_ = 0;
  bool uses_register_dynamic_addressing_ = false;
  bool kills_pixels_ = false;
+  bool uses_texture_fetch_instruction_results_ = false;
  bool writes_depth_ = false;
  uint32_t writes_color_targets_ = 0b0000;

--- a/src/xenia/gpu/shader_interpreter.cc
+++ b/src/xenia/gpu/shader_interpreter.cc
--- a/src/xenia/gpu/shader_interpreter.h
+++ b/src/xenia/gpu/shader_interpreter.h
@ -0,0 +1,149 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SHADER_INTERPRETER_H_
+#define XENIA_GPU_SHADER_INTERPRETER_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "xenia/base/assert.h"
+#include "xenia/gpu/register_file.h"
+#include "xenia/gpu/shader.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/ucode.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+class ShaderInterpreter {
+ public:
+  ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
+      : register_file_(register_file), memory_(memory) {}
+
+  class ExportSink {
+   public:
+    virtual ~ExportSink() = default;
+    virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
+    virtual void Export(ucode::ExportRegister export_register,
+                        const float* value, uint32_t value_mask) {}
+  };
+
+  void SetTraceWriter(TraceWriter* new_trace_writer) {
+    trace_writer_ = new_trace_writer;
+  }
+
+  ExportSink* GetExportSink() const { return export_sink_; }
+  void SetExportSink(ExportSink* new_export_sink) {
+    export_sink_ = new_export_sink;
+  }
+
+  const float* temp_registers() const { return &temp_registers_[0][0]; }
+  float* temp_registers() { return &temp_registers_[0][0]; }
+
+  static bool CanInterpretShader(const Shader& shader) {
+    assert_true(shader.is_ucode_analyzed());
+    // Texture instructions are not very common in vertex shaders (and not used
+    // in Direct3D 9's internal rectangles such as clears) and are extremely
+    // complex, not implemented.
+    if (shader.uses_texture_fetch_instruction_results()) {
+      return false;
+    }
+    return true;
+  }
+  void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
+    shader_type_ = shader_type;
+    ucode_ = ucode;
+  }
+  void SetShader(const Shader& shader) {
+    assert_true(CanInterpretShader(shader));
+    SetShader(shader.type(), shader.ucode_dwords());
+  }
+
+  void Execute();
+
+ private:
+  struct State {
+    ucode::VertexFetchInstruction vfetch_full_last;
+    uint32_t vfetch_address_dwords;
+    float previous_scalar;
+    uint32_t call_stack_depth;
+    uint32_t call_return_addresses[4];
+    uint32_t loop_stack_depth;
+    xenos::LoopConstant loop_constants[4];
+    uint32_t loop_iterators[4];
+    int32_t address_register;
+    bool predicate;
+
+    void Reset() { std::memset(this, 0, sizeof(*this)); }
+
+    int32_t GetLoopAddress() const {
+      assert_true(loop_stack_depth && loop_stack_depth < 4);
+      if (!loop_stack_depth || loop_stack_depth >= 4) {
+        return 0;
+      }
+      xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
+      // Clamp to the real range specified in the IPR2015-00325 sequencer
+      // specification.
+      // https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
+      return std::min(
+          INT32_C(256),
+          std::max(INT32_C(-256),
+                   int32_t(int32_t(loop_iterators[loop_stack_depth]) *
+                               loop_constant.step +
+                           loop_constant.start)));
+    }
+  };
+
+  static float FlushDenormal(float value) {
+    uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
+    bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
+    return *reinterpret_cast<const float*>(&bits);
+  }
+
+  const float* GetTempRegister(uint32_t address, bool is_relative) const {
+    return temp_registers_[(
+        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
+  }
+  // For simplicity (due to writability), not bounds-checking.
+  float* GetTempRegister(uint32_t address, bool is_relative) {
+    return temp_registers_[(
+        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
+  }
+  const float* GetFloatConstant(uint32_t address, bool is_relative,
+                                bool relative_address_is_a0) const;
+
+  void ExecuteAluInstruction(ucode::AluInstruction instr);
+  void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
+                        const float* value);
+  void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
+
+  const RegisterFile& register_file_;
+  const Memory& memory_;
+
+  TraceWriter* trace_writer_ = nullptr;
+
+  ExportSink* export_sink_ = nullptr;
+
+  xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
+  const uint32_t* ucode_ = nullptr;
+
+  // For both inputs and locals.
+  float temp_registers_[64][4];
+
+  State state_;
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SHADER_INTERPRETER_H_
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
    GatherOperandInformation(binding.fetch_instr.operands[i]);
  }

+  if (binding.fetch_instr.result.GetUsedResultComponents()) {
+    uses_texture_fetch_instruction_results_ = true;
+  }
+
  switch (op.opcode()) {
    case FetchOpcode::kSetTextureLod:
    case FetchOpcode::kSetTextureGradientsHorz: