From 0fd578cafd0465134214e5fd38c53cc3f888ace7 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Thu, 28 Apr 2022 22:25:25 +0300
Subject: [PATCH] [GPU] Get unclipped draw height by running VS on the CPU

---
 .../gpu/d3d12/d3d12_command_processor.cc      |    5 +-
 .../gpu/d3d12/d3d12_render_target_cache.cc    |    4 +-
 .../gpu/d3d12/d3d12_render_target_cache.h     |    7 +-
 src/xenia/gpu/draw_extent_estimator.cc        |  350 +++++
 src/xenia/gpu/draw_extent_estimator.h         |   76 ++
 src/xenia/gpu/registers.h                     |   25 +
 src/xenia/gpu/render_target_cache.cc          |   65 +-
 src/xenia/gpu/render_target_cache.h           |   13 +-
 src/xenia/gpu/shader.h                        |    7 +
 src/xenia/gpu/shader_interpreter.cc           | 1214 +++++++++++++++++
 src/xenia/gpu/shader_interpreter.h            |  149 ++
 src/xenia/gpu/shader_translator.cc            |    4 +
 12 files changed, 1866 insertions(+), 53 deletions(-)
 create mode 100644 src/xenia/gpu/draw_extent_estimator.cc
 create mode 100644 src/xenia/gpu/draw_extent_estimator.h
 create mode 100644 src/xenia/gpu/shader_interpreter.cc
 create mode 100644 src/xenia/gpu/shader_interpreter.h

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 2f06cfe54..694529f04 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
   // Initialize the render target cache before configuring binding - need to
   // know if using rasterizer-ordered views for the bindless root signature.
   render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
-      *register_file_, *this, trace_writer_, bindless_resources_used_);
+      *register_file_, *memory_, trace_writer_, *this,
+      bindless_resources_used_);
   if (!render_target_cache_->Initialize()) {
     XELOGE("Failed to initialize the render target cache");
     return false;
@@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
                    : 0;
   if (!render_target_cache_->Update(is_rasterization_done,
                                     normalized_depth_control,
-                                    normalized_color_mask)) {
+                                    normalized_color_mask, *vertex_shader)) {
     return false;
   }
 
diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
index b13f8bda1..6ccdf76e2 100644
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {
 
 bool D3D12RenderTargetCache::Update(
     bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
-    uint32_t shader_writes_color_targets) {
+    uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
   if (!RenderTargetCache::Update(is_rasterization_done,
                                  normalized_depth_control,
-                                 shader_writes_color_targets)) {
+                                 shader_writes_color_targets, vertex_shader)) {
     return false;
   }
   switch (GetPath()) {
diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
index 09512838d..8eecb450f 100644
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@@ -43,10 +43,10 @@ class D3D12CommandProcessor;
 class D3D12RenderTargetCache final : public RenderTargetCache {
  public:
   D3D12RenderTargetCache(const RegisterFile& register_file,
+                         const Memory& memory, TraceWriter& trace_writer,
                          D3D12CommandProcessor& command_processor,
-                         TraceWriter& trace_writer,
                          bool bindless_resources_used)
-      : RenderTargetCache(register_file),
+      : RenderTargetCache(register_file, memory, &trace_writer),
         command_processor_(command_processor),
         trace_writer_(trace_writer),
         bindless_resources_used_(bindless_resources_used) {}
@@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
 
   bool Update(bool is_rasterization_done,
               reg::RB_DEPTHCONTROL normalized_depth_control,
-              uint32_t shader_writes_color_targets) override;
+              uint32_t shader_writes_color_targets,
+              const Shader& vertex_shader) override;
 
   void InvalidateCommandListRenderTargets() {
     are_current_command_list_render_targets_valid_ = false;
diff --git a/src/xenia/gpu/draw_extent_estimator.cc b/src/xenia/gpu/draw_extent_estimator.cc
new file mode 100644
index 000000000..0ab9ed7f2
--- /dev/null
+++ b/src/xenia/gpu/draw_extent_estimator.cc
@@ -0,0 +1,350 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/draw_extent_estimator.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <cstdint>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/cvar.h"
+#include "xenia/base/profiling.h"
+#include "xenia/gpu/registers.h"
+#include "xenia/gpu/ucode.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/ui/graphics_util.h"
+
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu, true,
+    "Execute the vertex shader for draws with clipping disabled, primarily "
+    "screen-space draws (such as clears), on the CPU when possible to estimate "
+    "the extent of the EDRAM involved in the draw.\n"
+    "Enabling this may significantly improve GPU performance as otherwise up "
+    "to the entire EDRAM may be considered used in draws without clipping, "
+    "potentially resulting in spurious EDRAM range ownership transfer round "
+    "trips between host render targets.\n"
+    "Also, on hosts where certain render target formats have to be emulated in "
+    "a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
+    "this prevents corruption of other render targets located after the "
+    "current ones in the EDRAM by lossy range ownership transfers done for "
+    "those draws.",
+    "GPU");
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu_with_scissor, false,
+    "Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
+    "non-scissored draws (with the right and the bottom sides of the scissor "
+    "rectangle at 8192 or beyond) even though if the scissor rectangle is "
+    "present, it's usually sufficient for esimating the height of the render "
+    "target.\n"
+    "Enabling this may cause excessive processing of vertices on the CPU, as "
+    "some games draw rectangles (for their UI, for instance) without clipping, "
+    "but with a proper scissor rectangle.",
+    "GPU");
+
+namespace xe {
+namespace gpu {
+
+void DrawExtentEstimator::PositionYExportSink::Export(
+    ucode::ExportRegister export_register, const float* value,
+    uint32_t value_mask) {
+  if (export_register == ucode::ExportRegister::kVSPosition) {
+    if (value_mask & 0b0010) {
+      position_y_ = value[1];
+    }
+    if (value_mask & 0b1000) {
+      position_w_ = value[3];
+    }
+  } else if (export_register ==
+             ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
+    if (value_mask & 0b0001) {
+      point_size_ = value[0];
+    }
+    if (value_mask & 0b0100) {
+      vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
+    }
+  }
+}
+
+uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
+  SCOPE_profile_cpu_f("gpu");
+
+  const RegisterFile& regs = register_file_;
+
+  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
+  if (!vgt_draw_initiator.num_indices) {
+    return 0;
+  }
+  if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
+      vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
+    // TODO(Triang3l): Support immediate indices.
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  // Not reproducing tessellation.
+  if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
+                                 vgt_draw_initiator.prim_type) &&
+      regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
+          xenos::VGTOutputPath::kTessellationEnable) {
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
+  assert_true(vertex_shader.is_ucode_analyzed());
+  if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
+    return xenos::kTexture2DCubeMaxWidthHeight;
+  }
+
+  auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
+  union {
+    const void* index_buffer;
+    const uint16_t* index_buffer_16;
+    const uint32_t* index_buffer_32;
+  };
+  xenos::Endian index_endian = vgt_dma_size.swap_mode;
+  if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
+    xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
+    uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
+    uint32_t index_buffer_read_count =
+        std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
+    if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
+      // Handle the index endianness to same way as the PrimitiveProcessor.
+      if (index_endian == xenos::Endian::k8in32) {
+        index_endian = xenos::Endian::k8in16;
+      } else if (index_endian == xenos::Endian::k16in32) {
+        index_endian = xenos::Endian::kNone;
+      }
+      index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
+      if (trace_writer_) {
+        trace_writer_->WriteMemoryRead(
+            index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
+      }
+    } else {
+      assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
+      index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
+      if (trace_writer_) {
+        trace_writer_->WriteMemoryRead(
+            index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
+      }
+    }
+    index_buffer = memory_.TranslatePhysical(index_buffer_base);
+  }
+  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
+  uint32_t reset_index =
+      regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
+  uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
+  uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
+  uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
+
+  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+  float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
+                               ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+                               : 1.0f;
+  float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
+                                ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
+                                : 0.0f;
+
+  int32_t point_vertex_min_diameter_float = 0;
+  int32_t point_vertex_max_diameter_float = 0;
+  float point_constant_radius_y = 0.0f;
+  if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+    auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
+    *reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
+        float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
+    *reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
+        float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
+    point_constant_radius_y =
+        float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
+  }
+
+  float max_y = -FLT_MAX;
+
+  shader_interpreter_.SetShader(vertex_shader);
+
+  PositionYExportSink position_y_export_sink;
+  shader_interpreter_.SetExportSink(&position_y_export_sink);
+  for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
+    uint32_t vertex_index;
+    if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
+      if (i < vgt_dma_size.num_words) {
+        if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
+          vertex_index = index_buffer_16[i];
+        } else {
+          vertex_index = index_buffer_32[i];
+        }
+        // The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
+        vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
+      } else {
+        vertex_index = 0;
+      }
+      if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
+        continue;
+      }
+    } else {
+      assert_true(vgt_draw_initiator.source_select ==
+                  xenos::SourceSelect::kAutoIndex);
+      vertex_index = i;
+    }
+    vertex_index =
+        std::min(max_index,
+                 std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
+
+    position_y_export_sink.Reset();
+
+    shader_interpreter_.temp_registers()[0] = float(vertex_index);
+    shader_interpreter_.Execute();
+
+    if (position_y_export_sink.vertex_kill().has_value() &&
+        (position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
+      continue;
+    }
+    if (!position_y_export_sink.position_y().has_value()) {
+      continue;
+    }
+    float vertex_y = position_y_export_sink.position_y().value();
+    if (!pa_cl_vte_cntl.vtx_xy_fmt) {
+      if (!position_y_export_sink.position_w().has_value()) {
+        continue;
+      }
+      vertex_y /= position_y_export_sink.position_w().value();
+    }
+
+    vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
+
+    if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+      float point_radius_y;
+      if (position_y_export_sink.point_size().has_value()) {
+        // Vertex-specified diameter. Clamped effectively as a signed integer in
+        // the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
+        // to the maximum.
+        point_radius_y = position_y_export_sink.point_size().value();
+        *reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
+            point_vertex_max_diameter_float,
+            std::max(point_vertex_min_diameter_float,
+                     *reinterpret_cast<const int32_t*>(&point_radius_y)));
+        point_radius_y *= 0.5f;
+      } else {
+        // Constant radius.
+        point_radius_y = point_constant_radius_y;
+      }
+      vertex_y += point_radius_y;
+    }
+
+    // std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
+    // always returned - max_y, which is initialized to a normalized value.
+    max_y = std::max(max_y, vertex_y);
+  }
+  shader_interpreter_.SetExportSink(nullptr);
+
+  int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
+  // 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
+  // as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
+  // range of the window offset plus the half-pixel offset is -16384 to 16384.5,
+  // so it's safe to add both - adding it will neither move the 16p8 clamping
+  // bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
+  // cause 24p8 overflow.
+  if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
+    max_y_24p8 += 128;
+  }
+  if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
+    max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
+  }
+  // Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
+  auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
+  return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
+          ((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
+                                                                     : 255)) >>
+         8;
+}
+
+uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
+                                           const Shader& vertex_shader) {
+  SCOPE_profile_cpu_f("gpu");
+
+  const RegisterFile& regs = register_file_;
+
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
+
+  // Scissor.
+  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
+  int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
+  bool scissor_window_offset =
+      !regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
+  if (scissor_window_offset) {
+    scissor_bottom += window_y_offset;
+  }
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
+  uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
+
+  if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
+    // Actual extent from the vertices.
+    if (try_to_estimate_vertex_max_y &&
+        cvars::execute_unclipped_draw_vs_on_cpu) {
+      bool estimate_vertex_max_y;
+      if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
+        estimate_vertex_max_y = true;
+      } else {
+        estimate_vertex_max_y = false;
+        if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
+          // Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
+          // may be a normal render target height (80x8192 is well within the
+          // EDRAM size, for instance), no need to process the vertices on the
+          // CPU in this case.
+          int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
+          if (scissor_window_offset) {
+            scissor_right += pa_sc_window_offset.window_x_offset;
+          }
+          scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
+          if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
+            estimate_vertex_max_y = true;
+          }
+        }
+      }
+      if (estimate_vertex_max_y) {
+        max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
+      }
+    }
+  } else {
+    // Viewport. Though the Xenos itself doesn't have an implicit viewport
+    // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
+    // usually exists and can't be disabled.
+    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+    float viewport_bottom = 0.0f;
+    // First calculate all the integer.0 or integer.5 offsetting exactly at full
+    // precision.
+    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
+      viewport_bottom += float(window_y_offset);
+    }
+    if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
+      viewport_bottom += 0.5f;
+    }
+    // Then apply the floating-point viewport offset.
+    if (pa_cl_vte_cntl.vport_y_offset_ena) {
+      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+    }
+    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
+                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
+                           : 1.0f;
+    // Using floor, or, rather, truncation (because maxing with zero anyway)
+    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
+    // GPUs on Direct3D 12 (but not WARP), also like in
+    // draw_util::GetHostViewportInfo.
+    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
+    // argument in the !(a < b) case (always for NaN), min as float (max_y is
+    // well below 2^24) to safely drop very large values.
+    max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
+  }
+
+  return max_y;
+}
+
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/draw_extent_estimator.h b/src/xenia/gpu/draw_extent_estimator.h
new file mode 100644
index 000000000..3e360489e
--- /dev/null
+++ b/src/xenia/gpu/draw_extent_estimator.h
@@ -0,0 +1,76 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
+#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "xenia/gpu/register_file.h"
+#include "xenia/gpu/shader.h"
+#include "xenia/gpu/shader_interpreter.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+class DrawExtentEstimator {
+ public:
+  DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
+                      TraceWriter* trace_writer)
+      : register_file_(register_file),
+        memory_(memory),
+        trace_writer_(trace_writer),
+        shader_interpreter_(register_file, memory) {
+    shader_interpreter_.SetTraceWriter(trace_writer);
+  }
+
+  // The shader must have its ucode analyzed.
+  uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
+  uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
+                        const Shader& vertex_shader);
+
+ private:
+  class PositionYExportSink : public ShaderInterpreter::ExportSink {
+   public:
+    void Export(ucode::ExportRegister export_register, const float* value,
+                uint32_t value_mask) override;
+
+    void Reset() {
+      position_y_.reset();
+      position_w_.reset();
+      point_size_.reset();
+      vertex_kill_.reset();
+    }
+
+    const std::optional<float>& position_y() const { return position_y_; }
+    const std::optional<float>& position_w() const { return position_w_; }
+    const std::optional<float>& point_size() const { return point_size_; }
+    const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
+
+   private:
+    std::optional<float> position_y_;
+    std::optional<float> position_w_;
+    std::optional<float> point_size_;
+    std::optional<uint32_t> vertex_kill_;
+  };
+
+  const RegisterFile& register_file_;
+  const Memory& memory_;
+  TraceWriter* trace_writer_;
+
+  ShaderInterpreter shader_interpreter_;
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h
index f6425c277..1a7e721ce 100644
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
 };
 static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
 
+union alignas(uint32_t) SQ_VS_CONST {
+  uint32_t value;
+  struct {
+    uint32_t base : 9;  // +0
+    uint32_t : 3;       // +9
+    // Vec4 count minus one.
+    uint32_t size : 9;  // 12
+  };
+  static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
+};
+static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
+
+// Same as SQ_VS_CONST.
+union alignas(uint32_t) SQ_PS_CONST {
+  uint32_t value;
+  struct {
+    uint32_t base : 9;  // +0
+    uint32_t : 3;       // +9
+    // Vec4 count minus one.
+    uint32_t size : 9;  // 12
+  };
+  static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
+};
+static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
+
 /*******************************************************************************
  __   _____ ___ _____ _____  __
  \ \ / / __| _ \_   _| __\ \/ /
diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc
index d840a3550..5b5e9f613 100644
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@@ -22,7 +22,6 @@
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/gpu/draw_util.h"
-#include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/xenos.h"
@@ -143,6 +142,19 @@ DEFINE_bool(
     "-1...1, remap -32...32 to -1...1 to use the full possible range of "
     "values, at the expense of multiplicative blending correctness.",
     "GPU");
+// Enabled by default as the GPU is overall usually the bottleneck when the
+// pixel shader interlock render backend implementation is used, anything that
+// may improve GPU performance is favorable.
+DEFINE_bool(
+    execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
+    "If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
+    "for unclipped draws on the CPU even when using the pixel shader interlock "
+    "(rasterizer-ordered view) implementation of the render backend on the "
+    "host, for which no expensive copying between host render targets is "
+    "needed when the ownership of a EDRAM range is changed.\n"
+    "If this is enabled, excessive barriers may be eliminated when switching "
+    "between different render targets in separate EDRAM locations.",
+    "GPU");
 
 namespace xe {
 namespace gpu {
@@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
 
 bool RenderTargetCache::Update(bool is_rasterization_done,
                                reg::RB_DEPTHCONTROL normalized_depth_control,
-                               uint32_t normalized_color_mask) {
+                               uint32_t normalized_color_mask,
+                               const Shader& vertex_shader) {
   const RegisterFile& regs = register_file();
   bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
 
@@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
 
   // Estimate height used by render targets (for color for writes, for depth /
   // stencil for both reads and writes) from various sources.
-  uint32_t height_used =
-      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
-  int32_t window_y_offset =
-      regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
-  if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
-    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-    float viewport_bottom = 0.0f;
-    // First calculate all the integer.0 or integer.5 offsetting exactly at full
-    // precision.
-    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
-      viewport_bottom += float(window_y_offset);
-    }
-    if (cvars::half_pixel_offset &&
-        !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
-      viewport_bottom += 0.5f;
-    }
-    // Then apply the floating-point viewport offset.
-    if (pa_cl_vte_cntl.vport_y_offset_ena) {
-      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
-    }
-    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
-                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
-                           : 1.0f;
-    // Using floor, or, rather, truncation (because maxing with zero anyway)
-    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
-    // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
-    // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
-    // argument in the !(a < b) case (always for NaN), min as float (height_used
-    // is well below 2^24) to safely drop very large values.
-    height_used =
-        uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
-  }
-  int32_t scissor_bottom =
-      int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
-  if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
-    scissor_bottom += window_y_offset;
-  }
-  scissor_bottom =
-      std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
-  height_used =
-      std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
+  uint32_t height_used = std::min(
+      GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
+      draw_extent_estimator_.EstimateMaxY(
+          interlock_barrier_only
+              ? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
+              : true,
+          vertex_shader));
 
   // Sorted by EDRAM base and then by index in the pipeline - for simplicity,
   // treat render targets placed closer to the end of the EDRAM as truncating
diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h
index 2a04df75a..6a1aa3ea7 100644
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@@ -21,9 +21,11 @@
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
+#include "xenia/gpu/draw_extent_estimator.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
+#include "xenia/gpu/shader.h"
 #include "xenia/gpu/xenos.h"
 
 DECLARE_bool(depth_transfer_not_equal_test);
@@ -217,7 +219,8 @@ class RenderTargetCache {
 
   virtual bool Update(bool is_rasterization_done,
                       reg::RB_DEPTHCONTROL normalized_depth_control,
-                      uint32_t normalized_color_mask);
+                      uint32_t normalized_color_mask,
+                      const Shader& vertex_shader);
 
   // Returns bits where 0 is whether a depth render target is currently bound on
   // the host and 1... are whether the same applies to color render targets, and
@@ -228,8 +231,10 @@ class RenderTargetCache {
       uint32_t* depth_and_color_formats_out = nullptr) const;
 
  protected:
-  RenderTargetCache(const RegisterFile& register_file)
-      : register_file_(register_file) {}
+  RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
+                    TraceWriter* trace_writer)
+      : register_file_(register_file),
+        draw_extent_estimator_(register_file, memory, trace_writer) {}
 
   const RegisterFile& register_file() const { return register_file_; }
 
@@ -606,6 +611,8 @@ class RenderTargetCache {
  private:
   const RegisterFile& register_file_;
 
+  DrawExtentEstimator draw_extent_estimator_;
+
   // For host render targets.
 
   struct OwnershipRange {
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 91964e332..99ad84b8a 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -914,6 +914,12 @@ class Shader {
   // True if the current shader has any `kill` instructions.
   bool kills_pixels() const { return kills_pixels_; }
 
+  // True if the shader has any texture-related instructions (any fetch
+  // instructions other than vertex fetch) writing any non-constant components.
+  bool uses_texture_fetch_instruction_results() const {
+    return uses_texture_fetch_instruction_results_;
+  }
+
   // True if the shader overrides the pixel depth.
   bool writes_depth() const { return writes_depth_; }
 
@@ -1002,6 +1008,7 @@ class Shader {
   uint32_t register_static_address_bound_ = 0;
   bool uses_register_dynamic_addressing_ = false;
   bool kills_pixels_ = false;
+  bool uses_texture_fetch_instruction_results_ = false;
   bool writes_depth_ = false;
   uint32_t writes_color_targets_ = 0b0000;
 
diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc
new file mode 100644
index 000000000..566bade43
--- /dev/null
+++ b/src/xenia/gpu/shader_interpreter.cc
@@ -0,0 +1,1214 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/shader_interpreter.h"
+
+#include <cmath>
+#include <cstring>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/byte_order.h"
+#include "xenia/base/math.h"
+#include "xenia/gpu/registers.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/xenos.h"
+
+namespace xe {
+namespace gpu {
+
+void ShaderInterpreter::Execute() {
+  // For more consistency between invocations in case of a malformed shader.
+  state_.Reset();
+
+  const uint32_t* bool_constants =
+      &register_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32;
+  const xenos::LoopConstant* loop_constants =
+      reinterpret_cast<const xenos::LoopConstant*>(
+          &register_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32);
+
+  bool exec_ended = false;
+  uint32_t cf_index_next = 1;
+  for (uint32_t cf_index = 0; !exec_ended; cf_index = cf_index_next) {
+    cf_index_next = cf_index + 1;
+
+    const uint32_t* cf_pair = &ucode_[3 * (cf_index >> 1)];
+    ucode::ControlFlowInstruction cf_instr;
+    if (cf_index & 1) {
+      cf_instr.dword_0 = (cf_pair[1] >> 16) | (cf_pair[2] << 16);
+      cf_instr.dword_1 = cf_pair[2] >> 16;
+    } else {
+      cf_instr.dword_0 = cf_pair[0];
+      cf_instr.dword_1 = cf_pair[1] & 0xFFFF;
+    }
+
+    ucode::ControlFlowOpcode cf_opcode = cf_instr.opcode();
+    switch (cf_opcode) {
+      case ucode::ControlFlowOpcode::kNop: {
+      } break;
+
+      case ucode::ControlFlowOpcode::kExec:
+      case ucode::ControlFlowOpcode::kExecEnd:
+      case ucode::ControlFlowOpcode::kCondExec:
+      case ucode::ControlFlowOpcode::kCondExecEnd:
+      case ucode::ControlFlowOpcode::kCondExecPred:
+      case ucode::ControlFlowOpcode::kCondExecPredEnd:
+      case ucode::ControlFlowOpcode::kCondExecPredClean:
+      case ucode::ControlFlowOpcode::kCondExecPredCleanEnd: {
+        ucode::ControlFlowExecInstruction cf_exec =
+            *reinterpret_cast<const ucode::ControlFlowExecInstruction*>(
+                &cf_instr);
+
+        switch (cf_opcode) {
+          case ucode::ControlFlowOpcode::kCondExec:
+          case ucode::ControlFlowOpcode::kCondExecEnd:
+          case ucode::ControlFlowOpcode::kCondExecPredClean:
+          case ucode::ControlFlowOpcode::kCondExecPredCleanEnd: {
+            const ucode::ControlFlowCondExecInstruction cf_cond_exec =
+                *reinterpret_cast<const ucode::ControlFlowCondExecInstruction*>(
+                    &cf_exec);
+            uint32_t bool_address = cf_cond_exec.bool_address();
+            if (cf_cond_exec.condition() !=
+                ((bool_constants[bool_address >> 5] &
+                  (UINT32_C(1) << (bool_address & 31))) != 0)) {
+              continue;
+            }
+          } break;
+          case ucode::ControlFlowOpcode::kCondExecPred:
+          case ucode::ControlFlowOpcode::kCondExecPredEnd: {
+            const ucode::ControlFlowCondExecPredInstruction cf_cond_exec_pred =
+                *reinterpret_cast<
+                    const ucode::ControlFlowCondExecPredInstruction*>(&cf_exec);
+            if (cf_cond_exec_pred.condition() != state_.predicate) {
+              continue;
+            }
+          } break;
+          default:
+            break;
+        }
+
+        for (uint32_t exec_index = 0; exec_index < cf_exec.count();
+             ++exec_index) {
+          const uint32_t* exec_instruction =
+              &ucode_[3 * (cf_exec.address() + exec_index)];
+          if ((cf_exec.sequence() >> (exec_index << 1)) & 0b01) {
+            const ucode::FetchInstruction& fetch_instr =
+                *reinterpret_cast<const ucode::FetchInstruction*>(
+                    exec_instruction);
+            if (fetch_instr.is_predicated() &&
+                fetch_instr.predicate_condition() != state_.predicate) {
+              continue;
+            }
+            if (fetch_instr.opcode() == ucode::FetchOpcode::kVertexFetch) {
+              ExecuteVertexFetchInstruction(fetch_instr.vertex_fetch());
+            } else {
+              // Not supporting texture fetching (very complex).
+              float zero_result[4] = {};
+              StoreFetchResult(fetch_instr.dest(),
+                               fetch_instr.is_dest_relative(),
+                               fetch_instr.dest_swizzle(), zero_result);
+            }
+          } else {
+            const ucode::AluInstruction& alu_instr =
+                *reinterpret_cast<const ucode::AluInstruction*>(
+                    exec_instruction);
+            if (alu_instr.is_predicated() &&
+                alu_instr.predicate_condition() != state_.predicate) {
+              continue;
+            }
+            ExecuteAluInstruction(alu_instr);
+          }
+        }
+
+        if (ucode::DoesControlFlowOpcodeEndShader(cf_opcode)) {
+          exec_ended = true;
+        }
+      } break;
+
+      case ucode::ControlFlowOpcode::kLoopStart: {
+        ucode::ControlFlowLoopStartInstruction cf_loop_start =
+            *reinterpret_cast<const ucode::ControlFlowLoopStartInstruction*>(
+                &cf_instr);
+        assert_true(state_.loop_stack_depth < 4);
+        if (++state_.loop_stack_depth > 4) {
+          cf_index_next = cf_loop_start.address();
+          continue;
+        }
+        xenos::LoopConstant loop_constant =
+            loop_constants[cf_loop_start.loop_id()];
+        state_.loop_constants[state_.loop_stack_depth] = loop_constant;
+        uint32_t& loop_iterator_ref =
+            state_.loop_iterators[state_.loop_stack_depth];
+        if (!cf_loop_start.is_repeat()) {
+          loop_iterator_ref = 0;
+        }
+        if (loop_iterator_ref >= loop_constant.count) {
+          cf_index_next = cf_loop_start.address();
+          continue;
+        }
+        ++state_.loop_stack_depth;
+      } break;
+
+      case ucode::ControlFlowOpcode::kLoopEnd: {
+        assert_not_zero(state_.loop_stack_depth);
+        if (!state_.loop_stack_depth) {
+          continue;
+        }
+        assert_true(state_.loop_stack_depth <= 4);
+        if (state_.loop_stack_depth > 4) {
+          --state_.loop_stack_depth;
+          continue;
+        }
+        ucode::ControlFlowLoopEndInstruction cf_loop_end =
+            *reinterpret_cast<const ucode::ControlFlowLoopEndInstruction*>(
+                &cf_instr);
+        xenos::LoopConstant loop_constant =
+            state_.loop_constants[state_.loop_stack_depth - 1];
+        assert_true(loop_constant.value ==
+                    loop_constants[cf_loop_end.loop_id()].value);
+        uint32_t loop_iterator =
+            ++state_.loop_iterators[state_.loop_stack_depth - 1];
+        if (loop_iterator < loop_constant.count &&
+            (!cf_loop_end.is_predicated_break() ||
+             cf_loop_end.condition() != state_.predicate)) {
+          cf_index_next = cf_loop_end.address();
+          continue;
+        }
+        --state_.loop_stack_depth;
+      } break;
+
+      case ucode::ControlFlowOpcode::kCondCall: {
+        assert_true(state_.call_stack_depth < 4);
+        if (state_.call_stack_depth >= 4) {
+          continue;
+        }
+        const ucode::ControlFlowCondCallInstruction cf_cond_call =
+            *reinterpret_cast<const ucode::ControlFlowCondCallInstruction*>(
+                &cf_instr);
+        if (!cf_cond_call.is_unconditional()) {
+          if (cf_cond_call.is_predicated()) {
+            if (cf_cond_call.condition() != state_.predicate) {
+              continue;
+            }
+          } else {
+            uint32_t bool_address = cf_cond_call.bool_address();
+            if (cf_cond_call.condition() !=
+                ((bool_constants[bool_address >> 5] &
+                  (UINT32_C(1) << (bool_address & 31))) != 0)) {
+              continue;
+            }
+          }
+        }
+        state_.call_return_addresses[state_.call_stack_depth++] = cf_index + 1;
+        cf_index_next = cf_cond_call.address();
+      } break;
+
+      case ucode::ControlFlowOpcode::kReturn: {
+        // No stack depth assertion - skipping the return is a well-defined
+        // behavior for `return` outside a function call.
+        if (!state_.call_stack_depth) {
+          continue;
+        }
+        cf_index_next = state_.call_return_addresses[--state_.call_stack_depth];
+      } break;
+
+      case ucode::ControlFlowOpcode::kCondJmp: {
+        const ucode::ControlFlowCondJmpInstruction cf_cond_jmp =
+            *reinterpret_cast<const ucode::ControlFlowCondJmpInstruction*>(
+                &cf_instr);
+        if (!cf_cond_jmp.is_unconditional()) {
+          if (cf_cond_jmp.is_predicated()) {
+            if (cf_cond_jmp.condition() != state_.predicate) {
+              continue;
+            }
+          } else {
+            uint32_t bool_address = cf_cond_jmp.bool_address();
+            if (cf_cond_jmp.condition() !=
+                ((bool_constants[bool_address >> 5] &
+                  (UINT32_C(1) << (bool_address & 31))) != 0)) {
+              continue;
+            }
+          }
+        }
+        cf_index_next = cf_cond_jmp.address();
+      } break;
+
+      case ucode::ControlFlowOpcode::kAlloc: {
+        if (export_sink_) {
+          const ucode::ControlFlowAllocInstruction& cf_alloc =
+              *reinterpret_cast<const ucode::ControlFlowAllocInstruction*>(
+                  &cf_instr);
+          export_sink_->AllocExport(cf_alloc.alloc_type(), cf_alloc.size());
+        }
+      } break;
+
+      case ucode::ControlFlowOpcode::kMarkVsFetchDone: {
+      } break;
+
+      default:
+        assert_unhandled_case(cf_opcode);
+    }
+  }
+}
+
+const float* ShaderInterpreter::GetFloatConstant(
+    uint32_t address, bool is_relative, bool relative_address_is_a0) const {
+  static const float zero[4] = {};
+  int32_t index = int32_t(address);
+  if (is_relative) {
+    index += relative_address_is_a0 ? state_.address_register
+                                    : state_.GetLoopAddress();
+  }
+  if (index < 0) {
+    return zero;
+  }
+  auto base_and_size_minus_1 = register_file_.Get<reg::SQ_VS_CONST>(
+      shader_type_ == xenos::ShaderType::kVertex ? XE_GPU_REG_SQ_VS_CONST
+                                                 : XE_GPU_REG_SQ_PS_CONST);
+  if (uint32_t(index) > base_and_size_minus_1.size) {
+    return zero;
+  }
+  index += base_and_size_minus_1.base;
+  if (index >= 512) {
+    return zero;
+  }
+  return &register_file_[XE_GPU_REG_SHADER_CONSTANT_000_X + 4 * index].f32;
+}
+
+void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) {
+  // Vector operation.
+  float vector_result[4] = {};
+  ucode::AluVectorOpcode vector_opcode = instr.vector_opcode();
+  const ucode::AluVectorOpcodeInfo& vector_opcode_info =
+      ucode::GetAluVectorOpcodeInfo(vector_opcode);
+  uint32_t vector_result_write_mask = instr.GetVectorOpResultWriteMask();
+  if (vector_result_write_mask || vector_opcode_info.changed_state) {
+    float vector_operands[3][4];
+    for (uint32_t i = 0; i < 3; ++i) {
+      if (!vector_opcode_info.operand_components_used[i]) {
+        continue;
+      }
+      const float* vector_src_ptr;
+      uint32_t vector_src_register = instr.src_reg(1 + i);
+      bool vector_src_absolute = false;
+      if (instr.src_is_temp(1 + i)) {
+        vector_src_ptr = GetTempRegister(
+            ucode::AluInstruction::src_temp_reg(vector_src_register),
+            ucode::AluInstruction::is_src_temp_relative(vector_src_register));
+        vector_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute(
+            vector_src_register);
+      } else {
+        vector_src_ptr = GetFloatConstant(
+            vector_src_register, instr.src_const_is_addressed(1 + i),
+            instr.is_const_address_register_relative());
+      }
+      uint32_t vector_src_absolute_mask =
+          ~(uint32_t(vector_src_absolute) << 31);
+      uint32_t vector_src_negate_bit = uint32_t(instr.src_negate(1 + i)) << 31;
+      uint32_t vector_src_swizzle = instr.src_swizzle(1 + i);
+      for (uint32_t j = 0; j < 4; ++j) {
+        float vector_src_component = FlushDenormal(
+            vector_src_ptr[ucode::AluInstruction::GetSwizzledComponentIndex(
+                vector_src_swizzle, j)]);
+        *reinterpret_cast<uint32_t*>(&vector_src_component) =
+            (*reinterpret_cast<const uint32_t*>(&vector_src_component) &
+             vector_src_absolute_mask) ^
+            vector_src_negate_bit;
+        vector_operands[i][j] = vector_src_component;
+      }
+    }
+
+    bool replicate_vector_result_x = false;
+    switch (vector_opcode) {
+      case ucode::AluVectorOpcode::kAdd: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] + vector_operands[1][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kMul: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          // Direct3D 9 behavior (0 or denormal * anything = +0).
+          vector_result[i] = (vector_operands[0][i] && vector_operands[1][i])
+                                 ? vector_operands[0][i] * vector_operands[1][i]
+                                 : 0.0f;
+        }
+      } break;
+      case ucode::AluVectorOpcode::kMax: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] >= vector_operands[1][i]
+                                 ? vector_operands[0][i]
+                                 : vector_operands[1][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kMin: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] < vector_operands[1][i]
+                                 ? vector_operands[0][i]
+                                 : vector_operands[1][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kSeq: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] =
+              float(vector_operands[0][i] == vector_operands[1][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kSgt: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] =
+              float(vector_operands[0][i] > vector_operands[1][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kSge: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] =
+              float(vector_operands[0][i] >= vector_operands[1][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kSne: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] =
+              float(vector_operands[0][i] != vector_operands[1][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kFrc: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] =
+              vector_operands[0][i] - std::floor(vector_operands[0][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kTrunc: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = std::trunc(vector_operands[0][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kFloor: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = std::floor(vector_operands[0][i]);
+        }
+      } break;
+      case ucode::AluVectorOpcode::kMad: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          // Direct3D 9 behavior (0 or denormal * anything = +0).
+          // Doing the addition rather than conditional assignment even for zero
+          // operands because +0 + -0 must be +0.
+          vector_result[i] =
+              ((vector_operands[0][i] && vector_operands[1][i])
+                   ? vector_operands[0][i] * vector_operands[1][i]
+                   : 0.0f) +
+              vector_operands[2][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kCndEq: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] == 0.0f
+                                 ? vector_operands[1][i]
+                                 : vector_operands[2][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kCndGe: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] >= 0.0f
+                                 ? vector_operands[1][i]
+                                 : vector_operands[2][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kCndGt: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] > 0.0f
+                                 ? vector_operands[1][i]
+                                 : vector_operands[2][i];
+        }
+      } break;
+      case ucode::AluVectorOpcode::kDp4: {
+        vector_result[0] = 0.0f;
+        for (uint32_t i = 0; i < 4; ++i) {
+          // Direct3D 9 behavior (0 or denormal * anything = +0).
+          // Doing the addition even for zero operands because +0 + -0 must be
+          // +0.
+          vector_result[0] +=
+              (vector_operands[0][i] && vector_operands[1][i])
+                  ? vector_operands[0][i] * vector_operands[1][i]
+                  : 0.0f;
+        }
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kDp3: {
+        vector_result[0] = 0.0f;
+        for (uint32_t i = 0; i < 3; ++i) {
+          // Direct3D 9 behavior (0 or denormal * anything = +0).
+          // Doing the addition even for zero operands because +0 + -0 must be
+          // +0.
+          vector_result[0] +=
+              (vector_operands[0][i] && vector_operands[1][i])
+                  ? vector_operands[0][i] * vector_operands[1][i]
+                  : 0.0f;
+        }
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kDp2Add: {
+        // Doing the addition even for zero operands because +0 + -0 must be +0.
+        vector_result[0] = 0.0f;
+        for (uint32_t i = 0; i < 2; ++i) {
+          // Direct3D 9 behavior (0 or denormal * anything = +0).
+          vector_result[0] +=
+              (vector_operands[0][i] && vector_operands[1][i])
+                  ? vector_operands[0][i] * vector_operands[1][i]
+                  : 0.0f;
+        }
+        vector_result[0] += vector_operands[2][0];
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kCube: {
+        // Operand [0] is .z_xy.
+        float x = vector_operands[0][2];
+        float y = vector_operands[0][3];
+        float z = vector_operands[0][0];
+        float x_abs = std::abs(x), y_abs = std::abs(y), z_abs = std::abs(z);
+        // Result is T coordinate, S coordinate, 2 * major axis, face ID.
+        if (z_abs >= x_abs && z_abs >= y_abs) {
+          vector_result[0] = -y;
+          vector_result[1] = z < 0.0f ? -x : x;
+          vector_result[2] = z;
+          vector_result[3] = z < 0.0f ? 5.0f : 4.0f;
+        } else if (y_abs >= x_abs) {
+          vector_result[0] = y < 0.0f ? -z : z;
+          vector_result[1] = x;
+          vector_result[2] = y;
+          vector_result[3] = y < 0.0f ? 3.0f : 2.0f;
+        } else {
+          vector_result[0] = -y;
+          vector_result[1] = x < 0.0f ? z : -z;
+          vector_result[2] = x;
+          vector_result[3] = x < 0.0f ? 1.0f : 0.0f;
+        }
+        vector_result[2] *= 2.0f;
+      } break;
+      case ucode::AluVectorOpcode::kMax4: {
+        if (vector_operands[0][0] >= vector_operands[0][1] &&
+            vector_operands[0][0] >= vector_operands[0][2] &&
+            vector_operands[0][0] >= vector_operands[0][3]) {
+          vector_result[0] = vector_operands[0][0];
+        } else if (vector_operands[0][1] >= vector_operands[0][2] &&
+                   vector_operands[0][1] >= vector_operands[0][3]) {
+          vector_result[0] = vector_operands[0][1];
+        } else if (vector_operands[0][2] >= vector_operands[0][3]) {
+          vector_result[0] = vector_operands[0][2];
+        } else {
+          vector_result[0] = vector_operands[0][3];
+        }
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kSetpEqPush: {
+        state_.predicate =
+            vector_operands[0][3] == 0.0f && vector_operands[1][3] == 0.0f;
+        vector_result[0] =
+            (vector_operands[0][0] == 0.0f && vector_operands[1][0] == 0.0f)
+                ? 0.0f
+                : vector_operands[0][0] + 1.0f;
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kSetpNePush: {
+        state_.predicate =
+            vector_operands[0][3] == 0.0f && vector_operands[1][3] != 0.0f;
+        vector_result[0] =
+            (vector_operands[0][0] == 0.0f && vector_operands[1][0] != 0.0f)
+                ? 0.0f
+                : vector_operands[0][0] + 1.0f;
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kSetpGtPush: {
+        state_.predicate =
+            vector_operands[0][3] == 0.0f && vector_operands[1][3] > 0.0f;
+        vector_result[0] =
+            (vector_operands[0][0] == 0.0f && vector_operands[1][0] > 0.0f)
+                ? 0.0f
+                : vector_operands[0][0] + 1.0f;
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kSetpGePush: {
+        state_.predicate =
+            vector_operands[0][3] == 0.0f && vector_operands[1][3] >= 0.0f;
+        vector_result[0] =
+            (vector_operands[0][0] == 0.0f && vector_operands[1][0] >= 0.0f)
+                ? 0.0f
+                : vector_operands[0][0] + 1.0f;
+        replicate_vector_result_x = true;
+      } break;
+      // Not implementing pixel kill currently, the interpreter is currently
+      // used only for vertex shaders.
+      case ucode::AluVectorOpcode::kKillEq: {
+        vector_result[0] =
+            float(vector_operands[0][0] == vector_operands[1][0] ||
+                  vector_operands[0][1] == vector_operands[1][1] ||
+                  vector_operands[0][2] == vector_operands[1][2] ||
+                  vector_operands[0][3] == vector_operands[1][3]);
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kKillGt: {
+        vector_result[0] =
+            float(vector_operands[0][0] > vector_operands[1][0] ||
+                  vector_operands[0][1] > vector_operands[1][1] ||
+                  vector_operands[0][2] > vector_operands[1][2] ||
+                  vector_operands[0][3] > vector_operands[1][3]);
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kKillGe: {
+        vector_result[0] =
+            float(vector_operands[0][0] >= vector_operands[1][0] ||
+                  vector_operands[0][1] >= vector_operands[1][1] ||
+                  vector_operands[0][2] >= vector_operands[1][2] ||
+                  vector_operands[0][3] >= vector_operands[1][3]);
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kKillNe: {
+        vector_result[0] =
+            float(vector_operands[0][0] != vector_operands[1][0] ||
+                  vector_operands[0][1] != vector_operands[1][1] ||
+                  vector_operands[0][2] != vector_operands[1][2] ||
+                  vector_operands[0][3] != vector_operands[1][3]);
+        replicate_vector_result_x = true;
+      } break;
+      case ucode::AluVectorOpcode::kDst: {
+        vector_result[0] = 1.0f;
+        // Direct3D 9 behavior (0 or denormal * anything = +0).
+        vector_result[1] = (vector_operands[0][1] && vector_operands[1][1])
+                               ? vector_operands[0][1] * vector_operands[1][1]
+                               : 0.0f;
+        vector_result[2] = vector_operands[0][2];
+        vector_result[3] = vector_operands[1][3];
+      } break;
+      case ucode::AluVectorOpcode::kMaxA: {
+        // std::max is `a < b ? b : a`, thus in case of NaN, the first argument
+        // (-256.0f) is always the result.
+        state_.address_register = int32_t(std::floor(
+            std::min(255.0f, std::max(-256.0f, vector_operands[0][3])) + 0.5f));
+        for (uint32_t i = 0; i < 4; ++i) {
+          vector_result[i] = vector_operands[0][i] >= vector_operands[1][i]
+                                 ? vector_operands[0][i]
+                                 : vector_operands[1][i];
+        }
+      } break;
+      default: {
+        assert_unhandled_case(vector_opcode);
+      }
+    }
+    if (replicate_vector_result_x) {
+      for (uint32_t i = 1; i < 4; ++i) {
+        vector_result[i] = vector_result[0];
+      }
+    }
+  }
+
+  // Scalar operation.
+  ucode::AluScalarOpcode scalar_opcode = instr.scalar_opcode();
+  const ucode::AluScalarOpcodeInfo& scalar_opcode_info =
+      ucode::GetAluScalarOpcodeInfo(scalar_opcode);
+  float scalar_operands[2];
+  uint32_t scalar_operand_component_count = 0;
+  bool scalar_src_absolute = false;
+  switch (scalar_opcode_info.operand_count) {
+    case 1: {
+      // r#/c#.w or r#/c#.wx.
+      const float* scalar_src_ptr;
+      uint32_t scalar_src_register = instr.src_reg(3);
+      if (instr.src_is_temp(3)) {
+        scalar_src_ptr = GetTempRegister(
+            ucode::AluInstruction::src_temp_reg(scalar_src_register),
+            ucode::AluInstruction::is_src_temp_relative(scalar_src_register));
+        scalar_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute(
+            scalar_src_register);
+      } else {
+        scalar_src_ptr = GetFloatConstant(
+            scalar_src_register, instr.src_const_is_addressed(3),
+            instr.is_const_address_register_relative());
+      }
+      uint32_t scalar_src_swizzle = instr.src_swizzle(3);
+      scalar_operand_component_count =
+          scalar_opcode_info.single_operand_is_two_component ? 2 : 1;
+      for (uint32_t i = 0; i < scalar_operand_component_count; ++i) {
+        scalar_operands[i] =
+            scalar_src_ptr[ucode::AluInstruction::GetSwizzledComponentIndex(
+                scalar_src_swizzle, (3 + i) & 3)];
+      }
+    } break;
+    case 2: {
+      scalar_operand_component_count = 2;
+      uint32_t scalar_src_absolute_mask =
+          ~(uint32_t(instr.abs_constants()) << 31);
+      uint32_t scalar_src_negate_bit = uint32_t(instr.src_negate(3)) << 31;
+      uint32_t scalar_src_swizzle = instr.src_swizzle(3);
+      // c#.w.
+      scalar_operands[0] =
+          GetFloatConstant(instr.src_reg(3), instr.src_const_is_addressed(3),
+                           instr.is_const_address_register_relative())
+              [ucode::AluInstruction::GetSwizzledComponentIndex(
+                  scalar_src_swizzle, 3)];
+      // r#.x.
+      scalar_operands[1] = GetTempRegister(
+          instr.scalar_const_reg_op_src_temp_reg(),
+          false)[ucode::AluInstruction::GetSwizzledComponentIndex(
+          scalar_src_swizzle, 0)];
+    } break;
+  }
+  if (scalar_operand_component_count) {
+    uint32_t scalar_src_absolute_mask = ~(uint32_t(scalar_src_absolute) << 31);
+    uint32_t scalar_src_negate_bit = uint32_t(instr.src_negate(3)) << 31;
+    for (uint32_t i = 0; i < scalar_operand_component_count; ++i) {
+      float scalar_operand = FlushDenormal(scalar_operands[i]);
+      *reinterpret_cast<uint32_t*>(&scalar_operand) =
+          (*reinterpret_cast<const uint32_t*>(&scalar_operand) &
+           scalar_src_absolute_mask) ^
+          scalar_src_negate_bit;
+      scalar_operands[i] = scalar_operand;
+    }
+  }
+  switch (scalar_opcode) {
+    case ucode::AluScalarOpcode::kAdds:
+    case ucode::AluScalarOpcode::kAddsc0:
+    case ucode::AluScalarOpcode::kAddsc1: {
+      state_.previous_scalar = scalar_operands[0] + scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kAddsPrev: {
+      state_.previous_scalar = scalar_operands[0] + state_.previous_scalar;
+    } break;
+    case ucode::AluScalarOpcode::kMuls:
+    case ucode::AluScalarOpcode::kMulsc0:
+    case ucode::AluScalarOpcode::kMulsc1: {
+      // Direct3D 9 behavior (0 or denormal * anything = +0).
+      state_.previous_scalar = (scalar_operands[0] && scalar_operands[1])
+                                   ? scalar_operands[0] * scalar_operands[1]
+                                   : 0.0f;
+    } break;
+    case ucode::AluScalarOpcode::kMulsPrev: {
+      // Direct3D 9 behavior (0 or denormal * anything = +0).
+      state_.previous_scalar = (scalar_operands[0] && state_.previous_scalar)
+                                   ? scalar_operands[0] * state_.previous_scalar
+                                   : 0.0f;
+    } break;
+    case ucode::AluScalarOpcode::kMulsPrev2: {
+      if (state_.previous_scalar == -FLT_MAX ||
+          !std::isfinite(state_.previous_scalar) ||
+          !std::isfinite(scalar_operands[1]) || scalar_operands[1] <= 0.0f) {
+        state_.previous_scalar = -FLT_MAX;
+      } else {
+        // Direct3D 9 behavior (0 or denormal * anything = +0).
+        state_.previous_scalar =
+            (scalar_operands[0] && state_.previous_scalar)
+                ? scalar_operands[0] * state_.previous_scalar
+                : 0.0f;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kMaxs: {
+      state_.previous_scalar = scalar_operands[0] >= scalar_operands[1]
+                                   ? scalar_operands[0]
+                                   : scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kMins: {
+      state_.previous_scalar = scalar_operands[0] >= scalar_operands[1]
+                                   ? scalar_operands[0]
+                                   : scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kSeqs: {
+      state_.previous_scalar = float(scalar_operands[0] == 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kSgts: {
+      state_.previous_scalar = float(scalar_operands[0] > 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kSges: {
+      state_.previous_scalar = float(scalar_operands[0] >= 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kSnes: {
+      state_.previous_scalar = float(scalar_operands[0] != 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kFrcs: {
+      state_.previous_scalar =
+          scalar_operands[0] - std::floor(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kTruncs: {
+      state_.previous_scalar = std::trunc(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kFloors: {
+      state_.previous_scalar = std::floor(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kExp: {
+      state_.previous_scalar = std::exp2(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kLogc: {
+      state_.previous_scalar = std::log2(scalar_operands[0]);
+      if (state_.previous_scalar == -INFINITY) {
+        state_.previous_scalar = -FLT_MAX;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kLog: {
+      state_.previous_scalar = std::log2(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kRcpc: {
+      state_.previous_scalar = 1.0f / scalar_operands[0];
+      if (state_.previous_scalar == -INFINITY) {
+        state_.previous_scalar = -FLT_MAX;
+      } else if (state_.previous_scalar == INFINITY) {
+        state_.previous_scalar = FLT_MAX;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kRcpf: {
+      state_.previous_scalar = 1.0f / scalar_operands[0];
+      if (state_.previous_scalar == -INFINITY) {
+        state_.previous_scalar = -0.0f;
+      } else if (state_.previous_scalar == INFINITY) {
+        state_.previous_scalar = 0.0f;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kRcp: {
+      state_.previous_scalar = 1.0f / scalar_operands[0];
+    } break;
+    case ucode::AluScalarOpcode::kRsqc: {
+      state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]);
+      if (state_.previous_scalar == -INFINITY) {
+        state_.previous_scalar = -FLT_MAX;
+      } else if (state_.previous_scalar == INFINITY) {
+        state_.previous_scalar = FLT_MAX;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kRsqf: {
+      state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]);
+      if (state_.previous_scalar == -INFINITY) {
+        state_.previous_scalar = -0.0f;
+      } else if (state_.previous_scalar == INFINITY) {
+        state_.previous_scalar = 0.0f;
+      }
+    } break;
+    case ucode::AluScalarOpcode::kRsq: {
+      state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kMaxAs: {
+      // std::max is `a < b ? b : a`, thus in case of NaN, the first argument
+      // (-256.0f) is always the result.
+      state_.address_register = int32_t(std::floor(
+          std::min(255.0f, std::max(-256.0f, scalar_operands[0])) + 0.5f));
+      state_.previous_scalar = scalar_operands[0] >= scalar_operands[1]
+                                   ? scalar_operands[0]
+                                   : scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kMaxAsf: {
+      // std::max is `a < b ? b : a`, thus in case of NaN, the first argument
+      // (-256.0f) is always the result.
+      state_.address_register = int32_t(
+          std::floor(std::min(255.0f, std::max(-256.0f, scalar_operands[0]))));
+      state_.previous_scalar = scalar_operands[0] >= scalar_operands[1]
+                                   ? scalar_operands[0]
+                                   : scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kSubs:
+    case ucode::AluScalarOpcode::kSubsc0:
+    case ucode::AluScalarOpcode::kSubsc1: {
+      state_.previous_scalar = scalar_operands[0] - scalar_operands[1];
+    } break;
+    case ucode::AluScalarOpcode::kSubsPrev: {
+      state_.previous_scalar = scalar_operands[0] - state_.previous_scalar;
+    } break;
+    case ucode::AluScalarOpcode::kSetpEq: {
+      state_.predicate = scalar_operands[0] == 0.0f;
+      state_.previous_scalar = float(!state_.predicate);
+    } break;
+    case ucode::AluScalarOpcode::kSetpNe: {
+      state_.predicate = scalar_operands[0] != 0.0f;
+      state_.previous_scalar = float(!state_.predicate);
+    } break;
+    case ucode::AluScalarOpcode::kSetpGt: {
+      state_.predicate = scalar_operands[0] > 0.0f;
+      state_.previous_scalar = float(!state_.predicate);
+    } break;
+    case ucode::AluScalarOpcode::kSetpGe: {
+      state_.predicate = scalar_operands[0] >= 0.0f;
+      state_.previous_scalar = float(!state_.predicate);
+    } break;
+    case ucode::AluScalarOpcode::kSetpInv: {
+      state_.predicate = scalar_operands[0] == 1.0f;
+      state_.previous_scalar =
+          state_.predicate
+              ? 0.0f
+              : (scalar_operands[0] == 0.0f ? 1.0f : scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kSetpPop: {
+      float new_counter = scalar_operands[0] - 1.0f;
+      state_.predicate = new_counter <= 0.0f;
+      state_.previous_scalar = state_.predicate ? 0.0f : new_counter;
+    } break;
+    case ucode::AluScalarOpcode::kSetpClr: {
+      state_.predicate = false;
+      state_.previous_scalar = FLT_MAX;
+    } break;
+    case ucode::AluScalarOpcode::kSetpRstr: {
+      state_.predicate = scalar_operands[0] == 0.0f;
+      state_.previous_scalar = state_.predicate ? 0.0f : scalar_operands[0];
+    } break;
+    // Not implementing pixel kill currently, the interpreter is currently used
+    // only for vertex shaders.
+    case ucode::AluScalarOpcode::kKillsEq: {
+      state_.previous_scalar = float(scalar_operands[0] == 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kKillsGt: {
+      state_.previous_scalar = float(scalar_operands[0] > 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kKillsGe: {
+      state_.previous_scalar = float(scalar_operands[0] >= 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kKillsNe: {
+      state_.previous_scalar = float(scalar_operands[0] != 0.0f);
+    } break;
+    case ucode::AluScalarOpcode::kKillsOne: {
+      state_.previous_scalar = float(scalar_operands[0] == 1.0f);
+    } break;
+    case ucode::AluScalarOpcode::kSqrt: {
+      state_.previous_scalar = std::sqrt(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kSin: {
+      state_.previous_scalar = std::sin(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kCos: {
+      state_.previous_scalar = std::cos(scalar_operands[0]);
+    } break;
+    case ucode::AluScalarOpcode::kRetainPrev: {
+    } break;
+    default: {
+      assert_unhandled_case(scalar_opcode);
+    }
+  }
+
+  if (instr.vector_clamp()) {
+    for (uint32_t i = 0; i < 4; ++i) {
+      vector_result[i] = xe::saturate_unsigned(vector_result[i]);
+    }
+  }
+  float scalar_result = instr.scalar_clamp()
+                            ? xe::saturate_unsigned(state_.previous_scalar)
+                            : state_.previous_scalar;
+
+  uint32_t scalar_result_write_mask = instr.GetScalarOpResultWriteMask();
+  if (instr.is_export()) {
+    if (export_sink_) {
+      float export_value[4];
+      uint32_t export_constant_1_mask = instr.GetConstant1WriteMask();
+      uint32_t export_mask =
+          vector_result_write_mask | scalar_result_write_mask |
+          instr.GetConstant0WriteMask() | export_constant_1_mask;
+      for (uint32_t i = 0; i < 4; ++i) {
+        uint32_t export_component_bit = UINT32_C(1) << i;
+        float export_component = 0.0f;
+        if (vector_result_write_mask & export_component_bit) {
+          export_component = vector_result[i];
+        } else if (scalar_result_write_mask & export_component_bit) {
+          export_component = scalar_result;
+        } else if (export_constant_1_mask & export_component_bit) {
+          export_component = 1.0f;
+        } else {
+          export_component = 0.0f;
+        }
+        export_value[i] = export_component;
+      }
+      export_sink_->Export(
+          ucode::ExportRegister(instr.vector_dest()), export_value,
+          vector_result_write_mask | scalar_result_write_mask |
+              instr.GetConstant0WriteMask() | export_constant_1_mask);
+    }
+  } else {
+    if (vector_result_write_mask) {
+      float* vector_dest =
+          GetTempRegister(instr.vector_dest(), instr.is_vector_dest_relative());
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (vector_result_write_mask & (UINT32_C(1) << i)) {
+          vector_dest[i] = vector_result[i];
+        }
+      }
+    }
+    if (scalar_result_write_mask) {
+      float* scalar_dest =
+          GetTempRegister(instr.scalar_dest(), instr.is_scalar_dest_relative());
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (scalar_result_write_mask & (UINT32_C(1) << i)) {
+          scalar_dest[i] = scalar_result;
+        }
+      }
+    }
+  }
+}
+
+void ShaderInterpreter::StoreFetchResult(uint32_t dest, bool is_dest_relative,
+                                         uint32_t swizzle, const float* value) {
+  float* dest_data = GetTempRegister(dest, is_dest_relative);
+  for (uint32_t i = 0; i < 4; ++i) {
+    ucode::FetchDestinationSwizzle component_swizzle =
+        ucode::GetFetchDestinationComponentSwizzle(swizzle, i);
+    switch (component_swizzle) {
+      case ucode::FetchDestinationSwizzle::kX:
+        dest_data[i] = value[0];
+        break;
+      case ucode::FetchDestinationSwizzle::kY:
+        dest_data[i] = value[1];
+        break;
+      case ucode::FetchDestinationSwizzle::kZ:
+        dest_data[i] = value[2];
+        break;
+      case ucode::FetchDestinationSwizzle::kW:
+        dest_data[i] = value[3];
+        break;
+      case ucode::FetchDestinationSwizzle::k1:
+        dest_data[i] = 1.0f;
+        break;
+      case ucode::FetchDestinationSwizzle::kKeep:
+        break;
+      default:
+        // ucode::FetchDestinationSwizzle::k0 or the invalid swizzle 6.
+        // TODO(Triang3l): Find the correct handling of the invalid swizzle 6.
+        assert_true(component_swizzle == ucode::FetchDestinationSwizzle::k0);
+        dest_data[i] = 0.0f;
+        break;
+    }
+  }
+}
+
+void ShaderInterpreter::ExecuteVertexFetchInstruction(
+    ucode::VertexFetchInstruction instr) {
+  // FIXME(Triang3l): Bit scan loops over components cause a link-time
+  // optimization internal error in Visual Studio 2019, mainly in the format
+  // unpacking. Using loops with up to 4 iterations here instead.
+
+  if (!instr.is_mini_fetch()) {
+    state_.vfetch_full_last = instr;
+  }
+
+  xenos::xe_gpu_vertex_fetch_t fetch_constant =
+      *reinterpret_cast<const xenos::xe_gpu_vertex_fetch_t*>(
+          &register_file_[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
+                          state_.vfetch_full_last.fetch_constant_index()]);
+
+  if (!instr.is_mini_fetch()) {
+    // Get the part of the address that depends on vfetch_full data.
+    uint32_t vertex_index = uint32_t(std::floor(
+        GetTempRegister(instr.src(),
+                        instr.is_src_relative())[instr.src_swizzle()] +
+        (instr.is_index_rounded() ? 0.5f : 0.0f)));
+    state_.vfetch_address_dwords =
+        instr.stride() * vertex_index + fetch_constant.address;
+  }
+
+  // TODO(Triang3l): Find the default values for unused components.
+  float result[4] = {};
+  uint32_t dest_swizzle = instr.dest_swizzle();
+  uint32_t used_result_components = 0b0000;
+  for (uint32_t i = 0; i < 4; ++i) {
+    uint32_t dest_component_swizzle = (dest_swizzle >> (3 * i)) & 0b111;
+    if (dest_component_swizzle <= 3) {
+      used_result_components |= UINT32_C(1) << dest_component_swizzle;
+    }
+  }
+  uint32_t needed_dwords = xenos::GetVertexFormatNeededWords(
+      instr.data_format(), used_result_components);
+  if (needed_dwords) {
+    uint32_t data[4] = {};
+    const uint32_t* memory_dwords =
+        reinterpret_cast<const uint32_t*>(memory_.physical_membase());
+    uint32_t buffer_end_dwords = fetch_constant.address + fetch_constant.size;
+    uint32_t dword_0_address_dwords =
+        uint32_t(int32_t(state_.vfetch_address_dwords) + instr.offset());
+    for (uint32_t i = 0; i < 4; ++i) {
+      if (!(needed_dwords & (UINT32_C(1) << i))) {
+        continue;
+      }
+      uint32_t dword_value = 0;
+      uint32_t dword_address_dwords = dword_0_address_dwords + i;
+      if (dword_address_dwords >= fetch_constant.address &&
+          dword_address_dwords < buffer_end_dwords) {
+        if (trace_writer_) {
+          trace_writer_->WriteMemoryRead(
+              sizeof(uint32_t) * dword_address_dwords, sizeof(uint32_t));
+        }
+        dword_value = xenos::GpuSwap(memory_dwords[dword_address_dwords],
+                                     fetch_constant.endian);
+      }
+      data[i] = dword_value;
+    }
+
+    uint32_t packed_components = 0b0000;
+    uint32_t packed_widths[4], packed_offsets[4];
+    uint32_t packed_dwords[] = {data[0], data[0]};
+    switch (instr.data_format()) {
+      case xenos::VertexFormat::k_8_8_8_8: {
+        packed_components = 0b1111;
+        packed_widths[0] = packed_widths[1] = packed_widths[2] =
+            packed_widths[3] = 8;
+        packed_offsets[1] = 8;
+        packed_offsets[2] = 16;
+        packed_offsets[3] = 24;
+      } break;
+      case xenos::VertexFormat::k_2_10_10_10: {
+        packed_components = 0b1111;
+        packed_widths[0] = packed_widths[1] = packed_widths[2] = 10;
+        packed_widths[3] = 2;
+        packed_offsets[1] = 10;
+        packed_offsets[2] = 20;
+        packed_offsets[3] = 30;
+      } break;
+      case xenos::VertexFormat::k_10_11_11: {
+        packed_components = 0b0111;
+        packed_widths[0] = packed_widths[1] = 11;
+        packed_widths[2] = 10;
+        packed_offsets[1] = 11;
+        packed_offsets[2] = 22;
+      } break;
+      case xenos::VertexFormat::k_11_11_10: {
+        packed_components = 0b0111;
+        packed_widths[0] = 10;
+        packed_widths[1] = packed_widths[2] = 11;
+        packed_offsets[1] = 10;
+        packed_offsets[2] = 21;
+      } break;
+      case xenos::VertexFormat::k_16_16: {
+        packed_components = 0b0011;
+        packed_widths[0] = packed_widths[1] = 16;
+        packed_offsets[1] = 16;
+      } break;
+      case xenos::VertexFormat::k_16_16_16_16: {
+        packed_components = 0b1111;
+        packed_widths[0] = packed_widths[1] = packed_widths[2] =
+            packed_widths[3] = 16;
+        packed_offsets[1] = packed_offsets[3] = 16;
+        packed_dwords[1] = data[1];
+      } break;
+      case xenos::VertexFormat::k_16_16_16_16_FLOAT: {
+        if (used_result_components & 0b1000) {
+          result[3] = xe::xenos_half_to_float(uint16_t(data[1] >> 16));
+        }
+        if (used_result_components & 0b0100) {
+          result[2] = xe::xenos_half_to_float(uint16_t(data[1]));
+        }
+      }
+        [[fallthrough]];
+      case xenos::VertexFormat::k_16_16_FLOAT: {
+        if (used_result_components & 0b0010) {
+          result[1] = xe::xenos_half_to_float(uint16_t(data[0] >> 16));
+        }
+        if (used_result_components & 0b0001) {
+          result[0] = xe::xenos_half_to_float(uint16_t(data[0]));
+        }
+      } break;
+      case xenos::VertexFormat::k_32:
+      case xenos::VertexFormat::k_32_32:
+      case xenos::VertexFormat::k_32_32_32_32: {
+        if (instr.is_signed()) {
+          for (uint32_t i = 0; i < 4; ++i) {
+            result[i] = float(int32_t(data[i]));
+          }
+          if (instr.is_normalized()) {
+            if (instr.signed_rf_mode() ==
+                xenos::SignedRepeatingFractionMode::kNoZero) {
+              for (uint32_t i = 0; i < 4; ++i) {
+                result[i] = (result[i] + 0.5f) / 2147483647.5f;
+              }
+            } else {
+              for (uint32_t i = 0; i < 4; ++i) {
+                result[i] /= 2147483647.0f;
+                // No need to clamp to -1 if signed - the smallest value will be
+                // -2^23 / 2^23 due to rounding.
+              }
+            }
+          }
+        } else {
+          for (uint32_t i = 0; i < 4; ++i) {
+            result[i] = float(data[i]);
+          }
+          if (instr.is_normalized()) {
+            for (uint32_t i = 0; i < 4; ++i) {
+              result[i] /= 4294967295.0f;
+            }
+          }
+        }
+      } break;
+      case xenos::VertexFormat::k_32_FLOAT:
+      case xenos::VertexFormat::k_32_32_FLOAT:
+      case xenos::VertexFormat::k_32_32_32_32_FLOAT:
+      case xenos::VertexFormat::k_32_32_32_FLOAT: {
+        for (uint32_t i = 0; i < 4; ++i) {
+          result[i] = *reinterpret_cast<const float*>(&data[i]);
+        }
+      } break;
+      default:
+        assert_unhandled_case(instr.data_format());
+        break;
+    }
+
+    packed_components &= used_result_components;
+    if (packed_components) {
+      if (instr.is_signed()) {
+        for (uint32_t i = 0; i < 4; ++i) {
+          if (!(packed_components & (UINT32_C(1) << i))) {
+            continue;
+          }
+          uint32_t packed_width = packed_widths[i];
+          result[i] = float(int32_t(packed_dwords[i >> 1])
+                                << (32 - (packed_width + packed_offsets[i])) >>
+                            (32 - packed_width));
+        }
+        if (instr.is_normalized()) {
+          if (instr.signed_rf_mode() ==
+              xenos::SignedRepeatingFractionMode::kNoZero) {
+            for (uint32_t i = 0; i < 4; ++i) {
+              if (!(packed_components & (UINT32_C(1) << i))) {
+                continue;
+              }
+              result[i] = (result[i] + 0.5f) * 2.0f /
+                          float((UINT32_C(1) << packed_widths[i]) - 1);
+            }
+          } else {
+            for (uint32_t i = 0; i < 4; ++i) {
+              if (!(packed_components & (UINT32_C(1) << i))) {
+                continue;
+              }
+              result[i] = std::max(
+                  -1.0f,
+                  result[i] /
+                      float((UINT32_C(1) << (packed_widths[i] - 1)) - 1));
+            }
+          }
+        }
+      } else {
+        for (uint32_t i = 0; i < 4; ++i) {
+          if (!(packed_components & (UINT32_C(1) << i))) {
+            continue;
+          }
+          uint32_t packed_width = packed_widths[i];
+          result[i] = float(packed_dwords[i >> 1] &
+                            ((UINT32_C(1) << packed_widths[i]) - 1));
+        }
+        if (instr.is_normalized()) {
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (!(packed_components & (UINT32_C(1) << i))) {
+              continue;
+            }
+            result[i] /= float((UINT32_C(1) << packed_widths[i]) - 1);
+          }
+        }
+      }
+    }
+  }
+
+  int32_t exp_adjust = instr.exp_adjust();
+  if (exp_adjust) {
+    float exp_adjust_factor = std::ldexp(1.0f, exp_adjust);
+    for (uint32_t i = 0; i < 4; ++i) {
+      result[i] *= exp_adjust_factor;
+    }
+  }
+
+  StoreFetchResult(instr.dest(), instr.is_dest_relative(), instr.dest_swizzle(),
+                   result);
+}
+
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/shader_interpreter.h b/src/xenia/gpu/shader_interpreter.h
new file mode 100644
index 000000000..6182acecf
--- /dev/null
+++ b/src/xenia/gpu/shader_interpreter.h
@@ -0,0 +1,149 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_SHADER_INTERPRETER_H_
+#define XENIA_GPU_SHADER_INTERPRETER_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "xenia/base/assert.h"
+#include "xenia/gpu/register_file.h"
+#include "xenia/gpu/shader.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/ucode.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+class ShaderInterpreter {
+ public:
+  ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
+      : register_file_(register_file), memory_(memory) {}
+
+  class ExportSink {
+   public:
+    virtual ~ExportSink() = default;
+    virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
+    virtual void Export(ucode::ExportRegister export_register,
+                        const float* value, uint32_t value_mask) {}
+  };
+
+  void SetTraceWriter(TraceWriter* new_trace_writer) {
+    trace_writer_ = new_trace_writer;
+  }
+
+  ExportSink* GetExportSink() const { return export_sink_; }
+  void SetExportSink(ExportSink* new_export_sink) {
+    export_sink_ = new_export_sink;
+  }
+
+  const float* temp_registers() const { return &temp_registers_[0][0]; }
+  float* temp_registers() { return &temp_registers_[0][0]; }
+
+  static bool CanInterpretShader(const Shader& shader) {
+    assert_true(shader.is_ucode_analyzed());
+    // Texture instructions are not very common in vertex shaders (and not used
+    // in Direct3D 9's internal rectangles such as clears) and are extremely
+    // complex, not implemented.
+    if (shader.uses_texture_fetch_instruction_results()) {
+      return false;
+    }
+    return true;
+  }
+  void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
+    shader_type_ = shader_type;
+    ucode_ = ucode;
+  }
+  void SetShader(const Shader& shader) {
+    assert_true(CanInterpretShader(shader));
+    SetShader(shader.type(), shader.ucode_dwords());
+  }
+
+  void Execute();
+
+ private:
+  struct State {
+    ucode::VertexFetchInstruction vfetch_full_last;
+    uint32_t vfetch_address_dwords;
+    float previous_scalar;
+    uint32_t call_stack_depth;
+    uint32_t call_return_addresses[4];
+    uint32_t loop_stack_depth;
+    xenos::LoopConstant loop_constants[4];
+    uint32_t loop_iterators[4];
+    int32_t address_register;
+    bool predicate;
+
+    void Reset() { std::memset(this, 0, sizeof(*this)); }
+
+    int32_t GetLoopAddress() const {
+      assert_true(loop_stack_depth && loop_stack_depth < 4);
+      if (!loop_stack_depth || loop_stack_depth >= 4) {
+        return 0;
+      }
+      xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
+      // Clamp to the real range specified in the IPR2015-00325 sequencer
+      // specification.
+      // https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
+      return std::min(
+          INT32_C(256),
+          std::max(INT32_C(-256),
+                   int32_t(int32_t(loop_iterators[loop_stack_depth]) *
+                               loop_constant.step +
+                           loop_constant.start)));
+    }
+  };
+
+  static float FlushDenormal(float value) {
+    uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
+    bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
+    return *reinterpret_cast<const float*>(&bits);
+  }
+
+  const float* GetTempRegister(uint32_t address, bool is_relative) const {
+    return temp_registers_[(
+        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
+  }
+  // For simplicity (due to writability), not bounds-checking.
+  float* GetTempRegister(uint32_t address, bool is_relative) {
+    return temp_registers_[(
+        int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
+  }
+  const float* GetFloatConstant(uint32_t address, bool is_relative,
+                                bool relative_address_is_a0) const;
+
+  void ExecuteAluInstruction(ucode::AluInstruction instr);
+  void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
+                        const float* value);
+  void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
+
+  const RegisterFile& register_file_;
+  const Memory& memory_;
+
+  TraceWriter* trace_writer_ = nullptr;
+
+  ExportSink* export_sink_ = nullptr;
+
+  xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
+  const uint32_t* ucode_ = nullptr;
+
+  // For both inputs and locals.
+  float temp_registers_[64][4];
+
+  State state_;
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_SHADER_INTERPRETER_H_
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 4e4bce854..4f4c1736c 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
     GatherOperandInformation(binding.fetch_instr.operands[i]);
   }
 
+  if (binding.fetch_instr.result.GetUsedResultComponents()) {
+    uses_texture_fetch_instruction_results_ = true;
+  }
+
   switch (op.opcode()) {
     case FetchOpcode::kSetTextureLod:
     case FetchOpcode::kSetTextureGradientsHorz: