From e6fa0ad13993282a891009ff4a757d5dcc46fd7c Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Sat, 19 Dec 2020 16:14:54 +0300
Subject: [PATCH] [GPU] Dynamic r# count via shader modifications + refactoring

---
 .../gpu/d3d12/d3d12_command_processor.cc      |  189 +-
 src/xenia/gpu/d3d12/d3d12_command_processor.h |    2 +-
 src/xenia/gpu/d3d12/d3d12_shader.cc           |    2 +-
 src/xenia/gpu/d3d12/d3d12_shader.h            |    4 +-
 src/xenia/gpu/d3d12/pipeline_cache.cc         |  360 ++--
 src/xenia/gpu/d3d12/pipeline_cache.h          |   42 +-
 src/xenia/gpu/d3d12/render_target_cache.cc    |    6 +-
 src/xenia/gpu/d3d12/render_target_cache.h     |    2 +-
 src/xenia/gpu/d3d12/texture_cache.cc          |    8 +-
 src/xenia/gpu/d3d12/texture_cache.h           |    4 +-
 src/xenia/gpu/dxbc_shader.cc                  |    2 +-
 src/xenia/gpu/dxbc_shader.h                   |   26 +-
 src/xenia/gpu/dxbc_shader_translator.cc       |  262 +--
 src/xenia/gpu/dxbc_shader_translator.h        |   37 +-
 .../gpu/dxbc_shader_translator_memexport.cc   |    2 +-
 src/xenia/gpu/dxbc_shader_translator_om.cc    |   72 +-
 src/xenia/gpu/registers.h                     |    1 +
 src/xenia/gpu/shader.cc                       |    8 +-
 src/xenia/gpu/shader.h                        |  234 ++-
 src/xenia/gpu/shader_compiler_main.cc         |   19 +-
 src/xenia/gpu/shader_translator.cc            | 1792 ++++++++---------
 src/xenia/gpu/shader_translator.h             |  217 +-
 src/xenia/gpu/spirv_shader_translator.cc      |   26 +-
 src/xenia/gpu/spirv_shader_translator.h       |   16 +-
 src/xenia/gpu/ucode.h                         |   11 +-
 src/xenia/gpu/vulkan/pipeline_cache.cc        |   17 +-
 src/xenia/gpu/vulkan/pipeline_cache.h         |    6 +-
 src/xenia/gpu/vulkan/vulkan_shader.cc         |    2 +-
 src/xenia/gpu/vulkan/vulkan_shader.h          |    4 +-
 src/xenia/gpu/xenos.h                         |   27 -
 30 files changed, 1684 insertions(+), 1716 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index f6af89881..95744b49c 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -99,14 +99,11 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) {
 }
 
 uint32_t D3D12CommandProcessor::GetCurrentColorMask(
-    const Shader* pixel_shader) const {
-  if (pixel_shader == nullptr) {
-    return 0;
-  }
+    uint32_t shader_writes_color_targets) const {
   auto& regs = *register_file_;
   uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF;
   for (uint32_t i = 0; i < 4; ++i) {
-    if (!pixel_shader->writes_color_target(i)) {
+    if (!(shader_writes_color_targets & (1 << i))) {
       color_mask &= ~(0xF << (i * 4));
     }
   }
@@ -167,14 +164,18 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
       tessellated ? D3D12_SHADER_VISIBILITY_DOMAIN
                   : D3D12_SHADER_VISIBILITY_VERTEX;
 
-  uint32_t texture_count_vertex, sampler_count_vertex;
-  vertex_shader->GetTextureBindings(texture_count_vertex);
-  vertex_shader->GetSamplerBindings(sampler_count_vertex);
-  uint32_t texture_count_pixel = 0, sampler_count_pixel = 0;
-  if (pixel_shader != nullptr) {
-    pixel_shader->GetTextureBindings(texture_count_pixel);
-    pixel_shader->GetSamplerBindings(sampler_count_pixel);
-  }
+  uint32_t texture_count_vertex =
+      uint32_t(vertex_shader->GetTextureBindingsAfterTranslation().size());
+  uint32_t sampler_count_vertex =
+      uint32_t(vertex_shader->GetSamplerBindingsAfterTranslation().size());
+  uint32_t texture_count_pixel =
+      pixel_shader
+          ? uint32_t(pixel_shader->GetTextureBindingsAfterTranslation().size())
+          : 0;
+  uint32_t sampler_count_pixel =
+      pixel_shader
+          ? uint32_t(pixel_shader->GetSamplerBindingsAfterTranslation().size())
+          : 0;
 
   // Better put the pixel texture/sampler in the lower bits probably because it
   // changes often.
@@ -383,33 +384,26 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
 uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices(
     const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
     RootBindfulExtraParameterIndices& indices_out) {
-  uint32_t texture_count_pixel = 0, sampler_count_pixel = 0;
-  if (pixel_shader != nullptr) {
-    pixel_shader->GetTextureBindings(texture_count_pixel);
-    pixel_shader->GetSamplerBindings(sampler_count_pixel);
-  }
-  uint32_t texture_count_vertex, sampler_count_vertex;
-  vertex_shader->GetTextureBindings(texture_count_vertex);
-  vertex_shader->GetSamplerBindings(sampler_count_vertex);
-
   uint32_t index = kRootParameter_Bindful_Count_Base;
-  if (texture_count_pixel != 0) {
+  if (pixel_shader &&
+      !pixel_shader->GetTextureBindingsAfterTranslation().empty()) {
     indices_out.textures_pixel = index++;
   } else {
     indices_out.textures_pixel = RootBindfulExtraParameterIndices::kUnavailable;
   }
-  if (sampler_count_pixel != 0) {
+  if (pixel_shader &&
+      !pixel_shader->GetSamplerBindingsAfterTranslation().empty()) {
     indices_out.samplers_pixel = index++;
   } else {
     indices_out.samplers_pixel = RootBindfulExtraParameterIndices::kUnavailable;
   }
-  if (texture_count_vertex != 0) {
+  if (!vertex_shader->GetTextureBindingsAfterTranslation().empty()) {
     indices_out.textures_vertex = index++;
   } else {
     indices_out.textures_vertex =
         RootBindfulExtraParameterIndices::kUnavailable;
   }
-  if (sampler_count_vertex != 0) {
+  if (!vertex_shader->GetSamplerBindingsAfterTranslation().empty()) {
     indices_out.samplers_vertex = index++;
   } else {
     indices_out.samplers_vertex =
@@ -1839,10 +1833,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     // Need a pixel shader in normal color mode.
     return false;
   }
+  // Gather shader ucode information to get the color mask, which is needed by
+  // the render target cache, and memexport configuration, and also get the
+  // current shader modification bits.
   DxbcShaderTranslator::Modification vertex_shader_modification;
   DxbcShaderTranslator::Modification pixel_shader_modification;
-  if (!pipeline_cache_->GetCurrentShaderModifications(
-          vertex_shader_modification, pixel_shader_modification)) {
+  if (!pipeline_cache_->AnalyzeShaderUcodeAndGetCurrentModifications(
+          vertex_shader, pixel_shader, vertex_shader_modification,
+          pixel_shader_modification)) {
     return false;
   }
   D3D12Shader::D3D12Translation* vertex_shader_translation =
@@ -1854,13 +1852,6 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
                          pixel_shader->GetOrCreateTranslation(
                              pixel_shader_modification.value))
                    : nullptr;
-  // Translate the shaders now to get memexport configuration and color mask,
-  // which is needed by the render target cache, and also to get used textures
-  // and samplers.
-  if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader_translation,
-                                                pixel_shader_translation)) {
-    return false;
-  }
   bool tessellated = vertex_shader_modification.host_vertex_shader_type !=
                      Shader::HostVertexShaderType::kVertex;
 
@@ -1889,7 +1880,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   BeginSubmission(true);
 
   // Set up the render targets - this may bind pipelines.
-  if (!render_target_cache_->UpdateRenderTargets(pixel_shader)) {
+  uint32_t pixel_shader_writes_color_targets =
+      pixel_shader ? pixel_shader->writes_color_targets() : 0;
+  if (!render_target_cache_->UpdateRenderTargets(
+          pixel_shader_writes_color_targets)) {
     return false;
   }
   const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets =
@@ -1958,13 +1952,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     line_loop_closing_index = 0;
   }
 
-  // Update the textures - this may bind pipelines.
-  uint32_t used_texture_mask =
-      vertex_shader->GetUsedTextureMask() |
-      (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0);
-  texture_cache_->RequestTextures(used_texture_mask);
-
-  // Create the pipeline if needed and bind it.
+  // Translate the shaders and create the pipeline if needed.
   void* pipeline_handle;
   ID3D12RootSignature* root_signature;
   if (!pipeline_cache_->ConfigurePipeline(
@@ -1974,6 +1962,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           pipeline_render_targets, &pipeline_handle, &root_signature)) {
     return false;
   }
+
+  // Update the textures - this may bind pipelines.
+  uint32_t used_texture_mask =
+      vertex_shader->GetUsedTextureMaskAfterTranslation() |
+      (pixel_shader != nullptr
+           ? pixel_shader->GetUsedTextureMaskAfterTranslation()
+           : 0);
+  texture_cache_->RequestTextures(used_texture_mask);
+
+  // Bind the pipeline after configuring it and doing everything that may bind
+  // other pipelines.
   if (current_cached_pipeline_ != pipeline_handle) {
     deferred_command_list_.SetPipelineStateHandle(
         reinterpret_cast<void*>(pipeline_handle));
@@ -2026,7 +2025,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
       memexport_used, primitive_polygonal, line_loop_closing_index,
       indexed ? index_buffer_info->endianness : xenos::Endian::kNone,
       viewport_info, pixel_size_x, pixel_size_y, used_texture_mask,
-      GetCurrentColorMask(pixel_shader), pipeline_render_targets);
+      pixel_shader ? GetCurrentColorMask(pixel_shader->writes_color_targets())
+                   : 0,
+      pipeline_render_targets);
 
   // Update constant buffers, descriptors and root parameters.
   if (!UpdateBindings(vertex_shader, pixel_shader, root_signature)) {
@@ -2089,9 +2090,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   MemExportRange memexport_ranges[512];
   uint32_t memexport_range_count = 0;
   if (memexport_used_vertex) {
-    const std::vector<uint32_t>& memexport_stream_constants_vertex =
-        vertex_shader->memexport_stream_constants();
-    for (uint32_t constant_index : memexport_stream_constants_vertex) {
+    for (uint32_t constant_index :
+         vertex_shader->memexport_stream_constants()) {
       const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
           XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
       if (memexport_stream.index_count == 0) {
@@ -2132,9 +2132,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     }
   }
   if (memexport_used_pixel) {
-    const std::vector<uint32_t>& memexport_stream_constants_pixel =
-        pixel_shader->memexport_stream_constants();
-    for (uint32_t constant_index : memexport_stream_constants_pixel) {
+    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
       const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
           XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
       if (memexport_stream.index_count == 0) {
@@ -3588,20 +3586,21 @@ bool D3D12CommandProcessor::UpdateBindings(
       vertex_shader->GetTextureBindingLayoutUserUID();
   size_t sampler_layout_uid_vertex =
       vertex_shader->GetSamplerBindingLayoutUserUID();
-  uint32_t texture_count_vertex, sampler_count_vertex;
-  const D3D12Shader::TextureBinding* textures_vertex =
-      vertex_shader->GetTextureBindings(texture_count_vertex);
-  const D3D12Shader::SamplerBinding* samplers_vertex =
-      vertex_shader->GetSamplerBindings(sampler_count_vertex);
+  const std::vector<D3D12Shader::TextureBinding>& textures_vertex =
+      vertex_shader->GetTextureBindingsAfterTranslation();
+  const std::vector<D3D12Shader::SamplerBinding>& samplers_vertex =
+      vertex_shader->GetSamplerBindingsAfterTranslation();
+  size_t texture_count_vertex = textures_vertex.size();
+  size_t sampler_count_vertex = samplers_vertex.size();
   if (sampler_count_vertex) {
     if (current_sampler_layout_uid_vertex_ != sampler_layout_uid_vertex) {
       current_sampler_layout_uid_vertex_ = sampler_layout_uid_vertex;
       cbuffer_binding_descriptor_indices_vertex_.up_to_date = false;
       bindful_samplers_written_vertex_ = false;
     }
-    current_samplers_vertex_.resize(std::max(current_samplers_vertex_.size(),
-                                             size_t(sampler_count_vertex)));
-    for (uint32_t i = 0; i < sampler_count_vertex; ++i) {
+    current_samplers_vertex_.resize(
+        std::max(current_samplers_vertex_.size(), sampler_count_vertex));
+    for (size_t i = 0; i < sampler_count_vertex; ++i) {
       TextureCache::SamplerParameters parameters =
           texture_cache_->GetSamplerParameters(samplers_vertex[i]);
       if (current_samplers_vertex_[i] != parameters) {
@@ -3615,14 +3614,16 @@ bool D3D12CommandProcessor::UpdateBindings(
   // Get textures and samplers used by the pixel shader, check if the last used
   // samplers are compatible and update them.
   size_t texture_layout_uid_pixel, sampler_layout_uid_pixel;
-  uint32_t texture_count_pixel, sampler_count_pixel;
-  const D3D12Shader::TextureBinding* textures_pixel;
-  const D3D12Shader::SamplerBinding* samplers_pixel;
+  const std::vector<D3D12Shader::TextureBinding>* textures_pixel;
+  const std::vector<D3D12Shader::SamplerBinding>* samplers_pixel;
+  size_t texture_count_pixel, sampler_count_pixel;
   if (pixel_shader != nullptr) {
     texture_layout_uid_pixel = pixel_shader->GetTextureBindingLayoutUserUID();
     sampler_layout_uid_pixel = pixel_shader->GetSamplerBindingLayoutUserUID();
-    textures_pixel = pixel_shader->GetTextureBindings(texture_count_pixel);
-    samplers_pixel = pixel_shader->GetSamplerBindings(sampler_count_pixel);
+    textures_pixel = &pixel_shader->GetTextureBindingsAfterTranslation();
+    texture_count_pixel = textures_pixel->size();
+    samplers_pixel = &pixel_shader->GetSamplerBindingsAfterTranslation();
+    sampler_count_pixel = samplers_pixel->size();
     if (sampler_count_pixel) {
       if (current_sampler_layout_uid_pixel_ != sampler_layout_uid_pixel) {
         current_sampler_layout_uid_pixel_ = sampler_layout_uid_pixel;
@@ -3633,7 +3634,7 @@ bool D3D12CommandProcessor::UpdateBindings(
                                               size_t(sampler_count_pixel)));
       for (uint32_t i = 0; i < sampler_count_pixel; ++i) {
         TextureCache::SamplerParameters parameters =
-            texture_cache_->GetSamplerParameters(samplers_pixel[i]);
+            texture_cache_->GetSamplerParameters((*samplers_pixel)[i]);
         if (current_samplers_pixel_[i] != parameters) {
           current_samplers_pixel_[i] = parameters;
           cbuffer_binding_descriptor_indices_pixel_.up_to_date = false;
@@ -3663,7 +3664,7 @@ bool D3D12CommandProcessor::UpdateBindings(
         cbuffer_binding_descriptor_indices_vertex_.up_to_date &&
         (current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
          !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_vertex_.data(), textures_vertex,
+             current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
              texture_count_vertex))) {
       cbuffer_binding_descriptor_indices_vertex_.up_to_date = false;
     }
@@ -3671,7 +3672,7 @@ bool D3D12CommandProcessor::UpdateBindings(
         cbuffer_binding_descriptor_indices_pixel_.up_to_date &&
         (current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
          !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_pixel_.data(), textures_pixel,
+             current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
              texture_count_pixel))) {
       cbuffer_binding_descriptor_indices_pixel_.up_to_date = false;
     }
@@ -3804,15 +3805,14 @@ bool D3D12CommandProcessor::UpdateBindings(
       uint32_t* descriptor_indices =
           reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request(
               frame_current_,
-              std::max(texture_count_vertex + sampler_count_vertex,
-                       uint32_t(1)) *
+              std::max(texture_count_vertex + sampler_count_vertex, size_t(1)) *
                   sizeof(uint32_t),
               D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
               &cbuffer_binding_descriptor_indices_vertex_.address));
       if (!descriptor_indices) {
         return false;
       }
-      for (uint32_t i = 0; i < texture_count_vertex; ++i) {
+      for (size_t i = 0; i < texture_count_vertex; ++i) {
         const D3D12Shader::TextureBinding& texture = textures_vertex[i];
         descriptor_indices[texture.bindless_descriptor_index] =
             texture_cache_->GetActiveTextureBindlessSRVIndex(texture) -
@@ -3824,11 +3824,11 @@ bool D3D12CommandProcessor::UpdateBindings(
             std::max(current_texture_srv_keys_vertex_.size(),
                      size_t(texture_count_vertex)));
         texture_cache_->WriteActiveTextureSRVKeys(
-            current_texture_srv_keys_vertex_.data(), textures_vertex,
+            current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
             texture_count_vertex);
       }
       // Current samplers have already been updated.
-      for (uint32_t i = 0; i < sampler_count_vertex; ++i) {
+      for (size_t i = 0; i < sampler_count_vertex; ++i) {
         descriptor_indices[samplers_vertex[i].bindless_descriptor_index] =
             current_sampler_bindless_indices_vertex_[i];
       }
@@ -3841,15 +3841,15 @@ bool D3D12CommandProcessor::UpdateBindings(
       uint32_t* descriptor_indices =
           reinterpret_cast<uint32_t*>(constant_buffer_pool_->Request(
               frame_current_,
-              std::max(texture_count_pixel + sampler_count_pixel, uint32_t(1)) *
+              std::max(texture_count_pixel + sampler_count_pixel, size_t(1)) *
                   sizeof(uint32_t),
               D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, nullptr, nullptr,
               &cbuffer_binding_descriptor_indices_pixel_.address));
       if (!descriptor_indices) {
         return false;
       }
-      for (uint32_t i = 0; i < texture_count_pixel; ++i) {
-        const D3D12Shader::TextureBinding& texture = textures_pixel[i];
+      for (size_t i = 0; i < texture_count_pixel; ++i) {
+        const D3D12Shader::TextureBinding& texture = (*textures_pixel)[i];
         descriptor_indices[texture.bindless_descriptor_index] =
             texture_cache_->GetActiveTextureBindlessSRVIndex(texture) -
             uint32_t(SystemBindlessView::kUnboundedSRVsStart);
@@ -3860,12 +3860,12 @@ bool D3D12CommandProcessor::UpdateBindings(
             std::max(current_texture_srv_keys_pixel_.size(),
                      size_t(texture_count_pixel)));
         texture_cache_->WriteActiveTextureSRVKeys(
-            current_texture_srv_keys_pixel_.data(), textures_pixel,
+            current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
             texture_count_pixel);
       }
       // Current samplers have already been updated.
-      for (uint32_t i = 0; i < sampler_count_pixel; ++i) {
-        descriptor_indices[samplers_pixel[i].bindless_descriptor_index] =
+      for (size_t i = 0; i < sampler_count_pixel; ++i) {
+        descriptor_indices[(*samplers_pixel)[i].bindless_descriptor_index] =
             current_sampler_bindless_indices_pixel_[i];
       }
       cbuffer_binding_descriptor_indices_pixel_.up_to_date = true;
@@ -3884,14 +3884,14 @@ bool D3D12CommandProcessor::UpdateBindings(
         (!bindful_textures_written_vertex_ ||
          current_texture_layout_uid_vertex_ != texture_layout_uid_vertex ||
          !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_vertex_.data(), textures_vertex,
+             current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
              texture_count_vertex));
     bool write_textures_pixel =
         texture_count_pixel &&
         (!bindful_textures_written_pixel_ ||
          current_texture_layout_uid_pixel_ != texture_layout_uid_pixel ||
          !texture_cache_->AreActiveTextureSRVKeysUpToDate(
-             current_texture_srv_keys_pixel_.data(), textures_pixel,
+             current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
              texture_count_pixel));
     bool write_samplers_vertex =
         sampler_count_vertex && !bindful_samplers_written_vertex_;
@@ -3899,7 +3899,7 @@ bool D3D12CommandProcessor::UpdateBindings(
         sampler_count_pixel && !bindful_samplers_written_pixel_;
 
     // Allocate the descriptors.
-    uint32_t view_count_partial_update = 0;
+    size_t view_count_partial_update = 0;
     if (write_textures_vertex) {
       view_count_partial_update += texture_count_vertex;
     }
@@ -3907,7 +3907,7 @@ bool D3D12CommandProcessor::UpdateBindings(
       view_count_partial_update += texture_count_pixel;
     }
     // All the constants + shared memory SRV and UAV + textures.
-    uint32_t view_count_full_update =
+    size_t view_count_full_update =
         2 + texture_count_vertex + texture_count_pixel;
     if (edram_rov_used_) {
       // + EDRAM UAV.
@@ -3917,14 +3917,14 @@ bool D3D12CommandProcessor::UpdateBindings(
     D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle;
     uint32_t descriptor_size_view = provider.GetViewDescriptorSize();
     uint64_t view_heap_index = RequestViewBindfulDescriptors(
-        draw_view_bindful_heap_index_, view_count_partial_update,
-        view_count_full_update, view_cpu_handle, view_gpu_handle);
+        draw_view_bindful_heap_index_, uint32_t(view_count_partial_update),
+        uint32_t(view_count_full_update), view_cpu_handle, view_gpu_handle);
     if (view_heap_index ==
         ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
       XELOGE("Failed to allocate view descriptors");
       return false;
     }
-    uint32_t sampler_count_partial_update = 0;
+    size_t sampler_count_partial_update = 0;
     if (write_samplers_vertex) {
       sampler_count_partial_update += sampler_count_vertex;
     }
@@ -3938,9 +3938,10 @@ bool D3D12CommandProcessor::UpdateBindings(
         ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid;
     if (sampler_count_vertex != 0 || sampler_count_pixel != 0) {
       sampler_heap_index = RequestSamplerBindfulDescriptors(
-          draw_sampler_bindful_heap_index_, sampler_count_partial_update,
-          sampler_count_vertex + sampler_count_pixel, sampler_cpu_handle,
-          sampler_gpu_handle);
+          draw_sampler_bindful_heap_index_,
+          uint32_t(sampler_count_partial_update),
+          uint32_t(sampler_count_vertex + sampler_count_pixel),
+          sampler_cpu_handle, sampler_gpu_handle);
       if (sampler_heap_index ==
           ui::d3d12::D3D12DescriptorHeapPool::kHeapIndexInvalid) {
         XELOGE("Failed to allocate sampler descriptors");
@@ -3985,7 +3986,7 @@ bool D3D12CommandProcessor::UpdateBindings(
       assert_true(current_graphics_root_bindful_extras_.textures_vertex !=
                   RootBindfulExtraParameterIndices::kUnavailable);
       gpu_handle_textures_vertex_ = view_gpu_handle;
-      for (uint32_t i = 0; i < texture_count_vertex; ++i) {
+      for (size_t i = 0; i < texture_count_vertex; ++i) {
         texture_cache_->WriteActiveTextureBindfulSRV(textures_vertex[i],
                                                      view_cpu_handle);
         view_cpu_handle.ptr += descriptor_size_view;
@@ -3996,7 +3997,7 @@ bool D3D12CommandProcessor::UpdateBindings(
           std::max(current_texture_srv_keys_vertex_.size(),
                    size_t(texture_count_vertex)));
       texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_vertex_.data(), textures_vertex,
+          current_texture_srv_keys_vertex_.data(), textures_vertex.data(),
           texture_count_vertex);
       bindful_textures_written_vertex_ = true;
       current_graphics_root_up_to_date_ &=
@@ -4006,8 +4007,8 @@ bool D3D12CommandProcessor::UpdateBindings(
       assert_true(current_graphics_root_bindful_extras_.textures_pixel !=
                   RootBindfulExtraParameterIndices::kUnavailable);
       gpu_handle_textures_pixel_ = view_gpu_handle;
-      for (uint32_t i = 0; i < texture_count_pixel; ++i) {
-        texture_cache_->WriteActiveTextureBindfulSRV(textures_pixel[i],
+      for (size_t i = 0; i < texture_count_pixel; ++i) {
+        texture_cache_->WriteActiveTextureBindfulSRV((*textures_pixel)[i],
                                                      view_cpu_handle);
         view_cpu_handle.ptr += descriptor_size_view;
         view_gpu_handle.ptr += descriptor_size_view;
@@ -4016,7 +4017,7 @@ bool D3D12CommandProcessor::UpdateBindings(
       current_texture_srv_keys_pixel_.resize(std::max(
           current_texture_srv_keys_pixel_.size(), size_t(texture_count_pixel)));
       texture_cache_->WriteActiveTextureSRVKeys(
-          current_texture_srv_keys_pixel_.data(), textures_pixel,
+          current_texture_srv_keys_pixel_.data(), textures_pixel->data(),
           texture_count_pixel);
       bindful_textures_written_pixel_ = true;
       current_graphics_root_up_to_date_ &=
@@ -4026,7 +4027,7 @@ bool D3D12CommandProcessor::UpdateBindings(
       assert_true(current_graphics_root_bindful_extras_.samplers_vertex !=
                   RootBindfulExtraParameterIndices::kUnavailable);
       gpu_handle_samplers_vertex_ = sampler_gpu_handle;
-      for (uint32_t i = 0; i < sampler_count_vertex; ++i) {
+      for (size_t i = 0; i < sampler_count_vertex; ++i) {
         texture_cache_->WriteSampler(current_samplers_vertex_[i],
                                      sampler_cpu_handle);
         sampler_cpu_handle.ptr += descriptor_size_sampler;
@@ -4041,7 +4042,7 @@ bool D3D12CommandProcessor::UpdateBindings(
       assert_true(current_graphics_root_bindful_extras_.samplers_pixel !=
                   RootBindfulExtraParameterIndices::kUnavailable);
       gpu_handle_samplers_pixel_ = sampler_gpu_handle;
-      for (uint32_t i = 0; i < sampler_count_pixel; ++i) {
+      for (size_t i = 0; i < sampler_count_pixel; ++i) {
         texture_cache_->WriteSampler(current_samplers_pixel_[i],
                                      sampler_cpu_handle);
         sampler_cpu_handle.ptr += descriptor_size_sampler;
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index a9181f1c3..fc72433fc 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -89,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor {
   // there are 4 render targets bound with the same EDRAM base (clearly not
   // correct usage), but the shader only clears 1, and then EDRAM buffer stores
   // conflict with each other.
-  uint32_t GetCurrentColorMask(const Shader* pixel_shader) const;
+  uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const;
 
   void PushTransitionBarrier(
       ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
diff --git a/src/xenia/gpu/d3d12/d3d12_shader.cc b/src/xenia/gpu/d3d12/d3d12_shader.cc
index 672f1e37d..eef4ca7de 100644
--- a/src/xenia/gpu/d3d12/d3d12_shader.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shader.cc
@@ -99,7 +99,7 @@ void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil(
 }
 
 Shader::Translation* D3D12Shader::CreateTranslationInstance(
-    uint32_t modification) {
+    uint64_t modification) {
   return new D3D12Translation(*this, modification);
 }
 
diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h
index 384e48a8a..b64681dc7 100644
--- a/src/xenia/gpu/d3d12/d3d12_shader.h
+++ b/src/xenia/gpu/d3d12/d3d12_shader.h
@@ -23,7 +23,7 @@ class D3D12Shader : public DxbcShader {
  public:
   class D3D12Translation : public DxbcTranslation {
    public:
-    D3D12Translation(D3D12Shader& shader, uint32_t modification)
+    D3D12Translation(D3D12Shader& shader, uint64_t modification)
         : DxbcTranslation(shader, modification) {}
 
     void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider,
@@ -60,7 +60,7 @@ class D3D12Shader : public DxbcShader {
   }
 
  protected:
-  Translation* CreateTranslationInstance(uint32_t modification) override;
+  Translation* CreateTranslationInstance(uint64_t modification) override;
 
  private:
   std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT;
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index cc9f5c9be..c29dd4c0d 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -18,6 +18,7 @@
 #include <mutex>
 #include <set>
 #include <utility>
+#include <vector>
 
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
@@ -29,6 +30,7 @@
 #include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
 #include "xenia/base/string.h"
+#include "xenia/base/string_buffer.h"
 #include "xenia/base/xxhash.h"
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/gpu_flags.h"
@@ -265,7 +267,7 @@ void PipelineCache::InitializeShaderStorage(
   // collect used shader modifications to translate.
   std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
   // <Shader hash, modification bits>.
-  std::set<std::pair<uint64_t, uint32_t>> shader_translations_needed;
+  std::set<std::pair<uint64_t, uint64_t>> shader_translations_needed;
   auto pipeline_storage_file_path =
       shader_storage_shareable_root /
       fmt::format("{:08X}.{}.d3d12.xpso", title_id,
@@ -292,7 +294,6 @@ void PipelineCache::InitializeShaderStorage(
     uint32_t magic;
     uint32_t magic_api;
     uint32_t version_swapped;
-    uint32_t device_features;
   } pipeline_storage_file_header;
   if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
             1, pipeline_storage_file_) &&
@@ -331,6 +332,9 @@ void PipelineCache::InitializeShaderStorage(
           pipeline_stored_descriptions.resize(i);
           break;
         }
+        // TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported
+        // device features (to keep the cache files mostly shareable across
+        // devices).
         // Mark the shader modifications as needed for translation.
         shader_translations_needed.emplace(
             pipeline_stored_description.description.vertex_shader_hash,
@@ -391,14 +395,14 @@ void PipelineCache::InitializeShaderStorage(
     // Threads overlapping file reading.
     std::mutex shaders_translation_thread_mutex;
     std::condition_variable shaders_translation_thread_cond;
-    std::deque<std::pair<ShaderStoredHeader, D3D12Shader::D3D12Translation*>>
-        shaders_to_translate;
+    std::deque<D3D12Shader*> shaders_to_translate;
     size_t shader_translation_threads_busy = 0;
     bool shader_translation_threads_shutdown = false;
     std::mutex shaders_failed_to_translate_mutex;
     std::vector<D3D12Shader::D3D12Translation*> shaders_failed_to_translate;
     auto shader_translation_thread_function = [&]() {
       auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+      StringBuffer ucode_disasm_buffer;
       DxbcShaderTranslator translator(
           provider.GetAdapterVendorID(), bindless_resources_used_,
           edram_rov_used_, provider.GetGraphicsAnalysis() != nullptr);
@@ -416,8 +420,7 @@ void PipelineCache::InitializeShaderStorage(
                                    IID_PPV_ARGS(&dxc_compiler));
       }
       for (;;) {
-        std::pair<ShaderStoredHeader, D3D12Shader::D3D12Translation*>
-            shader_to_translate;
+        D3D12Shader* shader_to_translate;
         for (;;) {
           std::unique_lock<std::mutex> lock(shaders_translation_thread_mutex);
           if (shaders_to_translate.empty()) {
@@ -432,12 +435,29 @@ void PipelineCache::InitializeShaderStorage(
           ++shader_translation_threads_busy;
           break;
         }
-        assert_not_null(shader_to_translate.second);
-        if (!TranslateShader(translator, *shader_to_translate.second,
-                             shader_to_translate.first.sq_program_cntl,
-                             dxbc_converter, dxc_utils, dxc_compiler)) {
-          std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex);
-          shaders_failed_to_translate.push_back(shader_to_translate.second);
+        shader_to_translate->AnalyzeUcode(ucode_disasm_buffer);
+        // Translate each needed modification on this thread after performing
+        // modification-independent analysis of the whole shader.
+        uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash();
+        for (auto modification_it = shader_translations_needed.lower_bound(
+                 std::make_pair(ucode_data_hash, uint64_t(0)));
+             modification_it != shader_translations_needed.end() &&
+             modification_it->first == ucode_data_hash;
+             ++modification_it) {
+          D3D12Shader::D3D12Translation* translation =
+              static_cast<D3D12Shader::D3D12Translation*>(
+                  shader_to_translate->GetOrCreateTranslation(
+                      modification_it->second));
+          // Only try (and delete in case of failure) if it's a new translation.
+          // If it's a shader previously encountered in the game, translation of
+          // which has failed, and the shader storage is loaded later, keep it
+          // this way not to try to translate it again.
+          if (!translation->is_translated() &&
+              !TranslateAnalyzedShader(translator, *translation, dxbc_converter,
+                                       dxc_utils, dxc_compiler)) {
+            std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex);
+            shaders_failed_to_translate.push_back(translation);
+          }
         }
         {
           std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
@@ -477,59 +497,41 @@ void PipelineCache::InitializeShaderStorage(
         break;
       }
       shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count;
-      // Only add the shader if needed.
-      auto modification_it = shader_translations_needed.lower_bound(
-          std::make_pair(ucode_data_hash, uint32_t(0)));
-      if (modification_it == shader_translations_needed.end() ||
-          modification_it->first != ucode_data_hash) {
-        continue;
-      }
       D3D12Shader* shader =
           LoadShader(shader_header.type, ucode_dwords.data(),
                      shader_header.ucode_dword_count, ucode_data_hash);
+      if (shader->ucode_storage_index() == shader_storage_index_) {
+        // Appeared twice in this file for some reason - skip, otherwise race
+        // condition will be caused by translating twice in parallel.
+        continue;
+      }
       // Loaded from the current storage - don't write again.
       shader->set_ucode_storage_index(shader_storage_index_);
-      // Translate all the needed modifications.
-      for (; modification_it != shader_translations_needed.end() &&
-             modification_it->first == ucode_data_hash;
-           ++modification_it) {
-        bool translation_is_new;
-        D3D12Shader::D3D12Translation* translation =
-            static_cast<D3D12Shader::D3D12Translation*>(
-                shader->GetOrCreateTranslation(modification_it->second,
-                                               &translation_is_new));
-        if (!translation_is_new) {
-          // Already added - usually shaders aren't added without the intention
-          // of translating them imminently, so don't do additional checks to
-          // actually ensure that translation happens right now (they would
-          // cause a race condition with shaders currently queued for
-          // translation).
-          continue;
-        }
-        // Create new threads if the currently existing threads can't keep up
-        // with file reading, but not more than the number of logical processors
-        // minus one.
-        size_t shader_translation_threads_needed;
-        {
-          std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
-          shader_translation_threads_needed =
-              std::min(shader_translation_threads_busy +
-                           shaders_to_translate.size() + size_t(1),
-                       logical_processor_count - size_t(1));
-        }
-        while (shader_translation_threads.size() <
-               shader_translation_threads_needed) {
-          shader_translation_threads.push_back(xe::threading::Thread::Create(
-              {}, shader_translation_thread_function));
-          shader_translation_threads.back()->set_name("Shader Translation");
-        }
-        {
-          std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
-          shaders_to_translate.emplace_back(shader_header, translation);
-        }
-        shaders_translation_thread_cond.notify_one();
-        ++shaders_translated;
+      // Create new threads if the currently existing threads can't keep up
+      // with file reading, but not more than the number of logical processors
+      // minus one.
+      size_t shader_translation_threads_needed;
+      {
+        std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
+        shader_translation_threads_needed =
+            std::min(shader_translation_threads_busy +
+                         shaders_to_translate.size() + size_t(1),
+                     logical_processor_count - size_t(1));
       }
+      while (shader_translation_threads.size() <
+             shader_translation_threads_needed) {
+        shader_translation_threads.push_back(xe::threading::Thread::Create(
+            {}, shader_translation_thread_function));
+        shader_translation_threads.back()->set_name("Shader Translation");
+      }
+      // Request ucode information gathering and translation of all the needed
+      // shaders.
+      {
+        std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
+        shaders_to_translate.push_back(shader);
+      }
+      shaders_translation_thread_cond.notify_one();
+      ++shaders_translated;
     }
     if (!shader_translation_threads.empty()) {
       {
@@ -593,6 +595,8 @@ void PipelineCache::InitializeShaderStorage(
          pipeline_stored_descriptions) {
       const PipelineDescription& pipeline_description =
           pipeline_stored_description.description;
+      // TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported device
+      // features (to keep the cache files mostly shareable across devices).
       // Skip already known pipelines - those have already been enqueued.
       auto found_range =
           pipelines_.equal_range(pipeline_stored_description.description_hash);
@@ -621,6 +625,7 @@ void PipelineCache::InitializeShaderStorage(
               vertex_shader->GetTranslation(
                   pipeline_description.vertex_shader_modification));
       if (!pipeline_runtime_description.vertex_shader ||
+          !pipeline_runtime_description.vertex_shader->is_translated() ||
           !pipeline_runtime_description.vertex_shader->is_valid()) {
         continue;
       }
@@ -637,6 +642,7 @@ void PipelineCache::InitializeShaderStorage(
                 pixel_shader->GetTranslation(
                     pipeline_description.pixel_shader_modification));
         if (!pipeline_runtime_description.pixel_shader ||
+            !pipeline_runtime_description.pixel_shader->is_translated() ||
             !pipeline_runtime_description.pixel_shader->is_valid()) {
           continue;
         }
@@ -730,9 +736,6 @@ void PipelineCache::InitializeShaderStorage(
     pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
     pipeline_storage_file_header.version_swapped =
         pipeline_storage_version_swapped;
-    // Reserved for future (for Vulkan) - host device features affecting legal
-    // pipeline descriptions.
-    pipeline_storage_file_header.device_features = 0;
     fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
            1, pipeline_storage_file_);
   }
@@ -854,52 +857,68 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
   return shader;
 }
 
-bool PipelineCache::GetCurrentShaderModifications(
+bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications(
+    D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
     DxbcShaderTranslator::Modification& vertex_shader_modification_out,
-    DxbcShaderTranslator::Modification& pixel_shader_modification_out) const {
+    DxbcShaderTranslator::Modification& pixel_shader_modification_out) {
   Shader::HostVertexShaderType host_vertex_shader_type =
       GetCurrentHostVertexShaderTypeIfValid();
   if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) {
     return false;
   }
+  const auto& regs = register_file_;
+  auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
+
+  vertex_shader->AnalyzeUcode(ucode_disasm_buffer_);
   vertex_shader_modification_out = DxbcShaderTranslator::Modification(
-      shader_translator_->GetDefaultModification(xenos::ShaderType::kVertex,
-                                                 host_vertex_shader_type));
-  DxbcShaderTranslator::Modification pixel_shader_modification(
-      shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel));
-  if (!edram_rov_used_) {
-    const auto& regs = register_file_;
-    using DepthStencilMode =
-        DxbcShaderTranslator::Modification::DepthStencilMode;
-    if ((depth_float24_conversion_ ==
-             flags::DepthFloat24Conversion::kOnOutputTruncating ||
-         depth_float24_conversion_ ==
-             flags::DepthFloat24Conversion::kOnOutputRounding) &&
-        regs.Get<reg::RB_DEPTHCONTROL>().z_enable &&
-        regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
-            xenos::DepthRenderTargetFormat::kD24FS8) {
-      pixel_shader_modification.depth_stencil_mode =
-          depth_float24_conversion_ ==
-                  flags::DepthFloat24Conversion::kOnOutputTruncating
-              ? DepthStencilMode::kFloat24Truncating
-              : DepthStencilMode::kFloat24Rounding;
-    } else {
-      // Hint to enable early depth/stencil writing if possible - whether it
-      // will actually take effect depends on the shader itself, it's not known
-      // before translation.
-      auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>();
-      if ((!rb_colorcontrol.alpha_test_enable ||
-           rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) &&
-          !rb_colorcontrol.alpha_to_mask_enable) {
+      shader_translator_->GetDefaultModification(
+          xenos::ShaderType::kVertex,
+          vertex_shader->GetDynamicAddressableRegisterCount(
+              sq_program_cntl.vs_num_reg),
+          host_vertex_shader_type));
+
+  if (pixel_shader) {
+    pixel_shader->AnalyzeUcode(ucode_disasm_buffer_);
+    DxbcShaderTranslator::Modification pixel_shader_modification(
+        shader_translator_->GetDefaultModification(
+            xenos::ShaderType::kPixel,
+            pixel_shader->GetDynamicAddressableRegisterCount(
+                sq_program_cntl.ps_num_reg)));
+    if (!edram_rov_used_) {
+      using DepthStencilMode =
+          DxbcShaderTranslator::Modification::DepthStencilMode;
+      if ((depth_float24_conversion_ ==
+               flags::DepthFloat24Conversion::kOnOutputTruncating ||
+           depth_float24_conversion_ ==
+               flags::DepthFloat24Conversion::kOnOutputRounding) &&
+          regs.Get<reg::RB_DEPTHCONTROL>().z_enable &&
+          regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
+              xenos::DepthRenderTargetFormat::kD24FS8) {
         pixel_shader_modification.depth_stencil_mode =
-            DepthStencilMode::kEarlyHint;
+            depth_float24_conversion_ ==
+                    flags::DepthFloat24Conversion::kOnOutputTruncating
+                ? DepthStencilMode::kFloat24Truncating
+                : DepthStencilMode::kFloat24Rounding;
       } else {
-        pixel_shader_modification.depth_stencil_mode =
-            DepthStencilMode::kNoModifiers;
+        auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>();
+        if (pixel_shader->implicit_early_z_write_allowed() &&
+            (!rb_colorcontrol.alpha_test_enable ||
+             rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) &&
+            !rb_colorcontrol.alpha_to_mask_enable) {
+          pixel_shader_modification.depth_stencil_mode =
+              DepthStencilMode::kEarlyHint;
+        } else {
+          pixel_shader_modification.depth_stencil_mode =
+              DepthStencilMode::kNoModifiers;
+        }
       }
     }
+    pixel_shader_modification_out = pixel_shader_modification;
+  } else {
+    pixel_shader_modification_out = DxbcShaderTranslator::Modification(
+        shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel,
+                                                   0));
   }
-  pixel_shader_modification_out = pixel_shader_modification;
   return true;
 }
 
@@ -979,62 +998,6 @@ PipelineCache::GetCurrentHostVertexShaderTypeIfValid() const {
   return Shader::HostVertexShaderType(-1);
 }
 
-bool PipelineCache::EnsureShadersTranslated(
-    D3D12Shader::D3D12Translation* vertex_shader,
-    D3D12Shader::D3D12Translation* pixel_shader) {
-  const auto& regs = register_file_;
-  auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
-
-  // Edge flags are not supported yet (because polygon primitives are not).
-  assert_true(sq_program_cntl.vs_export_mode !=
-                  xenos::VertexShaderExportMode::kPosition2VectorsEdge &&
-              sq_program_cntl.vs_export_mode !=
-                  xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill);
-  assert_false(sq_program_cntl.gen_index_vtx);
-
-  if (!vertex_shader->is_translated()) {
-    if (!TranslateShader(*shader_translator_, *vertex_shader, sq_program_cntl,
-                         dxbc_converter_, dxc_utils_, dxc_compiler_)) {
-      XELOGE("Failed to translate the vertex shader!");
-      return false;
-    }
-    if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() !=
-                                    shader_storage_index_) {
-      vertex_shader->shader().set_ucode_storage_index(shader_storage_index_);
-      assert_not_null(storage_write_thread_);
-      shader_storage_file_flush_needed_ = true;
-      {
-        std::lock_guard<std::mutex> lock(storage_write_request_lock_);
-        storage_write_shader_queue_.push_back(
-            std::make_pair(&vertex_shader->shader(), sq_program_cntl));
-      }
-      storage_write_request_cond_.notify_all();
-    }
-  }
-
-  if (pixel_shader != nullptr && !pixel_shader->is_translated()) {
-    if (!TranslateShader(*shader_translator_, *pixel_shader, sq_program_cntl,
-                         dxbc_converter_, dxc_utils_, dxc_compiler_)) {
-      XELOGE("Failed to translate the pixel shader!");
-      return false;
-    }
-    if (shader_storage_file_ &&
-        pixel_shader->shader().ucode_storage_index() != shader_storage_index_) {
-      pixel_shader->shader().set_ucode_storage_index(shader_storage_index_);
-      assert_not_null(storage_write_thread_);
-      shader_storage_file_flush_needed_ = true;
-      {
-        std::lock_guard<std::mutex> lock(storage_write_request_lock_);
-        storage_write_shader_queue_.push_back(
-            std::make_pair(&pixel_shader->shader(), sq_program_cntl));
-      }
-      storage_write_request_cond_.notify_all();
-    }
-  }
-
-  return true;
-}
-
 bool PipelineCache::ConfigurePipeline(
     D3D12Shader::D3D12Translation* vertex_shader,
     D3D12Shader::D3D12Translation* pixel_shader,
@@ -1078,8 +1041,50 @@ bool PipelineCache::ConfigurePipeline(
     }
   }
 
-  if (!EnsureShadersTranslated(vertex_shader, pixel_shader)) {
-    return false;
+  // Ensure shaders are translated.
+  // Edge flags are not supported yet (because polygon primitives are not).
+  assert_true(register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
+                  xenos::VertexShaderExportMode::kPosition2VectorsEdge &&
+              register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
+                  xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill);
+  assert_false(register_file_.Get<reg::SQ_PROGRAM_CNTL>().gen_index_vtx);
+  if (!vertex_shader->is_translated()) {
+    vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
+    if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader,
+                                 dxbc_converter_, dxc_utils_, dxc_compiler_)) {
+      XELOGE("Failed to translate the vertex shader!");
+      return false;
+    }
+    if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() !=
+                                    shader_storage_index_) {
+      vertex_shader->shader().set_ucode_storage_index(shader_storage_index_);
+      assert_not_null(storage_write_thread_);
+      shader_storage_file_flush_needed_ = true;
+      {
+        std::lock_guard<std::mutex> lock(storage_write_request_lock_);
+        storage_write_shader_queue_.push_back(&vertex_shader->shader());
+      }
+      storage_write_request_cond_.notify_all();
+    }
+  }
+  if (pixel_shader != nullptr && !pixel_shader->is_translated()) {
+    pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
+    if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader,
+                                 dxbc_converter_, dxc_utils_, dxc_compiler_)) {
+      XELOGE("Failed to translate the pixel shader!");
+      return false;
+    }
+    if (shader_storage_file_ &&
+        pixel_shader->shader().ucode_storage_index() != shader_storage_index_) {
+      pixel_shader->shader().set_ucode_storage_index(shader_storage_index_);
+      assert_not_null(storage_write_thread_);
+      shader_storage_file_flush_needed_ = true;
+      {
+        std::lock_guard<std::mutex> lock(storage_write_request_lock_);
+        storage_write_shader_queue_.push_back(&pixel_shader->shader());
+      }
+      storage_write_request_cond_.notify_all();
+    }
   }
 
   Pipeline* new_pipeline = new Pipeline;
@@ -1121,17 +1126,15 @@ bool PipelineCache::ConfigurePipeline(
   return true;
 }
 
-bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator,
-                                    D3D12Shader::D3D12Translation& translation,
-                                    reg::SQ_PROGRAM_CNTL cntl,
-                                    IDxbcConverter* dxbc_converter,
-                                    IDxcUtils* dxc_utils,
-                                    IDxcCompiler* dxc_compiler) {
+bool PipelineCache::TranslateAnalyzedShader(
+    DxbcShaderTranslator& translator,
+    D3D12Shader::D3D12Translation& translation, IDxbcConverter* dxbc_converter,
+    IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler) {
   D3D12Shader& shader = static_cast<D3D12Shader&>(translation.shader());
 
   // Perform translation.
   // If this fails the shader will be marked as invalid and ignored later.
-  if (!translator.Translate(translation, cntl)) {
+  if (!translator.TranslateAnalyzedShader(translation)) {
     XELOGE("Shader {:016X} translation failed; marking as ignored",
            shader.ucode_data_hash());
     return false;
@@ -1171,21 +1174,21 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator,
 
   // Set up texture and sampler binding layouts.
   if (shader.EnterBindingLayoutUserUIDSetup()) {
-    uint32_t texture_binding_count;
-    const D3D12Shader::TextureBinding* texture_bindings =
-        shader.GetTextureBindings(texture_binding_count);
-    uint32_t sampler_binding_count;
-    const D3D12Shader::SamplerBinding* sampler_bindings =
-        shader.GetSamplerBindings(sampler_binding_count);
+    const std::vector<D3D12Shader::TextureBinding>& texture_bindings =
+        shader.GetTextureBindingsAfterTranslation();
+    uint32_t texture_binding_count = uint32_t(texture_bindings.size());
+    const std::vector<D3D12Shader::SamplerBinding>& sampler_bindings =
+        shader.GetSamplerBindingsAfterTranslation();
+    uint32_t sampler_binding_count = uint32_t(sampler_bindings.size());
     assert_false(bindless_resources_used_ &&
                  texture_binding_count + sampler_binding_count >
                      D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4);
     size_t texture_binding_layout_bytes =
-        texture_binding_count * sizeof(*texture_bindings);
+        texture_binding_count * sizeof(*texture_bindings.data());
     uint64_t texture_binding_layout_hash = 0;
     if (texture_binding_count) {
       texture_binding_layout_hash =
-          XXH3_64bits(texture_bindings, texture_binding_layout_bytes);
+          XXH3_64bits(texture_bindings.data(), texture_binding_layout_bytes);
     }
     uint32_t bindless_sampler_count =
         bindless_resources_used_ ? sampler_binding_count : 0;
@@ -1223,7 +1226,8 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator,
           if (it->second.vector_span_length == texture_binding_count &&
               !std::memcmp(texture_binding_layouts_.data() +
                                it->second.vector_span_offset,
-                           texture_bindings, texture_binding_layout_bytes)) {
+                           texture_bindings.data(),
+                           texture_binding_layout_bytes)) {
             texture_binding_layout_uid = it->second.uid;
             break;
           }
@@ -1242,7 +1246,7 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator,
                                           texture_binding_count);
           std::memcpy(
               texture_binding_layouts_.data() + new_uid.vector_span_offset,
-              texture_bindings, texture_binding_layout_bytes);
+              texture_bindings.data(), texture_binding_layout_bytes);
           texture_binding_layout_map_.emplace(texture_binding_layout_hash,
                                               new_uid);
         }
@@ -1576,8 +1580,10 @@ bool PipelineCache::GetCurrentStateDescription(
 
     // Render targets and blending state. 32 because of 0x1F mask, for safety
     // (all unknown to zero).
-    uint32_t color_mask = command_processor_.GetCurrentColorMask(
-        pixel_shader ? &pixel_shader->shader() : nullptr);
+    uint32_t color_mask =
+        pixel_shader ? command_processor_.GetCurrentColorMask(
+                           pixel_shader->shader().writes_color_targets())
+                     : 0;
     static const PipelineBlendFactor kBlendFactorMap[32] = {
         /*  0 */ PipelineBlendFactor::kZero,
         /*  1 */ PipelineBlendFactor::kOne,
@@ -2038,7 +2044,7 @@ void PipelineCache::StorageWriteThread() {
       fflush(pipeline_storage_file_);
     }
 
-    std::pair<const Shader*, reg::SQ_PROGRAM_CNTL> shader_pair = {};
+    const Shader* shader = nullptr;
     PipelineStoredDescription pipeline_description;
     bool write_pipeline = false;
     {
@@ -2047,7 +2053,7 @@ void PipelineCache::StorageWriteThread() {
         return;
       }
       if (!storage_write_shader_queue_.empty()) {
-        shader_pair = storage_write_shader_queue_.front();
+        shader = storage_write_shader_queue_.front();
         storage_write_shader_queue_.pop_front();
       } else if (storage_write_flush_shaders_) {
         storage_write_flush_shaders_ = false;
@@ -2063,18 +2069,16 @@ void PipelineCache::StorageWriteThread() {
         storage_write_flush_pipelines_ = false;
         flush_pipelines = true;
       }
-      if (!shader_pair.first && !write_pipeline) {
+      if (!shader && !write_pipeline) {
         storage_write_request_cond_.wait(lock);
         continue;
       }
     }
 
-    const Shader* shader = shader_pair.first;
     if (shader) {
       shader_header.ucode_data_hash = shader->ucode_data_hash();
       shader_header.ucode_dword_count = shader->ucode_dword_count();
       shader_header.type = shader->type();
-      shader_header.sq_program_cntl = shader_pair.second;
       assert_not_null(shader_storage_file_);
       fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_);
       if (shader_header.ucode_dword_count) {
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index fe867c82a..9a733e40a 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -23,6 +23,7 @@
 
 #include "xenia/base/hash.h"
 #include "xenia/base/platform.h"
+#include "xenia/base/string_buffer.h"
 #include "xenia/base/threading.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/d3d12/render_target_cache.h"
@@ -63,15 +64,12 @@ class PipelineCache {
   D3D12Shader* LoadShader(xenos::ShaderType shader_type,
                           const uint32_t* host_address, uint32_t dword_count);
 
-  // Retrieves the shader modifications for the current state, and returns
-  // whether they are valid.
-  bool GetCurrentShaderModifications(
+  // Ensures microcode is analyzed, retrieves the shader modifications for the
+  // current state, and returns whether they are valid.
+  bool AnalyzeShaderUcodeAndGetCurrentModifications(
+      D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
       DxbcShaderTranslator::Modification& vertex_shader_modification_out,
-      DxbcShaderTranslator::Modification& pixel_shader_modification_out) const;
-
-  // Translates shaders if needed, also making shader info up to date.
-  bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader,
-                               D3D12Shader::D3D12Translation* pixel_shader);
+      DxbcShaderTranslator::Modification& pixel_shader_modification_out);
 
   bool ConfigurePipeline(
       D3D12Shader::D3D12Translation* vertex_shader,
@@ -93,9 +91,7 @@ class PipelineCache {
     uint32_t ucode_dword_count : 31;
     xenos::ShaderType type : 1;
 
-    reg::SQ_PROGRAM_CNTL sq_program_cntl;
-
-    static constexpr uint32_t kVersion = 0x20201207;
+    static constexpr uint32_t kVersion = 0x20201219;
   });
 
   // Update PipelineDescription::kVersion if any of the Pipeline* enums are
@@ -171,10 +167,10 @@ class PipelineCache {
 
   XEPACKEDSTRUCT(PipelineDescription, {
     uint64_t vertex_shader_hash;
+    uint64_t vertex_shader_modification;
     // 0 if drawing without a pixel shader.
     uint64_t pixel_shader_hash;
-    uint32_t vertex_shader_modification;
-    uint32_t pixel_shader_modification;
+    uint64_t pixel_shader_modification;
 
     int32_t depth_bias;
     float depth_bias_slope_scaled;
@@ -208,7 +204,7 @@ class PipelineCache {
 
     PipelineRenderTarget render_targets[4];
 
-    static constexpr uint32_t kVersion = 0x20201207;
+    static constexpr uint32_t kVersion = 0x20201219;
   });
 
   XEPACKEDSTRUCT(PipelineStoredDescription, {
@@ -232,12 +228,11 @@ class PipelineCache {
                           uint64_t data_hash);
 
   // Can be called from multiple threads.
-  bool TranslateShader(DxbcShaderTranslator& translator,
-                       D3D12Shader::D3D12Translation& translation,
-                       reg::SQ_PROGRAM_CNTL cntl,
-                       IDxbcConverter* dxbc_converter = nullptr,
-                       IDxcUtils* dxc_utils = nullptr,
-                       IDxcCompiler* dxc_compiler = nullptr);
+  bool TranslateAnalyzedShader(DxbcShaderTranslator& translator,
+                               D3D12Shader::D3D12Translation& translation,
+                               IDxbcConverter* dxbc_converter = nullptr,
+                               IDxcUtils* dxc_utils = nullptr,
+                               IDxcCompiler* dxc_compiler = nullptr);
 
   bool GetCurrentStateDescription(
       D3D12Shader::D3D12Translation* vertex_shader,
@@ -257,7 +252,9 @@ class PipelineCache {
   flags::DepthFloat24Conversion depth_float24_conversion_;
   uint32_t resolution_scale_;
 
-  // Reusable shader translator.
+  // Temporary storage for AnalyzeUcode calls on the processor thread.
+  StringBuffer ucode_disasm_buffer_;
+  // Reusable shader translator for the processor thread.
   std::unique_ptr<DxbcShaderTranslator> shader_translator_;
 
   // Command processor thread DXIL conversion/disassembly interfaces, if DXIL
@@ -332,8 +329,7 @@ class PipelineCache {
   std::condition_variable storage_write_request_cond_;
   // Storage thread input is protected with storage_write_request_lock_, and the
   // thread is notified about its change via storage_write_request_cond_.
-  std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>>
-      storage_write_shader_queue_;
+  std::deque<const Shader*> storage_write_shader_queue_;
   std::deque<PipelineStoredDescription> storage_write_pipeline_queue_;
   bool storage_write_flush_shaders_ = false;
   bool storage_write_flush_pipelines_ = false;
diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index 8669d58a3..f5a4e0c6b 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -535,7 +535,8 @@ void RenderTargetCache::EndFrame() {
   FlushAndUnbindRenderTargets();
 }
 
-bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
+bool RenderTargetCache::UpdateRenderTargets(
+    uint32_t shader_writes_color_targets) {
   // There are two kinds of render target binding updates in this implementation
   // in case something has been changed - full and partial.
   //
@@ -635,7 +636,8 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
   uint32_t edram_bases[5];
   uint32_t formats[5];
   bool formats_are_64bpp[5];
-  uint32_t color_mask = command_processor_.GetCurrentColorMask(pixel_shader);
+  uint32_t color_mask =
+      command_processor_.GetCurrentColorMask(shader_writes_color_targets);
   for (uint32_t i = 0; i < 4; ++i) {
     enabled[i] = (color_mask & (0xF << (i * 4))) != 0;
     auto color_info = regs.Get<reg::RB_COLOR_INFO>(
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index 2f71c13c8..3bb0af399 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -269,7 +269,7 @@ class RenderTargetCache {
   void EndFrame();
   // Called in the beginning of a draw call - may bind pipelines and change the
   // view descriptor heap.
-  bool UpdateRenderTargets(const D3D12Shader* pixel_shader);
+  bool UpdateRenderTargets(uint32_t shader_writes_color_targets);
   // Returns the host-to-guest mappings and host formats of currently bound
   // render targets for pipeline creation and remapping in shaders. They are
   // consecutive, and format DXGI_FORMAT_UNKNOWN terminates the list. Depth
diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc
index e1f9bdcc4..99909f83f 100644
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@@ -1334,8 +1334,8 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
 bool TextureCache::AreActiveTextureSRVKeysUpToDate(
     const TextureSRVKey* keys,
     const D3D12Shader::TextureBinding* host_shader_bindings,
-    uint32_t host_shader_binding_count) const {
-  for (uint32_t i = 0; i < host_shader_binding_count; ++i) {
+    size_t host_shader_binding_count) const {
+  for (size_t i = 0; i < host_shader_binding_count; ++i) {
     const TextureSRVKey& key = keys[i];
     const TextureBinding& binding =
         texture_bindings_[host_shader_bindings[i].fetch_constant];
@@ -1350,8 +1350,8 @@ bool TextureCache::AreActiveTextureSRVKeysUpToDate(
 void TextureCache::WriteActiveTextureSRVKeys(
     TextureSRVKey* keys,
     const D3D12Shader::TextureBinding* host_shader_bindings,
-    uint32_t host_shader_binding_count) const {
-  for (uint32_t i = 0; i < host_shader_binding_count; ++i) {
+    size_t host_shader_binding_count) const {
+  for (size_t i = 0; i < host_shader_binding_count; ++i) {
     TextureSRVKey& key = keys[i];
     const TextureBinding& binding =
         texture_bindings_[host_shader_bindings[i].fetch_constant];
diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h
index 85131f25d..465824755 100644
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@@ -196,14 +196,14 @@ class TextureCache {
   bool AreActiveTextureSRVKeysUpToDate(
       const TextureSRVKey* keys,
       const D3D12Shader::TextureBinding* host_shader_bindings,
-      uint32_t host_shader_binding_count) const;
+      size_t host_shader_binding_count) const;
   // Exports the current binding data to texture SRV keys so they can be stored
   // for checking whether subsequent draw calls can keep using the same
   // bindings. Write host_shader_binding_count keys.
   void WriteActiveTextureSRVKeys(
       TextureSRVKey* keys,
       const D3D12Shader::TextureBinding* host_shader_bindings,
-      uint32_t host_shader_binding_count) const;
+      size_t host_shader_binding_count) const;
   // Returns the post-swizzle signedness of a currently bound texture (must be
   // called after RequestTextures).
   uint8_t GetActiveTextureSwizzledSigns(uint32_t index) const {
diff --git a/src/xenia/gpu/dxbc_shader.cc b/src/xenia/gpu/dxbc_shader.cc
index 144308d57..9b0243fca 100644
--- a/src/xenia/gpu/dxbc_shader.cc
+++ b/src/xenia/gpu/dxbc_shader.cc
@@ -19,7 +19,7 @@ DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
     : Shader(shader_type, data_hash, dword_ptr, dword_count) {}
 
 Shader::Translation* DxbcShader::CreateTranslationInstance(
-    uint32_t modification) {
+    uint64_t modification) {
   return new DxbcTranslation(*this, modification);
 }
 
diff --git a/src/xenia/gpu/dxbc_shader.h b/src/xenia/gpu/dxbc_shader.h
index 49439a2a6..477dfdc5d 100644
--- a/src/xenia/gpu/dxbc_shader.h
+++ b/src/xenia/gpu/dxbc_shader.h
@@ -10,6 +10,7 @@
 #ifndef XENIA_GPU_DXBC_SHADER_H_
 #define XENIA_GPU_DXBC_SHADER_H_
 
+#include <atomic>
 #include <vector>
 
 #include "xenia/gpu/dxbc_shader_translator.h"
@@ -23,13 +24,17 @@ class DxbcShader : public Shader {
  public:
   class DxbcTranslation : public Translation {
    public:
-    DxbcTranslation(DxbcShader& shader, uint32_t modification)
+    DxbcTranslation(DxbcShader& shader, uint64_t modification)
         : Translation(shader, modification) {}
   };
 
   DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
              const uint32_t* dword_ptr, uint32_t dword_count);
 
+  // Resource bindings are gathered after the successful translation of any
+  // modification for simplicity of translation (and they don't depend on
+  // modification bits).
+
   static constexpr uint32_t kMaxTextureBindingIndexBits =
       DxbcShaderTranslator::kMaxTextureBindingIndexBits;
   static constexpr uint32_t kMaxTextureBindings =
@@ -43,11 +48,13 @@ class DxbcShader : public Shader {
     bool is_signed;
   };
   // Safe to hash and compare with memcmp for layout hashing.
-  const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
-    count_out = uint32_t(texture_bindings_.size());
-    return texture_bindings_.data();
+  const std::vector<TextureBinding>& GetTextureBindingsAfterTranslation()
+      const {
+    return texture_bindings_;
+  }
+  const uint32_t GetUsedTextureMaskAfterTranslation() const {
+    return used_texture_mask_;
   }
-  const uint32_t GetUsedTextureMask() const { return used_texture_mask_; }
 
   static constexpr uint32_t kMaxSamplerBindingIndexBits =
       DxbcShaderTranslator::kMaxSamplerBindingIndexBits;
@@ -61,17 +68,18 @@ class DxbcShader : public Shader {
     xenos::TextureFilter mip_filter;
     xenos::AnisoFilter aniso_filter;
   };
-  const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
-    count_out = uint32_t(sampler_bindings_.size());
-    return sampler_bindings_.data();
+  const std::vector<SamplerBinding>& GetSamplerBindingsAfterTranslation()
+      const {
+    return sampler_bindings_;
   }
 
  protected:
-  Translation* CreateTranslationInstance(uint32_t modification) override;
+  Translation* CreateTranslationInstance(uint64_t modification) override;
 
  private:
   friend class DxbcShaderTranslator;
 
+  std::atomic_flag bindings_setup_entered_ = ATOMIC_FLAG_INIT;
   std::vector<TextureBinding> texture_bindings_;
   std::vector<SamplerBinding> sampler_bindings_;
   uint32_t used_texture_mask_ = 0;
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 865fbd77e..534355ce3 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -10,6 +10,7 @@
 #include "xenia/gpu/dxbc_shader_translator.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstring>
 #include <memory>
 
@@ -78,16 +79,23 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
 DxbcShaderTranslator::~DxbcShaderTranslator() = default;
 
 std::vector<uint8_t> DxbcShaderTranslator::CreateDepthOnlyPixelShader() {
-  Reset(xenos::ShaderType::kPixel);
   is_depth_only_pixel_shader_ = true;
-  StartTranslation();
-  return std::move(CompleteTranslation());
+  // TODO(Triang3l): Handle in a nicer way (is_depth_only_pixel_shader_ is a
+  // leftover from when a Shader object wasn't used during translation).
+  Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0);
+  shader.AnalyzeUcode(instruction_disassembly_buffer_);
+  Shader::Translation& translation = *shader.GetOrCreateTranslation(0);
+  TranslateAnalyzedShader(translation);
+  is_depth_only_pixel_shader_ = false;
+  return translation.translated_binary();
 }
 
-uint32_t DxbcShaderTranslator::GetDefaultModification(
-    xenos::ShaderType shader_type,
+uint64_t DxbcShaderTranslator::GetDefaultModification(
+    xenos::ShaderType shader_type, uint32_t dynamic_addressable_register_count,
     Shader::HostVertexShaderType host_vertex_shader_type) const {
   Modification shader_modification;
+  shader_modification.dynamic_addressable_register_count =
+      dynamic_addressable_register_count;
   switch (shader_type) {
     case xenos::ShaderType::kVertex:
       shader_modification.host_vertex_shader_type = host_vertex_shader_type;
@@ -100,13 +108,11 @@ uint32_t DxbcShaderTranslator::GetDefaultModification(
   return shader_modification.value;
 }
 
-void DxbcShaderTranslator::Reset(xenos::ShaderType shader_type) {
-  ShaderTranslator::Reset(shader_type);
+void DxbcShaderTranslator::Reset() {
+  ShaderTranslator::Reset();
 
   shader_code_.clear();
 
-  is_depth_only_pixel_shader_ = false;
-
   cbuffer_count_ = 0;
   // System constants always used in prologues/epilogues.
   cbuffer_index_system_constants_ = cbuffer_count_++;
@@ -231,6 +237,10 @@ void DxbcShaderTranslator::DxbcSrc::Write(std::vector<uint32_t>& code,
   }
 }
 
+uint32_t DxbcShaderTranslator::GetModificationRegisterCount() const {
+  return GetDxbcShaderModification().dynamic_addressable_register_count;
+}
+
 bool DxbcShaderTranslator::UseSwitchForControlFlow() const {
   // Xenia crashes on Intel HD Graphics 4000 with switch.
   return cvars::dxbc_switch && vendor_id_ != 0x8086;
@@ -239,7 +249,8 @@ bool DxbcShaderTranslator::UseSwitchForControlFlow() const {
 uint32_t DxbcShaderTranslator::PushSystemTemp(uint32_t zero_mask,
                                               uint32_t count) {
   uint32_t register_index = system_temp_count_current_;
-  if (!uses_register_dynamic_addressing() && !is_depth_only_pixel_shader_) {
+  if (!is_depth_only_pixel_shader_ &&
+      !current_shader().uses_register_dynamic_addressing()) {
     // Guest shader registers first if they're not in x0. Depth-only pixel
     // shader is a special case of the DXBC translator usage, where there are no
     // GPRs because there's no shader to translate, and a guest shader is not
@@ -327,10 +338,13 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() {
     return;
   }
 
+  bool uses_register_dynamic_addressing =
+      current_shader().uses_register_dynamic_addressing();
+
   // Writing the index to X of GPR 0 - either directly if not using indexable
   // registers, or via a system temporary register.
   uint32_t reg;
-  if (uses_register_dynamic_addressing()) {
+  if (uses_register_dynamic_addressing) {
     reg = PushSystemTemp();
   } else {
     reg = 0;
@@ -392,7 +406,7 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() {
     DxbcOpBreak();
     DxbcOpEndSwitch();
 
-    if (!uses_register_dynamic_addressing()) {
+    if (!uses_register_dynamic_addressing) {
       // Break register dependency.
       DxbcOpMov(swap_temp_dest, DxbcSrc::LF(0.0f));
     }
@@ -409,7 +423,7 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() {
   // Convert to float.
   DxbcOpIToF(index_dest, index_src);
 
-  if (uses_register_dynamic_addressing()) {
+  if (uses_register_dynamic_addressing) {
     // Store to indexed GPR 0 in x0[0].
     DxbcOpMov(DxbcDest::X(0, 0, 0b0001), index_src);
     PopSystemTemp();
@@ -417,6 +431,9 @@ void DxbcShaderTranslator::StartVertexShader_LoadVertexIndex() {
 }
 
 void DxbcShaderTranslator::StartVertexOrDomainShader() {
+  bool uses_register_dynamic_addressing =
+      current_shader().uses_register_dynamic_addressing();
+
   // Zero the interpolators.
   for (uint32_t i = 0; i < xenos::kMaxInterpolators; ++i) {
     DxbcOpMov(DxbcDest::O(uint32_t(InOutRegister::kVSDSOutInterpolators) + i),
@@ -438,13 +455,13 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
         // Copy the domain location to r0.xyz.
         // ZYX swizzle according to Call of Duty 3 and Viva Pinata.
         in_domain_location_used_ |= 0b0111;
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0111)
-                                                     : DxbcDest::R(0, 0b0111),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0111)
+                                                   : DxbcDest::R(0, 0b0111),
                   DxbcSrc::VDomain(0b000110));
         if (register_count() >= 2) {
           // Copy the control point indices (already swapped and converted to
           // float by the host vertex and hull shaders) to r1.xyz.
-          DxbcDest control_point_index_dest(uses_register_dynamic_addressing()
+          DxbcDest control_point_index_dest(uses_register_dynamic_addressing
                                                 ? DxbcDest::X(0, 1)
                                                 : DxbcDest::R(1));
           in_control_point_index_used_ = true;
@@ -465,16 +482,16 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
         // ZYX swizzle with r1.y == 0, according to the water shader in
         // Banjo-Kazooie: Nuts & Bolts.
         in_domain_location_used_ |= 0b0111;
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0111)
-                                                     : DxbcDest::R(0, 0b0111),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0111)
+                                                   : DxbcDest::R(0, 0b0111),
                   DxbcSrc::VDomain(0b000110));
         if (register_count() >= 2) {
           // Copy the primitive index to r1.x as a float.
           uint32_t primitive_id_temp =
-              uses_register_dynamic_addressing() ? PushSystemTemp() : 1;
+              uses_register_dynamic_addressing ? PushSystemTemp() : 1;
           in_primitive_id_used_ = true;
           DxbcOpUToF(DxbcDest::R(primitive_id_temp, 0b0001), DxbcSrc::VPrim());
-          if (uses_register_dynamic_addressing()) {
+          if (uses_register_dynamic_addressing) {
             DxbcOpMov(DxbcDest::X(0, 1, 0b0001),
                       DxbcSrc::R(primitive_id_temp, DxbcSrc::kXXXX));
             // Release primitive_id_temp.
@@ -499,9 +516,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
           //
           // Direct3D 12 passes the coordinates in a consistent order, so can
           // just use the identity swizzle.
-          DxbcOpMov(uses_register_dynamic_addressing()
-                        ? DxbcDest::X(0, 1, 0b0010)
-                        : DxbcDest::R(1, 0b0010),
+          DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 1, 0b0010)
+                                                     : DxbcDest::R(1, 0b0010),
                     DxbcSrc::LF(0.0f));
         }
       }
@@ -512,8 +528,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
       if (register_count() >= 1) {
         // Copy the domain location to r0.xy.
         in_domain_location_used_ |= 0b0011;
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0011)
-                                                     : DxbcDest::R(0, 0b0011),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0011)
+                                                   : DxbcDest::R(0, 0b0011),
                   DxbcSrc::VDomain());
         // Control point indices according to the shader from the main menu of
         // Defender, which starts from `cndeq r2, c255.xxxy, r1.xyzz, r0.zzzz`,
@@ -524,14 +540,13 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
         // r1.z for (1 - r0.x) * r0.y
         in_control_point_index_used_ = true;
         DxbcOpMov(
-            uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0100)
-                                               : DxbcDest::R(0, 0b0100),
+            uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0100)
+                                             : DxbcDest::R(0, 0b0100),
             DxbcSrc::VICP(0, uint32_t(InOutRegister::kDSInControlPointIndex),
                           DxbcSrc::kXXXX));
         if (register_count() >= 2) {
-          DxbcDest r1_dest(uses_register_dynamic_addressing()
-                               ? DxbcDest::X(0, 1)
-                               : DxbcDest::R(1));
+          DxbcDest r1_dest(uses_register_dynamic_addressing ? DxbcDest::X(0, 1)
+                                                            : DxbcDest::R(1));
           for (uint32_t i = 0; i < 3; ++i) {
             DxbcOpMov(
                 r1_dest.Mask(1 << i),
@@ -549,15 +564,15 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
         // Copy the domain location to r0.yz.
         // XY swizzle according to the ground shader in Viva Pinata.
         in_domain_location_used_ |= 0b0011;
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, 0, 0b0110)
-                                                     : DxbcDest::R(0, 0b0110),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 0, 0b0110)
+                                                   : DxbcDest::R(0, 0b0110),
                   DxbcSrc::VDomain(0b010000));
         // Copy the primitive index to r0.x as a float.
         uint32_t primitive_id_temp =
-            uses_register_dynamic_addressing() ? PushSystemTemp() : 0;
+            uses_register_dynamic_addressing ? PushSystemTemp() : 0;
         in_primitive_id_used_ = true;
         DxbcOpUToF(DxbcDest::R(primitive_id_temp, 0b0001), DxbcSrc::VPrim());
-        if (uses_register_dynamic_addressing()) {
+        if (uses_register_dynamic_addressing) {
           DxbcOpMov(DxbcDest::X(0, 0, 0b0001),
                     DxbcSrc::R(primitive_id_temp, DxbcSrc::kXXXX));
           // Release primitive_id_temp.
@@ -578,9 +593,8 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
           //
           // Direct3D 12 passes the coordinates in a consistent order, so can
           // just use the identity swizzle.
-          DxbcOpMov(uses_register_dynamic_addressing()
-                        ? DxbcDest::X(0, 1, 0b0001)
-                        : DxbcDest::R(1, 0b0001),
+          DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, 1, 0b0001)
+                                                     : DxbcDest::R(1, 0b0001),
                     DxbcSrc::LF(0.0f));
         }
       }
@@ -611,7 +625,10 @@ void DxbcShaderTranslator::StartPixelShader() {
     return;
   }
 
-  if (!edram_rov_used_ && writes_depth()) {
+  bool uses_register_dynamic_addressing =
+      current_shader().uses_register_dynamic_addressing();
+
+  if (!edram_rov_used_ && current_shader().writes_depth()) {
     // Initialize the depth output if used, which must be written to regardless
     // of the taken execution path.
     DxbcOpMov(DxbcDest::ODepth(), DxbcSrc::LF(0.0f));
@@ -623,7 +640,7 @@ void DxbcShaderTranslator::StartPixelShader() {
     // Copy interpolants to GPRs.
     if (edram_rov_used_) {
       uint32_t centroid_temp =
-          uses_register_dynamic_addressing() ? PushSystemTemp() : UINT32_MAX;
+          uses_register_dynamic_addressing ? PushSystemTemp() : UINT32_MAX;
       system_constants_used_ |= 1ull
                                 << kSysConst_InterpolatorSamplingPattern_Index;
       DxbcSrc sampling_pattern_src(
@@ -635,7 +652,7 @@ void DxbcShaderTranslator::StartPixelShader() {
         // With GPR dynamic addressing, first evaluate to centroid_temp r#, then
         // store to the x#.
         uint32_t centroid_register =
-            uses_register_dynamic_addressing() ? centroid_temp : i;
+            uses_register_dynamic_addressing ? centroid_temp : i;
         // Check if the input needs to be interpolated at center (if the bit is
         // set).
         DxbcOpAnd(DxbcDest::R(centroid_register, 0b0001), sampling_pattern_src,
@@ -643,8 +660,8 @@ void DxbcShaderTranslator::StartPixelShader() {
         DxbcOpIf(bool(xenos::SampleLocation::kCenter),
                  DxbcSrc::R(centroid_register, DxbcSrc::kXXXX));
         // At center.
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i)
-                                                     : DxbcDest::R(i),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, i)
+                                                   : DxbcDest::R(i),
                   DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i));
         DxbcOpElse();
         // At centroid. Not really important that 2x MSAA is emulated using
@@ -653,7 +670,7 @@ void DxbcShaderTranslator::StartPixelShader() {
         DxbcOpEvalCentroid(
             DxbcDest::R(centroid_register),
             DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i));
-        if (uses_register_dynamic_addressing()) {
+        if (uses_register_dynamic_addressing) {
           DxbcOpMov(DxbcDest::X(0, i), DxbcSrc::R(centroid_register));
         }
         DxbcOpEndIf();
@@ -665,8 +682,8 @@ void DxbcShaderTranslator::StartPixelShader() {
       // SSAA instead of MSAA without ROV - everything is interpolated at
       // samples, can't extrapolate.
       for (uint32_t i = 0; i < interpolator_count; ++i) {
-        DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i)
-                                                     : DxbcDest::R(i),
+        DxbcOpMov(uses_register_dynamic_addressing ? DxbcDest::X(0, i)
+                                                   : DxbcDest::R(i),
                   DxbcSrc::V(uint32_t(InOutRegister::kPSInInterpolators) + i));
       }
     }
@@ -781,7 +798,7 @@ void DxbcShaderTranslator::StartPixelShader() {
       }
       // Write ps_param_gen to the specified GPR.
       DxbcSrc param_gen_src(DxbcSrc::R(param_gen_temp));
-      if (uses_register_dynamic_addressing()) {
+      if (uses_register_dynamic_addressing) {
         // Copy the GPR number to r# for relative addressing.
         uint32_t param_gen_copy_temp = PushSystemTemp();
         DxbcOpMov(DxbcDest::R(param_gen_copy_temp, 0b0001),
@@ -863,10 +880,12 @@ void DxbcShaderTranslator::StartTranslation() {
       // by the guest code, so initialize because assumptions can't be made
       // about the integrity of the guest code.
       system_temp_depth_stencil_ =
-          PushSystemTemp(writes_depth() ? 0b0001 : 0b1111);
+          PushSystemTemp(current_shader().writes_depth() ? 0b0001 : 0b1111);
     }
+    uint32_t shader_writes_color_targets =
+        current_shader().writes_color_targets();
     for (uint32_t i = 0; i < 4; ++i) {
-      if (writes_color_target(i)) {
+      if (shader_writes_color_targets & (1 << i)) {
         system_temps_color_[i] = PushSystemTemp(0b1111);
       }
     }
@@ -879,8 +898,8 @@ void DxbcShaderTranslator::StartTranslation() {
     std::memset(system_temps_memexport_data_, 0xFF,
                 sizeof(system_temps_memexport_data_));
     system_temp_memexport_written_ = UINT32_MAX;
-    const uint8_t* memexports_written = memexport_eM_written();
-    for (uint32_t i = 0; i < kMaxMemExports; ++i) {
+    const uint8_t* memexports_written = current_shader().memexport_eM_written();
+    for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
       uint32_t memexport_alloc_written = memexports_written[i];
       if (memexport_alloc_written == 0) {
         continue;
@@ -915,8 +934,9 @@ void DxbcShaderTranslator::StartTranslation() {
     // references them after only initializing them conditionally.
     for (uint32_t i = is_pixel_shader() ? xenos::kMaxInterpolators : 0;
          i < register_count(); ++i) {
-      DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i)
-                                                   : DxbcDest::R(i),
+      DxbcOpMov(current_shader().uses_register_dynamic_addressing()
+                    ? DxbcDest::X(0, i)
+                    : DxbcDest::R(i),
                 DxbcSrc::LF(0.0f));
     }
   }
@@ -1120,7 +1140,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
     ExportToMemory();
 
     // Release memexport temporary registers.
-    for (int i = kMaxMemExports - 1; i >= 0; --i) {
+    for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
       if (system_temps_memexport_address_[i] == UINT32_MAX) {
         continue;
       }
@@ -1154,8 +1174,10 @@ void DxbcShaderTranslator::CompleteShaderCode() {
     PopSystemTemp(2);
   } else if (is_pixel_shader()) {
     // Release system_temps_color_.
+    uint32_t shader_writes_color_targets =
+        current_shader().writes_color_targets();
     for (int32_t i = 3; i >= 0; --i) {
-      if (writes_color_target(i)) {
+      if (shader_writes_color_targets & (1 << i)) {
         PopSystemTemp();
       }
     }
@@ -1274,40 +1296,42 @@ std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() {
   return shader_object_bytes;
 }
 
-void DxbcShaderTranslator::PostTranslation(
-    Shader::Translation& translation, bool setup_shader_post_translation_info) {
-  if (setup_shader_post_translation_info) {
-    DxbcShader* dxbc_shader = dynamic_cast<DxbcShader*>(&translation.shader());
-    if (dxbc_shader) {
-      dxbc_shader->texture_bindings_.clear();
-      dxbc_shader->texture_bindings_.reserve(texture_bindings_.size());
-      dxbc_shader->used_texture_mask_ = 0;
-      for (const TextureBinding& translator_binding : texture_bindings_) {
-        DxbcShader::TextureBinding& shader_binding =
-            dxbc_shader->texture_bindings_.emplace_back();
-        // For a stable hash.
-        std::memset(&shader_binding, 0, sizeof(shader_binding));
-        shader_binding.bindless_descriptor_index =
-            translator_binding.bindless_descriptor_index;
-        shader_binding.fetch_constant = translator_binding.fetch_constant;
-        shader_binding.dimension = translator_binding.dimension;
-        shader_binding.is_signed = translator_binding.is_signed;
-        dxbc_shader->used_texture_mask_ |= 1u
-                                           << translator_binding.fetch_constant;
-      }
-      dxbc_shader->sampler_bindings_.clear();
-      dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size());
-      for (const SamplerBinding& translator_binding : sampler_bindings_) {
-        DxbcShader::SamplerBinding& shader_binding =
-            dxbc_shader->sampler_bindings_.emplace_back();
-        shader_binding.bindless_descriptor_index =
-            translator_binding.bindless_descriptor_index;
-        shader_binding.fetch_constant = translator_binding.fetch_constant;
-        shader_binding.mag_filter = translator_binding.mag_filter;
-        shader_binding.min_filter = translator_binding.min_filter;
-        shader_binding.mip_filter = translator_binding.mip_filter;
-        shader_binding.aniso_filter = translator_binding.aniso_filter;
-      }
+void DxbcShaderTranslator::PostTranslation() {
+  Shader::Translation& translation = current_translation();
+  if (!translation.is_valid()) {
+    return;
+  }
+  DxbcShader* dxbc_shader = dynamic_cast<DxbcShader*>(&translation.shader());
+  if (dxbc_shader && !dxbc_shader->bindings_setup_entered_.test_and_set(
+                         std::memory_order_relaxed)) {
+    dxbc_shader->texture_bindings_.clear();
+    dxbc_shader->texture_bindings_.reserve(texture_bindings_.size());
+    dxbc_shader->used_texture_mask_ = 0;
+    for (const TextureBinding& translator_binding : texture_bindings_) {
+      DxbcShader::TextureBinding& shader_binding =
+          dxbc_shader->texture_bindings_.emplace_back();
+      // For a stable hash.
+      std::memset(&shader_binding, 0, sizeof(shader_binding));
+      shader_binding.bindless_descriptor_index =
+          translator_binding.bindless_descriptor_index;
+      shader_binding.fetch_constant = translator_binding.fetch_constant;
+      shader_binding.dimension = translator_binding.dimension;
+      shader_binding.is_signed = translator_binding.is_signed;
+      dxbc_shader->used_texture_mask_ |= 1u
+                                         << translator_binding.fetch_constant;
+    }
+    dxbc_shader->sampler_bindings_.clear();
+    dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size());
+    for (const SamplerBinding& translator_binding : sampler_bindings_) {
+      DxbcShader::SamplerBinding& shader_binding =
+          dxbc_shader->sampler_bindings_.emplace_back();
+      shader_binding.bindless_descriptor_index =
+          translator_binding.bindless_descriptor_index;
+      shader_binding.fetch_constant = translator_binding.fetch_constant;
+      shader_binding.mag_filter = translator_binding.mag_filter;
+      shader_binding.min_filter = translator_binding.min_filter;
+      shader_binding.mip_filter = translator_binding.mip_filter;
+      shader_binding.aniso_filter = translator_binding.aniso_filter;
     }
   }
 }
@@ -1373,7 +1397,7 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand(
   DxbcSrc src(DxbcSrc::LF(0.0f));
   switch (operand.storage_source) {
     case InstructionStorageSource::kRegister: {
-      if (uses_register_dynamic_addressing()) {
+      if (current_shader().uses_register_dynamic_addressing()) {
         // Load x#[#] to r# because x#[#] can be used only with mov.
         uint32_t temp = PushSystemTemp();
         temp_pushed_out = true;
@@ -1402,10 +1426,12 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand(
       if (cbuffer_index_float_constants_ == kBindingIndexUnallocated) {
         cbuffer_index_float_constants_ = cbuffer_count_++;
       }
+      const Shader::ConstantRegisterMap& constant_register_map =
+          current_shader().constant_register_map();
       if (operand.storage_addressing_mode ==
           InstructionStorageAddressingMode::kStatic) {
         uint32_t float_constant_index =
-            constant_register_map().GetPackedFloatConstantIndex(
+            constant_register_map.GetPackedFloatConstantIndex(
                 operand.storage_index);
         assert_true(float_constant_index != UINT32_MAX);
         if (float_constant_index == UINT32_MAX) {
@@ -1413,7 +1439,7 @@ DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand(
         }
         index.index_ = float_constant_index;
       } else {
-        assert_true(constant_register_map().float_dynamic_addressing);
+        assert_true(constant_register_map.float_dynamic_addressing);
       }
       src = DxbcSrc::CB(cbuffer_index_float_constants_,
                         uint32_t(CbufferRegister::kFloatConstants), index);
@@ -1453,7 +1479,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     case InstructionStorageTarget::kNone:
       return;
     case InstructionStorageTarget::kRegister:
-      if (uses_register_dynamic_addressing()) {
+      if (current_shader().uses_register_dynamic_addressing()) {
         DxbcIndex register_index(result.storage_index);
         switch (result.storage_addressing_mode) {
           case InstructionStorageAddressingMode::kStatic:
@@ -1488,7 +1514,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     case InstructionStorageTarget::kExportAddress:
       // Validate memexport writes (Halo 3 has some weird invalid ones).
       if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > kMaxMemExports ||
+          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
           system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
               UINT32_MAX) {
         return;
@@ -1499,7 +1525,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     case InstructionStorageTarget::kExportData: {
       // Validate memexport writes (Halo 3 has some weird invalid ones).
       if (memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > kMaxMemExports ||
+          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
           system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
                                       [result.storage_index] == UINT32_MAX) {
         return;
@@ -1519,7 +1545,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     } break;
     case InstructionStorageTarget::kColor:
       assert_not_zero(used_write_mask);
-      assert_true(writes_color_target(result.storage_index));
+      assert_true(current_shader().writes_color_target(result.storage_index));
       dest = DxbcDest::R(system_temps_color_[result.storage_index]);
       if (edram_rov_used_) {
         // For ROV output, mark that the color has been written to.
@@ -1539,7 +1565,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
       // Writes X to scalar oDepth or to X of system_temp_depth_stencil_, no
       // additional swizzling needed.
       assert_true(used_write_mask == 0b0001);
-      assert_true(writes_depth());
+      assert_true(current_shader().writes_depth());
       if (IsDepthStencilSystemTempUsed()) {
         dest = DxbcDest::R(system_temp_depth_stencil_);
       } else {
@@ -2077,6 +2103,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
   uint32_t chunk_position_dwords = uint32_t(shader_object_.size());
   uint32_t new_offset;
 
+  const Shader::ConstantRegisterMap& constant_register_map =
+      current_shader().constant_register_map();
+
   // ***************************************************************************
   // Header
   // ***************************************************************************
@@ -2162,7 +2191,7 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
         // Declaring a 0-sized array may not be safe, so write something valid
         // even if they aren't used.
         shader_object_.push_back(
-            std::max(constant_register_map().float_count, uint32_t(1)));
+            std::max(constant_register_map.float_count, uint32_t(1)));
         break;
       case RdefTypeIndex::kUint4DescriptorIndexArray:
         shader_object_.push_back(std::max(
@@ -2278,10 +2307,10 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
   // Float constants.
   uint32_t constant_offset_float = new_offset;
   if (cbuffer_index_float_constants_ != kBindingIndexUnallocated) {
-    assert_not_zero(constant_register_map().float_count);
+    assert_not_zero(constant_register_map.float_count);
     shader_object_.push_back(constant_name_offset_float);
     shader_object_.push_back(0);
-    shader_object_.push_back(constant_register_map().float_count * 4 *
+    shader_object_.push_back(constant_register_map.float_count * 4 *
                              sizeof(float));
     shader_object_.push_back(kDxbcRdefVariableFlagUsed);
     shader_object_.push_back(types_offset +
@@ -2405,11 +2434,11 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
       // No D3D_SHADER_CBUFFER_FLAGS.
       shader_object_.push_back(0);
     } else if (i == cbuffer_index_float_constants_) {
-      assert_not_zero(constant_register_map().float_count);
+      assert_not_zero(constant_register_map.float_count);
       shader_object_.push_back(cbuffer_name_offset_float);
       shader_object_.push_back(1);
       shader_object_.push_back(constant_offset_float);
-      shader_object_.push_back(constant_register_map().float_count * 4 *
+      shader_object_.push_back(constant_register_map.float_count * 4 *
                                sizeof(float));
       shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer));
       shader_object_.push_back(0);
@@ -3211,7 +3240,7 @@ void DxbcShaderTranslator::WriteOutputSignature() {
     if (!edram_rov_used_) {
       // Color render targets (SV_Target#).
       size_t target_position = SIZE_MAX;
-      if (writes_any_color_target()) {
+      if (current_shader().writes_color_targets()) {
         target_position = shader_object_.size();
         shader_object_.resize(shader_object_.size() + 4 * kParameterDwords);
         parameter_count += 4;
@@ -3233,7 +3262,7 @@ void DxbcShaderTranslator::WriteOutputSignature() {
       Modification::DepthStencilMode depth_stencil_mode =
           GetDxbcShaderModification().depth_stencil_mode;
       size_t depth_position = SIZE_MAX;
-      if (writes_depth() || DSV_IsWritingFloat24Depth()) {
+      if (current_shader().writes_depth() || DSV_IsWritingFloat24Depth()) {
         depth_position = shader_object_.size();
         shader_object_.resize(shader_object_.size() + kParameterDwords);
         ++parameter_count;
@@ -3268,7 +3297,7 @@ void DxbcShaderTranslator::WriteOutputSignature() {
           depth.semantic_name = semantic_offset;
         }
         const char* depth_semantic_name;
-        if (!writes_depth() &&
+        if (!current_shader().writes_depth() &&
             GetDxbcShaderModification().depth_stencil_mode ==
                 Modification::DepthStencilMode::kFloat24Truncating) {
           depth_semantic_name = "SV_DepthLessEqual";
@@ -3361,7 +3390,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
   if (is_pixel_shader() &&
       GetDxbcShaderModification().depth_stencil_mode ==
           Modification::DepthStencilMode::kEarlyHint &&
-      !edram_rov_used_ && CanWriteZEarly()) {
+      !edram_rov_used_ && current_shader().implicit_early_z_write_allowed()) {
     global_flags_opcode |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL;
   }
   shader_object_.push_back(global_flags_opcode);
@@ -3369,11 +3398,13 @@ void DxbcShaderTranslator::WriteShaderCode() {
   // Constant buffers, from most frequenly accessed to least frequently accessed
   // (the order is a hint to the driver according to the DXBC header).
   if (cbuffer_index_float_constants_ != kBindingIndexUnallocated) {
-    assert_not_zero(constant_register_map().float_count);
+    const Shader::ConstantRegisterMap& constant_register_map =
+        current_shader().constant_register_map();
+    assert_not_zero(constant_register_map.float_count);
     shader_object_.push_back(
         ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) |
         ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN(
-            constant_register_map().float_dynamic_addressing
+            constant_register_map.float_dynamic_addressing
                 ? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED
                 : D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) |
         ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
@@ -3382,7 +3413,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
     shader_object_.push_back(cbuffer_index_float_constants_);
     shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
     shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
-    shader_object_.push_back(constant_register_map().float_count);
+    shader_object_.push_back(constant_register_map.float_count);
     shader_object_.push_back(0);
   }
   if (cbuffer_index_system_constants_ != kBindingIndexUnallocated) {
@@ -3715,6 +3746,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
     ++stat_.dcl_count;
   } else if (is_pixel_shader()) {
     bool is_writing_float24_depth = DSV_IsWritingFloat24Depth();
+    bool shader_writes_depth = current_shader().writes_depth();
     // Interpolator input.
     if (!is_depth_only_pixel_shader_) {
       uint32_t interpolator_count =
@@ -3766,7 +3798,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
       // applicable here) position is mandatory. However, with depth output, on
       // the guest, there's only one depth value for the whole pixel.
       D3D10_SB_INTERPOLATION_MODE position_interpolation_mode =
-          is_writing_float24_depth && !writes_depth()
+          is_writing_float24_depth && !shader_writes_depth
               ? D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE_SAMPLE
               : D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE;
       shader_object_.push_back(
@@ -3806,7 +3838,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
           EncodeScalarOperand(D3D11_SB_OPERAND_TYPE_INPUT_COVERAGE_MASK, 0));
       ++stat_.dcl_count;
     } else {
-      if (writes_any_color_target()) {
+      if (current_shader().writes_color_targets()) {
         // Color output.
         for (uint32_t i = 0; i < 4; ++i) {
           shader_object_.push_back(
@@ -3819,9 +3851,9 @@ void DxbcShaderTranslator::WriteShaderCode() {
         }
       }
       // Depth output.
-      if (is_writing_float24_depth || writes_depth()) {
+      if (is_writing_float24_depth || shader_writes_depth) {
         D3D10_SB_OPERAND_TYPE depth_operand_type;
-        if (!writes_depth() &&
+        if (!shader_writes_depth &&
             GetDxbcShaderModification().depth_stencil_mode ==
                 Modification::DepthStencilMode::kFloat24Truncating) {
           depth_operand_type = D3D11_SB_OPERAND_TYPE_OUTPUT_DEPTH_LESS_EQUAL;
@@ -3840,7 +3872,8 @@ void DxbcShaderTranslator::WriteShaderCode() {
   // Temporary registers - guest general-purpose registers if not using dynamic
   // indexing and Xenia internal registers.
   stat_.temp_register_count = system_temp_count_max_;
-  if (!is_depth_only_pixel_shader_ && !uses_register_dynamic_addressing()) {
+  if (!is_depth_only_pixel_shader_ &&
+      !current_shader().uses_register_dynamic_addressing()) {
     stat_.temp_register_count += register_count();
   }
   if (stat_.temp_register_count != 0) {
@@ -3851,7 +3884,8 @@ void DxbcShaderTranslator::WriteShaderCode() {
   }
 
   // General-purpose registers if using dynamic indexing (x0).
-  if (!is_depth_only_pixel_shader_ && uses_register_dynamic_addressing()) {
+  if (!is_depth_only_pixel_shader_ &&
+      current_shader().uses_register_dynamic_addressing()) {
     assert_true(register_count() != 0);
     shader_object_.push_back(
         ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INDEXABLE_TEMP) |
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index 1e9891771..808b311fa 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -106,13 +106,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
     // If anything in this is structure is changed in a way not compatible with
     // the previous layout, invalidate the pipeline storages by increasing this
     // version number (0xYYYYMMDD)!
-    static constexpr uint32_t kVersion = 0x20201203;
+    static constexpr uint32_t kVersion = 0x20201219;
 
     enum class DepthStencilMode : uint32_t {
       kNoModifiers,
       // [earlydepthstencil] - enable if alpha test and alpha to coverage are
-      // disabled; ignored if anything in the shader blocks early Z writing
-      // (which is not known before translation, so this will be set anyway).
+      // disabled; ignored if anything in the shader blocks early Z writing.
       kEarlyHint,
       // Converting the depth to the closest 32-bit float representable exactly
       // as a 20e4 float, to support invariance in cases when the guest
@@ -136,15 +135,17 @@ class DxbcShaderTranslator : public ShaderTranslator {
     };
 
     struct {
+      // Both - dynamically indexable register count from SQ_PROGRAM_CNTL.
+      uint32_t dynamic_addressable_register_count : 8;
       // VS - pipeline stage and input configuration.
       Shader::HostVertexShaderType host_vertex_shader_type
           : Shader::kHostVertexShaderTypeBitCount;
       // PS, non-ROV - depth / stencil output mode.
       DepthStencilMode depth_stencil_mode : 2;
     };
-    uint32_t value = 0;
+    uint64_t value = 0;
 
-    Modification(uint32_t modification_value = 0) : value(modification_value) {}
+    Modification(uint64_t modification_value = 0) : value(modification_value) {}
   };
 
   // Constant buffer bindings in space 0.
@@ -467,8 +468,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
       float& clamp_alpha_high, uint32_t& keep_mask_low,
       uint32_t& keep_mask_high);
 
-  uint32_t GetDefaultModification(
+  uint64_t GetDefaultModification(
       xenos::ShaderType shader_type,
+      uint32_t dynamic_addressable_register_count,
       Shader::HostVertexShaderType host_vertex_shader_type =
           Shader::HostVertexShaderType::kVertex) const override;
 
@@ -477,12 +479,13 @@ class DxbcShaderTranslator : public ShaderTranslator {
   std::vector<uint8_t> CreateDepthOnlyPixelShader();
 
  protected:
-  void Reset(xenos::ShaderType shader_type) override;
+  void Reset() override;
+
+  uint32_t GetModificationRegisterCount() const override;
 
   void StartTranslation() override;
   std::vector<uint8_t> CompleteTranslation() override;
-  void PostTranslation(Shader::Translation& translation,
-                       bool setup_shader_post_translation_info) override;
+  void PostTranslation() override;
 
   void ProcessLabel(uint32_t cf_index) override;
 
@@ -2184,7 +2187,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
   }
 
   Modification GetDxbcShaderModification() const {
-    return Modification(modification());
+    return Modification(current_translation().modification());
   }
 
   bool IsDxbcVertexShader() const {
@@ -2227,9 +2230,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
   bool IsDepthStencilSystemTempUsed() const {
     // See system_temp_depth_stencil_ documentation for explanation of cases.
     if (edram_rov_used_) {
-      return writes_depth() || ROV_IsDepthStencilEarly();
+      return current_shader().writes_depth() || ROV_IsDepthStencilEarly();
     }
-    return writes_depth() && DSV_IsWritingFloat24Depth();
+    return current_shader().writes_depth() && DSV_IsWritingFloat24Depth();
   }
   // Whether the current non-ROV pixel shader should convert the depth to 20e4.
   bool DSV_IsWritingFloat24Depth() const {
@@ -2246,8 +2249,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // Whether it's possible and worth skipping running the translated shader for
   // 2x2 quads.
   bool ROV_IsDepthStencilEarly() const {
-    return !is_depth_only_pixel_shader_ && !writes_depth() &&
-           memexport_stream_constants().empty();
+    return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
+           current_shader().memexport_stream_constants().empty();
   }
   // Converts the depth value to 24-bit (storing the result in bits 0:23 and
   // zeros in 24:31, not creating room for stencil - since this may be involved
@@ -2467,7 +2470,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
 
   // Is currently writing the empty depth-only pixel shader, for
   // CompleteTranslation.
-  bool is_depth_only_pixel_shader_;
+  bool is_depth_only_pixel_shader_ = false;
 
   // Data types used in constants buffers. Listed in dependency order.
   enum class RdefTypeIndex {
@@ -2604,9 +2607,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // 4 `alloc export`s per component.
   uint32_t system_temp_memexport_written_;
   // eA in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_address_[kMaxMemExports];
+  uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
   // eM# in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_data_[kMaxMemExports][5];
+  uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
 
   // Vector ALU or fetch result/scratch (since Xenos write masks can contain
   // swizzles).
diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
index 5f3d47bc0..76bec3e60 100644
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
@@ -136,7 +136,7 @@ void DxbcShaderTranslator::ExportToMemory() {
   DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
   // control_temp.x is now free.
 
-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
+  for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
     uint32_t eA_temp = system_temps_memexport_address_[i];
     if (eA_temp == UINT32_MAX) {
       // Export not used.
diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc
index ea79b737c..8c01648f1 100644
--- a/src/xenia/gpu/dxbc_shader_translator_om.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_om.cc
@@ -144,7 +144,7 @@ void DxbcShaderTranslator::ROV_GetColorFormatSystemConstants(
 }
 
 void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
-  bool color_targets_written = writes_any_color_target();
+  bool any_color_targets_written = current_shader().writes_color_targets() != 0;
 
   // ***************************************************************************
   // Get EDRAM offsets for the pixel:
@@ -272,7 +272,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
   DxbcOpIAdd(DxbcDest::R(system_temp_rov_params_, 0b0001),
              DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kZZZZ),
              DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX));
-  if (color_targets_written) {
+  if (any_color_targets_written) {
     // Write 32bpp color offset to system_temp_rov_params_.z.
     // system_temp_rov_params_.x = X sample 0 position within the depth tile
     // system_temp_rov_params_.y = row offset
@@ -303,8 +303,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
   // Release resolution_scale_log2_temp.
   PopSystemTemp();
   {
-    DxbcDest offsets_dest(DxbcDest::R(system_temp_rov_params_,
-                                      color_targets_written ? 0b0110 : 0b0010));
+    DxbcDest offsets_dest(DxbcDest::R(
+        system_temp_rov_params_, any_color_targets_written ? 0b0110 : 0b0010));
     // Scale the offsets by the resolution scale.
     // system_temp_rov_params_.y = scaled 32bpp depth/stencil first host pixel
     //                             address
@@ -329,7 +329,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
   // Close the resolution scale conditional.
   DxbcOpEndIf();
 
-  if (color_targets_written) {
+  if (any_color_targets_written) {
     // Get the 64bpp color offset to system_temp_rov_params_.w.
     // TODO(Triang3l): Find some game that aliases 64bpp with 32bpp to emulate
     // the real layout.
@@ -388,8 +388,6 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
 }
 
 void DxbcShaderTranslator::ROV_DepthStencilTest() {
-  bool depth_stencil_early = ROV_IsDepthStencilEarly();
-
   uint32_t temp = PushSystemTemp();
   DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001));
   DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX));
@@ -413,6 +411,9 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
   // temp.x = free
   DxbcOpIf(true, temp_x_src);
 
+  bool depth_stencil_early = ROV_IsDepthStencilEarly();
+  bool shader_writes_depth = current_shader().writes_depth();
+
   for (uint32_t i = 0; i < 4; ++i) {
     // With early depth/stencil, depth/stencil writing may be deferred to the
     // end of the shader to prevent writing in case something (like alpha test,
@@ -427,7 +428,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
                             : temp_x_src);
 
     if (!i) {
-      if (writes_depth()) {
+      if (shader_writes_depth) {
         // Clamp oDepth to the lower viewport depth bound (depth clamp happens
         // after the pixel shader in the pipeline, at least on Direct3D 11 and
         // Vulkan, thus applies to the shader's depth output too).
@@ -569,7 +570,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
     // temp.w = free
     DxbcOpIf(true, temp_w_src);
 
-    if (writes_depth()) {
+    if (shader_writes_depth) {
       // Copy the 24-bit depth common to all samples to sample_depth_stencil.
       // temp.x = shader-generated 24-bit depth
       DxbcOpMov(sample_depth_stencil_dest,
@@ -1024,7 +1025,8 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
     // temp.z = viewport maximum depth if not writing to oDepth
     // temp.w = whether depth/stencil has been modified
     DxbcOpINE(temp_w_dest, sample_depth_stencil_src, temp_w_src);
-    if (depth_stencil_early && !CanWriteZEarly()) {
+    if (depth_stencil_early &&
+        !current_shader().implicit_early_z_write_allowed()) {
       // Set the sample bit in bits 4:7 of system_temp_rov_params_.x - always
       // need to write late in this shader, as it may do something like
       // explicitly killing pixels.
@@ -1734,7 +1736,7 @@ void DxbcShaderTranslator::ROV_HandleAlphaBlendFactorCases(
 
 void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() {
   // Check if alpha to coverage can be done at all in this shader.
-  if (!writes_color_target(0)) {
+  if (!current_shader().writes_color_target(0)) {
     return;
   }
 
@@ -1863,21 +1865,22 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() {
 }
 
 void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
-  if (!writes_any_color_target()) {
+  uint32_t shader_writes_color_targets =
+      current_shader().writes_color_targets();
+  if (!shader_writes_color_targets) {
     return;
   }
 
   // Check if this sample needs to be discarded by alpha to coverage.
   CompletePixelShader_WriteToRTVs_AlphaToMask();
 
-  // Get the write mask as components, and also apply the exponent bias after
-  // alpha to coverage because it needs the unbiased alpha from the shader.
-  uint32_t guest_rt_mask = 0;
+  uint32_t gamma_temp = PushSystemTemp();
   for (uint32_t i = 0; i < 4; ++i) {
-    if (!writes_color_target(i)) {
+    if (!(shader_writes_color_targets & (1 << i))) {
       continue;
     }
-    guest_rt_mask |= 1 << i;
+    // Apply the exponent bias after alpha to coverage because it needs the
+    // unbiased alpha from the shader
     system_constants_used_ |= 1ull << kSysConst_ColorExpBias_Index;
     DxbcOpMul(DxbcDest::R(system_temps_color_[i]),
               DxbcSrc::R(system_temps_color_[i]),
@@ -1885,16 +1888,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
                           uint32_t(CbufferRegister::kSystemConstants),
                           kSysConst_ColorExpBias_Vec)
                   .Select(i));
-  }
-
-  // Convert to gamma space - this is incorrect, since it must be done after
-  // blending on the Xbox 360, but this is just one of many blending issues in
-  // the RTV path.
-  uint32_t gamma_temp = PushSystemTemp();
-  for (uint32_t i = 0; i < 4; ++i) {
-    if (!(guest_rt_mask & (1 << i))) {
-      continue;
-    }
+    // Convert to gamma space - this is incorrect, since it must be done after
+    // blending on the Xbox 360, but this is just one of many blending issues in
+    // the RTV path.
     system_constants_used_ |= 1ull << kSysConst_Flags_Index;
     DxbcOpAnd(DxbcDest::R(gamma_temp, 0b0001),
               DxbcSrc::CB(cbuffer_index_system_constants_,
@@ -1923,7 +1919,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
   // Host RT i, guest RT j.
   for (uint32_t i = 0; i < 4; ++i) {
     // mask = map.iiii == (0, 1, 2, 3)
-    DxbcOpIEq(DxbcDest::R(remap_movc_mask_temp, guest_rt_mask),
+    DxbcOpIEq(DxbcDest::R(remap_movc_mask_temp, shader_writes_color_targets),
               DxbcSrc::CB(cbuffer_index_system_constants_,
                           uint32_t(CbufferRegister::kSystemConstants),
                           kSysConst_ColorOutputMap_Vec)
@@ -1932,7 +1928,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
     bool guest_rt_first = true;
     for (uint32_t j = 0; j < 4; ++j) {
       // If map.i == j, move guest color j to the temporary host color.
-      if (!(guest_rt_mask & (1 << j))) {
+      if (!(shader_writes_color_targets & (1 << j))) {
         continue;
       }
       DxbcOpMovC(DxbcDest::R(remap_movc_target_temp),
@@ -1954,8 +1950,10 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
     return;
   }
 
+  bool shader_writes_depth = current_shader().writes_depth();
+
   uint32_t temp;
-  if (writes_depth()) {
+  if (shader_writes_depth) {
     // The depth is already written to system_temp_depth_stencil_.x and clamped
     // to 0...1 with NaNs dropped (saturating in StoreResult); yzw are free.
     temp = system_temp_depth_stencil_;
@@ -1991,8 +1989,8 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
     // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa
     // bits at -34.
     // Anything smaller than 2^-34 becomes 0.
-    DxbcDest truncate_dest(writes_depth() ? DxbcDest::ODepth()
-                                          : DxbcDest::ODepthLE());
+    DxbcDest truncate_dest(shader_writes_depth ? DxbcDest::ODepth()
+                                               : DxbcDest::ODepthLE());
     // Check if the number is representable as a float24 after truncation - the
     // exponent is at least -34.
     DxbcOpUGE(temp_y_dest, temp_x_src, DxbcSrc::LU(0x2E800000));
@@ -2076,7 +2074,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
               temp_y_src);
   }
 
-  if (!writes_depth()) {
+  if (!shader_writes_depth) {
     // Release temp.
     PopSystemTemp();
   }
@@ -2106,7 +2104,7 @@ void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMaskSample(
 
 void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMask() {
   // Check if alpha to coverage can be done at all in this shader.
-  if (!writes_color_target(0)) {
+  if (!current_shader().writes_color_target(0)) {
     return;
   }
 
@@ -2269,8 +2267,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
   }
 
   // Write color values.
+  uint32_t shader_writes_color_targets =
+      current_shader().writes_color_targets();
   for (uint32_t i = 0; i < 4; ++i) {
-    if (!writes_color_target(i)) {
+    if (!(shader_writes_color_targets & (1 << i))) {
       continue;
     }
 
@@ -3156,7 +3156,7 @@ void DxbcShaderTranslator::CompletePixelShader() {
     return;
   }
 
-  if (writes_color_target(0)) {
+  if (current_shader().writes_color_target(0)) {
     // Alpha test.
     // X - mask, then masked result (SGPR for loading, VGPR for masking).
     // Y - operation result (SGPR for mask operations, VGPR for alpha
diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h
index 07986b169..deaecaf39 100644
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@@ -97,6 +97,7 @@ union SQ_PROGRAM_CNTL {
     // Note from a2xx.xml:
     // Only 0x3F worth of valid register values for VS_NUM_REG and PS_NUM_REG,
     // but high bit is set to indicate "0 registers used".
+    // (Register count = (num_reg & 0x80) ? 0 : (num_reg + 1))
     uint32_t vs_num_reg : 8;                           // +0
     uint32_t ps_num_reg : 8;                           // +8
     uint32_t vs_resource : 1;                          // +16
diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc
index 6df03fb81..78451035d 100644
--- a/src/xenia/gpu/shader.cc
+++ b/src/xenia/gpu/shader.cc
@@ -55,7 +55,7 @@ std::filesystem::path Shader::Translation::Dump(
   }
   path = path /
          fmt::format(
-             "shader_{:016X}_{:08X}.{}.{}", shader().ucode_data_hash(),
+             "shader_{:016X}_{:016X}.{}.{}", shader().ucode_data_hash(),
              modification(), path_prefix,
              shader().type() == xenos::ShaderType::kVertex ? "vert" : "frag");
   FILE* f = filesystem::OpenFile(path, "wb");
@@ -78,7 +78,7 @@ std::filesystem::path Shader::Translation::Dump(
   return std::move(path);
 }
 
-Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification,
+Shader::Translation* Shader::GetOrCreateTranslation(uint64_t modification,
                                                     bool* is_new) {
   auto it = translations_.find(modification);
   if (it != translations_.end()) {
@@ -95,7 +95,7 @@ Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification,
   return translation;
 }
 
-void Shader::DestroyTranslation(uint32_t modification) {
+void Shader::DestroyTranslation(uint64_t modification) {
   auto it = translations_.find(modification);
   if (it == translations_.end()) {
     return;
@@ -124,7 +124,7 @@ std::filesystem::path Shader::DumpUcodeBinary(
   return std::move(path);
 }
 
-Shader::Translation* Shader::CreateTranslationInstance(uint32_t modification) {
+Shader::Translation* Shader::CreateTranslationInstance(uint64_t modification) {
   // Default implementation for simple cases like ucode disassembly.
   return new Translation(*this, modification);
 }
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index e533ba9b8..9f849ee8b 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -11,9 +11,9 @@
 #define XENIA_GPU_SHADER_H_
 
 #include <algorithm>
-#include <atomic>
 #include <cstdint>
 #include <filesystem>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -593,6 +593,41 @@ struct ParsedAluInstruction {
   void Disassemble(StringBuffer* out) const;
 };
 
+void ParseControlFlowExec(const ucode::ControlFlowExecInstruction& cf,
+                          uint32_t cf_index, ParsedExecInstruction& instr);
+void ParseControlFlowCondExec(const ucode::ControlFlowCondExecInstruction& cf,
+                              uint32_t cf_index, ParsedExecInstruction& instr);
+void ParseControlFlowCondExecPred(
+    const ucode::ControlFlowCondExecPredInstruction& cf, uint32_t cf_index,
+    ParsedExecInstruction& instr);
+void ParseControlFlowLoopStart(const ucode::ControlFlowLoopStartInstruction& cf,
+                               uint32_t cf_index,
+                               ParsedLoopStartInstruction& instr);
+void ParseControlFlowLoopEnd(const ucode::ControlFlowLoopEndInstruction& cf,
+                             uint32_t cf_index,
+                             ParsedLoopEndInstruction& instr);
+void ParseControlFlowCondCall(const ucode::ControlFlowCondCallInstruction& cf,
+                              uint32_t cf_index, ParsedCallInstruction& instr);
+void ParseControlFlowReturn(const ucode::ControlFlowReturnInstruction& cf,
+                            uint32_t cf_index, ParsedReturnInstruction& instr);
+void ParseControlFlowCondJmp(const ucode::ControlFlowCondJmpInstruction& cf,
+                             uint32_t cf_index, ParsedJumpInstruction& instr);
+void ParseControlFlowAlloc(const ucode::ControlFlowAllocInstruction& cf,
+                           uint32_t cf_index, bool is_vertex_shader,
+                           ParsedAllocInstruction& instr);
+
+// Returns whether the fetch is a full one, and the next parsed mini vertex
+// fetch should inherit most of its parameters.
+bool ParseVertexFetchInstruction(
+    const ucode::VertexFetchInstruction& op,
+    const ucode::VertexFetchInstruction& previous_full_op,
+    ParsedVertexFetchInstruction& instr);
+void ParseTextureFetchInstruction(const ucode::TextureFetchInstruction& op,
+                                  ParsedTextureFetchInstruction& instr);
+void ParseAluInstruction(const ucode::AluInstruction& op,
+                         xenos::ShaderType shader_type,
+                         ParsedAluInstruction& instr);
+
 class Shader {
  public:
   // Type of the vertex shader in a D3D11-like rendering pipeline - shader
@@ -619,12 +654,8 @@ class Shader {
 
   struct VertexBinding {
     struct Attribute {
-      // Attribute index, 0-based in the entire shader.
-      int attrib_index;
       // Fetch instruction with all parameters.
       ParsedVertexFetchInstruction fetch_instr;
-      // Size of the attribute, in words.
-      uint32_t size_words;
     };
 
     // Index within the vertex binding listing.
@@ -691,6 +722,10 @@ class Shader {
     }
   };
 
+  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
+  // .pdb.
+  static constexpr uint32_t kMaxMemExports = 16;
+
   class Translation {
    public:
     virtual ~Translation() {}
@@ -698,7 +733,7 @@ class Shader {
     Shader& shader() const { return shader_; }
 
     // Translator-specific modification bits.
-    uint32_t modification() const { return modification_; }
+    uint64_t modification() const { return modification_; }
 
     // True if the shader was translated and prepared without error.
     bool is_valid() const { return is_valid_; }
@@ -735,7 +770,7 @@ class Shader {
                                const char* path_prefix);
 
    protected:
-    Translation(Shader& shader, uint32_t modification)
+    Translation(Shader& shader, uint64_t modification)
         : shader_(shader), modification_(modification) {}
 
    private:
@@ -743,7 +778,7 @@ class Shader {
     friend class ShaderTranslator;
 
     Shader& shader_;
-    uint32_t modification_;
+    uint64_t modification_;
 
     bool is_valid_ = false;
     bool is_translated_ = false;
@@ -765,32 +800,23 @@ class Shader {
   const uint32_t* ucode_dwords() const { return ucode_data_.data(); }
   size_t ucode_dword_count() const { return ucode_data_.size(); }
 
-  // Host translations with the specified modification bits. Not thread-safe
-  // with respect to translation creation/destruction.
-  const std::unordered_map<uint32_t, Translation*>& translations() const {
-    return translations_;
-  }
-  Translation* GetTranslation(uint32_t modification) const {
-    auto it = translations_.find(modification);
-    if (it != translations_.cend()) {
-      return it->second;
-    }
-    return nullptr;
-  }
-  Translation* GetOrCreateTranslation(uint32_t modification,
-                                      bool* is_new = nullptr);
-  // For shader storage loading, to remove a modification in case of translation
-  // failure. Not thread-safe.
-  void DestroyTranslation(uint32_t modification);
+  bool is_ucode_analyzed() const { return is_ucode_analyzed_; }
+  // ucode_disasm_buffer is temporary storage for disassembly (provided
+  // externally so it won't need to be reallocated for every shader).
+  void AnalyzeUcode(StringBuffer& ucode_disasm_buffer);
+
+  // The following parameters, until the translation, are valid if ucode
+  // information has been gathered.
+
+  // Microcode disassembly in D3D format.
+  const std::string& ucode_disassembly() const { return ucode_disassembly_; }
 
   // All vertex bindings used in the shader.
-  // Valid for vertex shaders only.
   const std::vector<VertexBinding>& vertex_bindings() const {
     return vertex_bindings_;
   }
 
   // All texture bindings used in the shader.
-  // Valid for both vertex and pixel shaders.
   const std::vector<TextureBinding>& texture_bindings() const {
     return texture_bindings_;
   }
@@ -800,24 +826,99 @@ class Shader {
     return constant_register_map_;
   }
 
+  // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
+  // been written to after each `alloc export`, for up to Shader::kMaxMemExports
+  // exports. This will contain zero for certain corrupt exports - for those to
+  // which a valid eA was not written via a MAD with a stream constant.
+  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
+
   // All c# registers used as the addend in MAD operations to eA.
-  const std::vector<uint32_t>& memexport_stream_constants() const {
+  const std::set<uint32_t>& memexport_stream_constants() const {
     return memexport_stream_constants_;
   }
 
-  // Returns true if the given color target index [0-3].
-  bool writes_color_target(uint32_t i) const {
-    return writes_color_targets_[i];
+  // Labels that jumps (explicit or from loops) can be done to.
+  const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
+
+  // Exclusive upper bound of the indexes of paired control flow instructions
+  // (each corresponds to 3 dwords).
+  uint32_t cf_pair_index_bound() const { return cf_pair_index_bound_; }
+
+  // Upper bound of temporary registers addressed statically by the shader -
+  // highest static register address + 1, or 0 if no registers referenced this
+  // way. SQ_PROGRAM_CNTL is not always reliable - some draws (like single point
+  // draws with oPos = 0001 that are done by Xbox 360's Direct3D 9 sometimes;
+  // can be reproduced by launching Arrival in Halo 3 from the campaign lobby)
+  // that aren't supposed to cover any pixels use an invalid (zero)
+  // SQ_PROGRAM_CNTL, but with an outdated pixel shader loaded, in this case
+  // SQ_PROGRAM_CNTL may contain a number smaller than actually needed by the
+  // pixel shader - SQ_PROGRAM_CNTL should be used to go above this count if
+  // uses_register_dynamic_addressing is true.
+  uint32_t register_static_address_bound() const {
+    return register_static_address_bound_;
   }
 
-  // True if the shader overrides the pixel depth.
-  bool writes_depth() const { return writes_depth_; }
+  // Whether the shader addresses temporary registers dynamically, thus
+  // SQ_PROGRAM_CNTL should determine the number of registers to use, not only
+  // register_static_address_bound.
+  bool uses_register_dynamic_addressing() const {
+    return uses_register_dynamic_addressing_;
+  }
+
+  // For building shader modification bits (and also for normalization of them),
+  // returns the amount of temporary registers that need to be allocated
+  // explicitly - if not using register dynamic addressing, the shader
+  // translator will use register_static_address_bound directly.
+  uint32_t GetDynamicAddressableRegisterCount(
+      uint32_t program_cntl_num_reg) const {
+    if (!uses_register_dynamic_addressing()) {
+      return 0;
+    }
+    return std::max((program_cntl_num_reg & 0x80)
+                        ? uint32_t(0)
+                        : (program_cntl_num_reg + uint32_t(1)),
+                    register_static_address_bound());
+  }
 
   // True if the current shader has any `kill` instructions.
   bool kills_pixels() const { return kills_pixels_; }
 
-  // Microcode disassembly in D3D format.
-  const std::string& ucode_disassembly() const { return ucode_disassembly_; }
+  // True if the shader overrides the pixel depth.
+  bool writes_depth() const { return writes_depth_; }
+
+  // Whether the shader can have early depth and stencil writing enabled, unless
+  // alpha test or alpha to coverage is enabled.
+  bool implicit_early_z_write_allowed() const {
+    // TODO(Triang3l): Investigate what happens to memexport when the pixel
+    // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
+    // depth/stencil.
+    return !writes_depth() && !kills_pixels() &&
+           memexport_stream_constants().empty();
+  }
+
+  // Whether each color render target is written to on any exection path.
+  uint32_t writes_color_targets() const { return writes_color_targets_; }
+  bool writes_color_target(uint32_t i) const {
+    return (writes_color_targets() & (uint32_t(1) << i)) != 0;
+  }
+
+  // Host translations with the specified modification bits. Not thread-safe
+  // with respect to translation creation/destruction.
+  const std::unordered_map<uint64_t, Translation*>& translations() const {
+    return translations_;
+  }
+  Translation* GetTranslation(uint64_t modification) const {
+    auto it = translations_.find(modification);
+    if (it != translations_.cend()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+  Translation* GetOrCreateTranslation(uint64_t modification,
+                                      bool* is_new = nullptr);
+  // For shader storage loading, to remove a modification in case of translation
+  // failure. Not thread-safe.
+  void DestroyTranslation(uint64_t modification);
 
   // An externally managed identifier of the shader storage the microcode of the
   // shader was last written to, or was loaded from, to only write the shader
@@ -835,33 +936,68 @@ class Shader {
  protected:
   friend class ShaderTranslator;
 
-  virtual Translation* CreateTranslationInstance(uint32_t modification);
+  virtual Translation* CreateTranslationInstance(uint64_t modification);
 
   xenos::ShaderType shader_type_;
   std::vector<uint32_t> ucode_data_;
   uint64_t ucode_data_hash_;
 
-  // Modification bits -> translation.
-  std::unordered_map<uint32_t, Translation*> translations_;
+  // Whether info needed before translating has been gathered already - may be
+  // needed to determine which modifications are actually needed and make sense
+  // (for instance, there may be draws not covering anything and not allocating
+  // any pixel shader registers in SQ_PROGRAM_CNTL, but still using the pixel
+  // shader from the previous draw - in this case, every shader that happens to
+  // be before such draw will need to be translated again with a different
+  // dynamically addressed register count, which may cause compilation of
+  // different random pipelines across many random frames, thus causing
+  // stuttering - normally host pipeline states are deterministically only
+  // compiled when a new material appears in the game, and having the order of
+  // draws also matter in such unpredictable way would break this rule; limit
+  // the effect to shaders with dynamic register addressing only, which are
+  // extremely rare), also some info needed for drawing is collected during the
+  // ucode analysis.
+  bool is_ucode_analyzed_ = false;
 
-  // Whether setup of the post-translation parameters (listed below, plus those
-  // specific to the implementation) has been initiated, by any thread. If
-  // translation is performed on multiple threads, only one thread must be
-  // setting this up (other threads would write the same data anyway).
-  std::atomic_flag post_translation_info_set_up_ = ATOMIC_FLAG_INIT;
-
-  // Initialized after the first successful translation (these don't depend on
-  // the host-side modification bits).
   std::string ucode_disassembly_;
   std::vector<VertexBinding> vertex_bindings_;
   std::vector<TextureBinding> texture_bindings_;
   ConstantRegisterMap constant_register_map_ = {0};
-  bool writes_color_targets_[4] = {false, false, false, false};
-  bool writes_depth_ = false;
+  uint8_t memexport_eM_written_[kMaxMemExports] = {};
+  std::set<uint32_t> memexport_stream_constants_;
+  std::set<uint32_t> label_addresses_;
+  uint32_t cf_pair_index_bound_ = 0;
+  uint32_t register_static_address_bound_ = 0;
+  bool uses_register_dynamic_addressing_ = false;
   bool kills_pixels_ = false;
-  std::vector<uint32_t> memexport_stream_constants_;
+  bool writes_depth_ = false;
+  uint32_t writes_color_targets_ = 0b0000;
+
+  // Modification bits -> translation.
+  std::unordered_map<uint64_t, Translation*> translations_;
 
   uint32_t ucode_storage_index_ = UINT32_MAX;
+
+ private:
+  void GatherExecInformation(
+      const ParsedExecInstruction& instr,
+      ucode::VertexFetchInstruction& previous_vfetch_full,
+      uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
+      uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
+  void GatherVertexFetchInformation(
+      const ucode::VertexFetchInstruction& op,
+      ucode::VertexFetchInstruction& previous_vfetch_full,
+      StringBuffer& ucode_disasm_buffer);
+  void GatherTextureFetchInformation(const ucode::TextureFetchInstruction& op,
+                                     uint32_t& unique_texture_bindings,
+                                     StringBuffer& ucode_disasm_buffer);
+  void GatherAluInstructionInformation(const ucode::AluInstruction& op,
+                                       uint32_t memexport_alloc_current_count,
+                                       uint32_t& memexport_eA_written,
+                                       StringBuffer& ucode_disasm_buffer);
+  void GatherOperandInformation(const InstructionOperand& operand);
+  void GatherFetchResultInformation(const InstructionResult& result);
+  void GatherAluResultInformation(const InstructionResult& result,
+                                  uint32_t memexport_alloc_current_count);
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc
index a9a744955..4874928d3 100644
--- a/src/xenia/gpu/shader_compiler_main.cc
+++ b/src/xenia/gpu/shader_compiler_main.cc
@@ -17,6 +17,7 @@
 #include "xenia/base/main.h"
 #include "xenia/base/platform.h"
 #include "xenia/base/string.h"
+#include "xenia/base/string_buffer.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
 #include "xenia/gpu/shader_translator.h"
 #include "xenia/gpu/spirv_shader_translator.h"
@@ -104,6 +105,8 @@ int shader_compiler_main(const std::vector<std::string>& args) {
   auto shader = std::make_unique<Shader>(
       shader_type, ucode_data_hash, ucode_dwords.data(), ucode_dwords.size());
 
+  shader->AnalyzeUcode(StringBuffer());
+
   std::unique_ptr<ShaderTranslator> translator;
   if (cvars::shader_output_type == "spirv" ||
       cvars::shader_output_type == "spirvtext") {
@@ -114,7 +117,15 @@ int shader_compiler_main(const std::vector<std::string>& args) {
         0, cvars::shader_output_bindless_resources,
         cvars::shader_output_dxbc_rov);
   } else {
-    translator = std::make_unique<UcodeShaderTranslator>();
+    // Just output microcode disassembly generated during microcode information
+    // gathering.
+    if (!cvars::shader_output.empty()) {
+      auto output_file = filesystem::OpenFile(cvars::shader_output, "wb");
+      fwrite(shader->ucode_disassembly().c_str(), 1,
+             shader->ucode_disassembly().length(), output_file);
+      fclose(output_file);
+    }
+    return 0;
   }
 
   Shader::HostVertexShaderType host_vertex_shader_type =
@@ -140,12 +151,12 @@ int shader_compiler_main(const std::vector<std::string>& args) {
           Shader::HostVertexShaderType::kQuadDomainPatchIndexed;
     }
   }
-  uint32_t modification =
-      translator->GetDefaultModification(shader_type, host_vertex_shader_type);
+  uint64_t modification = translator->GetDefaultModification(
+      shader_type, 64, host_vertex_shader_type);
 
   Shader::Translation* translation =
       shader->GetOrCreateTranslation(modification);
-  translator->Translate(*translation);
+  translator->TranslateAnalyzedShader(*translation);
 
   const void* source_data = translation->translated_binary().data();
   size_t source_data_size = translation->translated_binary().size();
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 6d79e82c2..80f122ba9 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -9,7 +9,9 @@
 
 #include "xenia/gpu/shader_translator.h"
 
+#include <algorithm>
 #include <cstdarg>
+#include <cstring>
 #include <set>
 #include <string>
 
@@ -42,91 +44,159 @@ using namespace ucode;
 // Lots of naming comes from the disassembly spit out by the XNA GS compiler
 // and dumps of d3dcompiler and games: https://pastebin.com/i4kAv7bB
 
-ShaderTranslator::ShaderTranslator() = default;
-
-ShaderTranslator::~ShaderTranslator() = default;
-
-void ShaderTranslator::Reset(xenos::ShaderType shader_type) {
-  shader_type_ = shader_type;
-  modification_ = GetDefaultModification(shader_type);
-  errors_.clear();
-  ucode_disasm_buffer_.Reset();
-  ucode_disasm_line_number_ = 0;
-  previous_ucode_disasm_scan_offset_ = 0;
-  register_count_ = 64;
-  label_addresses_.clear();
-  total_attrib_count_ = 0;
-  vertex_bindings_.clear();
-  unique_vertex_bindings_ = 0;
-  texture_bindings_.clear();
-  unique_texture_bindings_ = 0;
-  std::memset(&constant_register_map_, 0, sizeof(constant_register_map_));
-  uses_register_dynamic_addressing_ = false;
-  for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
-    writes_color_targets_[i] = false;
+void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
+  if (is_ucode_analyzed_) {
+    return;
   }
-  writes_depth_ = false;
-  kills_pixels_ = false;
-  memexport_alloc_count_ = 0;
-  memexport_eA_written_ = 0;
-  std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_));
-  memexport_stream_constants_.clear();
-}
-
-bool ShaderTranslator::Translate(Shader::Translation& translation,
-                                 reg::SQ_PROGRAM_CNTL cntl) {
-  xenos::ShaderType shader_type = translation.shader().type();
-  Reset(shader_type);
-  uint32_t cntl_num_reg = shader_type == xenos::ShaderType::kVertex
-                              ? cntl.vs_num_reg
-                              : cntl.ps_num_reg;
-  register_count_ = (cntl_num_reg & 0x80) ? 0 : (cntl_num_reg + 1);
-
-  return TranslateInternal(translation);
-}
-
-bool ShaderTranslator::Translate(Shader::Translation& translation) {
-  Reset(translation.shader().type());
-  return TranslateInternal(translation);
-}
-
-bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) {
-  Shader& shader = translation.shader();
-  assert_true(shader_type_ == shader.type());
-  shader_type_ = shader.type();
-  ucode_dwords_ = shader.ucode_dwords();
-  ucode_dword_count_ = shader.ucode_dword_count();
-  modification_ = translation.modification();
 
   // Control flow instructions come paired in blocks of 3 dwords and all are
   // listed at the top of the ucode.
   // Each control flow instruction is executed sequentially until the final
   // ending instruction.
-  uint32_t max_cf_dword_index = static_cast<uint32_t>(ucode_dword_count_);
-  std::vector<ControlFlowInstruction> cf_instructions;
-  for (uint32_t i = 0; i < max_cf_dword_index; i += 3) {
-    ControlFlowInstruction cf_a;
-    ControlFlowInstruction cf_b;
-    UnpackControlFlowInstructions(ucode_dwords_ + i, &cf_a, &cf_b);
-    // Guess how long the control flow program is by scanning for the first
-    // kExec-ish and instruction and using its address as the upper bound.
-    // This is what freedreno does.
-    if (IsControlFlowOpcodeExec(cf_a.opcode())) {
-      max_cf_dword_index =
-          std::min(max_cf_dword_index, cf_a.exec.address() * 3);
+  // Gather the upper bound of the control flow instructions, and label
+  // addresses, which are needed for disassembly.
+  cf_pair_index_bound_ = uint32_t(ucode_data_.size() / 3);
+  for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+    ControlFlowInstruction cf_ab[2];
+    UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
+    for (uint32_t j = 0; j < 2; ++j) {
+      // Guess how long the control flow program is by scanning for the first
+      // kExec-ish and instruction and using its address as the upper bound.
+      // This is what freedreno does.
+      const ControlFlowInstruction& cf = cf_ab[j];
+      if (IsControlFlowOpcodeExec(cf.opcode())) {
+        cf_pair_index_bound_ =
+            std::min(cf_pair_index_bound_, cf.exec.address());
+      }
+      switch (cf.opcode()) {
+        case ControlFlowOpcode::kCondCall:
+          label_addresses_.insert(cf.cond_call.address());
+          break;
+        case ControlFlowOpcode::kCondJmp:
+          label_addresses_.insert(cf.cond_jmp.address());
+          break;
+        case ControlFlowOpcode::kLoopStart:
+          label_addresses_.insert(cf.loop_start.address());
+          break;
+        case ControlFlowOpcode::kLoopEnd:
+          label_addresses_.insert(cf.loop_end.address());
+          break;
+        default:
+          break;
+      }
     }
-    if (IsControlFlowOpcodeExec(cf_b.opcode())) {
-      max_cf_dword_index =
-          std::min(max_cf_dword_index, cf_b.exec.address() * 3);
-    }
-    // Gather all labels, binding, operand addressing and export information.
-    // Translators may need this before they start codegen.
-    GatherInstructionInformation(cf_a);
-    GatherInstructionInformation(cf_b);
-    cf_instructions.push_back(cf_a);
-    cf_instructions.push_back(cf_b);
   }
 
+  // Disassemble and gather information.
+  ucode_disasm_buffer.Reset();
+  VertexFetchInstruction previous_vfetch_full;
+  std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
+  uint32_t unique_texture_bindings = 0;
+  uint32_t memexport_alloc_count = 0;
+  uint32_t memexport_eA_written = 0;
+  for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+    ControlFlowInstruction cf_ab[2];
+    UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
+    for (uint32_t j = 0; j < 2; ++j) {
+      uint32_t cf_index = i * 2 + j;
+      if (label_addresses_.find(cf_index) != label_addresses_.end()) {
+        ucode_disasm_buffer.AppendFormat("                label L{}\n",
+                                         cf_index);
+      }
+      ucode_disasm_buffer.AppendFormat("/* {:4d}.{} */ ", i, j);
+
+      const ControlFlowInstruction& cf = cf_ab[j];
+      uint32_t bool_constant_index = UINT32_MAX;
+      switch (cf.opcode()) {
+        case ControlFlowOpcode::kNop:
+          ucode_disasm_buffer.Append("      cnop\n");
+          break;
+        case ControlFlowOpcode::kExec:
+        case ControlFlowOpcode::kExecEnd: {
+          ParsedExecInstruction instr;
+          ParseControlFlowExec(cf.exec, cf_index, instr);
+          GatherExecInformation(instr, previous_vfetch_full,
+                                unique_texture_bindings, memexport_alloc_count,
+                                memexport_eA_written, ucode_disasm_buffer);
+        } break;
+        case ControlFlowOpcode::kCondExec:
+        case ControlFlowOpcode::kCondExecEnd:
+        case ControlFlowOpcode::kCondExecPredClean:
+        case ControlFlowOpcode::kCondExecPredCleanEnd: {
+          bool_constant_index = cf.cond_exec.bool_address();
+          ParsedExecInstruction instr;
+          ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
+          GatherExecInformation(instr, previous_vfetch_full,
+                                unique_texture_bindings, memexport_alloc_count,
+                                memexport_eA_written, ucode_disasm_buffer);
+        } break;
+        case ControlFlowOpcode::kCondExecPred:
+        case ControlFlowOpcode::kCondExecPredEnd: {
+          ParsedExecInstruction instr;
+          ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
+          GatherExecInformation(instr, previous_vfetch_full,
+                                unique_texture_bindings, memexport_alloc_count,
+                                memexport_eA_written, ucode_disasm_buffer);
+        } break;
+        case ControlFlowOpcode::kLoopStart: {
+          ParsedLoopStartInstruction instr;
+          ParseControlFlowLoopStart(cf.loop_start, cf_index, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+          constant_register_map_.loop_bitmap |= uint32_t(1)
+                                                << instr.loop_constant_index;
+        } break;
+        case ControlFlowOpcode::kLoopEnd: {
+          ParsedLoopEndInstruction instr;
+          ParseControlFlowLoopEnd(cf.loop_end, cf_index, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+          constant_register_map_.loop_bitmap |= uint32_t(1)
+                                                << instr.loop_constant_index;
+        } break;
+        case ControlFlowOpcode::kCondCall: {
+          ParsedCallInstruction instr;
+          ParseControlFlowCondCall(cf.cond_call, cf_index, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+          if (instr.type == ParsedCallInstruction::Type::kConditional) {
+            bool_constant_index = instr.bool_constant_index;
+          }
+        } break;
+        case ControlFlowOpcode::kReturn: {
+          ParsedReturnInstruction instr;
+          ParseControlFlowReturn(cf.ret, cf_index, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+        } break;
+        case ControlFlowOpcode::kCondJmp: {
+          ParsedJumpInstruction instr;
+          ParseControlFlowCondJmp(cf.cond_jmp, cf_index, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+          if (instr.type == ParsedJumpInstruction::Type::kConditional) {
+            bool_constant_index = instr.bool_constant_index;
+          }
+        } break;
+        case ControlFlowOpcode::kAlloc: {
+          ParsedAllocInstruction instr;
+          ParseControlFlowAlloc(cf.alloc, cf_index,
+                                type() == xenos::ShaderType::kVertex, instr);
+          instr.Disassemble(&ucode_disasm_buffer);
+          if (instr.type == AllocType::kMemory) {
+            ++memexport_alloc_count;
+          }
+        } break;
+        case ControlFlowOpcode::kMarkVsFetchDone:
+          break;
+        default:
+          assert_unhandled_case(cf.opcode);
+          break;
+      }
+      if (bool_constant_index != UINT32_MAX) {
+        constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
+            uint32_t(1) << (bool_constant_index % 32);
+      }
+      // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
+    }
+  }
+  ucode_disassembly_ = ucode_disasm_buffer.to_string();
+
   if (constant_register_map_.float_dynamic_addressing) {
     // All potentially can be referenced.
     constant_register_map_.float_count = 256;
@@ -143,335 +213,75 @@ bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) {
 
   // Cleanup invalid/unneeded memexport allocs.
   for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!(memexport_eA_written_ & (uint32_t(1) << i))) {
+    if (!(memexport_eA_written & (uint32_t(1) << i))) {
       memexport_eM_written_[i] = 0;
     } else if (!memexport_eM_written_[i]) {
-      memexport_eA_written_ &= ~(uint32_t(1) << i);
+      memexport_eA_written &= ~(uint32_t(1) << i);
     }
   }
-  if (memexport_eA_written_ == 0) {
+  if (memexport_eA_written == 0) {
     memexport_stream_constants_.clear();
   }
 
-  StartTranslation();
+  is_ucode_analyzed_ = true;
+}
 
-  PreProcessControlFlowInstructions(cf_instructions);
-
-  // Translate all instructions.
-  for (uint32_t i = 0, cf_index = 0; i < max_cf_dword_index; i += 3) {
-    ControlFlowInstruction cf_a;
-    ControlFlowInstruction cf_b;
-    UnpackControlFlowInstructions(ucode_dwords_ + i, &cf_a, &cf_b);
-
-    cf_index_ = cf_index;
-    MarkUcodeInstruction(i);
-    if (label_addresses_.find(cf_index) != label_addresses_.end()) {
-      AppendUcodeDisasmFormat("                label L%u\n", cf_index);
-      ProcessLabel(cf_index);
+void Shader::GatherExecInformation(
+    const ParsedExecInstruction& instr,
+    ucode::VertexFetchInstruction& previous_vfetch_full,
+    uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
+    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+  instr.Disassemble(&ucode_disasm_buffer);
+  uint32_t sequence = instr.sequence;
+  for (uint32_t instr_offset = instr.instruction_address;
+       instr_offset < instr.instruction_address + instr.instruction_count;
+       ++instr_offset, sequence >>= 2) {
+    ucode_disasm_buffer.AppendFormat("/* {:4d}   */ ", instr_offset);
+    if (sequence & 0b10) {
+      ucode_disasm_buffer.Append("         serialize\n             ");
     }
-    AppendUcodeDisasmFormat("/* %4u.0 */ ", cf_index / 2);
-    ProcessControlFlowInstructionBegin(cf_index);
-    TranslateControlFlowInstruction(cf_a);
-    ProcessControlFlowInstructionEnd(cf_index);
-    ++cf_index;
-
-    cf_index_ = cf_index;
-    MarkUcodeInstruction(i);
-    if (label_addresses_.find(cf_index) != label_addresses_.end()) {
-      AppendUcodeDisasmFormat("                label L%u\n", cf_index);
-      ProcessLabel(cf_index);
-    }
-    AppendUcodeDisasmFormat("/* %4u.1 */ ", cf_index / 2);
-    ProcessControlFlowInstructionBegin(cf_index);
-    TranslateControlFlowInstruction(cf_b);
-    ProcessControlFlowInstructionEnd(cf_index);
-    ++cf_index;
-  }
-
-  translation.errors_ = std::move(errors_);
-  translation.translated_binary_ = CompleteTranslation();
-  translation.is_translated_ = true;
-
-  bool is_valid = true;
-  for (const auto& error : translation.errors_) {
-    if (error.is_fatal) {
-      is_valid = false;
-      break;
-    }
-  }
-  translation.is_valid_ = is_valid;
-
-  // Setup info that doesn't depend on the modification only once.
-  bool setup_shader_post_translation_info =
-      is_valid && !shader.post_translation_info_set_up_.test_and_set();
-  if (setup_shader_post_translation_info) {
-    shader.ucode_disassembly_ = ucode_disasm_buffer_.to_string();
-    shader.vertex_bindings_ = std::move(vertex_bindings_);
-    shader.texture_bindings_ = std::move(texture_bindings_);
-    shader.constant_register_map_ = std::move(constant_register_map_);
-    for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
-      shader.writes_color_targets_[i] = writes_color_targets_[i];
-    }
-    shader.writes_depth_ = writes_depth_;
-    shader.kills_pixels_ = kills_pixels_;
-    shader.memexport_stream_constants_.clear();
-    shader.memexport_stream_constants_.reserve(
-        memexport_stream_constants_.size());
-    shader.memexport_stream_constants_.insert(
-        shader.memexport_stream_constants_.cend(),
-        memexport_stream_constants_.cbegin(),
-        memexport_stream_constants_.cend());
-  }
-  PostTranslation(translation, setup_shader_post_translation_info);
-
-  // In case is_valid_ is modified by PostTranslation, reload.
-  return translation.is_valid_;
-}
-
-void ShaderTranslator::MarkUcodeInstruction(uint32_t dword_offset) {
-  auto disasm = ucode_disasm_buffer_.buffer();
-  size_t current_offset = ucode_disasm_buffer_.length();
-  for (size_t i = previous_ucode_disasm_scan_offset_; i < current_offset; ++i) {
-    if (disasm[i] == '\n') {
-      ++ucode_disasm_line_number_;
-    }
-  }
-  previous_ucode_disasm_scan_offset_ = current_offset;
-}
-
-void ShaderTranslator::AppendUcodeDisasm(char c) {
-  ucode_disasm_buffer_.Append(c);
-}
-
-void ShaderTranslator::AppendUcodeDisasm(const char* value) {
-  ucode_disasm_buffer_.Append(value);
-}
-
-void ShaderTranslator::AppendUcodeDisasmFormat(const char* format, ...) {
-  va_list va;
-  va_start(va, format);
-  ucode_disasm_buffer_.AppendVarargs(format, va);
-  va_end(va);
-}
-
-void ShaderTranslator::EmitTranslationError(const char* message,
-                                            bool is_fatal) {
-  Shader::Error error;
-  error.is_fatal = is_fatal;
-  error.message = message;
-  // TODO(benvanik): location information.
-  errors_.push_back(std::move(error));
-  XELOGE("Shader translation {}error: {}", is_fatal ? "fatal " : "", message);
-}
-
-void ShaderTranslator::GatherInstructionInformation(
-    const ControlFlowInstruction& cf) {
-  uint32_t bool_constant_index = UINT32_MAX;
-  switch (cf.opcode()) {
-    case ControlFlowOpcode::kCondExec:
-    case ControlFlowOpcode::kCondExecEnd:
-    case ControlFlowOpcode::kCondExecPredClean:
-    case ControlFlowOpcode::kCondExecPredCleanEnd:
-      bool_constant_index = cf.cond_exec.bool_address();
-      break;
-    case ControlFlowOpcode::kCondCall:
-      label_addresses_.insert(cf.cond_call.address());
-      if (!cf.cond_call.is_unconditional() && !cf.cond_call.is_predicated()) {
-        bool_constant_index = cf.cond_call.bool_address();
+    if (sequence & 0b01) {
+      auto fetch_opcode = FetchOpcode(ucode_data_[instr_offset * 3] & 0x1F);
+      if (fetch_opcode == FetchOpcode::kVertexFetch) {
+        auto& op = *reinterpret_cast<const VertexFetchInstruction*>(
+            ucode_data_.data() + instr_offset * 3);
+        GatherVertexFetchInformation(op, previous_vfetch_full,
+                                     ucode_disasm_buffer);
+      } else {
+        auto& op = *reinterpret_cast<const TextureFetchInstruction*>(
+            ucode_data_.data() + instr_offset * 3);
+        GatherTextureFetchInformation(op, unique_texture_bindings,
+                                      ucode_disasm_buffer);
       }
-      break;
-    case ControlFlowOpcode::kCondJmp:
-      label_addresses_.insert(cf.cond_jmp.address());
-      if (!cf.cond_jmp.is_unconditional() && !cf.cond_jmp.is_predicated()) {
-        bool_constant_index = cf.cond_jmp.bool_address();
-      }
-      break;
-    case ControlFlowOpcode::kLoopStart:
-      label_addresses_.insert(cf.loop_start.address());
-      constant_register_map_.loop_bitmap |= uint32_t(1)
-                                            << cf.loop_start.loop_id();
-      break;
-    case ControlFlowOpcode::kLoopEnd:
-      label_addresses_.insert(cf.loop_end.address());
-      constant_register_map_.loop_bitmap |= uint32_t(1)
-                                            << cf.loop_end.loop_id();
-      break;
-    case ControlFlowOpcode::kAlloc:
-      if (cf.alloc.alloc_type() == AllocType::kMemory) {
-        ++memexport_alloc_count_;
-      }
-      break;
-    default:
-      break;
-  }
-  if (bool_constant_index != UINT32_MAX) {
-    constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
-        uint32_t(1) << (bool_constant_index % 32);
-  }
-
-  switch (cf.opcode()) {
-    case ControlFlowOpcode::kExec:
-    case ControlFlowOpcode::kExecEnd:
-    case ControlFlowOpcode::kCondExec:
-    case ControlFlowOpcode::kCondExecEnd:
-    case ControlFlowOpcode::kCondExecPred:
-    case ControlFlowOpcode::kCondExecPredEnd:
-    case ControlFlowOpcode::kCondExecPredClean:
-    case ControlFlowOpcode::kCondExecPredCleanEnd: {
-      uint32_t sequence = cf.exec.sequence();
-      for (uint32_t instr_offset = cf.exec.address();
-           instr_offset < cf.exec.address() + cf.exec.count();
-           ++instr_offset, sequence >>= 2) {
-        bool is_fetch = (sequence & 0x1) == 0x1;
-        if (is_fetch) {
-          // Gather vertex and texture fetches.
-          auto fetch_opcode =
-              static_cast<FetchOpcode>(ucode_dwords_[instr_offset * 3] & 0x1F);
-          if (fetch_opcode == FetchOpcode::kVertexFetch) {
-            assert_true(is_vertex_shader());
-            GatherVertexFetchInformation(
-                *reinterpret_cast<const VertexFetchInstruction*>(
-                    ucode_dwords_ + instr_offset * 3));
-          } else {
-            GatherTextureFetchInformation(
-                *reinterpret_cast<const TextureFetchInstruction*>(
-                    ucode_dwords_ + instr_offset * 3));
-          }
-        } else {
-          // Gather info needed for the translation pass because having such
-          // state changed in the middle of translation may break things. Check
-          // the comments for each specific variable set here to see usage
-          // restrictions that can be assumed here (such as only marking exports
-          // as written if the used write mask is non-empty).
-          auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords_ +
-                                                              instr_offset * 3);
-          ParsedAluInstruction instr;
-          ParseAluInstruction(op, instr);
-
-          kills_pixels_ = kills_pixels_ ||
-                          ucode::AluVectorOpcodeIsKill(op.vector_opcode()) ||
-                          ucode::AluScalarOpcodeIsKill(op.scalar_opcode());
-
-          if (instr.vector_and_constant_result.storage_target !=
-                  InstructionStorageTarget::kRegister ||
-              instr.scalar_result.storage_target !=
-                  InstructionStorageTarget::kRegister) {
-            // Export is done to vector_dest of the ucode instruction for both
-            // vector and scalar operations - no need to check separately.
-            assert_true(instr.vector_and_constant_result.storage_target ==
-                            instr.scalar_result.storage_target &&
-                        instr.vector_and_constant_result.storage_index ==
-                            instr.scalar_result.storage_index);
-            if (instr.vector_and_constant_result.GetUsedWriteMask() ||
-                instr.scalar_result.GetUsedWriteMask()) {
-              InstructionStorageTarget export_target =
-                  instr.vector_and_constant_result.storage_target;
-              uint32_t export_index =
-                  instr.vector_and_constant_result.storage_index;
-              switch (export_target) {
-                case InstructionStorageTarget::kExportAddress:
-                  // Store used memexport constants because CPU code needs
-                  // addresses and sizes, and also whether there have been
-                  // writes to eA and eM# for register allocation in shader
-                  // translator implementations.
-                  // eA is (hopefully) always written to using:
-                  // mad eA, r#, const0100, c#
-                  // (though there are some exceptions, shaders in Halo 3 for
-                  // some reason set eA to zeros, but the swizzle of the
-                  // constant is not .xyzw in this case, and they don't write to
-                  // eM#).
-                  if (memexport_alloc_count_ > 0 &&
-                      memexport_alloc_count_ <= kMaxMemExports) {
-                    uint32_t memexport_stream_constant =
-                        instr.GetMemExportStreamConstant();
-                    if (memexport_stream_constant != UINT32_MAX) {
-                      memexport_eA_written_ |= uint32_t(1)
-                                               << (memexport_alloc_count_ - 1);
-                      memexport_stream_constants_.insert(
-                          memexport_stream_constant);
-                    } else {
-                      XELOGE(
-                          "ShaderTranslator::GatherInstructionInformation: "
-                          "Couldn't extract memexport stream constant index");
-                    }
-                  }
-                  break;
-                case InstructionStorageTarget::kExportData:
-                  if (memexport_alloc_count_ > 0 &&
-                      memexport_alloc_count_ <= kMaxMemExports) {
-                    memexport_eM_written_[memexport_alloc_count_ - 1] |=
-                        uint32_t(1) << export_index;
-                  }
-                  break;
-                case InstructionStorageTarget::kColor:
-                  writes_color_targets_[export_index] = true;
-                  break;
-                case InstructionStorageTarget::kDepth:
-                  writes_depth_ = true;
-                  break;
-                default:
-                  break;
-              }
-            }
-          } else {
-            if ((instr.vector_and_constant_result.GetUsedWriteMask() &&
-                 instr.vector_and_constant_result.storage_addressing_mode !=
-                     InstructionStorageAddressingMode::kStatic) ||
-                (instr.scalar_result.GetUsedWriteMask() &&
-                 instr.scalar_result.storage_addressing_mode !=
-                     InstructionStorageAddressingMode::kStatic)) {
-              uses_register_dynamic_addressing_ = true;
-            }
-          }
-
-          uint32_t total_operand_count =
-              instr.vector_operand_count + instr.scalar_operand_count;
-          for (uint32_t i = 0; i < total_operand_count; ++i) {
-            const InstructionOperand& operand =
-                (i < instr.vector_operand_count)
-                    ? instr.vector_operands[i]
-                    : instr.scalar_operands[i - instr.vector_operand_count];
-            if (operand.storage_source == InstructionStorageSource::kRegister) {
-              if (operand.storage_addressing_mode !=
-                  InstructionStorageAddressingMode::kStatic) {
-                uses_register_dynamic_addressing_ = true;
-              }
-            } else if (operand.storage_source ==
-                       InstructionStorageSource::kConstantFloat) {
-              if (operand.storage_addressing_mode ==
-                  InstructionStorageAddressingMode::kStatic) {
-                // Store used float constants before translating so the
-                // translator can use tightly packed indices if not dynamically
-                // indexed.
-                uint32_t constant_index = operand.storage_index;
-                constant_register_map_.float_bitmap[constant_index / 64] |=
-                    uint64_t(1) << (constant_index % 64);
-              } else {
-                constant_register_map_.float_dynamic_addressing = true;
-              }
-            }
-          }
-        }
-      }
-    } break;
-    default:
-      break;
+    } else {
+      auto& op = *reinterpret_cast<const AluInstruction*>(ucode_data_.data() +
+                                                          instr_offset * 3);
+      GatherAluInstructionInformation(op, memexport_alloc_current_count,
+                                      memexport_eA_written,
+                                      ucode_disasm_buffer);
+    }
   }
 }
 
-void ShaderTranslator::GatherVertexFetchInformation(
-    const VertexFetchInstruction& op) {
+void Shader::GatherVertexFetchInformation(
+    const VertexFetchInstruction& op,
+    VertexFetchInstruction& previous_vfetch_full,
+    StringBuffer& ucode_disasm_buffer) {
   ParsedVertexFetchInstruction fetch_instr;
-  ParseVertexFetchInstruction(op, &fetch_instr);
+  if (ParseVertexFetchInstruction(op, previous_vfetch_full, fetch_instr)) {
+    previous_vfetch_full = op;
+  }
+  fetch_instr.Disassemble(&ucode_disasm_buffer);
+
+  GatherFetchResultInformation(fetch_instr.result);
 
   // Don't bother setting up a binding for an instruction that fetches nothing.
-  if (!op.fetches_any_data()) {
+  if (!fetch_instr.result.GetUsedResultComponents()) {
     return;
   }
 
-  // Check if using dynamic register indices.
-  if (op.is_dest_relative() || op.is_src_relative()) {
-    uses_register_dynamic_addressing_ = true;
+  for (size_t i = 0; i < fetch_instr.operand_count; ++i) {
+    GatherOperandInformation(fetch_instr.operands[i]);
   }
 
   // Try to allocate an attribute on an existing binding.
@@ -500,17 +310,19 @@ void ShaderTranslator::GatherVertexFetchInformation(
   }
 
   // Populate attribute.
-  attrib->attrib_index = total_attrib_count_++;
   attrib->fetch_instr = fetch_instr;
-  attrib->size_words = xenos::GetVertexFormatSizeInWords(
-      attrib->fetch_instr.attributes.data_format);
 }
 
-void ShaderTranslator::GatherTextureFetchInformation(
-    const TextureFetchInstruction& op) {
-  // Check if using dynamic register indices.
-  if (op.is_dest_relative() || op.is_src_relative()) {
-    uses_register_dynamic_addressing_ = true;
+void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
+                                           uint32_t& unique_texture_bindings,
+                                           StringBuffer& ucode_disasm_buffer) {
+  TextureBinding binding;
+  ParseTextureFetchInstruction(op, binding.fetch_instr);
+  binding.fetch_instr.Disassemble(&ucode_disasm_buffer);
+
+  GatherFetchResultInformation(binding.fetch_instr.result);
+  for (size_t i = 0; i < binding.fetch_instr.operand_count; ++i) {
+    GatherOperandInformation(binding.fetch_instr.operands[i]);
   }
 
   switch (op.opcode()) {
@@ -523,9 +335,7 @@ void ShaderTranslator::GatherTextureFetchInformation(
       // Continue.
       break;
   }
-  Shader::TextureBinding binding;
   binding.binding_index = -1;
-  ParseTextureFetchInstruction(op, &binding.fetch_instr);
   binding.fetch_constant = binding.fetch_instr.operands[1].storage_index;
 
   // Check and see if this fetch constant was previously used...
@@ -538,349 +348,502 @@ void ShaderTranslator::GatherTextureFetchInformation(
 
   if (binding.binding_index == -1) {
     // Assign a unique binding index.
-    binding.binding_index = unique_texture_bindings_++;
+    binding.binding_index = unique_texture_bindings++;
   }
 
   texture_bindings_.emplace_back(std::move(binding));
 }
 
-std::vector<uint8_t> UcodeShaderTranslator::CompleteTranslation() {
-  return ucode_disasm_buffer().to_bytes();
+void Shader::GatherAluInstructionInformation(
+    const AluInstruction& op, uint32_t memexport_alloc_current_count,
+    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+  ParsedAluInstruction instr;
+  ParseAluInstruction(op, type(), instr);
+  instr.Disassemble(&ucode_disasm_buffer);
+
+  kills_pixels_ = kills_pixels_ ||
+                  ucode::AluVectorOpcodeIsKill(op.vector_opcode()) ||
+                  ucode::AluScalarOpcodeIsKill(op.scalar_opcode());
+
+  GatherAluResultInformation(instr.vector_and_constant_result,
+                             memexport_alloc_current_count);
+  GatherAluResultInformation(instr.scalar_result,
+                             memexport_alloc_current_count);
+  for (size_t i = 0; i < instr.vector_operand_count; ++i) {
+    GatherOperandInformation(instr.vector_operands[i]);
+  }
+  for (size_t i = 0; i < instr.scalar_operand_count; ++i) {
+    GatherOperandInformation(instr.scalar_operands[i]);
+  }
+
+  // Store used memexport constants because CPU code needs addresses and sizes,
+  // and also whether there have been writes to eA and eM# for register
+  // allocation in shader translator implementations.
+  // eA is (hopefully) always written to using:
+  // mad eA, r#, const0100, c#
+  // (though there are some exceptions, shaders in Halo 3 for some reason set eA
+  // to zeros, but the swizzle of the constant is not .xyzw in this case, and
+  // they don't write to eM#).
+  // Export is done to vector_dest of the ucode instruction for both vector and
+  // scalar operations - no need to check separately.
+  if (instr.vector_and_constant_result.storage_target ==
+          InstructionStorageTarget::kExportAddress &&
+      memexport_alloc_current_count > 0 &&
+      memexport_alloc_current_count <= Shader::kMaxMemExports) {
+    uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
+    if (memexport_stream_constant != UINT32_MAX) {
+      memexport_eA_written |= uint32_t(1)
+                              << (memexport_alloc_current_count - 1);
+      memexport_stream_constants_.insert(memexport_stream_constant);
+    } else {
+      XELOGE(
+          "ShaderTranslator::GatherAluInstructionInformation: Couldn't extract "
+          "memexport stream constant index");
+    }
+  }
+}
+
+void Shader::GatherOperandInformation(const InstructionOperand& operand) {
+  switch (operand.storage_source) {
+    case InstructionStorageSource::kRegister:
+      if (operand.storage_addressing_mode ==
+          InstructionStorageAddressingMode::kStatic) {
+        register_static_address_bound_ =
+            std::max(register_static_address_bound_,
+                     operand.storage_index + uint32_t(1));
+      } else {
+        uses_register_dynamic_addressing_ = true;
+      }
+      break;
+    case InstructionStorageSource::kConstantFloat:
+      if (operand.storage_addressing_mode ==
+          InstructionStorageAddressingMode::kStatic) {
+        // Store used float constants before translating so the
+        // translator can use tightly packed indices if not dynamically
+        // indexed.
+        constant_register_map_.float_bitmap[operand.storage_index >> 6] |=
+            uint64_t(1) << (operand.storage_index & 63);
+      } else {
+        constant_register_map_.float_dynamic_addressing = true;
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+void Shader::GatherFetchResultInformation(const InstructionResult& result) {
+  if (!result.GetUsedWriteMask()) {
+    return;
+  }
+  // Fetch instructions can't export - don't need the current memexport count
+  // operand.
+  assert_true(result.storage_target == InstructionStorageTarget::kRegister);
+  if (result.storage_addressing_mode ==
+      InstructionStorageAddressingMode::kStatic) {
+    register_static_address_bound_ = std::max(
+        register_static_address_bound_, result.storage_index + uint32_t(1));
+  } else {
+    uses_register_dynamic_addressing_ = true;
+  }
+}
+
+void Shader::GatherAluResultInformation(
+    const InstructionResult& result, uint32_t memexport_alloc_current_count) {
+  if (!result.GetUsedWriteMask()) {
+    return;
+  }
+  switch (result.storage_target) {
+    case InstructionStorageTarget::kRegister:
+      if (result.storage_addressing_mode ==
+          InstructionStorageAddressingMode::kStatic) {
+        register_static_address_bound_ = std::max(
+            register_static_address_bound_, result.storage_index + uint32_t(1));
+      } else {
+        uses_register_dynamic_addressing_ = true;
+      }
+      break;
+    case InstructionStorageTarget::kExportData:
+      if (memexport_alloc_current_count > 0 &&
+          memexport_alloc_current_count <= Shader::kMaxMemExports) {
+        memexport_eM_written_[memexport_alloc_current_count - 1] |=
+            uint32_t(1) << result.storage_index;
+      }
+      break;
+    case InstructionStorageTarget::kColor:
+      writes_color_targets_ |= uint32_t(1) << result.storage_index;
+      break;
+    case InstructionStorageTarget::kDepth:
+      writes_depth_ = true;
+      break;
+  }
+}
+
+ShaderTranslator::ShaderTranslator() = default;
+
+ShaderTranslator::~ShaderTranslator() = default;
+
+void ShaderTranslator::Reset() {
+  errors_.clear();
+  std::memset(&previous_vfetch_full_, 0, sizeof(previous_vfetch_full_));
+}
+
+bool ShaderTranslator::TranslateAnalyzedShader(
+    Shader::Translation& translation) {
+  const Shader& shader = translation.shader();
+  assert_true(shader.is_ucode_analyzed());
+  if (!shader.is_ucode_analyzed()) {
+    XELOGE("AnalyzeUcode must be done on the shader before translation");
+    return false;
+  }
+  translation_ = &translation;
+
+  Reset();
+
+  register_count_ = shader.register_static_address_bound();
+  if (shader.uses_register_dynamic_addressing()) {
+    // An array of registers at the end of the r# space may be dynamically
+    // addressable - ensure enough space, as specified in SQ_PROGRAM_CNTL, is
+    // allocated.
+    register_count_ = std::max(register_count_, GetModificationRegisterCount());
+  }
+
+  StartTranslation();
+
+  const uint32_t* ucode_dwords = shader.ucode_data().data();
+
+  // TODO(Triang3l): Remove when the old SPIR-V shader translator is deleted.
+  uint32_t cf_pair_index_bound = shader.cf_pair_index_bound();
+  std::vector<ControlFlowInstruction> cf_instructions;
+  for (uint32_t i = 0; i < cf_pair_index_bound; ++i) {
+    ControlFlowInstruction cf_ab[2];
+    UnpackControlFlowInstructions(ucode_dwords + i * 3, cf_ab);
+    cf_instructions.push_back(cf_ab[0]);
+    cf_instructions.push_back(cf_ab[1]);
+  }
+  PreProcessControlFlowInstructions(cf_instructions);
+
+  // Translate all instructions.
+  const std::set<uint32_t>& label_addresses = shader.label_addresses();
+  for (uint32_t i = 0; i < cf_pair_index_bound; ++i) {
+    ControlFlowInstruction cf_ab[2];
+    UnpackControlFlowInstructions(ucode_dwords + i * 3, cf_ab);
+    for (uint32_t j = 0; j < 2; ++j) {
+      uint32_t cf_index = i * 2 + j;
+      cf_index_ = cf_index;
+      if (label_addresses.find(cf_index) != label_addresses.end()) {
+        ProcessLabel(cf_index);
+      }
+      ProcessControlFlowInstructionBegin(cf_index);
+      TranslateControlFlowInstruction(cf_ab[j]);
+      ProcessControlFlowInstructionEnd(cf_index);
+    }
+  }
+
+  translation.errors_ = std::move(errors_);
+  translation.translated_binary_ = CompleteTranslation();
+  translation.is_translated_ = true;
+
+  bool is_valid = true;
+  for (const auto& error : translation.errors_) {
+    if (error.is_fatal) {
+      is_valid = false;
+      break;
+    }
+  }
+  translation.is_valid_ = is_valid;
+
+  PostTranslation();
+
+  // In case is_valid_ is modified by PostTranslation, reload.
+  return translation.is_valid_;
+}
+
+void ShaderTranslator::EmitTranslationError(const char* message,
+                                            bool is_fatal) {
+  Shader::Error error;
+  error.is_fatal = is_fatal;
+  error.message = message;
+  // TODO(benvanik): location information.
+  errors_.push_back(std::move(error));
+  XELOGE("Shader translation {}error: {}", is_fatal ? "fatal " : "", message);
 }
 
 void ShaderTranslator::TranslateControlFlowInstruction(
     const ControlFlowInstruction& cf) {
   switch (cf.opcode()) {
     case ControlFlowOpcode::kNop:
-      TranslateControlFlowNop(cf);
+      ProcessControlFlowNopInstruction(cf_index_);
       break;
     case ControlFlowOpcode::kExec:
-      TranslateControlFlowExec(cf.exec);
-      break;
-    case ControlFlowOpcode::kExecEnd:
-      TranslateControlFlowExec(cf.exec);
-      break;
+    case ControlFlowOpcode::kExecEnd: {
+      ParsedExecInstruction instr;
+      ParseControlFlowExec(cf.exec, cf_index_, instr);
+      TranslateExecInstructions(instr);
+    } break;
     case ControlFlowOpcode::kCondExec:
-      TranslateControlFlowCondExec(cf.cond_exec);
-      break;
     case ControlFlowOpcode::kCondExecEnd:
-      TranslateControlFlowCondExec(cf.cond_exec);
-      break;
-    case ControlFlowOpcode::kCondExecPred:
-      TranslateControlFlowCondExecPred(cf.cond_exec_pred);
-      break;
-    case ControlFlowOpcode::kCondExecPredEnd:
-      TranslateControlFlowCondExecPred(cf.cond_exec_pred);
-      break;
     case ControlFlowOpcode::kCondExecPredClean:
-      TranslateControlFlowCondExec(cf.cond_exec);
-      break;
-    case ControlFlowOpcode::kCondExecPredCleanEnd:
-      TranslateControlFlowCondExec(cf.cond_exec);
-      break;
-    case ControlFlowOpcode::kLoopStart:
-      TranslateControlFlowLoopStart(cf.loop_start);
-      break;
-    case ControlFlowOpcode::kLoopEnd:
-      TranslateControlFlowLoopEnd(cf.loop_end);
-      break;
-    case ControlFlowOpcode::kCondCall:
-      TranslateControlFlowCondCall(cf.cond_call);
-      break;
-    case ControlFlowOpcode::kReturn:
-      TranslateControlFlowReturn(cf.ret);
-      break;
-    case ControlFlowOpcode::kCondJmp:
-      TranslateControlFlowCondJmp(cf.cond_jmp);
-      break;
-    case ControlFlowOpcode::kAlloc:
-      TranslateControlFlowAlloc(cf.alloc);
-      break;
+    case ControlFlowOpcode::kCondExecPredCleanEnd: {
+      ParsedExecInstruction instr;
+      ParseControlFlowCondExec(cf.cond_exec, cf_index_, instr);
+      TranslateExecInstructions(instr);
+    } break;
+    case ControlFlowOpcode::kCondExecPred:
+    case ControlFlowOpcode::kCondExecPredEnd: {
+      ParsedExecInstruction instr;
+      ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index_, instr);
+      TranslateExecInstructions(instr);
+    } break;
+    case ControlFlowOpcode::kLoopStart: {
+      ParsedLoopStartInstruction instr;
+      ParseControlFlowLoopStart(cf.loop_start, cf_index_, instr);
+      ProcessLoopStartInstruction(instr);
+    } break;
+    case ControlFlowOpcode::kLoopEnd: {
+      ParsedLoopEndInstruction instr;
+      ParseControlFlowLoopEnd(cf.loop_end, cf_index_, instr);
+      ProcessLoopEndInstruction(instr);
+    } break;
+    case ControlFlowOpcode::kCondCall: {
+      ParsedCallInstruction instr;
+      ParseControlFlowCondCall(cf.cond_call, cf_index_, instr);
+      ProcessCallInstruction(instr);
+    } break;
+    case ControlFlowOpcode::kReturn: {
+      ParsedReturnInstruction instr;
+      ParseControlFlowReturn(cf.ret, cf_index_, instr);
+      ProcessReturnInstruction(instr);
+    } break;
+    case ControlFlowOpcode::kCondJmp: {
+      ParsedJumpInstruction instr;
+      ParseControlFlowCondJmp(cf.cond_jmp, cf_index_, instr);
+      ProcessJumpInstruction(instr);
+    } break;
+    case ControlFlowOpcode::kAlloc: {
+      ParsedAllocInstruction instr;
+      ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
+      ProcessAllocInstruction(instr);
+    } break;
     case ControlFlowOpcode::kMarkVsFetchDone:
       break;
     default:
       assert_unhandled_case(cf.opcode);
       break;
   }
-  bool ends_shader = DoesControlFlowOpcodeEndShader(cf.opcode());
-  if (ends_shader) {
-    // TODO(benvanik): return?
-  }
+  // TODO(benvanik): return if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
 }
 
-void ShaderTranslator::TranslateControlFlowNop(
-    const ControlFlowInstruction& cf) {
-  ucode_disasm_buffer_.Append("      cnop\n");
-
-  ProcessControlFlowNopInstruction(cf_index_);
+void ParseControlFlowExec(const ControlFlowExecInstruction& cf,
+                          uint32_t cf_index, ParsedExecInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.opcode = cf.opcode();
+  instr.opcode_name =
+      cf.opcode() == ControlFlowOpcode::kExecEnd ? "exece" : "exec";
+  instr.instruction_address = cf.address();
+  instr.instruction_count = cf.count();
+  instr.type = ParsedExecInstruction::Type::kUnconditional;
+  instr.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd;
+  instr.clean = cf.clean();
+  instr.is_yield = cf.is_yield();
+  instr.sequence = cf.sequence();
 }
 
-void ShaderTranslator::TranslateControlFlowExec(
-    const ControlFlowExecInstruction& cf) {
-  ParsedExecInstruction i;
-  i.dword_index = cf_index_;
-  i.opcode = cf.opcode();
-  i.opcode_name = cf.opcode() == ControlFlowOpcode::kExecEnd ? "exece" : "exec";
-  i.instruction_address = cf.address();
-  i.instruction_count = cf.count();
-  i.type = ParsedExecInstruction::Type::kUnconditional;
-  i.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd;
-  i.clean = cf.clean();
-  i.is_yield = cf.is_yield();
-  i.sequence = cf.sequence();
-
-  TranslateExecInstructions(i);
-}
-
-void ShaderTranslator::TranslateControlFlowCondExec(
-    const ControlFlowCondExecInstruction& cf) {
-  ParsedExecInstruction i;
-  i.dword_index = cf_index_;
-  i.opcode = cf.opcode();
-  i.opcode_name = "cexec";
+void ParseControlFlowCondExec(const ControlFlowCondExecInstruction& cf,
+                              uint32_t cf_index, ParsedExecInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.opcode = cf.opcode();
+  instr.opcode_name = "cexec";
   switch (cf.opcode()) {
     case ControlFlowOpcode::kCondExecEnd:
     case ControlFlowOpcode::kCondExecPredCleanEnd:
-      i.opcode_name = "cexece";
-      i.is_end = true;
+      instr.opcode_name = "cexece";
+      instr.is_end = true;
       break;
     default:
       break;
   }
-  i.instruction_address = cf.address();
-  i.instruction_count = cf.count();
-  i.type = ParsedExecInstruction::Type::kConditional;
-  i.bool_constant_index = cf.bool_address();
-  assert_not_zero(
-      constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
-      (uint32_t(1) << (i.bool_constant_index % 32)));
-  i.condition = cf.condition();
+  instr.instruction_address = cf.address();
+  instr.instruction_count = cf.count();
+  instr.type = ParsedExecInstruction::Type::kConditional;
+  instr.bool_constant_index = cf.bool_address();
+  instr.condition = cf.condition();
   switch (cf.opcode()) {
     case ControlFlowOpcode::kCondExec:
     case ControlFlowOpcode::kCondExecEnd:
-      i.clean = false;
+      instr.clean = false;
       break;
     default:
       break;
   }
-  i.is_yield = cf.is_yield();
-  i.sequence = cf.sequence();
-
-  TranslateExecInstructions(i);
+  instr.is_yield = cf.is_yield();
+  instr.sequence = cf.sequence();
 }
 
-void ShaderTranslator::TranslateControlFlowCondExecPred(
-    const ControlFlowCondExecPredInstruction& cf) {
-  ParsedExecInstruction i;
-  i.dword_index = cf_index_;
-  i.opcode = cf.opcode();
-  i.opcode_name =
+void ParseControlFlowCondExecPred(const ControlFlowCondExecPredInstruction& cf,
+                                  uint32_t cf_index,
+                                  ParsedExecInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.opcode = cf.opcode();
+  instr.opcode_name =
       cf.opcode() == ControlFlowOpcode::kCondExecPredEnd ? "exece" : "exec";
-  i.instruction_address = cf.address();
-  i.instruction_count = cf.count();
-  i.type = ParsedExecInstruction::Type::kPredicated;
-  i.condition = cf.condition();
-  i.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd;
-  i.clean = cf.clean();
-  i.is_yield = cf.is_yield();
-  i.sequence = cf.sequence();
-
-  TranslateExecInstructions(i);
+  instr.instruction_address = cf.address();
+  instr.instruction_count = cf.count();
+  instr.type = ParsedExecInstruction::Type::kPredicated;
+  instr.condition = cf.condition();
+  instr.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd;
+  instr.clean = cf.clean();
+  instr.is_yield = cf.is_yield();
+  instr.sequence = cf.sequence();
 }
 
-void ShaderTranslator::TranslateControlFlowLoopStart(
-    const ControlFlowLoopStartInstruction& cf) {
-  ParsedLoopStartInstruction i;
-  i.dword_index = cf_index_;
-  i.loop_constant_index = cf.loop_id();
-  assert_not_zero(constant_register_map_.loop_bitmap &
-                  (uint32_t(1) << i.loop_constant_index));
-  i.is_repeat = cf.is_repeat();
-  i.loop_skip_address = cf.address();
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessLoopStartInstruction(i);
+void ParseControlFlowLoopStart(const ControlFlowLoopStartInstruction& cf,
+                               uint32_t cf_index,
+                               ParsedLoopStartInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.loop_constant_index = cf.loop_id();
+  instr.is_repeat = cf.is_repeat();
+  instr.loop_skip_address = cf.address();
 }
 
-void ShaderTranslator::TranslateControlFlowLoopEnd(
-    const ControlFlowLoopEndInstruction& cf) {
-  ParsedLoopEndInstruction i;
-  i.dword_index = cf_index_;
-  i.is_predicated_break = cf.is_predicated_break();
-  i.predicate_condition = cf.condition();
-  i.loop_constant_index = cf.loop_id();
-  assert_not_zero(constant_register_map_.loop_bitmap &
-                  (uint32_t(1) << i.loop_constant_index));
-  i.loop_body_address = cf.address();
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessLoopEndInstruction(i);
+void ParseControlFlowLoopEnd(const ControlFlowLoopEndInstruction& cf,
+                             uint32_t cf_index,
+                             ParsedLoopEndInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.is_predicated_break = cf.is_predicated_break();
+  instr.predicate_condition = cf.condition();
+  instr.loop_constant_index = cf.loop_id();
+  instr.loop_body_address = cf.address();
 }
 
-void ShaderTranslator::TranslateControlFlowCondCall(
-    const ControlFlowCondCallInstruction& cf) {
-  ParsedCallInstruction i;
-  i.dword_index = cf_index_;
-  i.target_address = cf.address();
+void ParseControlFlowCondCall(const ControlFlowCondCallInstruction& cf,
+                              uint32_t cf_index, ParsedCallInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.target_address = cf.address();
   if (cf.is_unconditional()) {
-    i.type = ParsedCallInstruction::Type::kUnconditional;
+    instr.type = ParsedCallInstruction::Type::kUnconditional;
   } else if (cf.is_predicated()) {
-    i.type = ParsedCallInstruction::Type::kPredicated;
-    i.condition = cf.condition();
+    instr.type = ParsedCallInstruction::Type::kPredicated;
+    instr.condition = cf.condition();
   } else {
-    i.type = ParsedCallInstruction::Type::kConditional;
-    i.bool_constant_index = cf.bool_address();
-    assert_not_zero(
-        constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
-        (uint32_t(1) << (i.bool_constant_index % 32)));
-    i.condition = cf.condition();
+    instr.type = ParsedCallInstruction::Type::kConditional;
+    instr.bool_constant_index = cf.bool_address();
+    instr.condition = cf.condition();
   }
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessCallInstruction(i);
 }
 
-void ShaderTranslator::TranslateControlFlowReturn(
-    const ControlFlowReturnInstruction& cf) {
-  ParsedReturnInstruction i;
-  i.dword_index = cf_index_;
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessReturnInstruction(i);
+void ParseControlFlowReturn(const ControlFlowReturnInstruction& cf,
+                            uint32_t cf_index, ParsedReturnInstruction& instr) {
+  instr.dword_index = cf_index;
 }
 
-void ShaderTranslator::TranslateControlFlowCondJmp(
-    const ControlFlowCondJmpInstruction& cf) {
-  ParsedJumpInstruction i;
-  i.dword_index = cf_index_;
-  i.target_address = cf.address();
+void ParseControlFlowCondJmp(const ControlFlowCondJmpInstruction& cf,
+                             uint32_t cf_index, ParsedJumpInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.target_address = cf.address();
   if (cf.is_unconditional()) {
-    i.type = ParsedJumpInstruction::Type::kUnconditional;
+    instr.type = ParsedJumpInstruction::Type::kUnconditional;
   } else if (cf.is_predicated()) {
-    i.type = ParsedJumpInstruction::Type::kPredicated;
-    i.condition = cf.condition();
+    instr.type = ParsedJumpInstruction::Type::kPredicated;
+    instr.condition = cf.condition();
   } else {
-    i.type = ParsedJumpInstruction::Type::kConditional;
-    i.bool_constant_index = cf.bool_address();
-    assert_not_zero(
-        constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
-        (uint32_t(1) << (i.bool_constant_index % 32)));
-    i.condition = cf.condition();
+    instr.type = ParsedJumpInstruction::Type::kConditional;
+    instr.bool_constant_index = cf.bool_address();
+    instr.condition = cf.condition();
   }
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessJumpInstruction(i);
 }
 
-void ShaderTranslator::TranslateControlFlowAlloc(
-    const ControlFlowAllocInstruction& cf) {
-  ParsedAllocInstruction i;
-  i.dword_index = cf_index_;
-  i.type = cf.alloc_type();
-  i.count = cf.size();
-  i.is_vertex_shader = is_vertex_shader();
-
-  i.Disassemble(&ucode_disasm_buffer_);
-
-  ProcessAllocInstruction(i);
+void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
+                           uint32_t cf_index, bool is_vertex_shader,
+                           ParsedAllocInstruction& instr) {
+  instr.dword_index = cf_index;
+  instr.type = cf.alloc_type();
+  instr.count = cf.size();
+  instr.is_vertex_shader = is_vertex_shader;
 }
 
 void ShaderTranslator::TranslateExecInstructions(
     const ParsedExecInstruction& instr) {
-  instr.Disassemble(&ucode_disasm_buffer_);
-
   ProcessExecInstructionBegin(instr);
-
+  const uint32_t* ucode_dwords = current_shader().ucode_data().data();
   uint32_t sequence = instr.sequence;
   for (uint32_t instr_offset = instr.instruction_address;
        instr_offset < instr.instruction_address + instr.instruction_count;
        ++instr_offset, sequence >>= 2) {
-    MarkUcodeInstruction(instr_offset);
-    AppendUcodeDisasmFormat("/* %4u   */ ", instr_offset);
-    bool is_sync = (sequence & 0x2) == 0x2;
-    bool is_fetch = (sequence & 0x1) == 0x1;
-    if (is_sync) {
-      AppendUcodeDisasm("         serialize\n             ");
-    }
-    if (is_fetch) {
+    if (sequence & 0b01) {
       auto fetch_opcode =
-          static_cast<FetchOpcode>(ucode_dwords_[instr_offset * 3] & 0x1F);
+          static_cast<FetchOpcode>(ucode_dwords[instr_offset * 3] & 0x1F);
       if (fetch_opcode == FetchOpcode::kVertexFetch) {
         auto& op = *reinterpret_cast<const VertexFetchInstruction*>(
-            ucode_dwords_ + instr_offset * 3);
-        TranslateVertexFetchInstruction(op);
+            ucode_dwords + instr_offset * 3);
+        ParsedVertexFetchInstruction vfetch_instr;
+        if (ParseVertexFetchInstruction(op, previous_vfetch_full_,
+                                        vfetch_instr)) {
+          previous_vfetch_full_ = op;
+        }
+        ProcessVertexFetchInstruction(vfetch_instr);
       } else {
         auto& op = *reinterpret_cast<const TextureFetchInstruction*>(
-            ucode_dwords_ + instr_offset * 3);
-        TranslateTextureFetchInstruction(op);
+            ucode_dwords + instr_offset * 3);
+        ParsedTextureFetchInstruction tfetch_instr;
+        ParseTextureFetchInstruction(op, tfetch_instr);
+        ProcessTextureFetchInstruction(tfetch_instr);
       }
     } else {
-      auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords_ +
+      auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords +
                                                           instr_offset * 3);
-      TranslateAluInstruction(op);
+      ParsedAluInstruction alu_instr;
+      ParseAluInstruction(op, current_shader().type(), alu_instr);
+      ProcessAluInstruction(alu_instr);
     }
   }
-
   ProcessExecInstructionEnd(instr);
 }
 
-void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle,
-                                 bool is_relative,
-                                 InstructionResult* out_result) {
-  out_result->storage_target = InstructionStorageTarget::kRegister;
-  out_result->storage_index = dest;
-  out_result->is_clamped = false;
-  out_result->storage_addressing_mode =
+static void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle,
+                                        bool is_relative,
+                                        InstructionResult& result) {
+  result.storage_target = InstructionStorageTarget::kRegister;
+  result.storage_index = dest;
+  result.is_clamped = false;
+  result.storage_addressing_mode =
       is_relative ? InstructionStorageAddressingMode::kAddressRelative
                   : InstructionStorageAddressingMode::kStatic;
-  out_result->original_write_mask = 0b1111;
+  result.original_write_mask = 0b1111;
   for (int i = 0; i < 4; ++i) {
     switch (swizzle & 0x7) {
       case 4:
       case 6:
-        out_result->components[i] = SwizzleSource::k0;
+        result.components[i] = SwizzleSource::k0;
         break;
       case 5:
-        out_result->components[i] = SwizzleSource::k1;
+        result.components[i] = SwizzleSource::k1;
         break;
       case 7:
-        out_result->original_write_mask &= ~uint32_t(1 << i);
+        result.original_write_mask &= ~uint32_t(1 << i);
         break;
       default:
-        out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3);
+        result.components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3);
     }
     swizzle >>= 3;
   }
 }
 
-void ShaderTranslator::TranslateVertexFetchInstruction(
-    const VertexFetchInstruction& op) {
-  ParsedVertexFetchInstruction instr;
-  ParseVertexFetchInstruction(op, &instr);
-  instr.Disassemble(&ucode_disasm_buffer_);
-  ProcessVertexFetchInstruction(instr);
-}
-
-void ShaderTranslator::ParseVertexFetchInstruction(
-    const VertexFetchInstruction& op, ParsedVertexFetchInstruction* out_instr) {
-  auto& i = *out_instr;
-  i.opcode = FetchOpcode::kVertexFetch;
-  i.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full";
-  i.is_mini_fetch = op.is_mini_fetch();
-  i.is_predicated = op.is_predicated();
-  i.predicate_condition = op.predicate_condition();
+bool ParseVertexFetchInstruction(const VertexFetchInstruction& op,
+                                 const VertexFetchInstruction& previous_full_op,
+                                 ParsedVertexFetchInstruction& instr) {
+  instr.opcode = FetchOpcode::kVertexFetch;
+  instr.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full";
+  instr.is_mini_fetch = op.is_mini_fetch();
+  instr.is_predicated = op.is_predicated();
+  instr.predicate_condition = op.predicate_condition();
 
   ParseFetchInstructionResult(op.dest(), op.dest_swizzle(),
-                              op.is_dest_relative(), &i.result);
+                              op.is_dest_relative(), instr.result);
 
   // Reuse previous vfetch_full if this is a mini.
-  const auto& full_op = op.is_mini_fetch() ? previous_vfetch_full_ : op;
-  auto& src_op = i.operands[i.operand_count++];
+  const auto& full_op = op.is_mini_fetch() ? previous_full_op : op;
+  auto& src_op = instr.operands[instr.operand_count++];
   src_op.storage_source = InstructionStorageSource::kRegister;
   src_op.storage_index = full_op.src();
   src_op.storage_addressing_mode =
@@ -895,37 +858,25 @@ void ShaderTranslator::ParseVertexFetchInstruction(
     src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3);
   }
 
-  auto& const_op = i.operands[i.operand_count++];
+  auto& const_op = instr.operands[instr.operand_count++];
   const_op.storage_source = InstructionStorageSource::kVertexFetchConstant;
   const_op.storage_index = full_op.fetch_constant_index();
 
-  i.attributes.data_format = op.data_format();
-  i.attributes.offset = op.offset();
-  i.attributes.stride = full_op.stride();
-  i.attributes.exp_adjust = op.exp_adjust();
-  i.attributes.prefetch_count = op.prefetch_count();
-  i.attributes.is_index_rounded = op.is_index_rounded();
-  i.attributes.is_signed = op.is_signed();
-  i.attributes.is_integer = !op.is_normalized();
-  i.attributes.signed_rf_mode = op.signed_rf_mode();
+  instr.attributes.data_format = op.data_format();
+  instr.attributes.offset = op.offset();
+  instr.attributes.stride = full_op.stride();
+  instr.attributes.exp_adjust = op.exp_adjust();
+  instr.attributes.prefetch_count = op.prefetch_count();
+  instr.attributes.is_index_rounded = op.is_index_rounded();
+  instr.attributes.is_signed = op.is_signed();
+  instr.attributes.is_integer = !op.is_normalized();
+  instr.attributes.signed_rf_mode = op.signed_rf_mode();
 
-  // Store for later use by mini fetches.
-  if (!op.is_mini_fetch()) {
-    previous_vfetch_full_ = op;
-  }
+  return !op.is_mini_fetch();
 }
 
-void ShaderTranslator::TranslateTextureFetchInstruction(
-    const TextureFetchInstruction& op) {
-  ParsedTextureFetchInstruction instr;
-  ParseTextureFetchInstruction(op, &instr);
-  instr.Disassemble(&ucode_disasm_buffer_);
-  ProcessTextureFetchInstruction(instr);
-}
-
-void ShaderTranslator::ParseTextureFetchInstruction(
-    const TextureFetchInstruction& op,
-    ParsedTextureFetchInstruction* out_instr) {
+void ParseTextureFetchInstruction(const TextureFetchInstruction& op,
+                                  ParsedTextureFetchInstruction& instr) {
   struct TextureFetchOpcodeInfo {
     const char* name;
     bool has_dest;
@@ -975,21 +926,20 @@ void ShaderTranslator::ParseTextureFetchInstruction(
       return;
   }
 
-  auto& i = *out_instr;
-  i.opcode = op.opcode();
-  i.opcode_name = opcode_info.name;
-  i.dimension = op.dimension();
-  i.is_predicated = op.is_predicated();
-  i.predicate_condition = op.predicate_condition();
+  instr.opcode = op.opcode();
+  instr.opcode_name = opcode_info.name;
+  instr.dimension = op.dimension();
+  instr.is_predicated = op.is_predicated();
+  instr.predicate_condition = op.predicate_condition();
 
   if (opcode_info.has_dest) {
     ParseFetchInstructionResult(op.dest(), op.dest_swizzle(),
-                                op.is_dest_relative(), &i.result);
+                                op.is_dest_relative(), instr.result);
   } else {
-    i.result.storage_target = InstructionStorageTarget::kNone;
+    instr.result.storage_target = InstructionStorageTarget::kNone;
   }
 
-  auto& src_op = i.operands[i.operand_count++];
+  auto& src_op = instr.operands[instr.operand_count++];
   src_op.storage_source = InstructionStorageSource::kRegister;
   src_op.storage_index = op.src();
   src_op.storage_addressing_mode =
@@ -1007,27 +957,27 @@ void ShaderTranslator::ParseTextureFetchInstruction(
   }
 
   if (opcode_info.has_const) {
-    auto& const_op = i.operands[i.operand_count++];
+    auto& const_op = instr.operands[instr.operand_count++];
     const_op.storage_source = InstructionStorageSource::kTextureFetchConstant;
     const_op.storage_index = op.fetch_constant_index();
   }
 
   if (opcode_info.has_attributes) {
-    i.attributes.fetch_valid_only = op.fetch_valid_only();
-    i.attributes.unnormalized_coordinates = op.unnormalized_coordinates();
-    i.attributes.mag_filter = op.mag_filter();
-    i.attributes.min_filter = op.min_filter();
-    i.attributes.mip_filter = op.mip_filter();
-    i.attributes.aniso_filter = op.aniso_filter();
-    i.attributes.vol_mag_filter = op.vol_mag_filter();
-    i.attributes.vol_min_filter = op.vol_min_filter();
-    i.attributes.use_computed_lod = op.use_computed_lod();
-    i.attributes.use_register_lod = op.use_register_lod();
-    i.attributes.use_register_gradients = op.use_register_gradients();
-    i.attributes.lod_bias = op.lod_bias();
-    i.attributes.offset_x = op.offset_x();
-    i.attributes.offset_y = op.offset_y();
-    i.attributes.offset_z = op.offset_z();
+    instr.attributes.fetch_valid_only = op.fetch_valid_only();
+    instr.attributes.unnormalized_coordinates = op.unnormalized_coordinates();
+    instr.attributes.mag_filter = op.mag_filter();
+    instr.attributes.min_filter = op.min_filter();
+    instr.attributes.mip_filter = op.mip_filter();
+    instr.attributes.aniso_filter = op.aniso_filter();
+    instr.attributes.vol_mag_filter = op.vol_mag_filter();
+    instr.attributes.vol_min_filter = op.vol_min_filter();
+    instr.attributes.use_computed_lod = op.use_computed_lod();
+    instr.attributes.use_register_lod = op.use_register_lod();
+    instr.attributes.use_register_gradients = op.use_register_gradients();
+    instr.attributes.lod_bias = op.lod_bias();
+    instr.attributes.offset_x = op.offset_x();
+    instr.attributes.offset_y = op.offset_y();
+    instr.attributes.offset_z = op.offset_z();
   }
 }
 
@@ -1079,250 +1029,102 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const {
   return result.GetUsedResultComponents() & components;
 }
 
-const ShaderTranslator::AluOpcodeInfo
-    ShaderTranslator::alu_vector_opcode_infos_[0x20] = {
-        {"add", 2, 4},           // 0
-        {"mul", 2, 4},           // 1
-        {"max", 2, 4},           // 2
-        {"min", 2, 4},           // 3
-        {"seq", 2, 4},           // 4
-        {"sgt", 2, 4},           // 5
-        {"sge", 2, 4},           // 6
-        {"sne", 2, 4},           // 7
-        {"frc", 1, 4},           // 8
-        {"trunc", 1, 4},         // 9
-        {"floor", 1, 4},         // 10
-        {"mad", 3, 4},           // 11
-        {"cndeq", 3, 4},         // 12
-        {"cndge", 3, 4},         // 13
-        {"cndgt", 3, 4},         // 14
-        {"dp4", 2, 4},           // 15
-        {"dp3", 2, 4},           // 16
-        {"dp2add", 3, 4},        // 17
-        {"cube", 2, 4},          // 18
-        {"max4", 1, 4},          // 19
-        {"setp_eq_push", 2, 4},  // 20
-        {"setp_ne_push", 2, 4},  // 21
-        {"setp_gt_push", 2, 4},  // 22
-        {"setp_ge_push", 2, 4},  // 23
-        {"kill_eq", 2, 4},       // 24
-        {"kill_gt", 2, 4},       // 25
-        {"kill_ge", 2, 4},       // 26
-        {"kill_ne", 2, 4},       // 27
-        {"dst", 2, 4},           // 28
-        {"maxa", 2, 4},          // 29
+struct AluOpcodeInfo {
+  const char* name;
+  uint32_t argument_count;
+  uint32_t src_swizzle_component_count;
 };
 
-const ShaderTranslator::AluOpcodeInfo
-    ShaderTranslator::alu_scalar_opcode_infos_[0x40] = {
-        {"adds", 1, 2},         // 0
-        {"adds_prev", 1, 1},    // 1
-        {"muls", 1, 2},         // 2
-        {"muls_prev", 1, 1},    // 3
-        {"muls_prev2", 1, 2},   // 4
-        {"maxs", 1, 2},         // 5
-        {"mins", 1, 2},         // 6
-        {"seqs", 1, 1},         // 7
-        {"sgts", 1, 1},         // 8
-        {"sges", 1, 1},         // 9
-        {"snes", 1, 1},         // 10
-        {"frcs", 1, 1},         // 11
-        {"truncs", 1, 1},       // 12
-        {"floors", 1, 1},       // 13
-        {"exp", 1, 1},          // 14
-        {"logc", 1, 1},         // 15
-        {"log", 1, 1},          // 16
-        {"rcpc", 1, 1},         // 17
-        {"rcpf", 1, 1},         // 18
-        {"rcp", 1, 1},          // 19
-        {"rsqc", 1, 1},         // 20
-        {"rsqf", 1, 1},         // 21
-        {"rsq", 1, 1},          // 22
-        {"maxas", 1, 2},        // 23
-        {"maxasf", 1, 2},       // 24
-        {"subs", 1, 2},         // 25
-        {"subs_prev", 1, 1},    // 26
-        {"setp_eq", 1, 1},      // 27
-        {"setp_ne", 1, 1},      // 28
-        {"setp_gt", 1, 1},      // 29
-        {"setp_ge", 1, 1},      // 30
-        {"setp_inv", 1, 1},     // 31
-        {"setp_pop", 1, 1},     // 32
-        {"setp_clr", 0, 0},     // 33
-        {"setp_rstr", 1, 1},    // 34
-        {"kills_eq", 1, 1},     // 35
-        {"kills_gt", 1, 1},     // 36
-        {"kills_ge", 1, 1},     // 37
-        {"kills_ne", 1, 1},     // 38
-        {"kills_one", 1, 1},    // 39
-        {"sqrt", 1, 1},         // 40
-        {"UNKNOWN", 0, 0},      // 41
-        {"mulsc", 2, 1},        // 42
-        {"mulsc", 2, 1},        // 43
-        {"addsc", 2, 1},        // 44
-        {"addsc", 2, 1},        // 45
-        {"subsc", 2, 1},        // 46
-        {"subsc", 2, 1},        // 47
-        {"sin", 1, 1},          // 48
-        {"cos", 1, 1},          // 49
-        {"retain_prev", 0, 0},  // 50
+static const AluOpcodeInfo alu_vector_opcode_infos[0x20] = {
+    {"add", 2, 4},           // 0
+    {"mul", 2, 4},           // 1
+    {"max", 2, 4},           // 2
+    {"min", 2, 4},           // 3
+    {"seq", 2, 4},           // 4
+    {"sgt", 2, 4},           // 5
+    {"sge", 2, 4},           // 6
+    {"sne", 2, 4},           // 7
+    {"frc", 1, 4},           // 8
+    {"trunc", 1, 4},         // 9
+    {"floor", 1, 4},         // 10
+    {"mad", 3, 4},           // 11
+    {"cndeq", 3, 4},         // 12
+    {"cndge", 3, 4},         // 13
+    {"cndgt", 3, 4},         // 14
+    {"dp4", 2, 4},           // 15
+    {"dp3", 2, 4},           // 16
+    {"dp2add", 3, 4},        // 17
+    {"cube", 2, 4},          // 18
+    {"max4", 1, 4},          // 19
+    {"setp_eq_push", 2, 4},  // 20
+    {"setp_ne_push", 2, 4},  // 21
+    {"setp_gt_push", 2, 4},  // 22
+    {"setp_ge_push", 2, 4},  // 23
+    {"kill_eq", 2, 4},       // 24
+    {"kill_gt", 2, 4},       // 25
+    {"kill_ge", 2, 4},       // 26
+    {"kill_ne", 2, 4},       // 27
+    {"dst", 2, 4},           // 28
+    {"maxa", 2, 4},          // 29
 };
 
-void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {
-  ParsedAluInstruction instr;
-  ParseAluInstruction(op, instr);
-  instr.Disassemble(&ucode_disasm_buffer_);
-  ProcessAluInstruction(instr);
-}
+static const AluOpcodeInfo alu_scalar_opcode_infos[0x40] = {
+    {"adds", 1, 2},         // 0
+    {"adds_prev", 1, 1},    // 1
+    {"muls", 1, 2},         // 2
+    {"muls_prev", 1, 1},    // 3
+    {"muls_prev2", 1, 2},   // 4
+    {"maxs", 1, 2},         // 5
+    {"mins", 1, 2},         // 6
+    {"seqs", 1, 1},         // 7
+    {"sgts", 1, 1},         // 8
+    {"sges", 1, 1},         // 9
+    {"snes", 1, 1},         // 10
+    {"frcs", 1, 1},         // 11
+    {"truncs", 1, 1},       // 12
+    {"floors", 1, 1},       // 13
+    {"exp", 1, 1},          // 14
+    {"logc", 1, 1},         // 15
+    {"log", 1, 1},          // 16
+    {"rcpc", 1, 1},         // 17
+    {"rcpf", 1, 1},         // 18
+    {"rcp", 1, 1},          // 19
+    {"rsqc", 1, 1},         // 20
+    {"rsqf", 1, 1},         // 21
+    {"rsq", 1, 1},          // 22
+    {"maxas", 1, 2},        // 23
+    {"maxasf", 1, 2},       // 24
+    {"subs", 1, 2},         // 25
+    {"subs_prev", 1, 1},    // 26
+    {"setp_eq", 1, 1},      // 27
+    {"setp_ne", 1, 1},      // 28
+    {"setp_gt", 1, 1},      // 29
+    {"setp_ge", 1, 1},      // 30
+    {"setp_inv", 1, 1},     // 31
+    {"setp_pop", 1, 1},     // 32
+    {"setp_clr", 0, 0},     // 33
+    {"setp_rstr", 1, 1},    // 34
+    {"kills_eq", 1, 1},     // 35
+    {"kills_gt", 1, 1},     // 36
+    {"kills_ge", 1, 1},     // 37
+    {"kills_ne", 1, 1},     // 38
+    {"kills_one", 1, 1},    // 39
+    {"sqrt", 1, 1},         // 40
+    {"UNKNOWN", 0, 0},      // 41
+    {"mulsc", 2, 1},        // 42
+    {"mulsc", 2, 1},        // 43
+    {"addsc", 2, 1},        // 44
+    {"addsc", 2, 1},        // 45
+    {"subsc", 2, 1},        // 46
+    {"subsc", 2, 1},        // 47
+    {"sin", 1, 1},          // 48
+    {"cos", 1, 1},          // 49
+    {"retain_prev", 0, 0},  // 50
+};
 
-void ShaderTranslator::ParseAluInstruction(const AluInstruction& op,
-                                           ParsedAluInstruction& instr) const {
-  instr.is_predicated = op.is_predicated();
-  instr.predicate_condition = op.predicate_condition();
-
-  bool is_export = op.is_export();
-
-  InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister;
-  uint32_t storage_index_export = 0;
-  if (is_export) {
-    storage_target = InstructionStorageTarget::kNone;
-    // Both vector and scalar operation export to vector_dest.
-    ExportRegister export_register = ExportRegister(op.vector_dest());
-    if (export_register == ExportRegister::kExportAddress) {
-      storage_target = InstructionStorageTarget::kExportAddress;
-    } else if (export_register >= ExportRegister::kExportData0 &&
-               export_register <= ExportRegister::kExportData4) {
-      storage_target = InstructionStorageTarget::kExportData;
-      storage_index_export =
-          uint32_t(export_register) - uint32_t(ExportRegister::kExportData0);
-    } else if (is_vertex_shader()) {
-      if (export_register >= ExportRegister::kVSInterpolator0 &&
-          export_register <= ExportRegister::kVSInterpolator15) {
-        storage_target = InstructionStorageTarget::kInterpolator;
-        storage_index_export = uint32_t(export_register) -
-                               uint32_t(ExportRegister::kVSInterpolator0);
-      } else if (export_register == ExportRegister::kVSPosition) {
-        storage_target = InstructionStorageTarget::kPosition;
-      } else if (export_register ==
-                 ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
-        storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex;
-      }
-    } else if (is_pixel_shader()) {
-      if (export_register >= ExportRegister::kPSColor0 &&
-          export_register <= ExportRegister::kPSColor3) {
-        storage_target = InstructionStorageTarget::kColor;
-        storage_index_export =
-            uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0);
-      } else if (export_register == ExportRegister::kPSDepth) {
-        storage_target = InstructionStorageTarget::kDepth;
-      }
-    }
-    if (storage_target == InstructionStorageTarget::kNone) {
-      assert_always();
-      XELOGE(
-          "ShaderTranslator::ParseAluInstruction: Unsupported write to export "
-          "{}",
-          uint32_t(export_register));
-    }
-  }
-
-  // Vector operation and constant 0/1 writes.
-
-  instr.vector_opcode = op.vector_opcode();
-  const auto& vector_opcode_info =
-      alu_vector_opcode_infos_[uint32_t(instr.vector_opcode)];
-  instr.vector_opcode_name = vector_opcode_info.name;
-
-  instr.vector_and_constant_result.storage_target = storage_target;
-  instr.vector_and_constant_result.storage_addressing_mode =
-      InstructionStorageAddressingMode::kStatic;
-  if (is_export) {
-    instr.vector_and_constant_result.storage_index = storage_index_export;
-  } else {
-    instr.vector_and_constant_result.storage_index = op.vector_dest();
-    assert_true(op.vector_dest() < register_count());
-    if (op.is_vector_dest_relative()) {
-      instr.vector_and_constant_result.storage_addressing_mode =
-          InstructionStorageAddressingMode::kAddressRelative;
-    }
-  }
-  instr.vector_and_constant_result.is_clamped = op.vector_clamp();
-  uint32_t constant_0_mask = op.GetConstant0WriteMask();
-  uint32_t constant_1_mask = op.GetConstant1WriteMask();
-  instr.vector_and_constant_result.original_write_mask =
-      op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask;
-  for (uint32_t i = 0; i < 4; ++i) {
-    SwizzleSource component = GetSwizzleFromComponentIndex(i);
-    if (constant_0_mask & (1 << i)) {
-      component = SwizzleSource::k0;
-    } else if (constant_1_mask & (1 << i)) {
-      component = SwizzleSource::k1;
-    }
-    instr.vector_and_constant_result.components[i] = component;
-  }
-
-  instr.vector_operand_count = vector_opcode_info.argument_count;
-  for (uint32_t i = 0; i < instr.vector_operand_count; ++i) {
-    InstructionOperand& vector_operand = instr.vector_operands[i];
-    ParseAluInstructionOperand(op, i + 1,
-                               vector_opcode_info.src_swizzle_component_count,
-                               vector_operand);
-  }
-
-  // Scalar operation.
-
-  instr.scalar_opcode = op.scalar_opcode();
-  const auto& scalar_opcode_info =
-      alu_scalar_opcode_infos_[uint32_t(instr.scalar_opcode)];
-  instr.scalar_opcode_name = scalar_opcode_info.name;
-
-  instr.scalar_result.storage_target = storage_target;
-  instr.scalar_result.storage_addressing_mode =
-      InstructionStorageAddressingMode::kStatic;
-  if (is_export) {
-    instr.scalar_result.storage_index = storage_index_export;
-  } else {
-    instr.scalar_result.storage_index = op.scalar_dest();
-    assert_true(op.scalar_dest() < register_count());
-    if (op.is_scalar_dest_relative()) {
-      instr.scalar_result.storage_addressing_mode =
-          InstructionStorageAddressingMode::kAddressRelative;
-    }
-  }
-  instr.scalar_result.is_clamped = op.scalar_clamp();
-  instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask();
-  for (uint32_t i = 0; i < 4; ++i) {
-    instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i);
-  }
-
-  instr.scalar_operand_count = scalar_opcode_info.argument_count;
-  if (instr.scalar_operand_count) {
-    if (instr.scalar_operand_count == 1) {
-      ParseAluInstructionOperand(op, 3,
-                                 scalar_opcode_info.src_swizzle_component_count,
-                                 instr.scalar_operands[0]);
-    } else {
-      uint32_t src3_swizzle = op.src_swizzle(3);
-      uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3;
-      uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3;
-      uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) |
-                      (static_cast<int>(op.scalar_opcode()) & 1);
-      int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0;
-
-      ParseAluInstructionOperandSpecial(
-          op, InstructionStorageSource::kConstantFloat, op.src_reg(3),
-          op.src_negate(3), 0, component_a, instr.scalar_operands[0]);
-
-      ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister,
-                                        reg2, op.src_negate(3), const_slot,
-                                        component_b, instr.scalar_operands[1]);
-    }
-  }
-}
-
-void ShaderTranslator::ParseAluInstructionOperand(
-    const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count,
-    InstructionOperand& out_op) {
+static void ParseAluInstructionOperand(const AluInstruction& op, uint32_t i,
+                                       uint32_t swizzle_component_count,
+                                       InstructionOperand& out_op) {
   int const_slot = 0;
   switch (i) {
     case 2:
@@ -1378,7 +1180,7 @@ void ShaderTranslator::ParseAluInstructionOperand(
   }
 }
 
-void ShaderTranslator::ParseAluInstructionOperandSpecial(
+static void ParseAluInstructionOperandSpecial(
     const AluInstruction& op, InstructionStorageSource storage_source,
     uint32_t reg, bool negate, int const_slot, uint32_t component_index,
     InstructionOperand& out_op) {
@@ -1448,6 +1250,150 @@ bool ParsedAluInstruction::IsVectorOpDefaultNop() const {
   return true;
 }
 
+void ParseAluInstruction(const AluInstruction& op,
+                         xenos::ShaderType shader_type,
+                         ParsedAluInstruction& instr) {
+  instr.is_predicated = op.is_predicated();
+  instr.predicate_condition = op.predicate_condition();
+
+  bool is_export = op.is_export();
+
+  InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister;
+  uint32_t storage_index_export = 0;
+  if (is_export) {
+    storage_target = InstructionStorageTarget::kNone;
+    // Both vector and scalar operation export to vector_dest.
+    ExportRegister export_register = ExportRegister(op.vector_dest());
+    if (export_register == ExportRegister::kExportAddress) {
+      storage_target = InstructionStorageTarget::kExportAddress;
+    } else if (export_register >= ExportRegister::kExportData0 &&
+               export_register <= ExportRegister::kExportData4) {
+      storage_target = InstructionStorageTarget::kExportData;
+      storage_index_export =
+          uint32_t(export_register) - uint32_t(ExportRegister::kExportData0);
+    } else if (shader_type == xenos::ShaderType::kVertex) {
+      if (export_register >= ExportRegister::kVSInterpolator0 &&
+          export_register <= ExportRegister::kVSInterpolator15) {
+        storage_target = InstructionStorageTarget::kInterpolator;
+        storage_index_export = uint32_t(export_register) -
+                               uint32_t(ExportRegister::kVSInterpolator0);
+      } else if (export_register == ExportRegister::kVSPosition) {
+        storage_target = InstructionStorageTarget::kPosition;
+      } else if (export_register ==
+                 ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
+        storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex;
+      }
+    } else if (shader_type == xenos::ShaderType::kPixel) {
+      if (export_register >= ExportRegister::kPSColor0 &&
+          export_register <= ExportRegister::kPSColor3) {
+        storage_target = InstructionStorageTarget::kColor;
+        storage_index_export =
+            uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0);
+      } else if (export_register == ExportRegister::kPSDepth) {
+        storage_target = InstructionStorageTarget::kDepth;
+      }
+    }
+    if (storage_target == InstructionStorageTarget::kNone) {
+      assert_always();
+      XELOGE(
+          "ShaderTranslator::ParseAluInstruction: Unsupported write to export "
+          "{}",
+          uint32_t(export_register));
+    }
+  }
+
+  // Vector operation and constant 0/1 writes.
+
+  instr.vector_opcode = op.vector_opcode();
+  const auto& vector_opcode_info =
+      alu_vector_opcode_infos[uint32_t(instr.vector_opcode)];
+  instr.vector_opcode_name = vector_opcode_info.name;
+
+  instr.vector_and_constant_result.storage_target = storage_target;
+  instr.vector_and_constant_result.storage_addressing_mode =
+      InstructionStorageAddressingMode::kStatic;
+  if (is_export) {
+    instr.vector_and_constant_result.storage_index = storage_index_export;
+  } else {
+    instr.vector_and_constant_result.storage_index = op.vector_dest();
+    if (op.is_vector_dest_relative()) {
+      instr.vector_and_constant_result.storage_addressing_mode =
+          InstructionStorageAddressingMode::kAddressRelative;
+    }
+  }
+  instr.vector_and_constant_result.is_clamped = op.vector_clamp();
+  uint32_t constant_0_mask = op.GetConstant0WriteMask();
+  uint32_t constant_1_mask = op.GetConstant1WriteMask();
+  instr.vector_and_constant_result.original_write_mask =
+      op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask;
+  for (uint32_t i = 0; i < 4; ++i) {
+    SwizzleSource component = GetSwizzleFromComponentIndex(i);
+    if (constant_0_mask & (1 << i)) {
+      component = SwizzleSource::k0;
+    } else if (constant_1_mask & (1 << i)) {
+      component = SwizzleSource::k1;
+    }
+    instr.vector_and_constant_result.components[i] = component;
+  }
+
+  instr.vector_operand_count = vector_opcode_info.argument_count;
+  for (uint32_t i = 0; i < instr.vector_operand_count; ++i) {
+    InstructionOperand& vector_operand = instr.vector_operands[i];
+    ParseAluInstructionOperand(op, i + 1,
+                               vector_opcode_info.src_swizzle_component_count,
+                               vector_operand);
+  }
+
+  // Scalar operation.
+
+  instr.scalar_opcode = op.scalar_opcode();
+  const auto& scalar_opcode_info =
+      alu_scalar_opcode_infos[uint32_t(instr.scalar_opcode)];
+  instr.scalar_opcode_name = scalar_opcode_info.name;
+
+  instr.scalar_result.storage_target = storage_target;
+  instr.scalar_result.storage_addressing_mode =
+      InstructionStorageAddressingMode::kStatic;
+  if (is_export) {
+    instr.scalar_result.storage_index = storage_index_export;
+  } else {
+    instr.scalar_result.storage_index = op.scalar_dest();
+    if (op.is_scalar_dest_relative()) {
+      instr.scalar_result.storage_addressing_mode =
+          InstructionStorageAddressingMode::kAddressRelative;
+    }
+  }
+  instr.scalar_result.is_clamped = op.scalar_clamp();
+  instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask();
+  for (uint32_t i = 0; i < 4; ++i) {
+    instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i);
+  }
+
+  instr.scalar_operand_count = scalar_opcode_info.argument_count;
+  if (instr.scalar_operand_count) {
+    if (instr.scalar_operand_count == 1) {
+      ParseAluInstructionOperand(op, 3,
+                                 scalar_opcode_info.src_swizzle_component_count,
+                                 instr.scalar_operands[0]);
+    } else {
+      uint32_t src3_swizzle = op.src_swizzle(3);
+      uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3;
+      uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3;
+      uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) |
+                      (static_cast<int>(op.scalar_opcode()) & 1);
+      int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0;
+
+      ParseAluInstructionOperandSpecial(
+          op, InstructionStorageSource::kConstantFloat, op.src_reg(3),
+          op.src_negate(3), 0, component_a, instr.scalar_operands[0]);
+
+      ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister,
+                                        reg2, op.src_negate(3), const_slot,
+                                        component_b, instr.scalar_operands[1]);
+    }
+  }
+}
+
 bool ParsedAluInstruction::IsScalarOpDefaultNop() const {
   if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev ||
       scalar_result.original_write_mask || scalar_result.is_clamped) {
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index e1c97808a..d5d3677d5 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -29,106 +29,43 @@ class ShaderTranslator {
  public:
   virtual ~ShaderTranslator();
 
-  virtual uint32_t GetDefaultModification(
+  virtual uint64_t GetDefaultModification(
       xenos::ShaderType shader_type,
+      uint32_t dynamic_addressable_register_count,
       Shader::HostVertexShaderType host_vertex_shader_type =
           Shader::HostVertexShaderType::kVertex) const {
     return 0;
   }
 
-  bool Translate(Shader::Translation& translation, reg::SQ_PROGRAM_CNTL cntl);
-  bool Translate(Shader::Translation& translation);
+  // AnalyzeUcode must be done on the shader before translating!
+  bool TranslateAnalyzedShader(Shader::Translation& translation);
 
  protected:
   ShaderTranslator();
 
   // Resets translator state before beginning translation.
-  // shader_type is passed here so translator implementations can generate
-  // special fixed shaders for internal use, and set up the type for this
-  // purpose.
-  virtual void Reset(xenos::ShaderType shader_type);
+  virtual void Reset();
 
-  // Current host-side modification being generated.
-  uint32_t modification() const { return modification_; }
+  // Shader and modification currently being translated.
+  Shader::Translation& current_translation() const { return *translation_; }
+  Shader& current_shader() const { return current_translation().shader(); }
+
+  // Register count from SQ_PROGRAM_CNTL, stored by the implementation in its
+  // modification bits.
+  virtual uint32_t GetModificationRegisterCount() const { return 64; }
 
-  // Register count.
-  uint32_t register_count() const { return register_count_; }
   // True if the current shader is a vertex shader.
   bool is_vertex_shader() const {
-    return shader_type_ == xenos::ShaderType::kVertex;
+    return current_shader().type() == xenos::ShaderType::kVertex;
   }
   // True if the current shader is a pixel shader.
   bool is_pixel_shader() const {
-    return shader_type_ == xenos::ShaderType::kPixel;
-  }
-  // Labels that jumps (explicit or from loops) can be done to, gathered before
-  // translation.
-  const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
-  // Used constant register info, populated before translation.
-  const Shader::ConstantRegisterMap& constant_register_map() const {
-    return constant_register_map_;
-  }
-  // True if the current shader addresses general-purpose registers with dynamic
-  // indices, set before translation. Doesn't include writes to r[#+a#] with an
-  // empty used write mask.
-  bool uses_register_dynamic_addressing() const {
-    return uses_register_dynamic_addressing_;
-  }
-  // True if the current shader writes to a color target on any execution path,
-  // set before translation. Doesn't include writes with an empty used write
-  // mask.
-  bool writes_color_target(int i) const { return writes_color_targets_[i]; }
-  bool writes_any_color_target() const {
-    for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
-      if (writes_color_targets_[i]) {
-        return true;
-      }
-    }
-    return false;
-  }
-  // True if the current shader overrides the pixel depth, set before
-  // translation. Doesn't include writes with an empty used write mask.
-  bool writes_depth() const { return writes_depth_; }
-  // True if the current shader has any `kill` instructions.
-  bool kills_pixels() const { return kills_pixels_; }
-  // A list of all vertex bindings, populated before translation occurs.
-  const std::vector<Shader::VertexBinding>& vertex_bindings() const {
-    return vertex_bindings_;
-  }
-  // A list of all texture bindings, populated before translation occurs.
-  const std::vector<Shader::TextureBinding>& texture_bindings() const {
-    return texture_bindings_;
+    return current_shader().type() == xenos::ShaderType::kPixel;
   }
 
-  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
-  // .pdb.
-  static constexpr uint32_t kMaxMemExports = 16;
-  // Bits indicating which eM# registers have been written to after each
-  // `alloc export`, for up to kMaxMemExports exports. This will contain zero
-  // for certain corrupt exports - that don't write to eA before writing to eM#,
-  // or if the write was done any way other than MAD with a stream constant.
-  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
-  // All c# registers used as the addend in MAD operations to eA, populated
-  // before translation occurs.
-  const std::set<uint32_t>& memexport_stream_constants() const {
-    return memexport_stream_constants_;
-  }
+  // Temporary register count, accessible via static and dynamic addressing.
+  uint32_t register_count() const { return register_count_; }
 
-  // Whether the shader can have early depth and stencil writing enabled, unless
-  // alpha test or alpha to coverage is enabled. Data gathered before
-  // translation.
-  bool CanWriteZEarly() const {
-    // TODO(Triang3l): Investigate what happens to memexport when the pixel
-    // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
-    // depth/stencil.
-    return !writes_depth_ && !kills_pixels_ &&
-           memexport_stream_constants_.empty();
-  }
-
-  // Current line number in the ucode disassembly.
-  size_t ucode_disasm_line_number() const { return ucode_disasm_line_number_; }
-  // Ucode disassembly buffer accumulated during translation.
-  StringBuffer& ucode_disasm_buffer() { return ucode_disasm_buffer_; }
   // Emits a translation error that will be passed back in the result.
   virtual void EmitTranslationError(const char* message, bool is_fatal = true);
 
@@ -143,10 +80,7 @@ class ShaderTranslator {
   }
 
   // Handles post-translation tasks when the shader has been fully translated.
-  // setup_shader_post_translation_info if non-modification-specific parameters
-  // of the Shader object behind the Translation can be set by this invocation.
-  virtual void PostTranslation(Shader::Translation& translation,
-                               bool setup_shader_post_translation_info) {}
+  virtual void PostTranslation() {}
   // Sets the host disassembly on a shader.
   void set_host_disassembly(Shader::Translation& translation,
                             std::string value) {
@@ -201,130 +135,23 @@ class ShaderTranslator {
   virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
 
  private:
-  struct AluOpcodeInfo {
-    const char* name;
-    uint32_t argument_count;
-    uint32_t src_swizzle_component_count;
-  };
-
-  bool TranslateInternal(Shader::Translation& translation);
-
-  void MarkUcodeInstruction(uint32_t dword_offset);
-  void AppendUcodeDisasm(char c);
-  void AppendUcodeDisasm(const char* value);
-  void AppendUcodeDisasmFormat(const char* format, ...);
-
-  void GatherInstructionInformation(const ucode::ControlFlowInstruction& cf);
-  void GatherVertexFetchInformation(const ucode::VertexFetchInstruction& op);
-  void GatherTextureFetchInformation(const ucode::TextureFetchInstruction& op);
   void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
-  void TranslateControlFlowNop(const ucode::ControlFlowInstruction& cf);
-  void TranslateControlFlowExec(const ucode::ControlFlowExecInstruction& cf);
-  void TranslateControlFlowCondExec(
-      const ucode::ControlFlowCondExecInstruction& cf);
-  void TranslateControlFlowCondExecPred(
-      const ucode::ControlFlowCondExecPredInstruction& cf);
-  void TranslateControlFlowLoopStart(
-      const ucode::ControlFlowLoopStartInstruction& cf);
-  void TranslateControlFlowLoopEnd(
-      const ucode::ControlFlowLoopEndInstruction& cf);
-  void TranslateControlFlowCondCall(
-      const ucode::ControlFlowCondCallInstruction& cf);
-  void TranslateControlFlowReturn(
-      const ucode::ControlFlowReturnInstruction& cf);
-  void TranslateControlFlowCondJmp(
-      const ucode::ControlFlowCondJmpInstruction& cf);
-  void TranslateControlFlowAlloc(const ucode::ControlFlowAllocInstruction& cf);
-
   void TranslateExecInstructions(const ParsedExecInstruction& instr);
 
-  void TranslateVertexFetchInstruction(const ucode::VertexFetchInstruction& op);
-  void ParseVertexFetchInstruction(const ucode::VertexFetchInstruction& op,
-                                   ParsedVertexFetchInstruction* out_instr);
-
-  void TranslateTextureFetchInstruction(
-      const ucode::TextureFetchInstruction& op);
-  void ParseTextureFetchInstruction(const ucode::TextureFetchInstruction& op,
-                                    ParsedTextureFetchInstruction* out_instr);
-
-  void TranslateAluInstruction(const ucode::AluInstruction& op);
-  void ParseAluInstruction(const ucode::AluInstruction& op,
-                           ParsedAluInstruction& out_instr) const;
-  static void ParseAluInstructionOperand(const ucode::AluInstruction& op,
-                                         uint32_t i,
-                                         uint32_t swizzle_component_count,
-                                         InstructionOperand& out_op);
-  static void ParseAluInstructionOperandSpecial(
-      const ucode::AluInstruction& op, InstructionStorageSource storage_source,
-      uint32_t reg, bool negate, int const_slot, uint32_t component_index,
-      InstructionOperand& out_op);
-
-  // Input shader metadata and microcode.
-  xenos::ShaderType shader_type_;
-  const uint32_t* ucode_dwords_;
-  size_t ucode_dword_count_;
-  uint32_t register_count_;
-
-  // Current host-side modification being generated.
-  uint32_t modification_ = 0;
+  // Current shader and modification being translated.
+  Shader::Translation* translation_ = nullptr;
 
   // Accumulated translation errors.
   std::vector<Shader::Error> errors_;
 
+  // Temporary register count, accessible via static and dynamic addressing.
+  uint32_t register_count_ = 0;
+
   // Current control flow dword index.
   uint32_t cf_index_ = 0;
 
-  // Microcode disassembly buffer, accumulated throughout the translation.
-  StringBuffer ucode_disasm_buffer_;
-  // Current line number in the disasm, which can be used for source annotation.
-  size_t ucode_disasm_line_number_ = 0;
-  // Last offset used when scanning for line numbers.
-  size_t previous_ucode_disasm_scan_offset_ = 0;
-
   // Kept for supporting vfetch_mini.
   ucode::VertexFetchInstruction previous_vfetch_full_;
-
-  // Labels that jumps (explicit or from loops) can be done to, gathered before
-  // translation.
-  std::set<uint32_t> label_addresses_;
-
-  // Detected binding information gathered before translation. Must not be
-  // affected by the modification index.
-  int total_attrib_count_ = 0;
-  std::vector<Shader::VertexBinding> vertex_bindings_;
-  std::vector<Shader::TextureBinding> texture_bindings_;
-  uint32_t unique_vertex_bindings_ = 0;
-  uint32_t unique_texture_bindings_ = 0;
-
-  // These all are gathered before translation.
-  // uses_register_dynamic_addressing_ for writes, writes_color_targets_,
-  // writes_depth_ don't include empty used write masks.
-  // Must not be affected by the modification index.
-  Shader::ConstantRegisterMap constant_register_map_ = {0};
-  bool uses_register_dynamic_addressing_ = false;
-  bool writes_color_targets_[4] = {false, false, false, false};
-  bool writes_depth_ = false;
-  bool kills_pixels_ = false;
-
-  // Memexport info is gathered before translation.
-  // Must not be affected by the modification index.
-  uint32_t memexport_alloc_count_ = 0;
-  // For register allocation in implementations - what was used after each
-  // `alloc export`.
-  uint32_t memexport_eA_written_ = 0;
-  uint8_t memexport_eM_written_[kMaxMemExports] = {0};
-  std::set<uint32_t> memexport_stream_constants_;
-
-  static const AluOpcodeInfo alu_vector_opcode_infos_[0x20];
-  static const AluOpcodeInfo alu_scalar_opcode_infos_[0x40];
-};
-
-class UcodeShaderTranslator : public ShaderTranslator {
- public:
-  UcodeShaderTranslator() = default;
-
- protected:
-  std::vector<uint8_t> CompleteTranslation() override;
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 0ff228d53..1063e8e0c 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -203,7 +203,9 @@ void SpirvShaderTranslator::StartTranslation() {
   push_consts_ = b.createVariable(spv::StorageClass::StorageClassPushConstant,
                                   push_constants_type, "push_consts");
 
-  if (!texture_bindings().empty()) {
+  const std::vector<Shader::TextureBinding>& texture_bindings =
+      current_shader().texture_bindings();
+  if (!texture_bindings.empty()) {
     image_2d_type_ =
         b.makeImageType(float_type_, spv::Dim::Dim2D, false, false, false, 1,
                         spv::ImageFormat::ImageFormatUnknown);
@@ -220,7 +222,7 @@ void SpirvShaderTranslator::StartTranslation() {
                   b.makeSampledImageType(image_cube_type_)};
 
     uint32_t num_tex_bindings = 0;
-    for (const auto& binding : texture_bindings()) {
+    for (const auto& binding : texture_bindings) {
       // Calculate the highest binding index.
       num_tex_bindings =
           std::max(num_tex_bindings, uint32_t(binding.binding_index + 1));
@@ -241,7 +243,7 @@ void SpirvShaderTranslator::StartTranslation() {
     }
 
     // Set up the map from binding -> ssbo index
-    for (const auto& binding : texture_bindings()) {
+    for (const auto& binding : texture_bindings) {
       tex_binding_map_[binding.fetch_constant] =
           uint32_t(binding.binding_index);
     }
@@ -254,7 +256,9 @@ void SpirvShaderTranslator::StartTranslation() {
     // Vertex inputs/outputs
     // Inputs: 32 SSBOs on DS 2 binding 0
 
-    if (!vertex_bindings().empty()) {
+    const std::vector<Shader::VertexBinding>& vertex_bindings =
+        current_shader().vertex_bindings();
+    if (!vertex_bindings.empty()) {
       // Runtime array for vertex data
       Id vtx_t = b.makeRuntimeArray(uint_type_);
       b.addDecoration(vtx_t, spv::Decoration::DecorationArrayStride,
@@ -269,7 +273,7 @@ void SpirvShaderTranslator::StartTranslation() {
 
       // Create the vertex bindings variable.
       Id vtx_a_t = b.makeArrayType(
-          vtx_s, b.makeUintConstant(uint32_t(vertex_bindings().size())), 0);
+          vtx_s, b.makeUintConstant(uint32_t(vertex_bindings.size())), 0);
       vtx_ = b.createVariable(spv::StorageClass::StorageClassUniform, vtx_a_t,
                               "vertex_bindings");
 
@@ -279,7 +283,7 @@ void SpirvShaderTranslator::StartTranslation() {
       b.addDecoration(vtx_, spv::Decoration::DecorationNonWritable);
 
       // Set up the map from binding -> ssbo index
-      for (const auto& binding : vertex_bindings()) {
+      for (const auto& binding : vertex_bindings) {
         vtx_binding_map_[binding.fetch_constant] = binding.binding_index;
       }
     }
@@ -494,7 +498,7 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
     b.addExecutionMode(mainFn, spv::ExecutionModeOriginUpperLeft);
 
     // If we write a new depth value, we must declare this mode!
-    if (writes_depth()) {
+    if (current_shader().writes_depth()) {
       b.addExecutionMode(mainFn, spv::ExecutionModeDepthReplacing);
     }
 
@@ -667,8 +671,12 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
   return spirv_bytes;
 }
 
-void SpirvShaderTranslator::PostTranslation(
-    Shader::Translation& translation, bool setup_shader_post_translation_info) {
+void SpirvShaderTranslator::PostTranslation() {
+  Shader::Translation& translation = current_translation();
+  if (!translation.is_valid()) {
+    return;
+  }
+
   // Validation.
   if (cvars::spv_validate) {
     auto validation = validator_.Validate(
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 478aa3428..05e147895 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -58,11 +58,23 @@ class SpirvShaderTranslator : public ShaderTranslator {
   SpirvShaderTranslator();
   ~SpirvShaderTranslator() override;
 
+  // Not storing anything else in modifications (as this shader translator is
+  // being replaced anyway).
+  uint64_t GetDefaultModification(
+      xenos::ShaderType shader_type,
+      uint32_t dynamic_addressable_register_count,
+      Shader::HostVertexShaderType host_vertex_shader_type =
+          Shader::HostVertexShaderType::kVertex) const override {
+    return dynamic_addressable_register_count;
+  }
+
  protected:
+  virtual uint32_t GetModificationRegisterCount() const {
+    return uint32_t(current_translation().modification());
+  }
   void StartTranslation() override;
   std::vector<uint8_t> CompleteTranslation() override;
-  void PostTranslation(Shader::Translation& translation,
-                       bool setup_shader_post_translation_info) override;
+  void PostTranslation() override;
 
   void PreProcessControlFlowInstructions(
       std::vector<ucode::ControlFlowInstruction> instrs) override;
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index ea11f10cd..4570f9515 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -431,15 +431,14 @@ XEPACKEDUNION(ControlFlowInstruction, {
 static_assert_size(ControlFlowInstruction, 8);
 
 inline void UnpackControlFlowInstructions(const uint32_t* dwords,
-                                          ControlFlowInstruction* out_a,
-                                          ControlFlowInstruction* out_b) {
+                                          ControlFlowInstruction* out_ab) {
   uint32_t dword_0 = dwords[0];
   uint32_t dword_1 = dwords[1];
   uint32_t dword_2 = dwords[2];
-  out_a->dword_0 = dword_0;
-  out_a->dword_1 = dword_1 & 0xFFFF;
-  out_b->dword_0 = (dword_1 >> 16) | (dword_2 << 16);
-  out_b->dword_1 = dword_2 >> 16;
+  out_ab[0].dword_0 = dword_0;
+  out_ab[0].dword_1 = dword_1 & 0xFFFF;
+  out_ab[1].dword_0 = (dword_1 >> 16) | (dword_2 << 16);
+  out_ab[1].dword_1 = dword_2 >> 16;
 }
 
 enum class FetchOpcode : uint32_t {
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index 52bb607f4..1fbe5681c 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -364,10 +364,11 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state,
 }
 
 bool PipelineCache::TranslateShader(
-    VulkanShader::VulkanTranslation& translation, reg::SQ_PROGRAM_CNTL cntl) {
+    VulkanShader::VulkanTranslation& translation) {
+  translation.shader().AnalyzeUcode(ucode_disasm_buffer_);
   // Perform translation.
   // If this fails the shader will be marked as invalid and ignored later.
-  if (!shader_translator_->Translate(translation, cntl)) {
+  if (!shader_translator_->TranslateAnalyzedShader(translation)) {
     XELOGE("Shader translation failed; marking shader as ignored");
     return false;
   }
@@ -1071,9 +1072,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
       static_cast<VulkanShader::VulkanTranslation*>(
           vertex_shader->GetOrCreateTranslation(
               shader_translator_->GetDefaultModification(
-                  xenos::ShaderType::kVertex)));
+                  xenos::ShaderType::kVertex,
+                  vertex_shader->GetDynamicAddressableRegisterCount(
+                      regs.sq_program_cntl.vs_num_reg))));
   if (!vertex_shader_translation->is_translated() &&
-      !TranslateShader(*vertex_shader_translation, regs.sq_program_cntl)) {
+      !TranslateShader(*vertex_shader_translation)) {
     XELOGE("Failed to translate the vertex shader!");
     return UpdateStatus::kError;
   }
@@ -1083,9 +1086,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
     pixel_shader_translation = static_cast<VulkanShader::VulkanTranslation*>(
         pixel_shader->GetOrCreateTranslation(
             shader_translator_->GetDefaultModification(
-                xenos::ShaderType::kPixel)));
+                xenos::ShaderType::kPixel,
+                pixel_shader->GetDynamicAddressableRegisterCount(
+                    regs.sq_program_cntl.ps_num_reg))));
     if (!pixel_shader_translation->is_translated() &&
-        !TranslateShader(*pixel_shader_translation, regs.sq_program_cntl)) {
+        !TranslateShader(*pixel_shader_translation)) {
       XELOGE("Failed to translate the pixel shader!");
       return UpdateStatus::kError;
     }
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h
index d6a88fdcf..64d319165 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/pipeline_cache.h
@@ -12,6 +12,7 @@
 
 #include <unordered_map>
 
+#include "xenia/base/string_buffer.h"
 #include "xenia/base/xxhash.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/spirv_shader_translator.h"
@@ -78,8 +79,7 @@ class PipelineCache {
   // state.
   VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key);
 
-  bool TranslateShader(VulkanShader::VulkanTranslation& translation,
-                       reg::SQ_PROGRAM_CNTL cntl);
+  bool TranslateShader(VulkanShader::VulkanTranslation& translation);
 
   void DumpShaderDisasmAMD(VkPipeline pipeline);
   void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info);
@@ -92,6 +92,8 @@ class PipelineCache {
   RegisterFile* register_file_ = nullptr;
   ui::vulkan::VulkanDevice* device_ = nullptr;
 
+  // Temporary storage for AnalyzeUcode calls.
+  StringBuffer ucode_disasm_buffer_;
   // Reusable shader translator.
   std::unique_ptr<ShaderTranslator> shader_translator_ = nullptr;
   // Disassembler used to get the SPIRV disasm. Only used in debug.
diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc
index 2eb41e9e5..99333f062 100644
--- a/src/xenia/gpu/vulkan/vulkan_shader.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shader.cc
@@ -73,7 +73,7 @@ bool VulkanShader::VulkanTranslation::Prepare() {
 }
 
 Shader::Translation* VulkanShader::CreateTranslationInstance(
-    uint32_t modification) {
+    uint64_t modification) {
   return new VulkanTranslation(*this, modification);
 }
 
diff --git a/src/xenia/gpu/vulkan/vulkan_shader.h b/src/xenia/gpu/vulkan/vulkan_shader.h
index 7d948ac71..76a196bff 100644
--- a/src/xenia/gpu/vulkan/vulkan_shader.h
+++ b/src/xenia/gpu/vulkan/vulkan_shader.h
@@ -23,7 +23,7 @@ class VulkanShader : public Shader {
  public:
   class VulkanTranslation : public Translation {
    public:
-    VulkanTranslation(VulkanShader& shader, uint32_t modification)
+    VulkanTranslation(VulkanShader& shader, uint64_t modification)
         : Translation(shader, modification) {}
     ~VulkanTranslation() override;
 
@@ -41,7 +41,7 @@ class VulkanShader : public Shader {
                uint32_t dword_count);
 
  protected:
-  Translation* CreateTranslationInstance(uint32_t modification) override;
+  Translation* CreateTranslationInstance(uint64_t modification) override;
 
  private:
   ui::vulkan::VulkanDevice* device_ = nullptr;
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 1c21ed8ff..f8e178f15 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -546,33 +546,6 @@ inline int GetVertexFormatComponentCount(VertexFormat format) {
   }
 }
 
-inline int GetVertexFormatSizeInWords(VertexFormat format) {
-  switch (format) {
-    case VertexFormat::k_8_8_8_8:
-    case VertexFormat::k_2_10_10_10:
-    case VertexFormat::k_10_11_11:
-    case VertexFormat::k_11_11_10:
-    case VertexFormat::k_16_16:
-    case VertexFormat::k_16_16_FLOAT:
-    case VertexFormat::k_32:
-    case VertexFormat::k_32_FLOAT:
-      return 1;
-    case VertexFormat::k_16_16_16_16:
-    case VertexFormat::k_16_16_16_16_FLOAT:
-    case VertexFormat::k_32_32:
-    case VertexFormat::k_32_32_FLOAT:
-      return 2;
-    case VertexFormat::k_32_32_32_FLOAT:
-      return 3;
-    case VertexFormat::k_32_32_32_32:
-    case VertexFormat::k_32_32_32_32_FLOAT:
-      return 4;
-    default:
-      assert_unhandled_case(format);
-      return 1;
-  }
-}
-
 inline uint32_t GetVertexFormatNeededWords(VertexFormat format,
                                            uint32_t used_components) {
   assert_zero(used_components & ~uint32_t(0b1111));