diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 278162d63..29bb913a5 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2961,6 +2961,14 @@ bool D3D12CommandProcessor::UpdateBindings(
       (!samplers_written_pixel_ ||
        current_samplers_hash_pixel_ != samplers_hash_pixel);
 
+  // These are the constant base addresses/ranges for shaders.
+  // We have these hardcoded right now cause nothing seems to differ on the Xbox
+  // 360 (however, OpenGL ES on Adreno 200 on Android has different ranges).
+  assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
+              regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
+  assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
+              regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
+
   // Check if the float constant layout is still the same and get the counts.
   const Shader::ConstantRegisterMap& float_constant_map_vertex =
       vertex_shader->constant_register_map();
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index 8151d90eb..2a694d59a 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -809,14 +809,6 @@ bool PipelineCache::EnsureShadersTranslated(
     D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
     Shader::HostVertexShaderType host_vertex_shader_type) {
   auto& regs = *register_file_;
-
-  // These are the constant base addresses/ranges for shaders.
-  // We have these hardcoded right now cause nothing seems to differ.
-  assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
-              regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
-  assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
-              regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
-
   auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
 
   // Edge flags are not supported yet (because polygon primitives are not).
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 5f3ed1c38..31f2a680e 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -18,6 +18,7 @@
 
 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
+#include "xenia/base/math.h"
 
 DEFINE_bool(dxbc_switch, true,
             "Use switch rather than if for flow control. Turning this off or "
@@ -86,7 +87,6 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
   // Don't allocate again and again for the first shader.
   shader_code_.reserve(8192);
   shader_object_.reserve(16384);
-  float_constant_index_offsets_.reserve(512);
 }
 DxbcShaderTranslator::~DxbcShaderTranslator() = default;
 
@@ -161,8 +161,6 @@ void DxbcShaderTranslator::Reset() {
   cbuffer_index_fetch_constants_ = kCbufferIndexUnallocated;
 
   system_constants_used_ = 0;
-  float_constants_dynamic_indexed_ = false;
-  float_constant_index_offsets_.clear();
 
   in_control_point_index_used_ = false;
 
@@ -1166,29 +1164,6 @@ void DxbcShaderTranslator::CompleteShaderCode() {
 
   // Release system_temps_subroutine_.
   PopSystemTemp(system_temps_subroutine_count_);
-
-  // Remap float constant indices if not indexed dynamically.
-  if (!float_constants_dynamic_indexed_ &&
-      !float_constant_index_offsets_.empty()) {
-    uint8_t float_constant_map[256] = {};
-    uint32_t float_constant_count = 0;
-    for (uint32_t i = 0; i < 4; ++i) {
-      uint64_t float_constants_used = constant_register_map().float_bitmap[i];
-      uint32_t float_constant_index;
-      while (
-          xe::bit_scan_forward(float_constants_used, &float_constant_index)) {
-        float_constants_used &= ~(1ull << float_constant_index);
-        float_constant_map[i * 64 + float_constant_index] =
-            float_constant_count++;
-      }
-    }
-    size_t index_count = float_constant_index_offsets_.size();
-    for (size_t i = 0; i < index_count; ++i) {
-      uint32_t index_offset = float_constant_index_offsets_[i];
-      shader_code_[index_offset] =
-          float_constant_map[shader_code_[index_offset] & 255];
-    }
-  }
 }
 
 std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() {
@@ -1420,7 +1395,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
           shader_code_.push_back(EncodeVectorSwizzledOperand(
               D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, kSwizzleXYZW, 2));
           shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(operand.storage_index));
+          shader_code_.push_back(operand.storage_index);
         } else {
           shader_code_.push_back(
               ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
@@ -1433,7 +1408,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
               D3D10_SB_OPERAND_INDEX_IMMEDIATE32,
               D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
           shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(operand.storage_index));
+          shader_code_.push_back(operand.storage_index);
           shader_code_.push_back(EncodeVectorSelectOperand(
               D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
           shader_code_.push_back(dynamic_address_register);
@@ -1445,7 +1420,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
         assert_true(operand.storage_addressing_mode ==
                     InstructionStorageAddressingMode::kStatic);
         dxbc_operand.type = DxbcSourceOperand::Type::kRegister;
-        dxbc_operand.index = uint32_t(operand.storage_index);
+        dxbc_operand.index = operand.storage_index;
       }
       break;
 
@@ -1457,11 +1432,18 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
         cbuffer_index_float_constants_ = cbuffer_count_++;
       }
       dxbc_operand.type = DxbcSourceOperand::Type::kConstantFloat;
-      dxbc_operand.index = uint32_t(operand.storage_index);
       dxbc_operand.addressing_mode = operand.storage_addressing_mode;
-      if (operand.storage_addressing_mode !=
+      if (operand.storage_addressing_mode ==
           InstructionStorageAddressingMode::kStatic) {
-        float_constants_dynamic_indexed_ = true;
+        uint32_t float_constant_index =
+            constant_register_map().GetPackedFloatConstantIndex(
+                operand.storage_index);
+        assert_true(float_constant_index != UINT32_MAX);
+        dxbc_operand.index =
+            float_constant_index != UINT32_MAX ? float_constant_index : 0;
+      } else {
+        assert_true(constant_register_map().float_dynamic_addressing);
+        dxbc_operand.index = operand.storage_index;
       }
       break;
 
@@ -1652,11 +1634,6 @@ void DxbcShaderTranslator::UseDxbcSourceOperand(
       }
       shader_code_.push_back(cbuffer_index_float_constants_);
       shader_code_.push_back(uint32_t(CbufferRegister::kFloatConstants));
-      if (!float_constants_dynamic_indexed_) {
-        // If there's no dynamic indexing in the shader, constants are compacted
-        // and remapped. Store where the index has been written.
-        float_constant_index_offsets_.push_back(uint32_t(shader_code_.size()));
-      }
       shader_code_.push_back(operand.index);
       if (!is_static) {
         uint32_t dynamic_address_register, dynamic_address_component;
@@ -1718,8 +1695,9 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand(
 void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
                                        uint32_t reg, bool replicate_x,
                                        bool can_store_memexport_address) {
+  uint32_t used_write_mask = result.GetUsedWriteMask();
   if (result.storage_target == InstructionStorageTarget::kNone ||
-      !result.has_any_writes()) {
+      !result.GetUsedWriteMask()) {
     return;
   }
 
@@ -1744,10 +1722,9 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
       ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped);
 
   // Scalar targets get only one component.
+  // TODO(Triang3l): It's not replicated, it's X specifically.
   if (result.storage_target == InstructionStorageTarget::kDepth) {
-    if (!result.write_mask[0]) {
-      return;
-    }
+    assert_not_zero(used_write_mask & 0b0001);
     SwizzleSource component = result.components[0];
     if (replicate_x && component <= SwizzleSource::kW) {
       component = SwizzleSource::kX;
@@ -1802,7 +1779,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
   uint32_t constant_mask = 0;
   uint32_t constant_values = 0;
   for (uint32_t i = 0; i < 4; ++i) {
-    if (!result.write_mask[i]) {
+    if (!(used_write_mask & (1 << i))) {
       continue;
     }
     SwizzleSource component = result.components[i];
@@ -1858,7 +1835,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
               is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32
                         : D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
           shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(result.storage_index));
+          shader_code_.push_back(result.storage_index);
           if (!is_static) {
             shader_code_.push_back(EncodeVectorSelectOperand(
                 D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
@@ -1874,11 +1851,11 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
               saturate_bit);
           shader_code_.push_back(
               EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
-          shader_code_.push_back(uint32_t(result.storage_index));
+          shader_code_.push_back(result.storage_index);
         }
         break;
 
-      case InstructionStorageTarget::kInterpolant:
+      case InstructionStorageTarget::kInterpolator:
         ++stat_.instruction_count;
         ++stat_.mov_instruction_count;
         shader_code_.push_back(
@@ -1943,7 +1920,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
                                         [uint32_t(result.storage_index)]);
         break;
 
-      case InstructionStorageTarget::kColorTarget:
+      case InstructionStorageTarget::kColor:
         ++stat_.instruction_count;
         ++stat_.mov_instruction_count;
         shader_code_.push_back(
@@ -1952,8 +1929,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
             saturate_bit);
         shader_code_.push_back(
             EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
-        shader_code_.push_back(
-            system_temps_color_[uint32_t(result.storage_index)]);
+        shader_code_.push_back(system_temps_color_[result.storage_index]);
         break;
 
       default:
@@ -1989,13 +1965,13 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     shader_code_.push_back(
         EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
     shader_code_.push_back(
-        1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3)));
+        uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3)));
     ++stat_.instruction_count;
     ++stat_.uint_instruction_count;
   }
 
   if (edram_rov_used_ &&
-      result.storage_target == InstructionStorageTarget::kColorTarget) {
+      result.storage_target == InstructionStorageTarget::kColor) {
     // For ROV output, mark that the color has been written to.
     // According to:
     // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
@@ -2014,7 +1990,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
     shader_code_.push_back(system_temp_rov_params_);
     shader_code_.push_back(
         EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
-    shader_code_.push_back(1 << (8 + uint32_t(result.storage_index)));
+    shader_code_.push_back(1 << (8 + result.storage_index));
     ++stat_.instruction_count;
     ++stat_.uint_instruction_count;
   }
@@ -2479,19 +2455,6 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator::
 };
 
 void DxbcShaderTranslator::WriteResourceDefinitions() {
-  // ***************************************************************************
-  // Preparation
-  // ***************************************************************************
-
-  // Float constant count.
-  uint32_t float_constant_count = 0;
-  if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
-    for (uint32_t i = 0; i < 4; ++i) {
-      float_constant_count +=
-          xe::bit_count(constant_register_map().float_bitmap[i]);
-    }
-  }
-
   uint32_t chunk_position_dwords = uint32_t(shader_object_.size());
   uint32_t new_offset;
 
@@ -2583,7 +2546,8 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
     if (RdefTypeIndex(i) == RdefTypeIndex::kFloat4ConstantArray) {
       // Declaring a 0-sized array may not be safe, so write something valid
       // even if they aren't used.
-      shader_object_.push_back(std::max(float_constant_count, 1u));
+      shader_object_.push_back(
+          std::max(constant_register_map().float_count, uint32_t(1)));
     } else {
       shader_object_.push_back(type.element_count |
                                (type.struct_member_count << 16));
@@ -2692,8 +2656,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
   if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
     shader_object_.push_back(constant_name_offset_float);
     shader_object_.push_back(0);
-    shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
-                             sizeof(float));
+    shader_object_.push_back(
+        std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
+        sizeof(float));
     shader_object_.push_back(kDxbcRdefVariableFlagUsed);
     shader_object_.push_back(types_offset +
                              uint32_t(RdefTypeIndex::kFloat4ConstantArray) *
@@ -2795,8 +2760,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
       shader_object_.push_back(cbuffer_name_offset_float);
       shader_object_.push_back(1);
       shader_object_.push_back(constant_offset_float);
-      shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
-                               sizeof(float));
+      shader_object_.push_back(
+          std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
+          sizeof(float));
       shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer));
       shader_object_.push_back(0);
     } else if (i == cbuffer_index_bool_loop_constants_) {
@@ -3646,15 +3612,10 @@ void DxbcShaderTranslator::WriteShaderCode() {
   // Constant buffers, from most frequenly accessed to least frequently accessed
   // (the order is a hint to the driver according to the DXBC header).
   if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
-    uint32_t float_constant_count = 0;
-    for (uint32_t i = 0; i < 4; ++i) {
-      float_constant_count +=
-          xe::bit_count(constant_register_map().float_bitmap[i]);
-    }
     shader_object_.push_back(
         ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) |
         ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN(
-            float_constants_dynamic_indexed_
+            constant_register_map().float_dynamic_addressing
                 ? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED
                 : D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) |
         ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
@@ -3663,7 +3624,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
     shader_object_.push_back(cbuffer_index_float_constants_);
     shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
     shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
-    shader_object_.push_back(float_constant_count);
+    shader_object_.push_back(constant_register_map().float_count);
     shader_object_.push_back(0);
   }
   if (cbuffer_index_system_constants_ != kCbufferIndexUnallocated) {
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index cb23fa511..3fff2c561 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -857,10 +857,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
           return 0b0000;
       }
     }
-    DxbcDest Mask(uint32_t write_mask) const {
+    [[nodiscard]] DxbcDest Mask(uint32_t write_mask) const {
       return DxbcDest(type_, write_mask, index_1d_, index_2d_, index_3d_);
     }
-    DxbcDest MaskMasked(uint32_t write_mask) const {
+    [[nodiscard]] DxbcDest MaskMasked(uint32_t write_mask) const {
       return DxbcDest(type_, write_mask_ & write_mask, index_1d_, index_2d_,
                       index_3d_);
     }
@@ -991,26 +991,28 @@ class DxbcShaderTranslator : public ShaderTranslator {
       return DxbcSrc(DxbcOperandType::kInputCoverageMask, kXXXX);
     }
 
-    DxbcSrc WithModifiers(bool absolute, bool negate) const {
+    [[nodiscard]] DxbcSrc WithModifiers(bool absolute, bool negate) const {
       DxbcSrc new_src(*this);
       new_src.absolute_ = absolute;
       new_src.negate_ = negate;
       return new_src;
     }
-    DxbcSrc WithAbs(bool absolute) const {
+    [[nodiscard]] DxbcSrc WithAbs(bool absolute) const {
       return WithModifiers(absolute, negate_);
     }
-    DxbcSrc WithNeg(bool negate) const {
+    [[nodiscard]] DxbcSrc WithNeg(bool negate) const {
       return WithModifiers(absolute_, negate);
     }
-    DxbcSrc Abs() const { return WithModifiers(true, false); }
-    DxbcSrc operator-() const { return WithModifiers(absolute_, !negate_); }
-    DxbcSrc Swizzle(uint32_t swizzle) const {
+    [[nodiscard]] DxbcSrc Abs() const { return WithModifiers(true, false); }
+    [[nodiscard]] DxbcSrc operator-() const {
+      return WithModifiers(absolute_, !negate_);
+    }
+    [[nodiscard]] DxbcSrc Swizzle(uint32_t swizzle) const {
       DxbcSrc new_src(*this);
       new_src.swizzle_ = swizzle;
       return new_src;
     }
-    DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
+    [[nodiscard]] DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
       DxbcSrc new_src(*this);
       new_src.swizzle_ = 0;
       for (uint32_t i = 0; i < 4; ++i) {
@@ -1019,12 +1021,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
       }
       return new_src;
     }
-    DxbcSrc Select(uint32_t component) const {
+    [[nodiscard]] DxbcSrc Select(uint32_t component) const {
       DxbcSrc new_src(*this);
       new_src.swizzle_ = component * 0b01010101;
       return new_src;
     }
-    DxbcSrc SelectFromSwizzled(uint32_t component) const {
+    [[nodiscard]] DxbcSrc SelectFromSwizzled(uint32_t component) const {
       DxbcSrc new_src(*this);
       new_src.swizzle_ = ((swizzle_ >> (component * 2)) & 3) * 0b01010101;
       return new_src;
@@ -2026,6 +2028,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
   void EmitInstructionDisassembly();
 
   // Abstract 4-component vector source operand.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
   struct DxbcSourceOperand {
     enum class Type {
       // GPR number in the index - used only when GPRs are not dynamically
@@ -2064,18 +2067,22 @@ class DxbcShaderTranslator : public ShaderTranslator {
   };
   // Each Load must be followed by Unload, otherwise there may be a temporary
   // register leak.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
   void LoadDxbcSourceOperand(const InstructionOperand& operand,
                              DxbcSourceOperand& dxbc_operand);
   // Number of tokens this operand adds to the instruction length when used.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
   uint32_t DxbcSourceOperandLength(const DxbcSourceOperand& operand,
                                    bool negate = false,
                                    bool absolute = false) const;
   // Writes the operand access tokens to the instruction (either for a scalar if
   // select_component is <= 3, or for a vector).
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
   void UseDxbcSourceOperand(const DxbcSourceOperand& operand,
                             uint32_t additional_swizzle = kSwizzleXYZW,
                             uint32_t select_component = 4, bool negate = false,
                             bool absolute = false);
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
   void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);
 
   // Writes xyzw or xxxx of the specified r# to the destination.
@@ -2258,15 +2265,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // the remaining ones can be marked as unused in RDEF.
   uint64_t system_constants_used_;
 
-  // Whether constants are dynamically indexed and need to be marked as such in
-  // dcl_constantBuffer.
-  bool float_constants_dynamic_indexed_;
-
-  // Offsets of float constant indices in shader_code_, for remapping in
-  // CompleteTranslation (initially, at these offsets, guest float constant
-  // indices are written).
-  std::vector<uint32_t> float_constant_index_offsets_;
-
   // Whether InOutRegister::kDSInControlPointIndex has been used in the shader.
   bool in_control_point_index_used_;
 
diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc
index fecdf6fdf..6b253dd2e 100644
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@@ -23,7 +23,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
   replicate_result_x = false;
   predicate_written = false;
 
-  if (!instr.has_vector_op) {
+  if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
+      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
     return false;
   }
 
@@ -32,7 +33,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
   if (instr.vector_opcode == AluVectorOpcode::kCube) {
     operand_count = 1;
   } else {
-    operand_count = uint32_t(instr.vector_operand_count);
+    operand_count = instr.vector_operand_count;
   }
   DxbcSourceOperand dxbc_operands[3];
   // Whether the operand is the same as any previous operand, and thus is loaded
@@ -42,7 +43,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
   for (uint32_t i = 0; i < operand_count; ++i) {
     const InstructionOperand& operand = instr.vector_operands[i];
     for (uint32_t j = 0; j < i; ++j) {
-      if (operand == instr.vector_operands[j]) {
+      if (operand.GetIdenticalComponents(instr.vector_operands[j]) == 0b1111) {
         operands_duplicate[i] = true;
         dxbc_operands[i] = dxbc_operands[j];
         break;
@@ -117,7 +118,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
       UseDxbcSourceOperand(dxbc_operands[1]);
       ++stat_.instruction_count;
       ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
         // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0),
         // flushing denormals (must be done using eq - doing bitwise comparison
         // doesn't flush denormals).
@@ -281,7 +283,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
       UseDxbcSourceOperand(dxbc_operands[2]);
       ++stat_.instruction_count;
       ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
         // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
         // If any operand is zero or denormalized, just leave the addition part.
         uint32_t is_subnormal_temp = PushSystemTemp();
@@ -388,7 +391,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
     case AluVectorOpcode::kDp4:
     case AluVectorOpcode::kDp3:
     case AluVectorOpcode::kDp2Add: {
-      if (instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
         // The operands are the same when calculating vector length, no need to
         // emulate 0 * anything = 0 in this case.
         shader_code_.push_back(
@@ -1092,7 +1096,9 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
       UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1);
       ++stat_.instruction_count;
       ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+                instr.vector_operands[1]) &
+            0b0010)) {
         // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
         // This is an attenuation calculation function, so infinity is probably
         // not very unlikely.
@@ -1294,7 +1300,8 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
     const ParsedAluInstruction& instr, bool& predicate_written) {
   predicate_written = false;
 
-  if (!instr.has_scalar_op) {
+  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+      !instr.scalar_result.GetUsedWriteMask()) {
     return false;
   }
 
@@ -1306,7 +1313,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
   for (uint32_t i = 0; i < uint32_t(instr.scalar_operand_count); ++i) {
     const InstructionOperand& operand = instr.scalar_operands[i];
     for (uint32_t j = 0; j < i; ++j) {
-      if (operand == instr.scalar_operands[j]) {
+      if (operand.GetIdenticalComponents(instr.scalar_operands[j]) == 0b1111) {
         operands_duplicate[i] = true;
         dxbc_operands[i] = dxbc_operands[j];
         break;
@@ -2303,7 +2310,9 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
       UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 0);
       ++stat_.instruction_count;
       ++stat_.float_instruction_count;
-      if (!instr.scalar_operands[0].EqualsAbsolute(instr.scalar_operands[1])) {
+      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+                instr.scalar_operands[1]) &
+            0b0001)) {
         // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
         uint32_t is_subnormal_temp = PushSystemTemp();
         // Get the non-NaN multiplicand closer to zero to check if any of them
@@ -2421,7 +2430,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
 
 void DxbcShaderTranslator::ProcessAluInstruction(
     const ParsedAluInstruction& instr) {
-  if (instr.is_nop()) {
+  if (instr.IsNop()) {
     return;
   }
 
@@ -2445,7 +2454,8 @@ void DxbcShaderTranslator::ProcessAluInstruction(
       ProcessScalarAluOperation(instr, predicate_written_scalar);
 
   if (store_vector) {
-    StoreResult(instr.vector_result, system_temp_pv_, replicate_vector_x,
+    StoreResult(instr.vector_and_constant_result, system_temp_pv_,
+                replicate_vector_x,
                 instr.GetMemExportStreamConstant() != UINT32_MAX);
   }
   if (store_scalar) {
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index b1502ce1e..25941c4bd 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -10,10 +10,12 @@
 #ifndef XENIA_GPU_SHADER_H_
 #define XENIA_GPU_SHADER_H_
 
+#include <algorithm>
 #include <filesystem>
 #include <string>
 #include <vector>
 
+#include "xenia/base/math.h"
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/ucode.h"
 #include "xenia/gpu/xenos.h"
@@ -21,23 +23,32 @@
 namespace xe {
 namespace gpu {
 
+// The structures here are used for both translation and disassembly.
+//
+// Because disassembly uses them too, to make sure "assemble -> disassemble ->
+// reassemble" round trip is always successful with the XNA assembler (as it is
+// the accuracy benchmark for translation), only generalization - not
+// optimization like nop skipping/replacement - must be done while converting
+// microcode to these structures (in other words, parsed shader code should be
+// enough to accurately reconstruct the microcode for any shader that could be
+// written by a human in assembly).
+//
+// During the "parsed -> host" part of the translation, however, translators are
+// free to make any optimizations (as long as they don't affect the result, of
+// course) they find appropriate.
+
 enum class InstructionStorageTarget {
   // Result is not stored.
   kNone,
   // Result is stored to a temporary register indexed by storage_index [0-31].
   kRegister,
-  // Result is stored into a vertex shader interpolant export [0-15].
-  kInterpolant,
+  // Result is stored into a vertex shader interpolator export [0-15].
+  kInterpolator,
   // Result is stored to the position export (gl_Position).
   kPosition,
-  // Result is stored to the vertex shader misc export register.
-  // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
-  // USE_VTX_KILL_FLAG).
-  // X - PSIZE (gl_PointSize).
-  // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
-  //     drawing.
-  // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
-  //     for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
+  // Result is stored to the vertex shader misc export register, see
+  // ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex for description of
+  // components.
   kPointSizeEdgeFlagKillVertex,
   // Result is stored as memexport destination address
   // (see xenos::xe_gpu_memexport_stream_t).
@@ -45,11 +56,29 @@ enum class InstructionStorageTarget {
   // Result is stored to memexport destination data.
   kExportData,
   // Result is stored to a color target export indexed by storage_index [0-3].
-  kColorTarget,
-  // Result is stored to the depth export (gl_FragDepth).
+  kColor,
+  // X of the result is stored to the depth export (gl_FragDepth).
   kDepth,
 };
 
+// Must be used only in translation to skip unused components, but not in
+// disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both
+// skipped components and zeros, which cannot be encoded, and therefore it will
+// not).
+constexpr uint32_t GetInstructionStorageTargetUsedComponents(
+    InstructionStorageTarget target) {
+  switch (target) {
+    case InstructionStorageTarget::kNone:
+      return 0b0000;
+    case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
+      return 0b0111;
+    case InstructionStorageTarget::kDepth:
+      return 0b0001;
+    default:
+      return 0b1111;
+  }
+}
+
 enum class InstructionStorageAddressingMode {
   // The storage index is not dynamically addressed.
   kStatic,
@@ -75,71 +104,63 @@ enum class SwizzleSource {
   k1,
 };
 
-constexpr SwizzleSource GetSwizzleFromComponentIndex(int i) {
+constexpr SwizzleSource GetSwizzleFromComponentIndex(uint32_t i) {
   return static_cast<SwizzleSource>(i);
 }
-inline char GetCharForComponentIndex(int i) {
+inline char GetCharForComponentIndex(uint32_t i) {
   const static char kChars[] = {'x', 'y', 'z', 'w'};
   return kChars[i];
 }
 inline char GetCharForSwizzle(SwizzleSource swizzle_source) {
   const static char kChars[] = {'x', 'y', 'z', 'w', '0', '1'};
-  return kChars[static_cast<int>(swizzle_source)];
+  return kChars[static_cast<uint32_t>(swizzle_source)];
 }
 
 struct InstructionResult {
   // Where the result is going.
   InstructionStorageTarget storage_target = InstructionStorageTarget::kNone;
   // Index into the storage_target, if it is indexed.
-  int storage_index = 0;
+  uint32_t storage_index = 0;
   // How the storage index is dynamically addressed, if it is.
   InstructionStorageAddressingMode storage_addressing_mode =
       InstructionStorageAddressingMode::kStatic;
-  // True if the result is exporting from the shader.
-  bool is_export = false;
   // True to clamp the result value to [0-1].
   bool is_clamped = false;
-  // Defines whether each output component is written.
-  bool write_mask[4] = {false, false, false, false};
+  // Defines whether each output component is written, though this is from the
+  // original microcode, not taking into account whether such components
+  // actually exist in the target.
+  uint32_t original_write_mask = 0b0000;
   // Defines the source for each output component xyzw.
   SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
                                  SwizzleSource::kZ, SwizzleSource::kW};
-  // Returns true if any component is written to.
-  bool has_any_writes() const {
-    return write_mask[0] || write_mask[1] || write_mask[2] || write_mask[3];
-  }
-  // Returns true if all components are written to.
-  bool has_all_writes() const {
-    return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3];
-  }
-  // Returns number of components written
-  uint32_t num_writes() const {
-    uint32_t total = 0;
-    for (int i = 0; i < 4; i++) {
-      if (write_mask[i]) {
-        total++;
-      }
-    }
-
-    return total;
-  }
-  // Returns true if any non-constant components are written.
-  bool stores_non_constants() const {
-    for (int i = 0; i < 4; ++i) {
-      if (write_mask[i] && components[i] != SwizzleSource::k0 &&
-          components[i] != SwizzleSource::k1) {
-        return true;
-      }
-    }
-    return false;
+  // Returns the write mask containing only components actually present in the
+  // target.
+  uint32_t GetUsedWriteMask() const {
+    return original_write_mask &
+           GetInstructionStorageTargetUsedComponents(storage_target);
   }
   // True if the components are in their 'standard' swizzle arrangement (xyzw).
-  bool is_standard_swizzle() const {
-    return has_all_writes() && components[0] == SwizzleSource::kX &&
+  bool IsStandardSwizzle() const {
+    return (GetUsedWriteMask() == 0b1111) &&
+           components[0] == SwizzleSource::kX &&
            components[1] == SwizzleSource::kY &&
            components[2] == SwizzleSource::kZ &&
            components[3] == SwizzleSource::kW;
   }
+  // Returns the components of the result, before swizzling, that won't be
+  // discarded or replaced with a constant.
+  uint32_t GetUsedResultComponents() const {
+    uint32_t used_write_mask = GetUsedWriteMask();
+    uint32_t used_components = 0b0000;
+    for (uint32_t i = 0; i < 4; ++i) {
+      if ((used_write_mask & (1 << i)) && components[i] >= SwizzleSource::kX &&
+          components[i] <= SwizzleSource::kW) {
+        used_components |=
+            1 << (uint32_t(components[i]) - uint32_t(SwizzleSource::kX));
+      }
+    }
+    return used_components;
+  }
 };
 
 enum class InstructionStorageSource {
@@ -159,7 +180,7 @@ struct InstructionOperand {
   // Where the source comes from.
   InstructionStorageSource storage_source = InstructionStorageSource::kRegister;
   // Index into the storage_target, if it is indexed.
-  int storage_index = 0;
+  uint32_t storage_index = 0;
   // How the storage index is dynamically addressed, if it is.
   InstructionStorageAddressingMode storage_addressing_mode =
       InstructionStorageAddressingMode::kStatic;
@@ -168,13 +189,19 @@ struct InstructionOperand {
   // True to take the absolute value of the source (before any negation).
   bool is_absolute_value = false;
   // Number of components taken from the source operand.
-  int component_count = 0;
+  uint32_t component_count = 4;
   // Defines the source for each component xyzw (up to the given
   // component_count).
   SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
                                  SwizzleSource::kZ, SwizzleSource::kW};
+  // Returns the swizzle source for the component, replicating the rightmost
+  // component if there are less than 4 components (similar to what the Xbox 360
+  // shader compiler does as a general rule for unspecified components).
+  SwizzleSource GetComponent(uint32_t index) const {
+    return components[std::min(index, component_count - 1)];
+  }
   // True if the components are in their 'standard' swizzle arrangement (xyzw).
-  bool is_standard_swizzle() const {
+  bool IsStandardSwizzle() const {
     switch (component_count) {
       case 4:
         return components[0] == SwizzleSource::kX &&
@@ -185,26 +212,32 @@ struct InstructionOperand {
     return false;
   }
 
-  // Whether absolute values of two operands are identical (useful for emulating
-  // Shader Model 3 0*anything=0 multiplication behavior).
-  bool EqualsAbsolute(const InstructionOperand& other) const {
+  // Returns which components of two operands are identical, but may have
+  // different signs (for simplicity of usage with GetComponent, treating the
+  // rightmost component as replicated).
+  uint32_t GetAbsoluteIdenticalComponents(
+      const InstructionOperand& other) const {
     if (storage_source != other.storage_source ||
         storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode ||
-        component_count != other.component_count) {
-      return false;
+        storage_addressing_mode != other.storage_addressing_mode) {
+      return 0;
     }
-    for (int i = 0; i < component_count; ++i) {
-      if (components[i] != other.components[i]) {
-        return false;
-      }
+    uint32_t identical_components = 0;
+    for (uint32_t i = 0; i < 4; ++i) {
+      identical_components |= uint32_t(GetComponent(i) == other.GetComponent(i))
+                              << i;
     }
-    return true;
+    return identical_components;
   }
-
-  bool operator==(const InstructionOperand& other) const {
-    return EqualsAbsolute(other) && is_negated == other.is_negated &&
-           is_absolute_value == other.is_absolute_value;
+  // Returns which components of two operands will always be bitwise equal, but
+  // may have different signs (disregarding component_count for simplicity of
+  // usage with GetComponent, treating the rightmost component as replicated).
+  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
+    if (is_negated != other.is_negated ||
+        is_absolute_value != other.is_absolute_value) {
+      return 0;
+    }
+    return GetAbsoluteIdenticalComponents(other);
   }
 };
 
@@ -365,9 +398,6 @@ struct ParsedAllocInstruction {
 };
 
 struct ParsedVertexFetchInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
   // Opcode for the instruction.
   ucode::FetchOpcode opcode;
   // Friendly name of the instruction.
@@ -409,9 +439,6 @@ struct ParsedVertexFetchInstruction {
 };
 
 struct ParsedTextureFetchInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
   // Opcode for the instruction.
   ucode::FetchOpcode opcode;
   // Friendly name of the instruction.
@@ -462,17 +489,6 @@ struct ParsedTextureFetchInstruction {
 };
 
 struct ParsedAluInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
-  // True if the vector part of the instruction needs to be executed and data
-  // about it in this structure is valid.
-  bool has_vector_op = false;
-  // True if the scalar part of the instruction needs to be executed and data
-  // about it in this structure is valid.
-  bool has_scalar_op = false;
-  bool is_nop() const { return !has_vector_op && !has_scalar_op; }
-
   // Opcode for the vector part of the instruction.
   ucode::AluVectorOpcode vector_opcode = ucode::AluVectorOpcode::kAdd;
   // Opcode for the scalar part of the instruction.
@@ -488,8 +504,20 @@ struct ParsedAluInstruction {
   // Expected predication condition value if predicated.
   bool predicate_condition = false;
 
-  // Describes how the vector operation result is stored.
-  InstructionResult vector_result;
+  // Describes how the vector operation result and, for exports, constant 0/1
+  // are stored. For simplicity of translation and disassembly, treating
+  // constant 0/1 writes as a part of the vector operation - they need to be
+  // expressed somehow in the disassembly anyway with a properly disassembled
+  // instruction even if only constants are being exported. The XNA disassembler
+  // falls back to displaying the whole vector operation, even if only constant
+  // components are written, if the scalar operation is a nop or if the vector
+  // operation has side effects (but if the scalar operation isn't nop, it
+  // outputs the entire constant mask in the scalar operation destination).
+  // Normally the XNA disassembler outputs the constant mask in both vector and
+  // scalar operations, but that's not required by assembler, so it doesn't
+  // really matter whether it's specified in the vector operation, in the scalar
+  // operation, or in both.
+  InstructionResult vector_and_constant_result;
   // Describes how the scalar operation result is stored.
   InstructionResult scalar_result;
   // Both operations must be executed before any result is stored if vector and
@@ -499,27 +527,109 @@ struct ParsedAluInstruction {
   // operations.
 
   // Number of source operands of the vector operation.
-  size_t vector_operand_count = 0;
+  uint32_t vector_operand_count = 0;
   // Describes each source operand of the vector operation.
   InstructionOperand vector_operands[3];
   // Number of source operands of the scalar operation.
-  size_t scalar_operand_count = 0;
+  uint32_t scalar_operand_count = 0;
   // Describes each source operand of the scalar operation.
   InstructionOperand scalar_operands[2];
 
-  // If this is a valid eA write (MAD with a stream constant), returns the index
-  // of the stream float constant, otherwise returns UINT32_MAX.
+  // Whether the vector part of the instruction is the same as if it was omitted
+  // in the assembly (if compiled or assembled with the Xbox 360 shader
+  // compiler), and thus reassembling the shader with this instruction omitted
+  // will result in the same microcode (since instructions with just an empty
+  // write mask may have different values in other fields).
+  // This is for disassembly! Translators should use the write masks and
+  // AluVectorOpHasSideEffects to skip operations, as this only covers one very
+  // specific nop format!
+  bool IsVectorOpDefaultNop() const {
+    if (vector_opcode != ucode::AluVectorOpcode::kMax ||
+        vector_and_constant_result.original_write_mask ||
+        vector_and_constant_result.is_clamped ||
+        vector_operands[0].storage_source !=
+            InstructionStorageSource::kRegister ||
+        vector_operands[0].storage_index != 0 ||
+        vector_operands[0].storage_addressing_mode !=
+            InstructionStorageAddressingMode::kStatic ||
+        vector_operands[0].is_negated || vector_operands[0].is_absolute_value ||
+        !vector_operands[0].IsStandardSwizzle() ||
+        vector_operands[1].storage_source !=
+            InstructionStorageSource::kRegister ||
+        vector_operands[1].storage_index != 0 ||
+        vector_operands[1].storage_addressing_mode !=
+            InstructionStorageAddressingMode::kStatic ||
+        vector_operands[1].is_negated || vector_operands[1].is_absolute_value ||
+        !vector_operands[1].IsStandardSwizzle()) {
+      return false;
+    }
+    if (vector_and_constant_result.storage_target ==
+        InstructionStorageTarget::kRegister) {
+      if (vector_and_constant_result.storage_index != 0 ||
+          vector_and_constant_result.storage_addressing_mode !=
+              InstructionStorageAddressingMode::kStatic) {
+        return false;
+      }
+    } else {
+      // In case both vector and scalar operations are nop, still need to write
+      // somewhere that it's an export, not mov r0._, r0 + retain_prev r0._.
+      // Accurate round trip is possible only if the target is o0 or oC0,
+      // because if the total write mask is empty, the XNA assembler forces the
+      // destination to be o0/oC0, but this doesn't really matter in this case.
+      if (IsScalarOpDefaultNop()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Whether the scalar part of the instruction is the same as if it was omitted
+  // in the assembly (if compiled or assembled with the Xbox 360 shader
+  // compiler), and thus reassembling the shader with this instruction omitted
+  // will result in the same microcode (since instructions with just an empty
+  // write mask may have different values in other fields).
+  bool IsScalarOpDefaultNop() const {
+    if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev ||
+        scalar_result.original_write_mask || scalar_result.is_clamped) {
+      return false;
+    }
+    if (scalar_result.storage_target == InstructionStorageTarget::kRegister) {
+      if (scalar_result.storage_index != 0 ||
+          scalar_result.storage_addressing_mode !=
+              InstructionStorageAddressingMode::kStatic) {
+        return false;
+      }
+    }
+    // For exports, if both are nop, the vector operation will be kept to state
+    // in the microcode that the destination in the microcode is an export.
+    return true;
+  }
+
+  // For translation (not disassembly) - whether this instruction has totally no
+  // effect.
+  bool IsNop() const {
+    return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+           !scalar_result.GetUsedWriteMask() &&
+           !vector_and_constant_result.GetUsedWriteMask() &&
+           !ucode::AluVectorOpHasSideEffects(vector_opcode);
+  }
+
+  // If this is a "normal" eA write recognized by Xenia (MAD with a stream
+  // constant), returns the index of the stream float constant, otherwise
+  // returns UINT32_MAX.
   uint32_t GetMemExportStreamConstant() const {
-    if (has_vector_op &&
-        vector_result.storage_target ==
+    if (vector_and_constant_result.storage_target ==
             InstructionStorageTarget::kExportAddress &&
         vector_opcode == ucode::AluVectorOpcode::kMad &&
-        vector_result.has_all_writes() &&
+        vector_and_constant_result.GetUsedResultComponents() == 0b1111 &&
+        !vector_and_constant_result.is_clamped &&
         vector_operands[2].storage_source ==
             InstructionStorageSource::kConstantFloat &&
         vector_operands[2].storage_addressing_mode ==
             InstructionStorageAddressingMode::kStatic &&
-        vector_operands[2].is_standard_swizzle()) {
+        vector_operands[2].IsStandardSwizzle() &&
+        !vector_operands[2].is_negated &&
+        !vector_operands[2].is_absolute_value) {
       return vector_operands[2].storage_index;
     }
     return UINT32_MAX;
@@ -581,9 +691,8 @@ class Shader {
   struct ConstantRegisterMap {
     // Bitmap of all kConstantFloat registers read by the shader.
     // Any shader can only read up to 256 of the 512, and the base is dependent
-    // on the shader type. Each bit corresponds to a storage index from the type
-    // base, so bit 0 in a vertex shader is register 0, and bit 0 in a fragment
-    // shader is register 256.
+    // on the shader type and SQ_VS/PS_CONST registers. Each bit corresponds to
+    // a storage index from the type base.
     uint64_t float_bitmap[256 / 64];
     // Bitmap of all loop constants read by the shader.
     // Each bit corresponds to a storage index [0-31].
@@ -595,8 +704,33 @@ class Shader {
     // Total number of kConstantFloat registers read by the shader.
     uint32_t float_count;
 
-    // Computed byte count of all registers required when packed.
-    uint32_t packed_byte_length;
+    // Whether kConstantFloat registers are indexed dynamically - in this case,
+    // float_bitmap must be set to all 1, and tight packing must not be done.
+    bool float_dynamic_addressing;
+
+    // Returns the index of the float4 constant as if all float4 constant
+    // registers actually referenced were tightly packed in a buffer, or
+    // UINT32_MAX if not found.
+    uint32_t GetPackedFloatConstantIndex(uint32_t float_constant) const {
+      if (float_constant >= 256) {
+        return UINT32_MAX;
+      }
+      if (float_dynamic_addressing) {
+        // Any can potentially be read - not packing.
+        return float_constant;
+      }
+      uint32_t block_index = float_constant / 64;
+      uint32_t bit_index = float_constant % 64;
+      if (!(float_bitmap[block_index] & (uint64_t(1) << bit_index))) {
+        return UINT32_MAX;
+      }
+      uint32_t offset = 0;
+      for (uint32_t i = 0; i < block_index; ++i) {
+        offset += xe::bit_count(float_bitmap[i]);
+      }
+      return offset + xe::bit_count(float_bitmap[block_index] &
+                                    ((uint64_t(1) << bit_index) - 1));
+    }
   };
 
   Shader(ShaderType shader_type, uint64_t ucode_data_hash,
@@ -642,7 +776,9 @@ class Shader {
   }
 
   // Returns true if the given color target index [0-3].
-  bool writes_color_target(int i) const { return writes_color_targets_[i]; }
+  bool writes_color_target(uint32_t i) const {
+    return writes_color_targets_[i];
+  }
 
   // True if the shader overrides the pixel depth.
   bool writes_depth() const { return writes_depth_; }
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index bce8af1ab..d8efbc4d0 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -131,9 +131,8 @@ bool ShaderTranslator::TranslateInternal(
   ucode_dwords_ = shader->ucode_dwords();
   ucode_dword_count_ = shader->ucode_dword_count();
 
-  // Run through and gather all binding information and to check whether
-  // registers are dynamically addressed.
-  // Translators may need this before they start codegen.
+  // Run through and gather all binding, operand addressing and export
+  // information. Translators may need this before they start codegen.
   uint32_t max_cf_dword_index = static_cast<uint32_t>(ucode_dword_count_);
   for (uint32_t i = 0; i < max_cf_dword_index; i += 3) {
     ControlFlowInstruction cf_a;
@@ -151,10 +150,27 @@ bool ShaderTranslator::TranslateInternal(
     GatherInstructionInformation(cf_a);
     GatherInstructionInformation(cf_b);
   }
+
+  if (constant_register_map_.float_dynamic_addressing) {
+    // All potentially can be referenced.
+    constant_register_map_.float_count = 256;
+    memset(constant_register_map_.float_bitmap, UINT8_MAX,
+           sizeof(constant_register_map_.float_bitmap));
+  } else {
+    constant_register_map_.float_count = 0;
+    for (int i = 0; i < 4; ++i) {
+      // Each bit indicates a vec4 (4 floats).
+      constant_register_map_.float_count +=
+          xe::bit_count(constant_register_map_.float_bitmap[i]);
+    }
+  }
+
   // Cleanup invalid/unneeded memexport allocs.
   for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!memexport_eM_written_[i]) {
-      memexport_eA_written_ &= ~(1u << i);
+    if (!(memexport_eA_written_ & (uint32_t(1) << i))) {
+      memexport_eM_written_[i] = 0;
+    } else if (!memexport_eM_written_[i]) {
+      memexport_eA_written_ &= ~(uint32_t(1) << i);
     }
   }
   if (memexport_eA_written_ == 0) {
@@ -171,27 +187,6 @@ bool ShaderTranslator::TranslateInternal(
 
   TranslateBlocks();
 
-  // Compute total number of float registers and total bytes used by the
-  // register map. This saves us work later when we need to pack them.
-  constant_register_map_.packed_byte_length = 0;
-  constant_register_map_.float_count = 0;
-  for (int i = 0; i < 4; ++i) {
-    // Each bit indicates a vec4 (4 floats).
-    constant_register_map_.float_count +=
-        xe::bit_count(constant_register_map_.float_bitmap[i]);
-  }
-  constant_register_map_.packed_byte_length +=
-      4 * 4 * constant_register_map_.float_count;
-  // Each bit indicates a single word.
-  constant_register_map_.packed_byte_length +=
-      4 * xe::bit_count(constant_register_map_.loop_bitmap);
-  // Direct map between words and words we upload.
-  for (int i = 0; i < 8; ++i) {
-    if (constant_register_map_.bool_bitmap[i]) {
-      constant_register_map_.packed_byte_length += 4;
-    }
-  }
-
   shader->errors_ = std::move(errors_);
   shader->translated_binary_ = CompleteTranslation();
   shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string();
@@ -267,6 +262,43 @@ void ShaderTranslator::EmitUnimplementedTranslationError() {
 
 void ShaderTranslator::GatherInstructionInformation(
     const ControlFlowInstruction& cf) {
+  uint32_t bool_constant_index = UINT32_MAX;
+  switch (cf.opcode()) {
+    case ControlFlowOpcode::kCondExec:
+    case ControlFlowOpcode::kCondExecEnd:
+    case ControlFlowOpcode::kCondExecPredClean:
+    case ControlFlowOpcode::kCondExecPredCleanEnd:
+      bool_constant_index = cf.cond_exec.bool_address();
+      break;
+    case ControlFlowOpcode::kCondCall:
+      if (!cf.cond_call.is_unconditional() && !cf.cond_call.is_predicated()) {
+        bool_constant_index = cf.cond_call.bool_address();
+      }
+      break;
+    case ControlFlowOpcode::kCondJmp:
+      if (!cf.cond_jmp.is_unconditional() && !cf.cond_jmp.is_predicated()) {
+        bool_constant_index = cf.cond_jmp.bool_address();
+      }
+      break;
+    case ControlFlowOpcode::kLoopStart:
+      constant_register_map_.loop_bitmap |= uint32_t(1)
+                                            << cf.loop_start.loop_id();
+      break;
+    case ControlFlowOpcode::kLoopEnd:
+      constant_register_map_.loop_bitmap |= uint32_t(1)
+                                            << cf.loop_end.loop_id();
+      break;
+    case ControlFlowOpcode::kAlloc:
+      if (cf.alloc.alloc_type() == AllocType::kMemory) {
+        ++memexport_alloc_count_;
+      }
+      break;
+  }
+  if (bool_constant_index != UINT32_MAX) {
+    constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
+        uint32_t(1) << (bool_constant_index % 32);
+  }
+
   switch (cf.opcode()) {
     case ControlFlowOpcode::kExec:
     case ControlFlowOpcode::kExecEnd:
@@ -296,99 +328,128 @@ void ShaderTranslator::GatherInstructionInformation(
                     ucode_dwords_ + instr_offset * 3));
           }
         } else {
-          // Gather up color targets written to, and check if using dynamic
-          // register indices.
+          // Gather info needed for the translation pass because having such
+          // state changed in the middle of translation may break things. Check
+          // the comments for each specific variable set here to see usage
+          // restrictions that can be assumed here (such as only marking exports
+          // as written if the used write mask is non-empty).
           auto& op = *reinterpret_cast<const AluInstruction*>(ucode_dwords_ +
                                                               instr_offset * 3);
-          if (op.has_vector_op()) {
-            const auto& opcode_info =
-                alu_vector_opcode_infos_[static_cast<int>(op.vector_opcode())];
-            implicit_early_z_allowed_ &= !opcode_info.disable_implicit_early_z;
-            for (size_t i = 0; i < opcode_info.argument_count; ++i) {
-              if (op.src_is_temp(i + 1) && (op.src_reg(i + 1) & 0x40)) {
-                uses_register_dynamic_addressing_ = true;
-              }
-            }
-            if (op.is_export()) {
-              if (is_pixel_shader()) {
-                if (op.vector_dest() <= 3) {
-                  writes_color_targets_[op.vector_dest()] = true;
-                } else if (op.vector_dest() == 61) {
+          ParsedAluInstruction instr;
+          ParseAluInstruction(op, instr);
+
+          const auto& vector_opcode_info =
+              alu_vector_opcode_infos_[uint32_t(op.vector_opcode())];
+          implicit_early_z_allowed_ &=
+              !vector_opcode_info.disable_implicit_early_z;
+          const auto& scalar_opcode_info =
+              alu_scalar_opcode_infos_[uint32_t(op.scalar_opcode())];
+          implicit_early_z_allowed_ &=
+              !scalar_opcode_info.disable_implicit_early_z;
+
+          if (instr.vector_and_constant_result.storage_target !=
+                  InstructionStorageTarget::kRegister ||
+              instr.scalar_result.storage_target !=
+                  InstructionStorageTarget::kRegister) {
+            // Export is done to vector_dest of the ucode instruction for both
+            // vector and scalar operations - no need to check separately.
+            assert_true(instr.vector_and_constant_result.storage_target ==
+                            instr.scalar_result.storage_target &&
+                        instr.vector_and_constant_result.storage_index ==
+                            instr.scalar_result.storage_index);
+            if (instr.vector_and_constant_result.GetUsedWriteMask() ||
+                instr.scalar_result.GetUsedWriteMask()) {
+              InstructionStorageTarget export_target =
+                  instr.vector_and_constant_result.storage_target;
+              uint32_t export_index =
+                  instr.vector_and_constant_result.storage_index;
+              switch (export_target) {
+                case InstructionStorageTarget::kExportAddress:
+                  // Store used memexport constants because CPU code needs
+                  // addresses and sizes, and also whether there have been
+                  // writes to eA and eM# for register allocation in shader
+                  // translator implementations.
+                  // eA is (hopefully) always written to using:
+                  // mad eA, r#, const0100, c#
+                  // (though there are some exceptions, shaders in Halo 3 for
+                  // some reason set eA to zeros, but the swizzle of the
+                  // constant is not .xyzw in this case, and they don't write to
+                  // eM#).
+                  if (memexport_alloc_count_ > 0 &&
+                      memexport_alloc_count_ <= kMaxMemExports) {
+                    uint32_t memexport_stream_constant =
+                        instr.GetMemExportStreamConstant();
+                    if (memexport_stream_constant != UINT32_MAX) {
+                      memexport_eA_written_ |= uint32_t(1)
+                                               << (memexport_alloc_count_ - 1);
+                      memexport_stream_constants_.insert(
+                          memexport_stream_constant);
+                    } else {
+                      XELOGE(
+                          "ShaderTranslator::GatherInstructionInformation: "
+                          "Couldn't extract memexport stream constant index");
+                    }
+                  }
+                  break;
+                case InstructionStorageTarget::kExportData:
+                  if (memexport_alloc_count_ > 0 &&
+                      memexport_alloc_count_ <= kMaxMemExports) {
+                    memexport_eM_written_[memexport_alloc_count_ - 1] |=
+                        uint32_t(1) << export_index;
+                  }
+                  break;
+                case InstructionStorageTarget::kColor:
+                  writes_color_targets_[export_index] = true;
+                  break;
+                case InstructionStorageTarget::kDepth:
                   writes_depth_ = true;
                   implicit_early_z_allowed_ = false;
-                }
-              }
-              if (memexport_alloc_count_ > 0 &&
-                  memexport_alloc_count_ <= kMaxMemExports) {
-                // Store used memexport constants because CPU code needs
-                // addresses and sizes, and also whether there have been writes
-                // to eA and eM# for register allocation in shader translator
-                // implementations.
-                // eA is (hopefully) always written to using:
-                // mad eA, r#, const0100, c#
-                // (though there are some exceptions, shaders in Halo 3 for some
-                // reason set eA to zeros, but the swizzle of the constant is
-                // not .xyzw in this case, and they don't write to eM#).
-                uint32_t memexport_alloc_index = memexport_alloc_count_ - 1;
-                if (op.vector_dest() == 32 &&
-                    op.vector_opcode() == AluVectorOpcode::kMad &&
-                    op.vector_write_mask() == 0b1111 && !op.src_is_temp(3) &&
-                    op.src_swizzle(3) == 0) {
-                  memexport_eA_written_ |= 1u << memexport_alloc_index;
-                  memexport_stream_constants_.insert(op.src_reg(3));
-                } else if (op.vector_dest() >= 33 && op.vector_dest() <= 37) {
-                  if (memexport_eA_written_ & (1u << memexport_alloc_index)) {
-                    memexport_eM_written_[memexport_alloc_index] |=
-                        1 << (op.vector_dest() - 33);
-                  }
-                }
-              }
-            } else {
-              if (op.is_vector_dest_relative()) {
-                uses_register_dynamic_addressing_ = true;
+                  break;
+                default:
+                  break;
               }
             }
-          }
-          if (op.has_scalar_op()) {
-            const auto& opcode_info =
-                alu_scalar_opcode_infos_[static_cast<int>(op.scalar_opcode())];
-            implicit_early_z_allowed_ &= !opcode_info.disable_implicit_early_z;
-            if (opcode_info.argument_count == 1 && op.src_is_temp(3) &&
-                (op.src_reg(3) & 0x40)) {
+          } else {
+            if ((instr.vector_and_constant_result.GetUsedWriteMask() &&
+                 instr.vector_and_constant_result.storage_addressing_mode !=
+                     InstructionStorageAddressingMode::kStatic) ||
+                (instr.scalar_result.GetUsedWriteMask() &&
+                 instr.scalar_result.storage_addressing_mode !=
+                     InstructionStorageAddressingMode::kStatic)) {
               uses_register_dynamic_addressing_ = true;
             }
-            if (op.is_export()) {
-              if (is_pixel_shader()) {
-                if (op.scalar_dest() <= 3) {
-                  writes_color_targets_[op.scalar_dest()] = true;
-                } else if (op.scalar_dest() == 61) {
-                  writes_depth_ = true;
-                  implicit_early_z_allowed_ = false;
-                }
-              }
-              if (memexport_alloc_count_ > 0 &&
-                  memexport_alloc_count_ <= kMaxMemExports &&
-                  op.scalar_dest() >= 33 && op.scalar_dest() <= 37) {
-                uint32_t memexport_alloc_index = memexport_alloc_count_ - 1;
-                if (memexport_eA_written_ & (1u << memexport_alloc_index)) {
-                  memexport_eM_written_[memexport_alloc_index] |=
-                      1 << (op.scalar_dest() - 33);
-                }
-              }
-            } else {
-              if (op.is_scalar_dest_relative()) {
+          }
+
+          uint32_t total_operand_count =
+              instr.vector_operand_count + instr.scalar_operand_count;
+          for (uint32_t i = 0; i < total_operand_count; ++i) {
+            const InstructionOperand& operand =
+                (i < instr.vector_operand_count)
+                    ? instr.vector_operands[i]
+                    : instr.scalar_operands[i - instr.vector_operand_count];
+            if (operand.storage_source == InstructionStorageSource::kRegister) {
+              if (operand.storage_addressing_mode !=
+                  InstructionStorageAddressingMode::kStatic) {
                 uses_register_dynamic_addressing_ = true;
               }
+            } else if (operand.storage_source ==
+                       InstructionStorageSource::kConstantFloat) {
+              if (operand.storage_addressing_mode ==
+                  InstructionStorageAddressingMode::kStatic) {
+                // Store used float constants before translating so the
+                // translator can use tightly packed indices if not dynamically
+                // indexed.
+                uint32_t constant_index = operand.storage_index;
+                constant_register_map_.float_bitmap[constant_index / 64] |=
+                    uint64_t(1) << (constant_index % 64);
+              } else {
+                constant_register_map_.float_dynamic_addressing = true;
+              }
             }
           }
         }
       }
     } break;
-    case ControlFlowOpcode::kAlloc:
-      if (cf.alloc.alloc_type() == AllocType::kMemory) {
-        ++memexport_alloc_count_;
-      }
-      break;
     default:
       break;
   }
@@ -674,8 +735,9 @@ void ShaderTranslator::TranslateControlFlowCondExec(
   i.instruction_count = cf.count();
   i.type = ParsedExecInstruction::Type::kConditional;
   i.bool_constant_index = cf.bool_address();
-  constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |=
-      1 << (i.bool_constant_index % 32);
+  assert_not_zero(
+      constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
+      (uint32_t(1) << (i.bool_constant_index % 32)));
   i.condition = cf.condition();
   switch (cf.opcode()) {
     case ControlFlowOpcode::kCondExec:
@@ -715,7 +777,8 @@ void ShaderTranslator::TranslateControlFlowLoopStart(
   ParsedLoopStartInstruction i;
   i.dword_index = cf_index_;
   i.loop_constant_index = cf.loop_id();
-  constant_register_map_.loop_bitmap |= 1 << i.loop_constant_index;
+  assert_not_zero(constant_register_map_.loop_bitmap &
+                  (uint32_t(1) << i.loop_constant_index));
   i.is_repeat = cf.is_repeat();
   i.loop_skip_address = cf.address();
 
@@ -731,7 +794,8 @@ void ShaderTranslator::TranslateControlFlowLoopEnd(
   i.is_predicated_break = cf.is_predicated_break();
   i.predicate_condition = cf.condition();
   i.loop_constant_index = cf.loop_id();
-  constant_register_map_.loop_bitmap |= 1 << i.loop_constant_index;
+  assert_not_zero(constant_register_map_.loop_bitmap &
+                  (uint32_t(1) << i.loop_constant_index));
   i.loop_body_address = cf.address();
 
   i.Disassemble(&ucode_disasm_buffer_);
@@ -752,8 +816,9 @@ void ShaderTranslator::TranslateControlFlowCondCall(
   } else {
     i.type = ParsedCallInstruction::Type::kConditional;
     i.bool_constant_index = cf.bool_address();
-    constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |=
-        1 << (i.bool_constant_index % 32);
+    assert_not_zero(
+        constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
+        (uint32_t(1) << (i.bool_constant_index % 32)));
     i.condition = cf.condition();
   }
 
@@ -785,8 +850,9 @@ void ShaderTranslator::TranslateControlFlowCondJmp(
   } else {
     i.type = ParsedJumpInstruction::Type::kConditional;
     i.bool_constant_index = cf.bool_address();
-    constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |=
-        1 << (i.bool_constant_index % 32);
+    assert_not_zero(
+        constant_register_map_.bool_bitmap[i.bool_constant_index / 32] &
+        (uint32_t(1) << (i.bool_constant_index % 32)));
     i.condition = cf.condition();
   }
 
@@ -852,23 +918,25 @@ void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle,
                                  InstructionResult* out_result) {
   out_result->storage_target = InstructionStorageTarget::kRegister;
   out_result->storage_index = dest;
-  out_result->is_export = false;
   out_result->is_clamped = false;
   out_result->storage_addressing_mode =
       is_relative ? InstructionStorageAddressingMode::kAddressRelative
                   : InstructionStorageAddressingMode::kStatic;
+  out_result->original_write_mask = 0b1111;
   for (int i = 0; i < 4; ++i) {
-    out_result->write_mask[i] = true;
-    if ((swizzle & 0x7) == 4) {
-      out_result->components[i] = SwizzleSource::k0;
-    } else if ((swizzle & 0x7) == 5) {
-      out_result->components[i] = SwizzleSource::k1;
-    } else if ((swizzle & 0x7) == 6) {
-      out_result->components[i] = SwizzleSource::k0;
-    } else if ((swizzle & 0x7) == 7) {
-      out_result->write_mask[i] = false;
-    } else {
-      out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3);
+    switch (swizzle & 0x7) {
+      case 4:
+      case 6:
+        out_result->components[i] = SwizzleSource::k0;
+        break;
+      case 5:
+        out_result->components[i] = SwizzleSource::k1;
+        break;
+      case 7:
+        out_result->original_write_mask &= ~uint32_t(1 << i);
+        break;
+      default:
+        out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3);
     }
     swizzle >>= 3;
   }
@@ -885,7 +953,6 @@ void ShaderTranslator::TranslateVertexFetchInstruction(
 void ShaderTranslator::ParseVertexFetchInstruction(
     const VertexFetchInstruction& op, ParsedVertexFetchInstruction* out_instr) {
   auto& i = *out_instr;
-  i.dword_index = 0;
   i.opcode = FetchOpcode::kVertexFetch;
   i.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full";
   i.is_mini_fetch = op.is_mini_fetch();
@@ -908,7 +975,7 @@ void ShaderTranslator::ParseVertexFetchInstruction(
   src_op.is_absolute_value = false;
   src_op.component_count = 1;
   uint32_t swizzle = full_op.src_swizzle();
-  for (int j = 0; j < src_op.component_count; ++j, swizzle >>= 2) {
+  for (uint32_t j = 0; j < src_op.component_count; ++j, swizzle >>= 2) {
     src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3);
   }
 
@@ -947,7 +1014,7 @@ void ShaderTranslator::ParseTextureFetchInstruction(
     bool has_dest;
     bool has_const;
     bool has_attributes;
-    int override_component_count;
+    uint32_t override_component_count;
   } opcode_info;
   switch (op.opcode()) {
     case FetchOpcode::kTextureFetch: {
@@ -993,7 +1060,6 @@ void ShaderTranslator::ParseTextureFetchInstruction(
   }
 
   auto& i = *out_instr;
-  i.dword_index = 0;
   i.opcode = op.opcode();
   i.opcode_name = opcode_info.name;
   i.dimension = op.dimension();
@@ -1020,7 +1086,7 @@ void ShaderTranslator::ParseTextureFetchInstruction(
           ? opcode_info.override_component_count
           : GetTextureDimensionComponentCount(op.dimension());
   uint32_t swizzle = op.src_swizzle();
-  for (int j = 0; j < src_op.component_count; ++j, swizzle >>= 2) {
+  for (uint32_t j = 0; j < src_op.component_count; ++j, swizzle >>= 2) {
     src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3);
   }
 
@@ -1118,7 +1184,7 @@ const ShaderTranslator::AluOpcodeInfo
         {"setp_ge", 1, 1, false},      // 30
         {"setp_inv", 1, 1, false},     // 31
         {"setp_pop", 1, 1, false},     // 32
-        {"setp_clr", 1, 1, false},     // 33
+        {"setp_clr", 0, 0, false},     // 33
         {"setp_rstr", 1, 1, false},    // 34
         {"kills_eq", 1, 1, true},      // 35
         {"kills_gt", 1, 1, true},      // 36
@@ -1135,28 +1201,164 @@ const ShaderTranslator::AluOpcodeInfo
         {"subsc", 2, 1, false},        // 47
         {"sin", 1, 1, false},          // 48
         {"cos", 1, 1, false},          // 49
-        {"retain_prev", 1, 1, false},  // 50
+        {"retain_prev", 0, 0, false},  // 50
 };
 
 void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {
   ParsedAluInstruction instr;
-
-  instr.dword_index = 0;
-
-  instr.is_predicated = op.is_predicated();
-  instr.predicate_condition = op.predicate_condition();
-
-  ParseAluVectorOperation(op, instr);
-  ParseAluScalarOperation(op, instr);
-
+  ParseAluInstruction(op, instr);
   instr.Disassemble(&ucode_disasm_buffer_);
-
   ProcessAluInstruction(instr);
 }
 
-void ParseAluInstructionOperand(const AluInstruction& op, int i,
-                                int swizzle_component_count,
-                                InstructionOperand* out_op) {
+void ShaderTranslator::ParseAluInstruction(const AluInstruction& op,
+                                           ParsedAluInstruction& instr) const {
+  instr.is_predicated = op.is_predicated();
+  instr.predicate_condition = op.predicate_condition();
+
+  bool is_export = op.is_export();
+
+  InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister;
+  uint32_t storage_index_export = 0;
+  if (is_export) {
+    storage_target = InstructionStorageTarget::kNone;
+    // Both vector and scalar operation export to vector_dest.
+    ExportRegister export_register = ExportRegister(op.vector_dest());
+    if (export_register == ExportRegister::kExportAddress) {
+      storage_target = InstructionStorageTarget::kExportAddress;
+    } else if (export_register >= ExportRegister::kExportData0 &&
+               export_register <= ExportRegister::kExportData4) {
+      storage_target = InstructionStorageTarget::kExportData;
+      storage_index_export =
+          uint32_t(export_register) - uint32_t(ExportRegister::kExportData0);
+    } else if (is_vertex_shader()) {
+      if (export_register >= ExportRegister::kVSInterpolator0 &&
+          export_register <= ExportRegister::kVSInterpolator15) {
+        storage_target = InstructionStorageTarget::kInterpolator;
+        storage_index_export = uint32_t(export_register) -
+                               uint32_t(ExportRegister::kVSInterpolator0);
+      } else if (export_register == ExportRegister::kVSPosition) {
+        storage_target = InstructionStorageTarget::kPosition;
+      } else if (export_register ==
+                 ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
+        storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex;
+      }
+    } else if (is_pixel_shader()) {
+      if (export_register >= ExportRegister::kPSColor0 &&
+          export_register <= ExportRegister::kPSColor3) {
+        storage_target = InstructionStorageTarget::kColor;
+        storage_index_export =
+            uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0);
+      } else if (export_register == ExportRegister::kPSDepth) {
+        storage_target = InstructionStorageTarget::kDepth;
+      }
+    }
+    if (storage_target == InstructionStorageTarget::kNone) {
+      assert_always();
+      XELOGE(
+          "ShaderTranslator::ParseAluInstruction: Unsupported write to export "
+          "{}",
+          uint32_t(export_register));
+    }
+  }
+
+  // Vector operation and constant 0/1 writes.
+
+  instr.vector_opcode = op.vector_opcode();
+  const auto& vector_opcode_info =
+      alu_vector_opcode_infos_[uint32_t(instr.vector_opcode)];
+  instr.vector_opcode_name = vector_opcode_info.name;
+
+  instr.vector_and_constant_result.storage_target = storage_target;
+  instr.vector_and_constant_result.storage_addressing_mode =
+      InstructionStorageAddressingMode::kStatic;
+  if (is_export) {
+    instr.vector_and_constant_result.storage_index = storage_index_export;
+  } else {
+    instr.vector_and_constant_result.storage_index = op.vector_dest();
+    assert_true(op.vector_dest() < register_count());
+    if (op.is_vector_dest_relative()) {
+      instr.vector_and_constant_result.storage_addressing_mode =
+          InstructionStorageAddressingMode::kAddressRelative;
+    }
+  }
+  instr.vector_and_constant_result.is_clamped = op.vector_clamp();
+  uint32_t constant_0_mask = op.GetConstant0WriteMask();
+  uint32_t constant_1_mask = op.GetConstant1WriteMask();
+  instr.vector_and_constant_result.original_write_mask =
+      op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask;
+  for (uint32_t i = 0; i < 4; ++i) {
+    SwizzleSource component = GetSwizzleFromComponentIndex(i);
+    if (constant_0_mask & (1 << i)) {
+      component = SwizzleSource::k0;
+    } else if (constant_1_mask & (1 << i)) {
+      component = SwizzleSource::k1;
+    }
+    instr.vector_and_constant_result.components[i] = component;
+  }
+
+  instr.vector_operand_count = vector_opcode_info.argument_count;
+  for (uint32_t i = 0; i < instr.vector_operand_count; ++i) {
+    InstructionOperand& vector_operand = instr.vector_operands[i];
+    ParseAluInstructionOperand(op, i + 1,
+                               vector_opcode_info.src_swizzle_component_count,
+                               vector_operand);
+  }
+
+  // Scalar operation.
+
+  instr.scalar_opcode = op.scalar_opcode();
+  const auto& scalar_opcode_info =
+      alu_scalar_opcode_infos_[uint32_t(instr.scalar_opcode)];
+  instr.scalar_opcode_name = scalar_opcode_info.name;
+
+  instr.scalar_result.storage_target = storage_target;
+  instr.scalar_result.storage_addressing_mode =
+      InstructionStorageAddressingMode::kStatic;
+  if (is_export) {
+    instr.scalar_result.storage_index = storage_index_export;
+  } else {
+    instr.scalar_result.storage_index = op.scalar_dest();
+    assert_true(op.scalar_dest() < register_count());
+    if (op.is_scalar_dest_relative()) {
+      instr.scalar_result.storage_addressing_mode =
+          InstructionStorageAddressingMode::kAddressRelative;
+    }
+  }
+  instr.scalar_result.is_clamped = op.scalar_clamp();
+  instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask();
+  for (uint32_t i = 0; i < 4; ++i) {
+    instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i);
+  }
+
+  instr.scalar_operand_count = scalar_opcode_info.argument_count;
+  if (instr.scalar_operand_count) {
+    if (instr.scalar_operand_count == 1) {
+      ParseAluInstructionOperand(op, 3,
+                                 scalar_opcode_info.src_swizzle_component_count,
+                                 instr.scalar_operands[0]);
+    } else {
+      uint32_t src3_swizzle = op.src_swizzle(3);
+      uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3;
+      uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3;
+      uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) |
+                      (static_cast<int>(op.scalar_opcode()) & 1);
+      int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0;
+
+      ParseAluInstructionOperandSpecial(
+          op, InstructionStorageSource::kConstantFloat, op.src_reg(3),
+          op.src_negate(3), 0, component_a, instr.scalar_operands[0]);
+
+      ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister,
+                                        reg2, op.src_negate(3), const_slot,
+                                        component_b, instr.scalar_operands[1]);
+    }
+  }
+}
+
+void ShaderTranslator::ParseAluInstructionOperand(
+    const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count,
+    InstructionOperand& out_op) {
   int const_slot = 0;
   switch (i) {
     case 2:
@@ -1166,393 +1368,80 @@ void ParseAluInstructionOperand(const AluInstruction& op, int i,
       const_slot = op.src_is_temp(1) && op.src_is_temp(2) ? 0 : 1;
       break;
   }
-  out_op->is_negated = op.src_negate(i);
+  out_op.is_negated = op.src_negate(i);
   uint32_t reg = op.src_reg(i);
   if (op.src_is_temp(i)) {
-    out_op->storage_source = InstructionStorageSource::kRegister;
-    out_op->storage_index = reg & 0x1F;
-    out_op->is_absolute_value = (reg & 0x80) == 0x80;
-    out_op->storage_addressing_mode =
+    out_op.storage_source = InstructionStorageSource::kRegister;
+    out_op.storage_index = reg & 0x1F;
+    out_op.is_absolute_value = (reg & 0x80) == 0x80;
+    out_op.storage_addressing_mode =
         (reg & 0x40) ? InstructionStorageAddressingMode::kAddressRelative
                      : InstructionStorageAddressingMode::kStatic;
   } else {
-    out_op->storage_source = InstructionStorageSource::kConstantFloat;
-    out_op->storage_index = reg;
+    out_op.storage_source = InstructionStorageSource::kConstantFloat;
+    out_op.storage_index = reg;
     if ((const_slot == 0 && op.is_const_0_addressed()) ||
         (const_slot == 1 && op.is_const_1_addressed())) {
       if (op.is_address_relative()) {
-        out_op->storage_addressing_mode =
+        out_op.storage_addressing_mode =
             InstructionStorageAddressingMode::kAddressAbsolute;
       } else {
-        out_op->storage_addressing_mode =
+        out_op.storage_addressing_mode =
             InstructionStorageAddressingMode::kAddressRelative;
       }
     } else {
-      out_op->storage_addressing_mode =
+      out_op.storage_addressing_mode =
           InstructionStorageAddressingMode::kStatic;
     }
-    out_op->is_absolute_value = op.abs_constants();
+    out_op.is_absolute_value = op.abs_constants();
   }
-  out_op->component_count = swizzle_component_count;
+  out_op.component_count = swizzle_component_count;
   uint32_t swizzle = op.src_swizzle(i);
   if (swizzle_component_count == 1) {
     uint32_t a = ((swizzle >> 6) + 3) & 0x3;
-    out_op->components[0] = GetSwizzleFromComponentIndex(a);
+    out_op.components[0] = GetSwizzleFromComponentIndex(a);
   } else if (swizzle_component_count == 2) {
     uint32_t a = ((swizzle >> 6) + 3) & 0x3;
     uint32_t b = ((swizzle >> 0) + 0) & 0x3;
-    out_op->components[0] = GetSwizzleFromComponentIndex(a);
-    out_op->components[1] = GetSwizzleFromComponentIndex(b);
+    out_op.components[0] = GetSwizzleFromComponentIndex(a);
+    out_op.components[1] = GetSwizzleFromComponentIndex(b);
   } else if (swizzle_component_count == 3) {
     assert_always();
   } else if (swizzle_component_count == 4) {
-    for (int j = 0; j < swizzle_component_count; ++j, swizzle >>= 2) {
-      out_op->components[j] = GetSwizzleFromComponentIndex((swizzle + j) & 0x3);
+    for (uint32_t j = 0; j < swizzle_component_count; ++j, swizzle >>= 2) {
+      out_op.components[j] = GetSwizzleFromComponentIndex((swizzle + j) & 0x3);
     }
   }
 }
 
-void ParseAluInstructionOperandSpecial(const AluInstruction& op,
-                                       InstructionStorageSource storage_source,
-                                       uint32_t reg, bool negate,
-                                       int const_slot, uint32_t swizzle,
-                                       InstructionOperand* out_op) {
-  out_op->is_negated = negate;
-  out_op->is_absolute_value = op.abs_constants();
-  out_op->storage_source = storage_source;
+void ShaderTranslator::ParseAluInstructionOperandSpecial(
+    const AluInstruction& op, InstructionStorageSource storage_source,
+    uint32_t reg, bool negate, int const_slot, uint32_t component_index,
+    InstructionOperand& out_op) {
+  out_op.is_negated = negate;
+  out_op.is_absolute_value = op.abs_constants();
+  out_op.storage_source = storage_source;
   if (storage_source == InstructionStorageSource::kRegister) {
-    out_op->storage_index = reg & 0x7F;
-    out_op->storage_addressing_mode = InstructionStorageAddressingMode::kStatic;
+    out_op.storage_index = reg & 0x7F;
+    out_op.storage_addressing_mode = InstructionStorageAddressingMode::kStatic;
   } else {
-    out_op->storage_index = reg;
+    out_op.storage_index = reg;
     if ((const_slot == 0 && op.is_const_0_addressed()) ||
         (const_slot == 1 && op.is_const_1_addressed())) {
       if (op.is_address_relative()) {
-        out_op->storage_addressing_mode =
+        out_op.storage_addressing_mode =
             InstructionStorageAddressingMode::kAddressAbsolute;
       } else {
-        out_op->storage_addressing_mode =
+        out_op.storage_addressing_mode =
             InstructionStorageAddressingMode::kAddressRelative;
       }
     } else {
-      out_op->storage_addressing_mode =
+      out_op.storage_addressing_mode =
           InstructionStorageAddressingMode::kStatic;
     }
   }
-  out_op->component_count = 1;
-  uint32_t a = swizzle & 0x3;
-  out_op->components[0] = GetSwizzleFromComponentIndex(a);
-}
-
-void ShaderTranslator::ParseAluVectorOperation(const AluInstruction& op,
-                                               ParsedAluInstruction& i) {
-  i.has_vector_op = op.has_vector_op();
-  if (!i.has_vector_op) {
-    return;
-  }
-  i.vector_opcode = op.vector_opcode();
-  const auto& opcode_info =
-      alu_vector_opcode_infos_[static_cast<int>(op.vector_opcode())];
-  i.vector_opcode_name = opcode_info.name;
-
-  i.vector_result.is_export = op.is_export();
-  i.vector_result.is_clamped = op.vector_clamp();
-  i.vector_result.storage_target = InstructionStorageTarget::kRegister;
-  i.vector_result.storage_index = 0;
-  uint32_t dest_num = op.vector_dest();
-  if (!op.is_export()) {
-    assert_true(dest_num < 32);
-    i.vector_result.storage_target = InstructionStorageTarget::kRegister;
-    i.vector_result.storage_index = dest_num;
-    i.vector_result.storage_addressing_mode =
-        op.is_vector_dest_relative()
-            ? InstructionStorageAddressingMode::kAddressRelative
-            : InstructionStorageAddressingMode::kStatic;
-  } else if (is_vertex_shader()) {
-    switch (dest_num) {
-      case 32:
-        i.vector_result.storage_target =
-            InstructionStorageTarget::kExportAddress;
-        break;
-      case 33:
-      case 34:
-      case 35:
-      case 36:
-      case 37:
-        i.vector_result.storage_index = dest_num - 33;
-        i.vector_result.storage_target = InstructionStorageTarget::kExportData;
-        break;
-      case 62:
-        i.vector_result.storage_target = InstructionStorageTarget::kPosition;
-        break;
-      case 63:
-        i.vector_result.storage_target =
-            InstructionStorageTarget::kPointSizeEdgeFlagKillVertex;
-        break;
-      default:
-        if (dest_num < 16) {
-          i.vector_result.storage_target =
-              InstructionStorageTarget::kInterpolant;
-          i.vector_result.storage_index = dest_num;
-        } else {
-          // Unimplemented.
-          // assert_always();
-          XELOGE(
-              "ShaderTranslator::ParseAluVectorOperation: Unsupported write to "
-              "export {}",
-              dest_num);
-          i.vector_result.storage_target = InstructionStorageTarget::kNone;
-          i.vector_result.storage_index = 0;
-        }
-        break;
-    }
-  } else if (is_pixel_shader()) {
-    switch (dest_num) {
-      case 0:
-      case 63:  // ? masked?
-        i.vector_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.vector_result.storage_index = 0;
-        break;
-      case 1:
-        i.vector_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.vector_result.storage_index = 1;
-        break;
-      case 2:
-        i.vector_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.vector_result.storage_index = 2;
-        break;
-      case 3:
-        i.vector_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.vector_result.storage_index = 3;
-        break;
-      case 32:
-        i.vector_result.storage_target =
-            InstructionStorageTarget::kExportAddress;
-        break;
-      case 33:
-      case 34:
-      case 35:
-      case 36:
-      case 37:
-        i.vector_result.storage_index = dest_num - 33;
-        i.vector_result.storage_target = InstructionStorageTarget::kExportData;
-        break;
-      case 61:
-        i.vector_result.storage_target = InstructionStorageTarget::kDepth;
-        break;
-      default:
-        XELOGE(
-            "ShaderTranslator::ParseAluVectorOperation: Unsupported write to "
-            "export {}",
-            dest_num);
-        i.vector_result.storage_target = InstructionStorageTarget::kNone;
-        i.vector_result.storage_index = 0;
-    }
-  }
-  if (op.is_export()) {
-    uint32_t write_mask = op.vector_write_mask();
-    uint32_t const_1_mask = op.scalar_write_mask();
-    if (!write_mask) {
-      for (int j = 0; j < 4; ++j) {
-        i.vector_result.write_mask[j] = false;
-      }
-    } else {
-      for (int j = 0; j < 4; ++j, write_mask >>= 1, const_1_mask >>= 1) {
-        i.vector_result.write_mask[j] = true;
-        if (write_mask & 0x1) {
-          if (const_1_mask & 0x1) {
-            i.vector_result.components[j] = SwizzleSource::k1;
-          } else {
-            i.vector_result.components[j] = GetSwizzleFromComponentIndex(j);
-          }
-        } else {
-          if (op.is_scalar_dest_relative()) {
-            i.vector_result.components[j] = SwizzleSource::k0;
-          } else {
-            i.vector_result.write_mask[j] = false;
-          }
-        }
-      }
-    }
-  } else {
-    uint32_t write_mask = op.vector_write_mask();
-    for (int j = 0; j < 4; ++j, write_mask >>= 1) {
-      i.vector_result.write_mask[j] = (write_mask & 0x1) == 0x1;
-      i.vector_result.components[j] = GetSwizzleFromComponentIndex(j);
-    }
-  }
-
-  i.vector_operand_count = opcode_info.argument_count;
-  for (int j = 0; j < i.vector_operand_count; ++j) {
-    ParseAluInstructionOperand(op, j + 1,
-                               opcode_info.src_swizzle_component_count,
-                               &i.vector_operands[j]);
-
-    // Track constant float register loads.
-    if (i.vector_operands[j].storage_source ==
-        InstructionStorageSource::kConstantFloat) {
-      if (i.vector_operands[j].storage_addressing_mode !=
-          InstructionStorageAddressingMode::kStatic) {
-        // Dynamic addressing makes all constants required.
-        std::memset(constant_register_map_.float_bitmap, 0xFF,
-                    sizeof(constant_register_map_.float_bitmap));
-      } else {
-        auto register_index = i.vector_operands[j].storage_index;
-        constant_register_map_.float_bitmap[register_index / 64] |=
-            1ull << (register_index % 64);
-      }
-    }
-  }
-}
-
-void ShaderTranslator::ParseAluScalarOperation(const AluInstruction& op,
-                                               ParsedAluInstruction& i) {
-  i.has_scalar_op = op.has_scalar_op();
-  if (!i.has_scalar_op) {
-    return;
-  }
-  i.scalar_opcode = op.scalar_opcode();
-  const auto& opcode_info =
-      alu_scalar_opcode_infos_[static_cast<int>(op.scalar_opcode())];
-  i.scalar_opcode_name = opcode_info.name;
-
-  uint32_t dest_num;
-  uint32_t write_mask;
-  if (op.is_export()) {
-    dest_num = op.vector_dest();
-    write_mask = op.scalar_write_mask() & ~op.vector_write_mask();
-  } else {
-    dest_num = op.scalar_dest();
-    write_mask = op.scalar_write_mask();
-  }
-  i.scalar_result.is_export = op.is_export();
-  i.scalar_result.is_clamped = op.scalar_clamp();
-  i.scalar_result.storage_target = InstructionStorageTarget::kRegister;
-  i.scalar_result.storage_index = 0;
-  if (!op.is_export()) {
-    assert_true(dest_num < 32);
-    i.scalar_result.storage_target = InstructionStorageTarget::kRegister;
-    i.scalar_result.storage_index = dest_num;
-    i.scalar_result.storage_addressing_mode =
-        op.is_scalar_dest_relative()
-            ? InstructionStorageAddressingMode::kAddressRelative
-            : InstructionStorageAddressingMode::kStatic;
-  } else if (is_vertex_shader()) {
-    switch (dest_num) {
-      case 32:
-        i.scalar_result.storage_target =
-            InstructionStorageTarget::kExportAddress;
-        break;
-      case 33:
-      case 34:
-      case 35:
-      case 36:
-      case 37:
-        i.scalar_result.storage_index = dest_num - 33;
-        i.scalar_result.storage_target = InstructionStorageTarget::kExportData;
-        break;
-      case 62:
-        i.scalar_result.storage_target = InstructionStorageTarget::kPosition;
-        break;
-      case 63:
-        i.scalar_result.storage_target =
-            InstructionStorageTarget::kPointSizeEdgeFlagKillVertex;
-        break;
-      default:
-        if (dest_num < 16) {
-          i.scalar_result.storage_target =
-              InstructionStorageTarget::kInterpolant;
-          i.scalar_result.storage_index = dest_num;
-        } else {
-          // Unimplemented.
-          // assert_always();
-          XELOGE(
-              "ShaderTranslator::ParseAluScalarOperation: Unsupported write to "
-              "export {}",
-              dest_num);
-          i.scalar_result.storage_target = InstructionStorageTarget::kNone;
-          i.scalar_result.storage_index = 0;
-        }
-        break;
-    }
-  } else if (is_pixel_shader()) {
-    switch (dest_num) {
-      case 0:
-      case 63:  // ? masked?
-        i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.scalar_result.storage_index = 0;
-        break;
-      case 1:
-        i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.scalar_result.storage_index = 1;
-        break;
-      case 2:
-        i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.scalar_result.storage_index = 2;
-        break;
-      case 3:
-        i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget;
-        i.scalar_result.storage_index = 3;
-        break;
-      case 32:
-        i.scalar_result.storage_target =
-            InstructionStorageTarget::kExportAddress;
-        break;
-      case 33:
-      case 34:
-      case 35:
-      case 36:
-      case 37:
-        i.scalar_result.storage_index = dest_num - 33;
-        i.scalar_result.storage_target = InstructionStorageTarget::kExportData;
-        break;
-      case 61:
-        i.scalar_result.storage_target = InstructionStorageTarget::kDepth;
-        break;
-    }
-  }
-  for (int j = 0; j < 4; ++j, write_mask >>= 1) {
-    i.scalar_result.write_mask[j] = (write_mask & 0x1) == 0x1;
-    i.scalar_result.components[j] = GetSwizzleFromComponentIndex(j);
-  }
-
-  i.scalar_operand_count = opcode_info.argument_count;
-  if (opcode_info.argument_count == 1) {
-    ParseAluInstructionOperand(op, 3, opcode_info.src_swizzle_component_count,
-                               &i.scalar_operands[0]);
-  } else {
-    uint32_t src3_swizzle = op.src_swizzle(3);
-    uint32_t swiz_a = ((src3_swizzle >> 6) + 3) & 0x3;
-    uint32_t swiz_b = ((src3_swizzle >> 0) + 0) & 0x3;
-    uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) |
-                    (static_cast<int>(op.scalar_opcode()) & 1);
-
-    int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0;
-
-    ParseAluInstructionOperandSpecial(
-        op, InstructionStorageSource::kConstantFloat, op.src_reg(3),
-        op.src_negate(3), 0, swiz_a, &i.scalar_operands[0]);
-
-    ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister,
-                                      reg2, op.src_negate(3), const_slot,
-                                      swiz_b, &i.scalar_operands[1]);
-  }
-
-  // Track constant float register loads - in either case, a float constant may
-  // be used in operand 0.
-  if (i.scalar_operands[0].storage_source ==
-      InstructionStorageSource::kConstantFloat) {
-    auto register_index = i.scalar_operands[0].storage_index;
-    if (i.scalar_operands[0].storage_addressing_mode !=
-        InstructionStorageAddressingMode::kStatic) {
-      // Dynamic addressing makes all constants required.
-      std::memset(constant_register_map_.float_bitmap, 0xFF,
-                  sizeof(constant_register_map_.float_bitmap));
-    } else {
-      constant_register_map_.float_bitmap[register_index / 64] |=
-          1ull << (register_index % 64);
-    }
-  }
+  out_op.component_count = 1;
+  out_op.components[0] = GetSwizzleFromComponentIndex(component_index);
 }
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index a41253669..300e00b48 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -57,15 +57,19 @@ class ShaderTranslator {
   }
   // True if the current shader is a pixel shader.
   bool is_pixel_shader() const { return shader_type_ == ShaderType::kPixel; }
+  // Used constant register info, populated before translation.
   const Shader::ConstantRegisterMap& constant_register_map() const {
     return constant_register_map_;
   }
   // True if the current shader addresses general-purpose registers with dynamic
-  // indices.
+  // indices, set before translation. Doesn't include writes to r[#+a#] with an
+  // empty used write mask.
   bool uses_register_dynamic_addressing() const {
     return uses_register_dynamic_addressing_;
   }
-  // True if the current shader writes to a color target on any execution path.
+  // True if the current shader writes to a color target on any execution path,
+  // set before translation. Doesn't include writes with an empty used write
+  // mask.
   bool writes_color_target(int i) const { return writes_color_targets_[i]; }
   bool writes_any_color_target() const {
     for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
@@ -75,7 +79,8 @@ class ShaderTranslator {
     }
     return false;
   }
-  // True if the current shader overrides the pixel depth.
+  // True if the current shader overrides the pixel depth, set before
+  // translation. Doesn't include writes with an empty used write mask.
   bool writes_depth() const { return writes_depth_; }
   // True if Xenia can automatically enable early depth/stencil for the pixel
   // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha
@@ -181,8 +186,8 @@ class ShaderTranslator {
  private:
   struct AluOpcodeInfo {
     const char* name;
-    size_t argument_count;
-    int src_swizzle_component_count;
+    uint32_t argument_count;
+    uint32_t src_swizzle_component_count;
     bool disable_implicit_early_z;
   };
 
@@ -229,10 +234,16 @@ class ShaderTranslator {
                                     ParsedTextureFetchInstruction* out_instr);
 
   void TranslateAluInstruction(const ucode::AluInstruction& op);
-  void ParseAluVectorOperation(const ucode::AluInstruction& op,
-                               ParsedAluInstruction& instr);
-  void ParseAluScalarOperation(const ucode::AluInstruction& op,
-                               ParsedAluInstruction& instr);
+  void ParseAluInstruction(const ucode::AluInstruction& op,
+                           ParsedAluInstruction& out_instr) const;
+  static void ParseAluInstructionOperand(const ucode::AluInstruction& op,
+                                         uint32_t i,
+                                         uint32_t swizzle_component_count,
+                                         InstructionOperand& out_op);
+  static void ParseAluInstructionOperandSpecial(
+      const ucode::AluInstruction& op, InstructionStorageSource storage_source,
+      uint32_t reg, bool negate, int const_slot, uint32_t component_index,
+      InstructionOperand& out_op);
 
   // Input shader metadata and microcode.
   ShaderType shader_type_;
@@ -265,12 +276,16 @@ class ShaderTranslator {
   uint32_t unique_vertex_bindings_ = 0;
   uint32_t unique_texture_bindings_ = 0;
 
+  // These all are gathered before translation.
+  // uses_register_dynamic_addressing_ for writes, writes_color_targets_,
+  // writes_depth_ don't include empty used write masks.
   Shader::ConstantRegisterMap constant_register_map_ = {0};
   bool uses_register_dynamic_addressing_ = false;
   bool writes_color_targets_[4] = {false, false, false, false};
   bool writes_depth_ = false;
   bool implicit_early_z_allowed_ = true;
 
+  // Memexport info is gathered before translation.
   uint32_t memexport_alloc_count_ = 0;
   // For register allocation in implementations - what was used after each
   // `alloc export`.
diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc
index 7a0bdf179..2a9536b26 100644
--- a/src/xenia/gpu/shader_translator_disasm.cc
+++ b/src/xenia/gpu/shader_translator_disasm.cc
@@ -28,7 +28,7 @@ void DisassembleResultOperand(const InstructionResult& result,
       out->Append('r');
       uses_storage_index = true;
       break;
-    case InstructionStorageTarget::kInterpolant:
+    case InstructionStorageTarget::kInterpolator:
       out->Append('o');
       uses_storage_index = true;
       break;
@@ -45,7 +45,7 @@ void DisassembleResultOperand(const InstructionResult& result,
       out->Append("eM");
       uses_storage_index = true;
       break;
-    case InstructionStorageTarget::kColorTarget:
+    case InstructionStorageTarget::kColor:
       out->Append("oC");
       uses_storage_index = true;
       break;
@@ -68,12 +68,19 @@ void DisassembleResultOperand(const InstructionResult& result,
         break;
     }
   }
-  if (!result.has_any_writes()) {
+  // Not using GetUsedWriteMask/IsStandardSwizzle because they filter out
+  // components not having any runtime effect, but those components are still
+  // present in the microcode.
+  if (!result.original_write_mask) {
     out->Append("._");
-  } else if (!result.is_standard_swizzle()) {
+  } else if (result.original_write_mask != 0b1111 ||
+             result.components[0] != SwizzleSource::kX ||
+             result.components[1] != SwizzleSource::kY ||
+             result.components[2] != SwizzleSource::kZ ||
+             result.components[3] != SwizzleSource::kW) {
     out->Append('.');
     for (int i = 0; i < 4; ++i) {
-      if (result.write_mask[i]) {
+      if (result.original_write_mask & (1 << i)) {
         out->Append(GetCharForSwizzle(result.components[i]));
       } else {
         out->Append('_');
@@ -116,7 +123,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
       out->AppendFormat("[{}+aL]", op.storage_index);
       break;
   }
-  if (!op.is_standard_swizzle()) {
+  if (!op.IsStandardSwizzle()) {
     out->Append('.');
     if (op.component_count == 1) {
       out->Append(GetCharForSwizzle(op.components[0]));
@@ -124,7 +131,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
       out->Append(GetCharForSwizzle(op.components[0]));
       out->Append(GetCharForSwizzle(op.components[1]));
     } else {
-      for (int j = 0; j < op.component_count; ++j) {
+      for (uint32_t j = 0; j < op.component_count; ++j) {
         out->Append(GetCharForSwizzle(op.components[j]));
       }
     }
@@ -454,11 +461,19 @@ void ParsedTextureFetchInstruction::Disassemble(StringBuffer* out) const {
 }
 
 void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
-  if (is_nop()) {
-    out->Append("         nop\n");
+  bool is_vector_op_default_nop = IsVectorOpDefaultNop();
+  bool is_scalar_op_default_nop = IsScalarOpDefaultNop();
+  if (is_vector_op_default_nop && is_scalar_op_default_nop) {
+    out->Append("   ");
+    if (is_predicated) {
+      out->Append(predicate_condition ? " (p0) " : "(!p0) ");
+    } else {
+      out->Append("      ");
+    }
+    out->Append("nop\n");
     return;
   }
-  if (has_vector_op) {
+  if (!is_vector_op_default_nop) {
     out->Append("   ");
     if (is_predicated) {
       out->Append(predicate_condition ? " (p0) " : "(!p0) ");
@@ -466,19 +481,19 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
       out->Append("      ");
     }
     out->Append(vector_opcode_name);
-    if (vector_result.is_clamped) {
+    if (vector_and_constant_result.is_clamped) {
       out->Append("_sat");
     }
     out->Append(' ');
-    DisassembleResultOperand(vector_result, out);
-    for (int i = 0; i < vector_operand_count; ++i) {
+    DisassembleResultOperand(vector_and_constant_result, out);
+    for (uint32_t i = 0; i < vector_operand_count; ++i) {
       out->Append(", ");
       DisassembleSourceOperand(vector_operands[i], out);
     }
     out->Append('\n');
   }
-  if (has_scalar_op) {
-    out->Append(has_vector_op ? "              + " : "   ");
+  if (!is_scalar_op_default_nop) {
+    out->Append(is_vector_op_default_nop ? "   " : "              + ");
     if (is_predicated) {
       out->Append(predicate_condition ? " (p0) " : "(!p0) ");
     } else {
@@ -490,7 +505,7 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
     }
     out->Append(' ');
     DisassembleResultOperand(scalar_result, out);
-    for (int i = 0; i < scalar_operand_count; ++i) {
+    for (uint32_t i = 0; i < scalar_operand_count; ++i) {
       out->Append(", ");
       DisassembleSourceOperand(scalar_operands[i], out);
     }
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 5680f9eca..bd8f2217e 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -2003,7 +2003,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
 
 void SpirvShaderTranslator::ProcessAluInstruction(
     const ParsedAluInstruction& instr) {
-  if (instr.is_nop()) {
+  if (instr.IsNop()) {
     return;
   }
 
@@ -2044,7 +2044,7 @@ void SpirvShaderTranslator::ProcessAluInstruction(
       ProcessScalarAluOperation(instr, close_predicated_block_scalar);
 
   if (store_vector) {
-    StoreToResult(b.createLoad(pv_), instr.vector_result);
+    StoreToResult(b.createLoad(pv_), instr.vector_and_constant_result);
   }
   if (store_scalar) {
     StoreToResult(b.createLoad(ps_), instr.scalar_result);
@@ -2252,7 +2252,8 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
     const ParsedAluInstruction& instr, bool& close_predicated_block) {
   close_predicated_block = false;
 
-  if (!instr.has_vector_op) {
+  if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
+      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
     return false;
   }
 
@@ -2261,7 +2262,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
   // TODO: If we have identical operands, reuse previous one.
   Id sources[3] = {0};
   Id dest = vec4_float_zero_;
-  for (size_t i = 0; i < instr.vector_operand_count; i++) {
+  for (uint32_t i = 0; i < instr.vector_operand_count; i++) {
     sources[i] = LoadFromOperand(instr.vector_operands[i]);
   }
 
@@ -2636,7 +2637,8 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
     const ParsedAluInstruction& instr, bool& close_predicated_block) {
   close_predicated_block = false;
 
-  if (!instr.has_scalar_op) {
+  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+      !instr.scalar_result.GetUsedWriteMask()) {
     return false;
   }
 
@@ -2645,12 +2647,12 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
   // TODO: If we have identical operands, reuse previous one.
   Id sources[3] = {0};
   Id dest = b.makeFloatConstant(0);
-  for (size_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
+  for (uint32_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
     auto src = LoadFromOperand(instr.scalar_operands[i]);
 
     // Pull components out of the vector operands and use them as sources.
     if (instr.scalar_operands[i].component_count > 1) {
-      for (int j = 0; j < instr.scalar_operands[i].component_count; j++) {
+      for (uint32_t j = 0; j < instr.scalar_operands[i].component_count; j++) {
         sources[x++] = b.createCompositeExtract(src, float_type_, j);
       }
     } else {
@@ -3191,7 +3193,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
   }
 
   // swizzle
-  if (op.component_count > 1 && !op.is_standard_swizzle()) {
+  if (op.component_count > 1 && !op.IsStandardSwizzle()) {
     std::vector<uint32_t> operands;
     operands.push_back(storage_value);
     operands.push_back(b.makeCompositeConstant(
@@ -3200,7 +3202,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
 
     // Components start from left and are duplicated rightwards
     // e.g. count = 1, xxxx / count = 2, xyyy ...
-    for (int i = 0; i < 4; i++) {
+    for (uint32_t i = 0; i < 4; i++) {
       auto swiz = op.components[i];
       if (i > op.component_count - 1) {
         swiz = op.components[op.component_count - 1];
@@ -3244,7 +3246,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     return;
   }
 
-  if (!result.has_any_writes()) {
+  uint32_t used_write_mask = result.GetUsedWriteMask();
+  if (!used_write_mask) {
     return;
   }
 
@@ -3285,7 +3288,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
       storage_array = true;
       assert_true(uint32_t(result.storage_index) < register_count());
       break;
-    case InstructionStorageTarget::kInterpolant:
+    case InstructionStorageTarget::kInterpolator:
       assert_true(is_vertex_shader());
       storage_pointer = interpolators_;
       storage_class = spv::StorageClass::StorageClassOutput;
@@ -3310,7 +3313,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
       storage_offsets.push_back(0);
       storage_array = false;
       break;
-    case InstructionStorageTarget::kColorTarget:
+    case InstructionStorageTarget::kColor:
       assert_true(is_pixel_shader());
       assert_not_zero(frag_outputs_);
       storage_pointer = frag_outputs_;
@@ -3351,7 +3354,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
 
   // Only load from storage if we need it later.
   Id storage_value = 0;
-  if ((source_is_scalar && !storage_is_scalar) || !result.has_all_writes()) {
+  if ((source_is_scalar && !storage_is_scalar) || used_write_mask != 0b1111) {
     storage_value = b.createLoad(storage_pointer);
   }
 
@@ -3366,7 +3369,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
   }
 
   // destination swizzle
-  if (!result.is_standard_swizzle() && !source_is_scalar) {
+  if (!result.IsStandardSwizzle() && !source_is_scalar) {
     std::vector<uint32_t> operands;
     operands.push_back(source_value_id);
     operands.push_back(b.makeCompositeConstant(
@@ -3377,7 +3380,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     // e.g. count = 1, xxxx / count = 2, xyyy ...
     uint32_t source_components = b.getNumComponents(source_value_id);
     for (int i = 0; i < 4; i++) {
-      if (!result.write_mask[i]) {
+      if (!(used_write_mask & (1 << i))) {
         // Undefined / don't care.
         operands.push_back(0);
         continue;
@@ -3411,29 +3414,30 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
   }
 
   // write mask
-  if (!result.has_all_writes() && !source_is_scalar && !storage_is_scalar) {
+  if (used_write_mask != 0b1111 && !source_is_scalar && !storage_is_scalar) {
     std::vector<uint32_t> operands;
     operands.push_back(source_value_id);
     operands.push_back(storage_value);
 
     for (int i = 0; i < b.getNumTypeComponents(storage_type); i++) {
-      operands.push_back(
-          result.write_mask[i] ? i : b.getNumComponents(source_value_id) + i);
+      operands.push_back((used_write_mask & (1 << i))
+                             ? i
+                             : b.getNumComponents(source_value_id) + i);
     }
 
     source_value_id =
         b.createOp(spv::Op::OpVectorShuffle, storage_type, operands);
   } else if (source_is_scalar && !storage_is_scalar) {
-    assert_true(result.num_writes() >= 1);
+    assert_not_zero(used_write_mask);
 
-    if (result.has_all_writes()) {
+    if (used_write_mask == 0b1111) {
       source_value_id =
           b.smearScalar(spv::NoPrecision, source_value_id, storage_type);
     } else {
       // Find first enabled component
       uint32_t index = 0;
       for (uint32_t i = 0; i < 4; i++) {
-        if (result.write_mask[i]) {
+        if (used_write_mask & (1 << i)) {
           index = i;
           break;
         }
@@ -3443,10 +3447,10 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
     }
   } else if (!source_is_scalar && storage_is_scalar) {
     // Num writes /needs/ to be 1, and let's assume it's the first element.
-    assert_true(result.num_writes() == 1);
+    assert_true(xe::bit_count(used_write_mask) == 1);
 
     for (uint32_t i = 0; i < 4; i++) {
-      if (result.write_mask[i]) {
+      if (used_write_mask & (1 << i)) {
         source_value_id =
             b.createCompositeExtract(source_value_id, storage_type, 0);
         break;
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index 7aa135bce..b588f6776 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -667,7 +667,11 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 * anything = 0) and for NaN in min/max.
+//   multiplication (0 * anything = 0) wherever it's present (mul, mad, dp,
+//   etc.) and for NaN in min/max. It's very important to respect this rule for
+//   multiplication, as games often rely on it in vector normalization (rcp and
+//   mul), Infinity * 0 resulting in NaN breaks a lot of things in games -
+//   causes white screen in Halo 3, white specular on characters in GTA IV.
 
 enum class AluScalarOpcode : uint32_t {
   // Floating-Point Add
@@ -1300,8 +1304,10 @@ enum class AluVectorOpcode : uint32_t {
 
 // Whether the vector instruction has side effects such as discarding a pixel or
 // setting the predicate and can't be ignored even if it doesn't write to
-// anywhere.
-inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
+// anywhere. Note that all scalar operations except for retain_prev have a side
+// effect of modifying the previous scalar result register, so they must always
+// be executed even if not writing.
+constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
   switch (vector_opcode) {
     case AluVectorOpcode::kSetpEqPush:
     case AluVectorOpcode::kSetpNePush:
@@ -1319,7 +1325,126 @@ inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
   return false;
 }
 
+// Whether each component of a source operand is used at all in the instruction
+// (doesn't check the operand count though).
+constexpr uint32_t GetAluVectorOpUsedSourceComponents(
+    AluVectorOpcode vector_opcode, uint32_t src_index) {
+  switch (vector_opcode) {
+    case AluVectorOpcode::kDp3:
+      return 0b0111;
+    case AluVectorOpcode::kDp2Add:
+      return src_index == 3 ? 0b0001 : 0b0011;
+    case AluVectorOpcode::kSetpEqPush:
+    case AluVectorOpcode::kSetpNePush:
+    case AluVectorOpcode::kSetpGtPush:
+    case AluVectorOpcode::kSetpGePush:
+      return 0b1001;
+    case AluVectorOpcode::kDst:
+      return src_index == 2 ? 0b1010 : 0b0110;
+    default:
+      break;
+  }
+  return 0b1111;
+}
+
+// Whether each component of a source operand is needed for the instruction if
+// executed with the specified write mask, and thus can't be thrown away or be
+// undefined in translation. For per-component operations, for example, only the
+// components specified in the write mask are needed, but there are instructions
+// with special behavior for certain components.
+constexpr uint32_t GetAluVectorOpNeededSourceComponents(
+    AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) {
+  uint32_t components = write_mask;
+  switch (vector_opcode) {
+    case AluVectorOpcode::kDp4:
+    case AluVectorOpcode::kMax4:
+      components = write_mask ? 0b1111 : 0;
+      break;
+    case AluVectorOpcode::kDp3:
+      components = write_mask ? 0b0111 : 0;
+      break;
+    case AluVectorOpcode::kDp2Add:
+      components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0;
+      break;
+    case AluVectorOpcode::kCube:
+      components = write_mask ? 0b1111 : 0;
+      break;
+    case AluVectorOpcode::kSetpEqPush:
+    case AluVectorOpcode::kSetpNePush:
+    case AluVectorOpcode::kSetpGtPush:
+    case AluVectorOpcode::kSetpGePush:
+      components = write_mask ? 0b1001 : 0b1000;
+      break;
+    case AluVectorOpcode::kKillEq:
+    case AluVectorOpcode::kKillGt:
+    case AluVectorOpcode::kKillGe:
+    case AluVectorOpcode::kKillNe:
+      components = 0b1111;
+      break;
+    // kDst is per-component, but not all components are used -
+    // GetAluVectorOpUsedSourceComponents will filter out the unused ones.
+    case AluVectorOpcode::kMaxA:
+      if (src_index == 1) {
+        components |= 0b1000;
+      }
+      break;
+    default:
+      break;
+  }
+  return components &
+         GetAluVectorOpUsedSourceComponents(vector_opcode, src_index);
+}
+
+enum class ExportRegister : uint32_t {
+  kVSInterpolator0 = 0,
+  kVSInterpolator1,
+  kVSInterpolator2,
+  kVSInterpolator3,
+  kVSInterpolator4,
+  kVSInterpolator5,
+  kVSInterpolator6,
+  kVSInterpolator7,
+  kVSInterpolator8,
+  kVSInterpolator9,
+  kVSInterpolator10,
+  kVSInterpolator11,
+  kVSInterpolator12,
+  kVSInterpolator13,
+  kVSInterpolator14,
+  kVSInterpolator15,
+
+  kVSPosition = 62,
+
+  // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
+  // USE_VTX_KILL_FLAG).
+  // X - PSIZE (gl_PointSize).
+  // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
+  //     drawing.
+  // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
+  //     for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
+  kVSPointSizeEdgeFlagKillVertex = 63,
+
+  kPSColor0 = 0,
+  kPSColor1,
+  kPSColor2,
+  kPSColor3,
+
+  // In X.
+  kPSDepth = 61,
+
+  // Memory export: index.?y?? * 0100 + xe_gpu_memexport_stream_t.xyzw.
+  kExportAddress = 32,
+  // Memory export: values for texels [index+0], [index+1], ..., [index+4].
+  kExportData0 = 33,
+  kExportData1,
+  kExportData2,
+  kExportData3,
+  kExportData4,
+};
+
 struct AluInstruction {
+  // Raw accessors.
+
   // Whether data is being exported (or written to local registers).
   bool is_export() const { return data_.export_data == 1; }
   bool export_write_mask() const { return data_.scalar_dest_rel == 1; }
@@ -1334,20 +1459,12 @@ struct AluInstruction {
   bool is_const_1_addressed() const { return data_.const_1_rel_abs == 1; }
   bool is_address_relative() const { return data_.address_absolute == 1; }
 
-  bool has_vector_op() const {
-    return vector_write_mask() || is_export() ||
-           AluVectorOpcodeHasSideEffects(vector_opcode());
-  }
   AluVectorOpcode vector_opcode() const { return data_.vector_opc; }
   uint32_t vector_write_mask() const { return data_.vector_write_mask; }
   uint32_t vector_dest() const { return data_.vector_dest; }
   bool is_vector_dest_relative() const { return data_.vector_dest_rel == 1; }
   bool vector_clamp() const { return data_.vector_clamp == 1; }
 
-  bool has_scalar_op() const {
-    return scalar_opcode() != AluScalarOpcode::kRetainPrev ||
-           (!is_export() && scalar_write_mask() != 0);
-  }
   AluScalarOpcode scalar_opcode() const { return data_.scalar_opc; }
   uint32_t scalar_write_mask() const { return data_.scalar_write_mask; }
   uint32_t scalar_dest() const { return data_.scalar_dest; }
@@ -1407,14 +1524,62 @@ struct AluInstruction {
     }
   }
 
+  // Helpers.
+
+  // Note that even if the export component is unused (like W of the vertex
+  // shader misc register, YZW of pixel shader depth), it must still not be
+  // excluded - that may make disassembly not reassemblable if there are
+  // constant 0 writes in the export, like, oPts.x000 will be assembled, but
+  // oPts.x00_ will not, even though W has no effect on anything.
+  uint32_t GetVectorOpResultWriteMask() const {
+    uint32_t mask = vector_write_mask();
+    if (is_export()) {
+      mask &= ~scalar_write_mask();
+    }
+    return mask;
+  }
+  uint32_t GetScalarOpResultWriteMask() const {
+    uint32_t mask = scalar_write_mask();
+    if (is_export()) {
+      mask &= ~vector_write_mask();
+    }
+    return mask;
+  }
+  uint32_t GetConstant0WriteMask() const {
+    if (!is_export() || !is_scalar_dest_relative()) {
+      return 0b0000;
+    }
+    return 0b1111 & ~(vector_write_mask() | scalar_write_mask());
+  }
+  uint32_t GetConstant1WriteMask() const {
+    if (!is_export()) {
+      return 0b0000;
+    }
+    return vector_write_mask() & scalar_write_mask();
+  }
+
  private:
   XEPACKEDSTRUCT(Data, {
     XEPACKEDSTRUCTANONYMOUS({
+      // If exporting, both vector and scalar operations use the vector
+      // destination (which can't be relative in this case).
+      // Not very important note: If both scalar and vector operations exporting
+      // something have empty write mask, the XNA assembler forces vector_dest
+      // to 0 (interpolator 0 or color 0) directly in the microcode.
       uint32_t vector_dest : 6;
       uint32_t vector_dest_rel : 1;
       uint32_t abs_constants : 1;
       uint32_t scalar_dest : 6;
       uint32_t scalar_dest_rel : 1;
+      // Exports have different write masking (export is done to vector_dest by
+      // both the vector and the scalar operation, and exports can write
+      // constant 0 and 1). For each component:
+      // - vector_write_mask 0, scalar_write_mask 0:
+      //   - scalar_dest_rel 0 - unchanged.
+      //   - scalar_dest_rel 1 - constant 0 (all components must be written).
+      // - vector_write_mask 1, scalar_write_mask 0 - from vector operation.
+      // - vector_write_mask 0, scalar_write_mask 1 - from scalar operation.
+      // - vector_write_mask 1, scalar_write_mask 1 - constant 1.
       uint32_t export_data : 1;
       uint32_t vector_write_mask : 4;
       uint32_t scalar_write_mask : 4;
diff --git a/tools/shader-playground/Editor.cs b/tools/shader-playground/Editor.cs
index dd5c46e0d..017773b39 100644
--- a/tools/shader-playground/Editor.cs
+++ b/tools/shader-playground/Editor.cs
@@ -267,6 +267,7 @@ namespace shader_playground {
         "--shader_output=" + translatedDisasmPath,
         "--shader_output_type=" + outputType,
         "--vertex_shader_output_type=" + vertexShaderType,
+        "--dxbc_source_map=true",
       };
       if (translationComboBox.SelectedIndex == 1) {
         startArguments.Add("--shader_output_dxbc_rov=true");