[GPU] Shader translator refactoring (mostly ALU), fixes for disassembly round trip and write masks

2020-05-08 23:57:51 +03:00 · 2020-05-08 23:57:51 +03:00 · 3aa0ce3096
parent 8f91e580f4
commit 3aa0ce3096
12 changed files with 959 additions and 765 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2961,6 +2961,14 @@ bool D3D12CommandProcessor::UpdateBindings(
      (!samplers_written_pixel_ ||
       current_samplers_hash_pixel_ != samplers_hash_pixel);

+  // These are the constant base addresses/ranges for shaders.
+  // We have these hardcoded right now cause nothing seems to differ on the Xbox
+  // 360 (however, OpenGL ES on Adreno 200 on Android has different ranges).
+  assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
+              regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
+  assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
+              regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
+
  // Check if the float constant layout is still the same and get the counts.
  const Shader::ConstantRegisterMap& float_constant_map_vertex =
      vertex_shader->constant_register_map();
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -809,14 +809,6 @@ bool PipelineCache::EnsureShadersTranslated(
    D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
    Shader::HostVertexShaderType host_vertex_shader_type) {
  auto& regs = *register_file_;
-
-  // These are the constant base addresses/ranges for shaders.
-  // We have these hardcoded right now cause nothing seems to differ.
-  assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
-              regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
-  assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
-              regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
-
  auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();

  // Edge flags are not supported yet (because polygon primitives are not).
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@ -18,6 +18,7 @@

 #include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
+#include "xenia/base/math.h"

 DEFINE_bool(dxbc_switch, true,
            "Use switch rather than if for flow control. Turning this off or "
@ -86,7 +87,6 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
  // Don't allocate again and again for the first shader.
  shader_code_.reserve(8192);
  shader_object_.reserve(16384);
-  float_constant_index_offsets_.reserve(512);
 }
 DxbcShaderTranslator::~DxbcShaderTranslator() = default;

@ -161,8 +161,6 @@ void DxbcShaderTranslator::Reset() {
  cbuffer_index_fetch_constants_ = kCbufferIndexUnallocated;

  system_constants_used_ = 0;
-  float_constants_dynamic_indexed_ = false;
-  float_constant_index_offsets_.clear();

  in_control_point_index_used_ = false;

@ -1166,29 +1164,6 @@ void DxbcShaderTranslator::CompleteShaderCode() {

  // Release system_temps_subroutine_.
  PopSystemTemp(system_temps_subroutine_count_);
-
-  // Remap float constant indices if not indexed dynamically.
-  if (!float_constants_dynamic_indexed_ &&
-      !float_constant_index_offsets_.empty()) {
-    uint8_t float_constant_map[256] = {};
-    uint32_t float_constant_count = 0;
-    for (uint32_t i = 0; i < 4; ++i) {
-      uint64_t float_constants_used = constant_register_map().float_bitmap[i];
-      uint32_t float_constant_index;
-      while (
-          xe::bit_scan_forward(float_constants_used, &float_constant_index)) {
-        float_constants_used &= ~(1ull << float_constant_index);
-        float_constant_map[i * 64 + float_constant_index] =
-            float_constant_count++;
-      }
-    }
-    size_t index_count = float_constant_index_offsets_.size();
-    for (size_t i = 0; i < index_count; ++i) {
-      uint32_t index_offset = float_constant_index_offsets_[i];
-      shader_code_[index_offset] =
-          float_constant_map[shader_code_[index_offset] & 255];
-    }
-  }
 }

 std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() {
@ -1420,7 +1395,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
          shader_code_.push_back(EncodeVectorSwizzledOperand(
              D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, kSwizzleXYZW, 2));
          shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(operand.storage_index));
+          shader_code_.push_back(operand.storage_index);
        } else {
          shader_code_.push_back(
              ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
@ -1433,7 +1408,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
              D3D10_SB_OPERAND_INDEX_IMMEDIATE32,
              D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
          shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(operand.storage_index));
+          shader_code_.push_back(operand.storage_index);
          shader_code_.push_back(EncodeVectorSelectOperand(
              D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
          shader_code_.push_back(dynamic_address_register);
@ -1445,7 +1420,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
        assert_true(operand.storage_addressing_mode ==
                    InstructionStorageAddressingMode::kStatic);
        dxbc_operand.type = DxbcSourceOperand::Type::kRegister;
-        dxbc_operand.index = uint32_t(operand.storage_index);
+        dxbc_operand.index = operand.storage_index;
      }
      break;

@ -1457,11 +1432,18 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
        cbuffer_index_float_constants_ = cbuffer_count_++;
      }
      dxbc_operand.type = DxbcSourceOperand::Type::kConstantFloat;
-      dxbc_operand.index = uint32_t(operand.storage_index);
      dxbc_operand.addressing_mode = operand.storage_addressing_mode;
-      if (operand.storage_addressing_mode !=
+      if (operand.storage_addressing_mode ==
          InstructionStorageAddressingMode::kStatic) {
-        float_constants_dynamic_indexed_ = true;
+        uint32_t float_constant_index =
+            constant_register_map().GetPackedFloatConstantIndex(
+                operand.storage_index);
+        assert_true(float_constant_index != UINT32_MAX);
+        dxbc_operand.index =
+            float_constant_index != UINT32_MAX ? float_constant_index : 0;
+      } else {
+        assert_true(constant_register_map().float_dynamic_addressing);
+        dxbc_operand.index = operand.storage_index;
      }
      break;

@ -1652,11 +1634,6 @@ void DxbcShaderTranslator::UseDxbcSourceOperand(
      }
      shader_code_.push_back(cbuffer_index_float_constants_);
      shader_code_.push_back(uint32_t(CbufferRegister::kFloatConstants));
-      if (!float_constants_dynamic_indexed_) {
-        // If there's no dynamic indexing in the shader, constants are compacted
-        // and remapped. Store where the index has been written.
-        float_constant_index_offsets_.push_back(uint32_t(shader_code_.size()));
-      }
      shader_code_.push_back(operand.index);
      if (!is_static) {
        uint32_t dynamic_address_register, dynamic_address_component;
@ -1718,8 +1695,9 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand(
 void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
                                       uint32_t reg, bool replicate_x,
                                       bool can_store_memexport_address) {
+  uint32_t used_write_mask = result.GetUsedWriteMask();
  if (result.storage_target == InstructionStorageTarget::kNone ||
-      !result.has_any_writes()) {
+      !result.GetUsedWriteMask()) {
    return;
  }

@ -1744,10 +1722,9 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
      ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped);

  // Scalar targets get only one component.
+  // TODO(Triang3l): It's not replicated, it's X specifically.
  if (result.storage_target == InstructionStorageTarget::kDepth) {
-    if (!result.write_mask[0]) {
-      return;
-    }
+    assert_not_zero(used_write_mask & 0b0001);
    SwizzleSource component = result.components[0];
    if (replicate_x && component <= SwizzleSource::kW) {
      component = SwizzleSource::kX;
@ -1802,7 +1779,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
  uint32_t constant_mask = 0;
  uint32_t constant_values = 0;
  for (uint32_t i = 0; i < 4; ++i) {
-    if (!result.write_mask[i]) {
+    if (!(used_write_mask & (1 << i))) {
      continue;
    }
    SwizzleSource component = result.components[i];
@ -1858,7 +1835,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
              is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32
                        : D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
          shader_code_.push_back(0);
-          shader_code_.push_back(uint32_t(result.storage_index));
+          shader_code_.push_back(result.storage_index);
          if (!is_static) {
            shader_code_.push_back(EncodeVectorSelectOperand(
                D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
@ -1874,11 +1851,11 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
              saturate_bit);
          shader_code_.push_back(
              EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
-          shader_code_.push_back(uint32_t(result.storage_index));
+          shader_code_.push_back(result.storage_index);
        }
        break;

-      case InstructionStorageTarget::kInterpolant:
+      case InstructionStorageTarget::kInterpolator:
        ++stat_.instruction_count;
        ++stat_.mov_instruction_count;
        shader_code_.push_back(
@ -1943,7 +1920,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
                                        [uint32_t(result.storage_index)]);
        break;

-      case InstructionStorageTarget::kColorTarget:
+      case InstructionStorageTarget::kColor:
        ++stat_.instruction_count;
        ++stat_.mov_instruction_count;
        shader_code_.push_back(
@ -1952,8 +1929,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
            saturate_bit);
        shader_code_.push_back(
            EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
-        shader_code_.push_back(
-            system_temps_color_[uint32_t(result.storage_index)]);
+        shader_code_.push_back(system_temps_color_[result.storage_index]);
        break;

      default:
@ -1989,13 +1965,13 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
    shader_code_.push_back(
        EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
    shader_code_.push_back(
-        1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3)));
+        uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3)));
    ++stat_.instruction_count;
    ++stat_.uint_instruction_count;
  }

  if (edram_rov_used_ &&
-      result.storage_target == InstructionStorageTarget::kColorTarget) {
+      result.storage_target == InstructionStorageTarget::kColor) {
    // For ROV output, mark that the color has been written to.
    // According to:
    // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
@ -2014,7 +1990,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
    shader_code_.push_back(system_temp_rov_params_);
    shader_code_.push_back(
        EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
-    shader_code_.push_back(1 << (8 + uint32_t(result.storage_index)));
+    shader_code_.push_back(1 << (8 + result.storage_index));
    ++stat_.instruction_count;
    ++stat_.uint_instruction_count;
  }
@ -2479,19 +2455,6 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator::
 };

 void DxbcShaderTranslator::WriteResourceDefinitions() {
-  // ***************************************************************************
-  // Preparation
-  // ***************************************************************************
-
-  // Float constant count.
-  uint32_t float_constant_count = 0;
-  if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
-    for (uint32_t i = 0; i < 4; ++i) {
-      float_constant_count +=
-          xe::bit_count(constant_register_map().float_bitmap[i]);
-    }
-  }
-
  uint32_t chunk_position_dwords = uint32_t(shader_object_.size());
  uint32_t new_offset;

@ -2583,7 +2546,8 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
    if (RdefTypeIndex(i) == RdefTypeIndex::kFloat4ConstantArray) {
      // Declaring a 0-sized array may not be safe, so write something valid
      // even if they aren't used.
-      shader_object_.push_back(std::max(float_constant_count, 1u));
+      shader_object_.push_back(
+          std::max(constant_register_map().float_count, uint32_t(1)));
    } else {
      shader_object_.push_back(type.element_count |
                               (type.struct_member_count << 16));
@ -2692,8 +2656,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
  if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
    shader_object_.push_back(constant_name_offset_float);
    shader_object_.push_back(0);
-    shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
-                             sizeof(float));
+    shader_object_.push_back(
+        std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
+        sizeof(float));
    shader_object_.push_back(kDxbcRdefVariableFlagUsed);
    shader_object_.push_back(types_offset +
                             uint32_t(RdefTypeIndex::kFloat4ConstantArray) *
@ -2795,8 +2760,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
      shader_object_.push_back(cbuffer_name_offset_float);
      shader_object_.push_back(1);
      shader_object_.push_back(constant_offset_float);
-      shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
-                               sizeof(float));
+      shader_object_.push_back(
+          std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
+          sizeof(float));
      shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer));
      shader_object_.push_back(0);
    } else if (i == cbuffer_index_bool_loop_constants_) {
@ -3646,15 +3612,10 @@ void DxbcShaderTranslator::WriteShaderCode() {
  // Constant buffers, from most frequenly accessed to least frequently accessed
  // (the order is a hint to the driver according to the DXBC header).
  if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
-    uint32_t float_constant_count = 0;
-    for (uint32_t i = 0; i < 4; ++i) {
-      float_constant_count +=
-          xe::bit_count(constant_register_map().float_bitmap[i]);
-    }
    shader_object_.push_back(
        ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) |
        ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN(
-            float_constants_dynamic_indexed_
+            constant_register_map().float_dynamic_addressing
                ? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED
                : D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) |
        ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
@ -3663,7 +3624,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
    shader_object_.push_back(cbuffer_index_float_constants_);
    shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
    shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
-    shader_object_.push_back(float_constant_count);
+    shader_object_.push_back(constant_register_map().float_count);
    shader_object_.push_back(0);
  }
  if (cbuffer_index_system_constants_ != kCbufferIndexUnallocated) {
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -857,10 +857,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
          return 0b0000;
      }
    }
-    DxbcDest Mask(uint32_t write_mask) const {
+    [[nodiscard]] DxbcDest Mask(uint32_t write_mask) const {
      return DxbcDest(type_, write_mask, index_1d_, index_2d_, index_3d_);
    }
-    DxbcDest MaskMasked(uint32_t write_mask) const {
+    [[nodiscard]] DxbcDest MaskMasked(uint32_t write_mask) const {
      return DxbcDest(type_, write_mask_ & write_mask, index_1d_, index_2d_,
                      index_3d_);
    }
@ -991,26 +991,28 @@ class DxbcShaderTranslator : public ShaderTranslator {
      return DxbcSrc(DxbcOperandType::kInputCoverageMask, kXXXX);
    }

-    DxbcSrc WithModifiers(bool absolute, bool negate) const {
+    [[nodiscard]] DxbcSrc WithModifiers(bool absolute, bool negate) const {
      DxbcSrc new_src(*this);
      new_src.absolute_ = absolute;
      new_src.negate_ = negate;
      return new_src;
    }
-    DxbcSrc WithAbs(bool absolute) const {
+    [[nodiscard]] DxbcSrc WithAbs(bool absolute) const {
      return WithModifiers(absolute, negate_);
    }
-    DxbcSrc WithNeg(bool negate) const {
+    [[nodiscard]] DxbcSrc WithNeg(bool negate) const {
      return WithModifiers(absolute_, negate);
    }
-    DxbcSrc Abs() const { return WithModifiers(true, false); }
-    DxbcSrc operator-() const { return WithModifiers(absolute_, !negate_); }
-    DxbcSrc Swizzle(uint32_t swizzle) const {
+    [[nodiscard]] DxbcSrc Abs() const { return WithModifiers(true, false); }
+    [[nodiscard]] DxbcSrc operator-() const {
+      return WithModifiers(absolute_, !negate_);
+    }
+    [[nodiscard]] DxbcSrc Swizzle(uint32_t swizzle) const {
      DxbcSrc new_src(*this);
      new_src.swizzle_ = swizzle;
      return new_src;
    }
-    DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
+    [[nodiscard]] DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
      DxbcSrc new_src(*this);
      new_src.swizzle_ = 0;
      for (uint32_t i = 0; i < 4; ++i) {
@ -1019,12 +1021,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
      }
      return new_src;
    }
-    DxbcSrc Select(uint32_t component) const {
+    [[nodiscard]] DxbcSrc Select(uint32_t component) const {
      DxbcSrc new_src(*this);
      new_src.swizzle_ = component * 0b01010101;
      return new_src;
    }
-    DxbcSrc SelectFromSwizzled(uint32_t component) const {
+    [[nodiscard]] DxbcSrc SelectFromSwizzled(uint32_t component) const {
      DxbcSrc new_src(*this);
      new_src.swizzle_ = ((swizzle_ >> (component * 2)) & 3) * 0b01010101;
      return new_src;
@ -2026,6 +2028,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
  void EmitInstructionDisassembly();

  // Abstract 4-component vector source operand.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
  struct DxbcSourceOperand {
    enum class Type {
      // GPR number in the index - used only when GPRs are not dynamically
@ -2064,18 +2067,22 @@ class DxbcShaderTranslator : public ShaderTranslator {
  };
  // Each Load must be followed by Unload, otherwise there may be a temporary
  // register leak.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
  void LoadDxbcSourceOperand(const InstructionOperand& operand,
                             DxbcSourceOperand& dxbc_operand);
  // Number of tokens this operand adds to the instruction length when used.
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
  uint32_t DxbcSourceOperandLength(const DxbcSourceOperand& operand,
                                   bool negate = false,
                                   bool absolute = false) const;
  // Writes the operand access tokens to the instruction (either for a scalar if
  // select_component is <= 3, or for a vector).
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
  void UseDxbcSourceOperand(const DxbcSourceOperand& operand,
                            uint32_t additional_swizzle = kSwizzleXYZW,
                            uint32_t select_component = 4, bool negate = false,
                            bool absolute = false);
+  // TODO(Triang3l): Remove after fully moving to the new emitter.
  void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);

  // Writes xyzw or xxxx of the specified r# to the destination.
@ -2258,15 +2265,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
  // the remaining ones can be marked as unused in RDEF.
  uint64_t system_constants_used_;

-  // Whether constants are dynamically indexed and need to be marked as such in
-  // dcl_constantBuffer.
-  bool float_constants_dynamic_indexed_;
-
-  // Offsets of float constant indices in shader_code_, for remapping in
-  // CompleteTranslation (initially, at these offsets, guest float constant
-  // indices are written).
-  std::vector<uint32_t> float_constant_index_offsets_;
-
  // Whether InOutRegister::kDSInControlPointIndex has been used in the shader.
  bool in_control_point_index_used_;

--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -23,7 +23,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
  replicate_result_x = false;
  predicate_written = false;

-  if (!instr.has_vector_op) {
+  if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
+      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
    return false;
  }

@ -32,7 +33,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
  if (instr.vector_opcode == AluVectorOpcode::kCube) {
    operand_count = 1;
  } else {
-    operand_count = uint32_t(instr.vector_operand_count);
+    operand_count = instr.vector_operand_count;
  }
  DxbcSourceOperand dxbc_operands[3];
  // Whether the operand is the same as any previous operand, and thus is loaded
@ -42,7 +43,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
  for (uint32_t i = 0; i < operand_count; ++i) {
    const InstructionOperand& operand = instr.vector_operands[i];
    for (uint32_t j = 0; j < i; ++j) {
-      if (operand == instr.vector_operands[j]) {
+      if (operand.GetIdenticalComponents(instr.vector_operands[j]) == 0b1111) {
        operands_duplicate[i] = true;
        dxbc_operands[i] = dxbc_operands[j];
        break;
@ -117,7 +118,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
      UseDxbcSourceOperand(dxbc_operands[1]);
      ++stat_.instruction_count;
      ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
        // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0),
        // flushing denormals (must be done using eq - doing bitwise comparison
        // doesn't flush denormals).
@ -281,7 +283,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
      UseDxbcSourceOperand(dxbc_operands[2]);
      ++stat_.instruction_count;
      ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
        // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
        // If any operand is zero or denormalized, just leave the addition part.
        uint32_t is_subnormal_temp = PushSystemTemp();
@ -388,7 +391,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
    case AluVectorOpcode::kDp4:
    case AluVectorOpcode::kDp3:
    case AluVectorOpcode::kDp2Add: {
-      if (instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+              instr.vector_operands[1]) != 0b1111) {
        // The operands are the same when calculating vector length, no need to
        // emulate 0 * anything = 0 in this case.
        shader_code_.push_back(
@ -1092,7 +1096,9 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
      UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1);
      ++stat_.instruction_count;
      ++stat_.float_instruction_count;
-      if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
+      if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+                instr.vector_operands[1]) &
+            0b0010)) {
        // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
        // This is an attenuation calculation function, so infinity is probably
        // not very unlikely.
@ -1294,7 +1300,8 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
    const ParsedAluInstruction& instr, bool& predicate_written) {
  predicate_written = false;

-  if (!instr.has_scalar_op) {
+  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+      !instr.scalar_result.GetUsedWriteMask()) {
    return false;
  }

@ -1306,7 +1313,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
  for (uint32_t i = 0; i < uint32_t(instr.scalar_operand_count); ++i) {
    const InstructionOperand& operand = instr.scalar_operands[i];
    for (uint32_t j = 0; j < i; ++j) {
-      if (operand == instr.scalar_operands[j]) {
+      if (operand.GetIdenticalComponents(instr.scalar_operands[j]) == 0b1111) {
        operands_duplicate[i] = true;
        dxbc_operands[i] = dxbc_operands[j];
        break;
@ -2303,7 +2310,9 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
      UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 0);
      ++stat_.instruction_count;
      ++stat_.float_instruction_count;
-      if (!instr.scalar_operands[0].EqualsAbsolute(instr.scalar_operands[1])) {
+      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+                instr.scalar_operands[1]) &
+            0b0001)) {
        // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
        uint32_t is_subnormal_temp = PushSystemTemp();
        // Get the non-NaN multiplicand closer to zero to check if any of them
@ -2421,7 +2430,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(

 void DxbcShaderTranslator::ProcessAluInstruction(
    const ParsedAluInstruction& instr) {
-  if (instr.is_nop()) {
+  if (instr.IsNop()) {
    return;
  }

@ -2445,7 +2454,8 @@ void DxbcShaderTranslator::ProcessAluInstruction(
      ProcessScalarAluOperation(instr, predicate_written_scalar);

  if (store_vector) {
-    StoreResult(instr.vector_result, system_temp_pv_, replicate_vector_x,
+    StoreResult(instr.vector_and_constant_result, system_temp_pv_,
+                replicate_vector_x,
                instr.GetMemExportStreamConstant() != UINT32_MAX);
  }
  if (store_scalar) {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -10,10 +10,12 @@
 #ifndef XENIA_GPU_SHADER_H_
 #define XENIA_GPU_SHADER_H_

+#include <algorithm>
 #include <filesystem>
 #include <string>
 #include <vector>

+#include "xenia/base/math.h"
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/ucode.h"
 #include "xenia/gpu/xenos.h"
@ -21,23 +23,32 @@
 namespace xe {
 namespace gpu {

+// The structures here are used for both translation and disassembly.
+//
+// Because disassembly uses them too, to make sure "assemble -> disassemble ->
+// reassemble" round trip is always successful with the XNA assembler (as it is
+// the accuracy benchmark for translation), only generalization - not
+// optimization like nop skipping/replacement - must be done while converting
+// microcode to these structures (in other words, parsed shader code should be
+// enough to accurately reconstruct the microcode for any shader that could be
+// written by a human in assembly).
+//
+// During the "parsed -> host" part of the translation, however, translators are
+// free to make any optimizations (as long as they don't affect the result, of
+// course) they find appropriate.
+
 enum class InstructionStorageTarget {
  // Result is not stored.
  kNone,
  // Result is stored to a temporary register indexed by storage_index [0-31].
  kRegister,
-  // Result is stored into a vertex shader interpolant export [0-15].
-  kInterpolant,
+  // Result is stored into a vertex shader interpolator export [0-15].
+  kInterpolator,
  // Result is stored to the position export (gl_Position).
  kPosition,
-  // Result is stored to the vertex shader misc export register.
-  // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
-  // USE_VTX_KILL_FLAG).
-  // X - PSIZE (gl_PointSize).
-  // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
-  //     drawing.
-  // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
-  //     for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
+  // Result is stored to the vertex shader misc export register, see
+  // ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex for description of
+  // components.
  kPointSizeEdgeFlagKillVertex,
  // Result is stored as memexport destination address
  // (see xenos::xe_gpu_memexport_stream_t).
@ -45,11 +56,29 @@ enum class InstructionStorageTarget {
  // Result is stored to memexport destination data.
  kExportData,
  // Result is stored to a color target export indexed by storage_index [0-3].
-  kColorTarget,
-  // Result is stored to the depth export (gl_FragDepth).
+  kColor,
+  // X of the result is stored to the depth export (gl_FragDepth).
  kDepth,
 };

+// Must be used only in translation to skip unused components, but not in
+// disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both
+// skipped components and zeros, which cannot be encoded, and therefore it will
+// not).
+constexpr uint32_t GetInstructionStorageTargetUsedComponents(
+    InstructionStorageTarget target) {
+  switch (target) {
+    case InstructionStorageTarget::kNone:
+      return 0b0000;
+    case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
+      return 0b0111;
+    case InstructionStorageTarget::kDepth:
+      return 0b0001;
+    default:
+      return 0b1111;
+  }
+}
+
 enum class InstructionStorageAddressingMode {
  // The storage index is not dynamically addressed.
  kStatic,
@ -75,71 +104,63 @@ enum class SwizzleSource {
  k1,
 };

-constexpr SwizzleSource GetSwizzleFromComponentIndex(int i) {
+constexpr SwizzleSource GetSwizzleFromComponentIndex(uint32_t i) {
  return static_cast<SwizzleSource>(i);
 }
-inline char GetCharForComponentIndex(int i) {
+inline char GetCharForComponentIndex(uint32_t i) {
  const static char kChars[] = {'x', 'y', 'z', 'w'};
  return kChars[i];
 }
 inline char GetCharForSwizzle(SwizzleSource swizzle_source) {
  const static char kChars[] = {'x', 'y', 'z', 'w', '0', '1'};
-  return kChars[static_cast<int>(swizzle_source)];
+  return kChars[static_cast<uint32_t>(swizzle_source)];
 }

 struct InstructionResult {
  // Where the result is going.
  InstructionStorageTarget storage_target = InstructionStorageTarget::kNone;
  // Index into the storage_target, if it is indexed.
-  int storage_index = 0;
+  uint32_t storage_index = 0;
  // How the storage index is dynamically addressed, if it is.
  InstructionStorageAddressingMode storage_addressing_mode =
      InstructionStorageAddressingMode::kStatic;
-  // True if the result is exporting from the shader.
-  bool is_export = false;
  // True to clamp the result value to [0-1].
  bool is_clamped = false;
-  // Defines whether each output component is written.
-  bool write_mask[4] = {false, false, false, false};
+  // Defines whether each output component is written, though this is from the
+  // original microcode, not taking into account whether such components
+  // actually exist in the target.
+  uint32_t original_write_mask = 0b0000;
  // Defines the source for each output component xyzw.
  SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
                                 SwizzleSource::kZ, SwizzleSource::kW};
-  // Returns true if any component is written to.
-  bool has_any_writes() const {
-    return write_mask[0] || write_mask[1] || write_mask[2] || write_mask[3];
-  }
-  // Returns true if all components are written to.
-  bool has_all_writes() const {
-    return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3];
-  }
-  // Returns number of components written
-  uint32_t num_writes() const {
-    uint32_t total = 0;
-    for (int i = 0; i < 4; i++) {
-      if (write_mask[i]) {
-        total++;
-      }
-    }
-
-    return total;
-  }
-  // Returns true if any non-constant components are written.
-  bool stores_non_constants() const {
-    for (int i = 0; i < 4; ++i) {
-      if (write_mask[i] && components[i] != SwizzleSource::k0 &&
-          components[i] != SwizzleSource::k1) {
-        return true;
-      }
-    }
-    return false;
+  // Returns the write mask containing only components actually present in the
+  // target.
+  uint32_t GetUsedWriteMask() const {
+    return original_write_mask &
+           GetInstructionStorageTargetUsedComponents(storage_target);
  }
  // True if the components are in their 'standard' swizzle arrangement (xyzw).
-  bool is_standard_swizzle() const {
-    return has_all_writes() && components[0] == SwizzleSource::kX &&
+  bool IsStandardSwizzle() const {
+    return (GetUsedWriteMask() == 0b1111) &&
+           components[0] == SwizzleSource::kX &&
           components[1] == SwizzleSource::kY &&
           components[2] == SwizzleSource::kZ &&
           components[3] == SwizzleSource::kW;
  }
+  // Returns the components of the result, before swizzling, that won't be
+  // discarded or replaced with a constant.
+  uint32_t GetUsedResultComponents() const {
+    uint32_t used_write_mask = GetUsedWriteMask();
+    uint32_t used_components = 0b0000;
+    for (uint32_t i = 0; i < 4; ++i) {
+      if ((used_write_mask & (1 << i)) && components[i] >= SwizzleSource::kX &&
+          components[i] <= SwizzleSource::kW) {
+        used_components |=
+            1 << (uint32_t(components[i]) - uint32_t(SwizzleSource::kX));
+      }
+    }
+    return used_components;
+  }
 };

 enum class InstructionStorageSource {
@ -159,7 +180,7 @@ struct InstructionOperand {
  // Where the source comes from.
  InstructionStorageSource storage_source = InstructionStorageSource::kRegister;
  // Index into the storage_target, if it is indexed.
-  int storage_index = 0;
+  uint32_t storage_index = 0;
  // How the storage index is dynamically addressed, if it is.
  InstructionStorageAddressingMode storage_addressing_mode =
      InstructionStorageAddressingMode::kStatic;
@ -168,13 +189,19 @@ struct InstructionOperand {
  // True to take the absolute value of the source (before any negation).
  bool is_absolute_value = false;
  // Number of components taken from the source operand.
-  int component_count = 0;
+  uint32_t component_count = 4;
  // Defines the source for each component xyzw (up to the given
  // component_count).
  SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
                                 SwizzleSource::kZ, SwizzleSource::kW};
+  // Returns the swizzle source for the component, replicating the rightmost
+  // component if there are less than 4 components (similar to what the Xbox 360
+  // shader compiler does as a general rule for unspecified components).
+  SwizzleSource GetComponent(uint32_t index) const {
+    return components[std::min(index, component_count - 1)];
+  }
  // True if the components are in their 'standard' swizzle arrangement (xyzw).
-  bool is_standard_swizzle() const {
+  bool IsStandardSwizzle() const {
    switch (component_count) {
      case 4:
        return components[0] == SwizzleSource::kX &&
@ -185,26 +212,32 @@ struct InstructionOperand {
    return false;
  }

-  // Whether absolute values of two operands are identical (useful for emulating
-  // Shader Model 3 0*anything=0 multiplication behavior).
-  bool EqualsAbsolute(const InstructionOperand& other) const {
+  // Returns which components of two operands are identical, but may have
+  // different signs (for simplicity of usage with GetComponent, treating the
+  // rightmost component as replicated).
+  uint32_t GetAbsoluteIdenticalComponents(
+      const InstructionOperand& other) const {
    if (storage_source != other.storage_source ||
        storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode ||
-        component_count != other.component_count) {
-      return false;
+        storage_addressing_mode != other.storage_addressing_mode) {
+      return 0;
    }
-    for (int i = 0; i < component_count; ++i) {
-      if (components[i] != other.components[i]) {
-        return false;
-      }
+    uint32_t identical_components = 0;
+    for (uint32_t i = 0; i < 4; ++i) {
+      identical_components |= uint32_t(GetComponent(i) == other.GetComponent(i))
+                              << i;
    }
-    return true;
+    return identical_components;
  }
-
-  bool operator==(const InstructionOperand& other) const {
-    return EqualsAbsolute(other) && is_negated == other.is_negated &&
-           is_absolute_value == other.is_absolute_value;
+  // Returns which components of two operands will always be bitwise equal, but
+  // may have different signs (disregarding component_count for simplicity of
+  // usage with GetComponent, treating the rightmost component as replicated).
+  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
+    if (is_negated != other.is_negated ||
+        is_absolute_value != other.is_absolute_value) {
+      return 0;
+    }
+    return GetAbsoluteIdenticalComponents(other);
  }
 };

@ -365,9 +398,6 @@ struct ParsedAllocInstruction {
 };

 struct ParsedVertexFetchInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
  // Opcode for the instruction.
  ucode::FetchOpcode opcode;
  // Friendly name of the instruction.
@ -409,9 +439,6 @@ struct ParsedVertexFetchInstruction {
 };

 struct ParsedTextureFetchInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
  // Opcode for the instruction.
  ucode::FetchOpcode opcode;
  // Friendly name of the instruction.
@ -462,17 +489,6 @@ struct ParsedTextureFetchInstruction {
 };

 struct ParsedAluInstruction {
-  // Index into the ucode dword source.
-  uint32_t dword_index = 0;
-
-  // True if the vector part of the instruction needs to be executed and data
-  // about it in this structure is valid.
-  bool has_vector_op = false;
-  // True if the scalar part of the instruction needs to be executed and data
-  // about it in this structure is valid.
-  bool has_scalar_op = false;
-  bool is_nop() const { return !has_vector_op && !has_scalar_op; }
-
  // Opcode for the vector part of the instruction.
  ucode::AluVectorOpcode vector_opcode = ucode::AluVectorOpcode::kAdd;
  // Opcode for the scalar part of the instruction.
@ -488,8 +504,20 @@ struct ParsedAluInstruction {
  // Expected predication condition value if predicated.
  bool predicate_condition = false;

-  // Describes how the vector operation result is stored.
-  InstructionResult vector_result;
+  // Describes how the vector operation result and, for exports, constant 0/1
+  // are stored. For simplicity of translation and disassembly, treating
+  // constant 0/1 writes as a part of the vector operation - they need to be
+  // expressed somehow in the disassembly anyway with a properly disassembled
+  // instruction even if only constants are being exported. The XNA disassembler
+  // falls back to displaying the whole vector operation, even if only constant
+  // components are written, if the scalar operation is a nop or if the vector
+  // operation has side effects (but if the scalar operation isn't nop, it
+  // outputs the entire constant mask in the scalar operation destination).
+  // Normally the XNA disassembler outputs the constant mask in both vector and
+  // scalar operations, but that's not required by assembler, so it doesn't
+  // really matter whether it's specified in the vector operation, in the scalar
+  // operation, or in both.
+  InstructionResult vector_and_constant_result;
  // Describes how the scalar operation result is stored.
  InstructionResult scalar_result;
  // Both operations must be executed before any result is stored if vector and
@ -499,27 +527,109 @@ struct ParsedAluInstruction {
  // operations.

  // Number of source operands of the vector operation.
-  size_t vector_operand_count = 0;
+  uint32_t vector_operand_count = 0;
  // Describes each source operand of the vector operation.
  InstructionOperand vector_operands[3];
  // Number of source operands of the scalar operation.
-  size_t scalar_operand_count = 0;
+  uint32_t scalar_operand_count = 0;
  // Describes each source operand of the scalar operation.
  InstructionOperand scalar_operands[2];

-  // If this is a valid eA write (MAD with a stream constant), returns the index
-  // of the stream float constant, otherwise returns UINT32_MAX.
+  // Whether the vector part of the instruction is the same as if it was omitted
+  // in the assembly (if compiled or assembled with the Xbox 360 shader
+  // compiler), and thus reassembling the shader with this instruction omitted
+  // will result in the same microcode (since instructions with just an empty
+  // write mask may have different values in other fields).
+  // This is for disassembly! Translators should use the write masks and
+  // AluVectorOpHasSideEffects to skip operations, as this only covers one very
+  // specific nop format!
+  bool IsVectorOpDefaultNop() const {
+    if (vector_opcode != ucode::AluVectorOpcode::kMax ||
+        vector_and_constant_result.original_write_mask ||
+        vector_and_constant_result.is_clamped ||
+        vector_operands[0].storage_source !=
+            InstructionStorageSource::kRegister ||
+        vector_operands[0].storage_index != 0 ||
+        vector_operands[0].storage_addressing_mode !=
+            InstructionStorageAddressingMode::kStatic ||
+        vector_operands[0].is_negated || vector_operands[0].is_absolute_value ||
+        !vector_operands[0].IsStandardSwizzle() ||
+        vector_operands[1].storage_source !=
+            InstructionStorageSource::kRegister ||
+        vector_operands[1].storage_index != 0 ||
+        vector_operands[1].storage_addressing_mode !=
+            InstructionStorageAddressingMode::kStatic ||
+        vector_operands[1].is_negated || vector_operands[1].is_absolute_value ||
+        !vector_operands[1].IsStandardSwizzle()) {
+      return false;
+    }
+    if (vector_and_constant_result.storage_target ==
+        InstructionStorageTarget::kRegister) {
+      if (vector_and_constant_result.storage_index != 0 ||
+          vector_and_constant_result.storage_addressing_mode !=
+              InstructionStorageAddressingMode::kStatic) {
+        return false;
+      }
+    } else {
+      // In case both vector and scalar operations are nop, still need to write
+      // somewhere that it's an export, not mov r0._, r0 + retain_prev r0._.
+      // Accurate round trip is possible only if the target is o0 or oC0,
+      // because if the total write mask is empty, the XNA assembler forces the
+      // destination to be o0/oC0, but this doesn't really matter in this case.
+      if (IsScalarOpDefaultNop()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Whether the scalar part of the instruction is the same as if it was omitted
+  // in the assembly (if compiled or assembled with the Xbox 360 shader
+  // compiler), and thus reassembling the shader with this instruction omitted
+  // will result in the same microcode (since instructions with just an empty
+  // write mask may have different values in other fields).
+  bool IsScalarOpDefaultNop() const {
+    if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev ||
+        scalar_result.original_write_mask || scalar_result.is_clamped) {
+      return false;
+    }
+    if (scalar_result.storage_target == InstructionStorageTarget::kRegister) {
+      if (scalar_result.storage_index != 0 ||
+          scalar_result.storage_addressing_mode !=
+              InstructionStorageAddressingMode::kStatic) {
+        return false;
+      }
+    }
+    // For exports, if both are nop, the vector operation will be kept to state
+    // in the microcode that the destination in the microcode is an export.
+    return true;
+  }
+
+  // For translation (not disassembly) - whether this instruction has totally no
+  // effect.
+  bool IsNop() const {
+    return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+           !scalar_result.GetUsedWriteMask() &&
+           !vector_and_constant_result.GetUsedWriteMask() &&
+           !ucode::AluVectorOpHasSideEffects(vector_opcode);
+  }
+
+  // If this is a "normal" eA write recognized by Xenia (MAD with a stream
+  // constant), returns the index of the stream float constant, otherwise
+  // returns UINT32_MAX.
  uint32_t GetMemExportStreamConstant() const {
-    if (has_vector_op &&
-        vector_result.storage_target ==
+    if (vector_and_constant_result.storage_target ==
            InstructionStorageTarget::kExportAddress &&
        vector_opcode == ucode::AluVectorOpcode::kMad &&
-        vector_result.has_all_writes() &&
+        vector_and_constant_result.GetUsedResultComponents() == 0b1111 &&
+        !vector_and_constant_result.is_clamped &&
        vector_operands[2].storage_source ==
            InstructionStorageSource::kConstantFloat &&
        vector_operands[2].storage_addressing_mode ==
            InstructionStorageAddressingMode::kStatic &&
-        vector_operands[2].is_standard_swizzle()) {
+        vector_operands[2].IsStandardSwizzle() &&
+        !vector_operands[2].is_negated &&
+        !vector_operands[2].is_absolute_value) {
      return vector_operands[2].storage_index;
    }
    return UINT32_MAX;
@ -581,9 +691,8 @@ class Shader {
  struct ConstantRegisterMap {
    // Bitmap of all kConstantFloat registers read by the shader.
    // Any shader can only read up to 256 of the 512, and the base is dependent
-    // on the shader type. Each bit corresponds to a storage index from the type
-    // base, so bit 0 in a vertex shader is register 0, and bit 0 in a fragment
-    // shader is register 256.
+    // on the shader type and SQ_VS/PS_CONST registers. Each bit corresponds to
+    // a storage index from the type base.
    uint64_t float_bitmap[256 / 64];
    // Bitmap of all loop constants read by the shader.
    // Each bit corresponds to a storage index [0-31].
@ -595,8 +704,33 @@ class Shader {
    // Total number of kConstantFloat registers read by the shader.
    uint32_t float_count;

-    // Computed byte count of all registers required when packed.
-    uint32_t packed_byte_length;
+    // Whether kConstantFloat registers are indexed dynamically - in this case,
+    // float_bitmap must be set to all 1, and tight packing must not be done.
+    bool float_dynamic_addressing;
+
+    // Returns the index of the float4 constant as if all float4 constant
+    // registers actually referenced were tightly packed in a buffer, or
+    // UINT32_MAX if not found.
+    uint32_t GetPackedFloatConstantIndex(uint32_t float_constant) const {
+      if (float_constant >= 256) {
+        return UINT32_MAX;
+      }
+      if (float_dynamic_addressing) {
+        // Any can potentially be read - not packing.
+        return float_constant;
+      }
+      uint32_t block_index = float_constant / 64;
+      uint32_t bit_index = float_constant % 64;
+      if (!(float_bitmap[block_index] & (uint64_t(1) << bit_index))) {
+        return UINT32_MAX;
+      }
+      uint32_t offset = 0;
+      for (uint32_t i = 0; i < block_index; ++i) {
+        offset += xe::bit_count(float_bitmap[i]);
+      }
+      return offset + xe::bit_count(float_bitmap[block_index] &
+                                    ((uint64_t(1) << bit_index) - 1));
+    }
  };

  Shader(ShaderType shader_type, uint64_t ucode_data_hash,
@ -642,7 +776,9 @@ class Shader {
  }

  // Returns true if the given color target index [0-3].
-  bool writes_color_target(int i) const { return writes_color_targets_[i]; }
+  bool writes_color_target(uint32_t i) const {
+    return writes_color_targets_[i];
+  }

  // True if the shader overrides the pixel depth.
  bool writes_depth() const { return writes_depth_; }
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@ -57,15 +57,19 @@ class ShaderTranslator {
  }
  // True if the current shader is a pixel shader.
  bool is_pixel_shader() const { return shader_type_ == ShaderType::kPixel; }
+  // Used constant register info, populated before translation.
  const Shader::ConstantRegisterMap& constant_register_map() const {
    return constant_register_map_;
  }
  // True if the current shader addresses general-purpose registers with dynamic
-  // indices.
+  // indices, set before translation. Doesn't include writes to r[#+a#] with an
+  // empty used write mask.
  bool uses_register_dynamic_addressing() const {
    return uses_register_dynamic_addressing_;
  }
-  // True if the current shader writes to a color target on any execution path.
+  // True if the current shader writes to a color target on any execution path,
+  // set before translation. Doesn't include writes with an empty used write
+  // mask.
  bool writes_color_target(int i) const { return writes_color_targets_[i]; }
  bool writes_any_color_target() const {
    for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
@ -75,7 +79,8 @@ class ShaderTranslator {
    }
    return false;
  }
-  // True if the current shader overrides the pixel depth.
+  // True if the current shader overrides the pixel depth, set before
+  // translation. Doesn't include writes with an empty used write mask.
  bool writes_depth() const { return writes_depth_; }
  // True if Xenia can automatically enable early depth/stencil for the pixel
  // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha
@ -181,8 +186,8 @@ class ShaderTranslator {
 private:
  struct AluOpcodeInfo {
    const char* name;
-    size_t argument_count;
-    int src_swizzle_component_count;
+    uint32_t argument_count;
+    uint32_t src_swizzle_component_count;
    bool disable_implicit_early_z;
  };

@ -229,10 +234,16 @@ class ShaderTranslator {
                                    ParsedTextureFetchInstruction* out_instr);

  void TranslateAluInstruction(const ucode::AluInstruction& op);
-  void ParseAluVectorOperation(const ucode::AluInstruction& op,
-                               ParsedAluInstruction& instr);
-  void ParseAluScalarOperation(const ucode::AluInstruction& op,
-                               ParsedAluInstruction& instr);
+  void ParseAluInstruction(const ucode::AluInstruction& op,
+                           ParsedAluInstruction& out_instr) const;
+  static void ParseAluInstructionOperand(const ucode::AluInstruction& op,
+                                         uint32_t i,
+                                         uint32_t swizzle_component_count,
+                                         InstructionOperand& out_op);
+  static void ParseAluInstructionOperandSpecial(
+      const ucode::AluInstruction& op, InstructionStorageSource storage_source,
+      uint32_t reg, bool negate, int const_slot, uint32_t component_index,
+      InstructionOperand& out_op);

  // Input shader metadata and microcode.
  ShaderType shader_type_;
@ -265,12 +276,16 @@ class ShaderTranslator {
  uint32_t unique_vertex_bindings_ = 0;
  uint32_t unique_texture_bindings_ = 0;

+  // These all are gathered before translation.
+  // uses_register_dynamic_addressing_ for writes, writes_color_targets_,
+  // writes_depth_ don't include empty used write masks.
  Shader::ConstantRegisterMap constant_register_map_ = {0};
  bool uses_register_dynamic_addressing_ = false;
  bool writes_color_targets_[4] = {false, false, false, false};
  bool writes_depth_ = false;
  bool implicit_early_z_allowed_ = true;

+  // Memexport info is gathered before translation.
  uint32_t memexport_alloc_count_ = 0;
  // For register allocation in implementations - what was used after each
  // `alloc export`.
--- a/src/xenia/gpu/shader_translator_disasm.cc
+++ b/src/xenia/gpu/shader_translator_disasm.cc
@ -28,7 +28,7 @@ void DisassembleResultOperand(const InstructionResult& result,
      out->Append('r');
      uses_storage_index = true;
      break;
-    case InstructionStorageTarget::kInterpolant:
+    case InstructionStorageTarget::kInterpolator:
      out->Append('o');
      uses_storage_index = true;
      break;
@ -45,7 +45,7 @@ void DisassembleResultOperand(const InstructionResult& result,
      out->Append("eM");
      uses_storage_index = true;
      break;
-    case InstructionStorageTarget::kColorTarget:
+    case InstructionStorageTarget::kColor:
      out->Append("oC");
      uses_storage_index = true;
      break;
@ -68,12 +68,19 @@ void DisassembleResultOperand(const InstructionResult& result,
        break;
    }
  }
-  if (!result.has_any_writes()) {
+  // Not using GetUsedWriteMask/IsStandardSwizzle because they filter out
+  // components not having any runtime effect, but those components are still
+  // present in the microcode.
+  if (!result.original_write_mask) {
    out->Append("._");
-  } else if (!result.is_standard_swizzle()) {
+  } else if (result.original_write_mask != 0b1111 ||
+             result.components[0] != SwizzleSource::kX ||
+             result.components[1] != SwizzleSource::kY ||
+             result.components[2] != SwizzleSource::kZ ||
+             result.components[3] != SwizzleSource::kW) {
    out->Append('.');
    for (int i = 0; i < 4; ++i) {
-      if (result.write_mask[i]) {
+      if (result.original_write_mask & (1 << i)) {
        out->Append(GetCharForSwizzle(result.components[i]));
      } else {
        out->Append('_');
@ -116,7 +123,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
      out->AppendFormat("[{}+aL]", op.storage_index);
      break;
  }
-  if (!op.is_standard_swizzle()) {
+  if (!op.IsStandardSwizzle()) {
    out->Append('.');
    if (op.component_count == 1) {
      out->Append(GetCharForSwizzle(op.components[0]));
@ -124,7 +131,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
      out->Append(GetCharForSwizzle(op.components[0]));
      out->Append(GetCharForSwizzle(op.components[1]));
    } else {
-      for (int j = 0; j < op.component_count; ++j) {
+      for (uint32_t j = 0; j < op.component_count; ++j) {
        out->Append(GetCharForSwizzle(op.components[j]));
      }
    }
@ -454,11 +461,19 @@ void ParsedTextureFetchInstruction::Disassemble(StringBuffer* out) const {
 }

 void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
-  if (is_nop()) {
-    out->Append("         nop\n");
+  bool is_vector_op_default_nop = IsVectorOpDefaultNop();
+  bool is_scalar_op_default_nop = IsScalarOpDefaultNop();
+  if (is_vector_op_default_nop && is_scalar_op_default_nop) {
+    out->Append("   ");
+    if (is_predicated) {
+      out->Append(predicate_condition ? " (p0) " : "(!p0) ");
+    } else {
+      out->Append("      ");
+    }
+    out->Append("nop\n");
    return;
  }
-  if (has_vector_op) {
+  if (!is_vector_op_default_nop) {
    out->Append("   ");
    if (is_predicated) {
      out->Append(predicate_condition ? " (p0) " : "(!p0) ");
@ -466,19 +481,19 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
      out->Append("      ");
    }
    out->Append(vector_opcode_name);
-    if (vector_result.is_clamped) {
+    if (vector_and_constant_result.is_clamped) {
      out->Append("_sat");
    }
    out->Append(' ');
-    DisassembleResultOperand(vector_result, out);
-    for (int i = 0; i < vector_operand_count; ++i) {
+    DisassembleResultOperand(vector_and_constant_result, out);
+    for (uint32_t i = 0; i < vector_operand_count; ++i) {
      out->Append(", ");
      DisassembleSourceOperand(vector_operands[i], out);
    }
    out->Append('\n');
  }
-  if (has_scalar_op) {
-    out->Append(has_vector_op ? "              + " : "   ");
+  if (!is_scalar_op_default_nop) {
+    out->Append(is_vector_op_default_nop ? "   " : "              + ");
    if (is_predicated) {
      out->Append(predicate_condition ? " (p0) " : "(!p0) ");
    } else {
@ -490,7 +505,7 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
    }
    out->Append(' ');
    DisassembleResultOperand(scalar_result, out);
-    for (int i = 0; i < scalar_operand_count; ++i) {
+    for (uint32_t i = 0; i < scalar_operand_count; ++i) {
      out->Append(", ");
      DisassembleSourceOperand(scalar_operands[i], out);
    }
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@ -2003,7 +2003,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(

 void SpirvShaderTranslator::ProcessAluInstruction(
    const ParsedAluInstruction& instr) {
-  if (instr.is_nop()) {
+  if (instr.IsNop()) {
    return;
  }

@ -2044,7 +2044,7 @@ void SpirvShaderTranslator::ProcessAluInstruction(
      ProcessScalarAluOperation(instr, close_predicated_block_scalar);

  if (store_vector) {
-    StoreToResult(b.createLoad(pv_), instr.vector_result);
+    StoreToResult(b.createLoad(pv_), instr.vector_and_constant_result);
  }
  if (store_scalar) {
    StoreToResult(b.createLoad(ps_), instr.scalar_result);
@ -2252,7 +2252,8 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
    const ParsedAluInstruction& instr, bool& close_predicated_block) {
  close_predicated_block = false;

-  if (!instr.has_vector_op) {
+  if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
+      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
    return false;
  }

@ -2261,7 +2262,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
  // TODO: If we have identical operands, reuse previous one.
  Id sources[3] = {0};
  Id dest = vec4_float_zero_;
-  for (size_t i = 0; i < instr.vector_operand_count; i++) {
+  for (uint32_t i = 0; i < instr.vector_operand_count; i++) {
    sources[i] = LoadFromOperand(instr.vector_operands[i]);
  }

@ -2636,7 +2637,8 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
    const ParsedAluInstruction& instr, bool& close_predicated_block) {
  close_predicated_block = false;

-  if (!instr.has_scalar_op) {
+  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
+      !instr.scalar_result.GetUsedWriteMask()) {
    return false;
  }

@ -2645,12 +2647,12 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
  // TODO: If we have identical operands, reuse previous one.
  Id sources[3] = {0};
  Id dest = b.makeFloatConstant(0);
-  for (size_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
+  for (uint32_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
    auto src = LoadFromOperand(instr.scalar_operands[i]);

    // Pull components out of the vector operands and use them as sources.
    if (instr.scalar_operands[i].component_count > 1) {
-      for (int j = 0; j < instr.scalar_operands[i].component_count; j++) {
+      for (uint32_t j = 0; j < instr.scalar_operands[i].component_count; j++) {
        sources[x++] = b.createCompositeExtract(src, float_type_, j);
      }
    } else {
@ -3191,7 +3193,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
  }

  // swizzle
-  if (op.component_count > 1 && !op.is_standard_swizzle()) {
+  if (op.component_count > 1 && !op.IsStandardSwizzle()) {
    std::vector<uint32_t> operands;
    operands.push_back(storage_value);
    operands.push_back(b.makeCompositeConstant(
@ -3200,7 +3202,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {

    // Components start from left and are duplicated rightwards
    // e.g. count = 1, xxxx / count = 2, xyyy ...
-    for (int i = 0; i < 4; i++) {
+    for (uint32_t i = 0; i < 4; i++) {
      auto swiz = op.components[i];
      if (i > op.component_count - 1) {
        swiz = op.components[op.component_count - 1];
@ -3244,7 +3246,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
    return;
  }

-  if (!result.has_any_writes()) {
+  uint32_t used_write_mask = result.GetUsedWriteMask();
+  if (!used_write_mask) {
    return;
  }

@ -3285,7 +3288,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
      storage_array = true;
      assert_true(uint32_t(result.storage_index) < register_count());
      break;
-    case InstructionStorageTarget::kInterpolant:
+    case InstructionStorageTarget::kInterpolator:
      assert_true(is_vertex_shader());
      storage_pointer = interpolators_;
      storage_class = spv::StorageClass::StorageClassOutput;
@ -3310,7 +3313,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
      storage_offsets.push_back(0);
      storage_array = false;
      break;
-    case InstructionStorageTarget::kColorTarget:
+    case InstructionStorageTarget::kColor:
      assert_true(is_pixel_shader());
      assert_not_zero(frag_outputs_);
      storage_pointer = frag_outputs_;
@ -3351,7 +3354,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,

  // Only load from storage if we need it later.
  Id storage_value = 0;
-  if ((source_is_scalar && !storage_is_scalar) || !result.has_all_writes()) {
+  if ((source_is_scalar && !storage_is_scalar) || used_write_mask != 0b1111) {
    storage_value = b.createLoad(storage_pointer);
  }

@ -3366,7 +3369,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
  }

  // destination swizzle
-  if (!result.is_standard_swizzle() && !source_is_scalar) {
+  if (!result.IsStandardSwizzle() && !source_is_scalar) {
    std::vector<uint32_t> operands;
    operands.push_back(source_value_id);
    operands.push_back(b.makeCompositeConstant(
@ -3377,7 +3380,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
    // e.g. count = 1, xxxx / count = 2, xyyy ...
    uint32_t source_components = b.getNumComponents(source_value_id);
    for (int i = 0; i < 4; i++) {
-      if (!result.write_mask[i]) {
+      if (!(used_write_mask & (1 << i))) {
        // Undefined / don't care.
        operands.push_back(0);
        continue;
@ -3411,29 +3414,30 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
  }

  // write mask
-  if (!result.has_all_writes() && !source_is_scalar && !storage_is_scalar) {
+  if (used_write_mask != 0b1111 && !source_is_scalar && !storage_is_scalar) {
    std::vector<uint32_t> operands;
    operands.push_back(source_value_id);
    operands.push_back(storage_value);

    for (int i = 0; i < b.getNumTypeComponents(storage_type); i++) {
-      operands.push_back(
-          result.write_mask[i] ? i : b.getNumComponents(source_value_id) + i);
+      operands.push_back((used_write_mask & (1 << i))
+                             ? i
+                             : b.getNumComponents(source_value_id) + i);
    }

    source_value_id =
        b.createOp(spv::Op::OpVectorShuffle, storage_type, operands);
  } else if (source_is_scalar && !storage_is_scalar) {
-    assert_true(result.num_writes() >= 1);
+    assert_not_zero(used_write_mask);

-    if (result.has_all_writes()) {
+    if (used_write_mask == 0b1111) {
      source_value_id =
          b.smearScalar(spv::NoPrecision, source_value_id, storage_type);
    } else {
      // Find first enabled component
      uint32_t index = 0;
      for (uint32_t i = 0; i < 4; i++) {
-        if (result.write_mask[i]) {
+        if (used_write_mask & (1 << i)) {
          index = i;
          break;
        }
@ -3443,10 +3447,10 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
    }
  } else if (!source_is_scalar && storage_is_scalar) {
    // Num writes /needs/ to be 1, and let's assume it's the first element.
-    assert_true(result.num_writes() == 1);
+    assert_true(xe::bit_count(used_write_mask) == 1);

    for (uint32_t i = 0; i < 4; i++) {
-      if (result.write_mask[i]) {
+      if (used_write_mask & (1 << i)) {
        source_value_id =
            b.createCompositeExtract(source_value_id, storage_type, 0);
        break;
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -667,7 +667,11 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 * anything = 0) and for NaN in min/max.
+//   multiplication (0 * anything = 0) wherever it's present (mul, mad, dp,
+//   etc.) and for NaN in min/max. It's very important to respect this rule for
+//   multiplication, as games often rely on it in vector normalization (rcp and
+//   mul), Infinity * 0 resulting in NaN breaks a lot of things in games -
+//   causes white screen in Halo 3, white specular on characters in GTA IV.

 enum class AluScalarOpcode : uint32_t {
  // Floating-Point Add
@ -1300,8 +1304,10 @@ enum class AluVectorOpcode : uint32_t {

 // Whether the vector instruction has side effects such as discarding a pixel or
 // setting the predicate and can't be ignored even if it doesn't write to
-// anywhere.
-inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
+// anywhere. Note that all scalar operations except for retain_prev have a side
+// effect of modifying the previous scalar result register, so they must always
+// be executed even if not writing.
+constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
  switch (vector_opcode) {
    case AluVectorOpcode::kSetpEqPush:
    case AluVectorOpcode::kSetpNePush:
@ -1319,7 +1325,126 @@ inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
  return false;
 }

+// Whether each component of a source operand is used at all in the instruction
+// (doesn't check the operand count though).
+constexpr uint32_t GetAluVectorOpUsedSourceComponents(
+    AluVectorOpcode vector_opcode, uint32_t src_index) {
+  switch (vector_opcode) {
+    case AluVectorOpcode::kDp3:
+      return 0b0111;
+    case AluVectorOpcode::kDp2Add:
+      return src_index == 3 ? 0b0001 : 0b0011;
+    case AluVectorOpcode::kSetpEqPush:
+    case AluVectorOpcode::kSetpNePush:
+    case AluVectorOpcode::kSetpGtPush:
+    case AluVectorOpcode::kSetpGePush:
+      return 0b1001;
+    case AluVectorOpcode::kDst:
+      return src_index == 2 ? 0b1010 : 0b0110;
+    default:
+      break;
+  }
+  return 0b1111;
+}
+
+// Whether each component of a source operand is needed for the instruction if
+// executed with the specified write mask, and thus can't be thrown away or be
+// undefined in translation. For per-component operations, for example, only the
+// components specified in the write mask are needed, but there are instructions
+// with special behavior for certain components.
+constexpr uint32_t GetAluVectorOpNeededSourceComponents(
+    AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) {
+  uint32_t components = write_mask;
+  switch (vector_opcode) {
+    case AluVectorOpcode::kDp4:
+    case AluVectorOpcode::kMax4:
+      components = write_mask ? 0b1111 : 0;
+      break;
+    case AluVectorOpcode::kDp3:
+      components = write_mask ? 0b0111 : 0;
+      break;
+    case AluVectorOpcode::kDp2Add:
+      components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0;
+      break;
+    case AluVectorOpcode::kCube:
+      components = write_mask ? 0b1111 : 0;
+      break;
+    case AluVectorOpcode::kSetpEqPush:
+    case AluVectorOpcode::kSetpNePush:
+    case AluVectorOpcode::kSetpGtPush:
+    case AluVectorOpcode::kSetpGePush:
+      components = write_mask ? 0b1001 : 0b1000;
+      break;
+    case AluVectorOpcode::kKillEq:
+    case AluVectorOpcode::kKillGt:
+    case AluVectorOpcode::kKillGe:
+    case AluVectorOpcode::kKillNe:
+      components = 0b1111;
+      break;
+    // kDst is per-component, but not all components are used -
+    // GetAluVectorOpUsedSourceComponents will filter out the unused ones.
+    case AluVectorOpcode::kMaxA:
+      if (src_index == 1) {
+        components |= 0b1000;
+      }
+      break;
+    default:
+      break;
+  }
+  return components &
+         GetAluVectorOpUsedSourceComponents(vector_opcode, src_index);
+}
+
+enum class ExportRegister : uint32_t {
+  kVSInterpolator0 = 0,
+  kVSInterpolator1,
+  kVSInterpolator2,
+  kVSInterpolator3,
+  kVSInterpolator4,
+  kVSInterpolator5,
+  kVSInterpolator6,
+  kVSInterpolator7,
+  kVSInterpolator8,
+  kVSInterpolator9,
+  kVSInterpolator10,
+  kVSInterpolator11,
+  kVSInterpolator12,
+  kVSInterpolator13,
+  kVSInterpolator14,
+  kVSInterpolator15,
+
+  kVSPosition = 62,
+
+  // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
+  // USE_VTX_KILL_FLAG).
+  // X - PSIZE (gl_PointSize).
+  // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
+  //     drawing.
+  // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
+  //     for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
+  kVSPointSizeEdgeFlagKillVertex = 63,
+
+  kPSColor0 = 0,
+  kPSColor1,
+  kPSColor2,
+  kPSColor3,
+
+  // In X.
+  kPSDepth = 61,
+
+  // Memory export: index.?y?? * 0100 + xe_gpu_memexport_stream_t.xyzw.
+  kExportAddress = 32,
+  // Memory export: values for texels [index+0], [index+1], ..., [index+4].
+  kExportData0 = 33,
+  kExportData1,
+  kExportData2,
+  kExportData3,
+  kExportData4,
+};
+
 struct AluInstruction {
+  // Raw accessors.
+
  // Whether data is being exported (or written to local registers).
  bool is_export() const { return data_.export_data == 1; }
  bool export_write_mask() const { return data_.scalar_dest_rel == 1; }
@ -1334,20 +1459,12 @@ struct AluInstruction {
  bool is_const_1_addressed() const { return data_.const_1_rel_abs == 1; }
  bool is_address_relative() const { return data_.address_absolute == 1; }

-  bool has_vector_op() const {
-    return vector_write_mask() || is_export() ||
-           AluVectorOpcodeHasSideEffects(vector_opcode());
-  }
  AluVectorOpcode vector_opcode() const { return data_.vector_opc; }
  uint32_t vector_write_mask() const { return data_.vector_write_mask; }
  uint32_t vector_dest() const { return data_.vector_dest; }
  bool is_vector_dest_relative() const { return data_.vector_dest_rel == 1; }
  bool vector_clamp() const { return data_.vector_clamp == 1; }

-  bool has_scalar_op() const {
-    return scalar_opcode() != AluScalarOpcode::kRetainPrev ||
-           (!is_export() && scalar_write_mask() != 0);
-  }
  AluScalarOpcode scalar_opcode() const { return data_.scalar_opc; }
  uint32_t scalar_write_mask() const { return data_.scalar_write_mask; }
  uint32_t scalar_dest() const { return data_.scalar_dest; }
@ -1407,14 +1524,62 @@ struct AluInstruction {
    }
  }

+  // Helpers.
+
+  // Note that even if the export component is unused (like W of the vertex
+  // shader misc register, YZW of pixel shader depth), it must still not be
+  // excluded - that may make disassembly not reassemblable if there are
+  // constant 0 writes in the export, like, oPts.x000 will be assembled, but
+  // oPts.x00_ will not, even though W has no effect on anything.
+  uint32_t GetVectorOpResultWriteMask() const {
+    uint32_t mask = vector_write_mask();
+    if (is_export()) {
+      mask &= ~scalar_write_mask();
+    }
+    return mask;
+  }
+  uint32_t GetScalarOpResultWriteMask() const {
+    uint32_t mask = scalar_write_mask();
+    if (is_export()) {
+      mask &= ~vector_write_mask();
+    }
+    return mask;
+  }
+  uint32_t GetConstant0WriteMask() const {
+    if (!is_export() || !is_scalar_dest_relative()) {
+      return 0b0000;
+    }
+    return 0b1111 & ~(vector_write_mask() | scalar_write_mask());
+  }
+  uint32_t GetConstant1WriteMask() const {
+    if (!is_export()) {
+      return 0b0000;
+    }
+    return vector_write_mask() & scalar_write_mask();
+  }
+
 private:
  XEPACKEDSTRUCT(Data, {
    XEPACKEDSTRUCTANONYMOUS({
+      // If exporting, both vector and scalar operations use the vector
+      // destination (which can't be relative in this case).
+      // Not very important note: If both scalar and vector operations exporting
+      // something have empty write mask, the XNA assembler forces vector_dest
+      // to 0 (interpolator 0 or color 0) directly in the microcode.
      uint32_t vector_dest : 6;
      uint32_t vector_dest_rel : 1;
      uint32_t abs_constants : 1;
      uint32_t scalar_dest : 6;
      uint32_t scalar_dest_rel : 1;
+      // Exports have different write masking (export is done to vector_dest by
+      // both the vector and the scalar operation, and exports can write
+      // constant 0 and 1). For each component:
+      // - vector_write_mask 0, scalar_write_mask 0:
+      //   - scalar_dest_rel 0 - unchanged.
+      //   - scalar_dest_rel 1 - constant 0 (all components must be written).
+      // - vector_write_mask 1, scalar_write_mask 0 - from vector operation.
+      // - vector_write_mask 0, scalar_write_mask 1 - from scalar operation.
+      // - vector_write_mask 1, scalar_write_mask 1 - constant 1.
      uint32_t export_data : 1;
      uint32_t vector_write_mask : 4;
      uint32_t scalar_write_mask : 4;
--- a/tools/shader-playground/Editor.cs
+++ b/tools/shader-playground/Editor.cs
@ -267,6 +267,7 @@ namespace shader_playground {
        "--shader_output=" + translatedDisasmPath,
        "--shader_output_type=" + outputType,
        "--vertex_shader_output_type=" + vertexShaderType,
+        "--dxbc_source_map=true",
      };
      if (translationComboBox.SelectedIndex == 1) {
        startArguments.Add("--shader_output_dxbc_rov=true");