[GPU] Shader ALU refactoring + documentation

Mainly move instruction info from the ShaderTranslator to xe::gpu::ucode for future use in the CPU shader interpreter
2022-04-27 20:52:20 +03:00 · 2022-04-27 20:52:20 +03:00 · b42680abf7
parent df9a37f798
commit b42680abf7
6 changed files with 229 additions and 193 deletions
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -28,7 +28,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
  uint32_t used_result_components =
      instr.vector_and_constant_result.GetUsedResultComponents();
  if (!used_result_components &&
-      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
+      !ucode::GetAluVectorOpcodeInfo(instr.vector_opcode).changed_state) {
    return;
  }

--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -561,12 +561,12 @@ struct ParsedAluInstruction {
  // instruction even if only constants are being exported. The XNA disassembler
  // falls back to displaying the whole vector operation, even if only constant
  // components are written, if the scalar operation is a nop or if the vector
-  // operation has side effects (but if the scalar operation isn't nop, it
-  // outputs the entire constant mask in the scalar operation destination).
-  // Normally the XNA disassembler outputs the constant mask in both vector and
-  // scalar operations, but that's not required by assembler, so it doesn't
-  // really matter whether it's specified in the vector operation, in the scalar
-  // operation, or in both.
+  // operation changes a0, p0 or kills pixels (but if the scalar operation isn't
+  // nop, it outputs the entire constant mask in the scalar operation
+  // destination). Normally the XNA disassembler outputs the constant mask in
+  // both vector and scalar operations, but that's not required by assembler, so
+  // it doesn't really matter whether it's specified in the vector operation, in
+  // the scalar operation, or in both.
  InstructionResult vector_and_constant_result;
  // Describes how the scalar operation result is stored.
  InstructionResult scalar_result;
@ -591,8 +591,8 @@ struct ParsedAluInstruction {
  // will result in the same microcode (since instructions with just an empty
  // write mask may have different values in other fields).
  // This is for disassembly! Translators should use the write masks and
-  // AluVectorOpHasSideEffects to skip operations, as this only covers one very
-  // specific nop format!
+  // the changed state bits in the opcode info to skip operations, as this only
+  // covers one very specific nop format!
  bool IsVectorOpDefaultNop() const;
  // Whether the scalar part of the instruction is the same as if it was omitted
  // in the assembly (if compiled or assembled with the Xbox 360 shader
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -370,9 +370,12 @@ void Shader::GatherAluInstructionInformation(
  ParseAluInstruction(op, type(), instr);
  instr.Disassemble(&ucode_disasm_buffer);

-  kills_pixels_ = kills_pixels_ ||
-                  ucode::AluVectorOpcodeIsKill(op.vector_opcode()) ||
-                  ucode::AluScalarOpcodeIsKill(op.scalar_opcode());
+  kills_pixels_ =
+      kills_pixels_ ||
+      (ucode::GetAluVectorOpcodeInfo(op.vector_opcode()).changed_state &
+       ucode::kAluOpChangedStatePixelKill) ||
+      (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
+       ucode::kAluOpChangedStatePixelKill);

  GatherAluResultInformation(instr.vector_and_constant_result,
                             memexport_alloc_current_count);
@ -1055,99 +1058,6 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const {
  return result.GetUsedResultComponents() & components;
 }

-struct AluOpcodeInfo {
-  const char* name;
-  uint32_t argument_count;
-  uint32_t src_swizzle_component_count;
-};
-
-static const AluOpcodeInfo alu_vector_opcode_infos[0x20] = {
-    {"add", 2, 4},           // 0
-    {"mul", 2, 4},           // 1
-    {"max", 2, 4},           // 2
-    {"min", 2, 4},           // 3
-    {"seq", 2, 4},           // 4
-    {"sgt", 2, 4},           // 5
-    {"sge", 2, 4},           // 6
-    {"sne", 2, 4},           // 7
-    {"frc", 1, 4},           // 8
-    {"trunc", 1, 4},         // 9
-    {"floor", 1, 4},         // 10
-    {"mad", 3, 4},           // 11
-    {"cndeq", 3, 4},         // 12
-    {"cndge", 3, 4},         // 13
-    {"cndgt", 3, 4},         // 14
-    {"dp4", 2, 4},           // 15
-    {"dp3", 2, 4},           // 16
-    {"dp2add", 3, 4},        // 17
-    {"cube", 2, 4},          // 18
-    {"max4", 1, 4},          // 19
-    {"setp_eq_push", 2, 4},  // 20
-    {"setp_ne_push", 2, 4},  // 21
-    {"setp_gt_push", 2, 4},  // 22
-    {"setp_ge_push", 2, 4},  // 23
-    {"kill_eq", 2, 4},       // 24
-    {"kill_gt", 2, 4},       // 25
-    {"kill_ge", 2, 4},       // 26
-    {"kill_ne", 2, 4},       // 27
-    {"dst", 2, 4},           // 28
-    {"maxa", 2, 4},          // 29
-};
-
-static const AluOpcodeInfo alu_scalar_opcode_infos[0x40] = {
-    {"adds", 1, 2},         // 0
-    {"adds_prev", 1, 1},    // 1
-    {"muls", 1, 2},         // 2
-    {"muls_prev", 1, 1},    // 3
-    {"muls_prev2", 1, 2},   // 4
-    {"maxs", 1, 2},         // 5
-    {"mins", 1, 2},         // 6
-    {"seqs", 1, 1},         // 7
-    {"sgts", 1, 1},         // 8
-    {"sges", 1, 1},         // 9
-    {"snes", 1, 1},         // 10
-    {"frcs", 1, 1},         // 11
-    {"truncs", 1, 1},       // 12
-    {"floors", 1, 1},       // 13
-    {"exp", 1, 1},          // 14
-    {"logc", 1, 1},         // 15
-    {"log", 1, 1},          // 16
-    {"rcpc", 1, 1},         // 17
-    {"rcpf", 1, 1},         // 18
-    {"rcp", 1, 1},          // 19
-    {"rsqc", 1, 1},         // 20
-    {"rsqf", 1, 1},         // 21
-    {"rsq", 1, 1},          // 22
-    {"maxas", 1, 2},        // 23
-    {"maxasf", 1, 2},       // 24
-    {"subs", 1, 2},         // 25
-    {"subs_prev", 1, 1},    // 26
-    {"setp_eq", 1, 1},      // 27
-    {"setp_ne", 1, 1},      // 28
-    {"setp_gt", 1, 1},      // 29
-    {"setp_ge", 1, 1},      // 30
-    {"setp_inv", 1, 1},     // 31
-    {"setp_pop", 1, 1},     // 32
-    {"setp_clr", 0, 0},     // 33
-    {"setp_rstr", 1, 1},    // 34
-    {"kills_eq", 1, 1},     // 35
-    {"kills_gt", 1, 1},     // 36
-    {"kills_ge", 1, 1},     // 37
-    {"kills_ne", 1, 1},     // 38
-    {"kills_one", 1, 1},    // 39
-    {"sqrt", 1, 1},         // 40
-    {"UNKNOWN", 0, 0},      // 41
-    {"mulsc", 2, 1},        // 42
-    {"mulsc", 2, 1},        // 43
-    {"addsc", 2, 1},        // 44
-    {"addsc", 2, 1},        // 45
-    {"subsc", 2, 1},        // 46
-    {"subsc", 2, 1},        // 47
-    {"sin", 1, 1},          // 48
-    {"cos", 1, 1},          // 49
-    {"retain_prev", 0, 0},  // 50
-};
-
 static void ParseAluInstructionOperand(const AluInstruction& op, uint32_t i,
                                       uint32_t swizzle_component_count,
                                       InstructionOperand& out_op) {
@ -1290,9 +1200,10 @@ void ParseAluInstruction(const AluInstruction& op,

  // Vector operation and constant 0/1 writes.

-  instr.vector_opcode = op.vector_opcode();
-  const auto& vector_opcode_info =
-      alu_vector_opcode_infos[uint32_t(instr.vector_opcode)];
+  ucode::AluVectorOpcode vector_opcode = op.vector_opcode();
+  instr.vector_opcode = vector_opcode;
+  const ucode::AluVectorOpcodeInfo& vector_opcode_info =
+      ucode::GetAluVectorOpcodeInfo(vector_opcode);
  instr.vector_opcode_name = vector_opcode_info.name;

  instr.vector_and_constant_result.storage_target = storage_target;
@ -1322,19 +1233,18 @@ void ParseAluInstruction(const AluInstruction& op,
    instr.vector_and_constant_result.components[i] = component;
  }

-  instr.vector_operand_count = vector_opcode_info.argument_count;
+  instr.vector_operand_count = vector_opcode_info.GetOperandCount();
  for (uint32_t i = 0; i < instr.vector_operand_count; ++i) {
    InstructionOperand& vector_operand = instr.vector_operands[i];
-    ParseAluInstructionOperand(op, i + 1,
-                               vector_opcode_info.src_swizzle_component_count,
-                               vector_operand);
+    ParseAluInstructionOperand(op, i + 1, 4, vector_operand);
  }

  // Scalar operation.

-  instr.scalar_opcode = op.scalar_opcode();
-  const auto& scalar_opcode_info =
-      alu_scalar_opcode_infos[uint32_t(instr.scalar_opcode)];
+  ucode::AluScalarOpcode scalar_opcode = op.scalar_opcode();
+  instr.scalar_opcode = scalar_opcode;
+  const ucode::AluScalarOpcodeInfo& scalar_opcode_info =
+      ucode::GetAluScalarOpcodeInfo(scalar_opcode);
  instr.scalar_opcode_name = scalar_opcode_info.name;

  instr.scalar_result.storage_target = storage_target;
@ -1355,12 +1265,12 @@ void ParseAluInstruction(const AluInstruction& op,
    instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i);
  }

-  instr.scalar_operand_count = scalar_opcode_info.argument_count;
+  instr.scalar_operand_count = scalar_opcode_info.operand_count;
  if (instr.scalar_operand_count) {
    if (instr.scalar_operand_count == 1) {
-      ParseAluInstructionOperand(op, 3,
-                                 scalar_opcode_info.src_swizzle_component_count,
-                                 instr.scalar_operands[0]);
+      ParseAluInstructionOperand(
+          op, 3, scalar_opcode_info.single_operand_is_two_component ? 2 : 1,
+          instr.scalar_operands[0]);
    } else {
      // Constant and temporary register.

@ -1393,7 +1303,7 @@ void ParseAluInstruction(const AluInstruction& op,
      temp_op.is_negated = src3_negate;
      temp_op.is_absolute_value = op.abs_constants();
      temp_op.storage_source = InstructionStorageSource::kRegister;
-      temp_op.storage_index = op.scalar_const_op_src_temp_reg();
+      temp_op.storage_index = op.scalar_const_reg_op_src_temp_reg();
      temp_op.storage_addressing_mode =
          InstructionStorageAddressingMode::kAbsolute;
      temp_op.component_count = 1;
@ -1423,7 +1333,7 @@ bool ParsedAluInstruction::IsNop() const {
  return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
         !scalar_result.GetUsedWriteMask() &&
         !vector_and_constant_result.GetUsedWriteMask() &&
-         !ucode::AluVectorOpHasSideEffects(vector_opcode);
+         !ucode::GetAluVectorOpcodeInfo(vector_opcode).changed_state;
 }

 uint32_t ParsedAluInstruction::GetMemExportStreamConstant() const {
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@ -2264,7 +2264,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
  close_predicated_block = false;

  if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
-      !AluVectorOpHasSideEffects(instr.vector_opcode)) {
+      !ucode::GetAluVectorOpcodeInfo(instr.vector_opcode).changed_state) {
    return false;
  }

--- a/src/xenia/gpu/ucode.cc
+++ b/src/xenia/gpu/ucode.cc
@ -0,0 +1,120 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/ucode.h"
+
+namespace xe {
+namespace gpu {
+namespace ucode {
+
+const AluScalarOpcodeInfo kAluScalarOpcodeInfos[64] = {
+    {"adds", 1, true, kAluOpChangedStateNone},
+    {"adds_prev", 1, false, kAluOpChangedStateNone},
+    {"muls", 1, true, kAluOpChangedStateNone},
+    {"muls_prev", 1, false, kAluOpChangedStateNone},
+    {"muls_prev2", 1, true, kAluOpChangedStateNone},
+    {"maxs", 1, true, kAluOpChangedStateNone},
+    {"mins", 1, true, kAluOpChangedStateNone},
+    {"seqs", 1, false, kAluOpChangedStateNone},
+    {"sgts", 1, false, kAluOpChangedStateNone},
+    {"sges", 1, false, kAluOpChangedStateNone},
+    {"snes", 1, false, kAluOpChangedStateNone},
+    {"frcs", 1, false, kAluOpChangedStateNone},
+    {"truncs", 1, false, kAluOpChangedStateNone},
+    {"floors", 1, false, kAluOpChangedStateNone},
+    {"exp", 1, false, kAluOpChangedStateNone},
+    {"logc", 1, false, kAluOpChangedStateNone},
+    {"log", 1, false, kAluOpChangedStateNone},
+    {"rcpc", 1, false, kAluOpChangedStateNone},
+    {"rcpf", 1, false, kAluOpChangedStateNone},
+    {"rcp", 1, false, kAluOpChangedStateNone},
+    {"rsqc", 1, false, kAluOpChangedStateNone},
+    {"rsqf", 1, false, kAluOpChangedStateNone},
+    {"rsq", 1, false, kAluOpChangedStateNone},
+    {"maxas", 1, true, kAluOpChangedStateAddressRegister},
+    {"maxasf", 1, true, kAluOpChangedStateAddressRegister},
+    {"subs", 1, true, kAluOpChangedStateNone},
+    {"subs_prev", 1, false, kAluOpChangedStateNone},
+    {"setp_eq", 1, false, kAluOpChangedStatePredicate},
+    {"setp_ne", 1, false, kAluOpChangedStatePredicate},
+    {"setp_gt", 1, false, kAluOpChangedStatePredicate},
+    {"setp_ge", 1, false, kAluOpChangedStatePredicate},
+    {"setp_inv", 1, false, kAluOpChangedStatePredicate},
+    {"setp_pop", 1, false, kAluOpChangedStatePredicate},
+    {"setp_clr", 0, false, kAluOpChangedStatePredicate},
+    {"setp_rstr", 1, false, kAluOpChangedStatePredicate},
+    {"kills_eq", 1, false, kAluOpChangedStatePixelKill},
+    {"kills_gt", 1, false, kAluOpChangedStatePixelKill},
+    {"kills_ge", 1, false, kAluOpChangedStatePixelKill},
+    {"kills_ne", 1, false, kAluOpChangedStatePixelKill},
+    {"kills_one", 1, false, kAluOpChangedStatePixelKill},
+    {"sqrt", 1, false, kAluOpChangedStateNone},
+    {"opcode_41", 0, false, kAluOpChangedStateNone},
+    {"mulsc", 2, false, kAluOpChangedStateNone},
+    {"mulsc", 2, false, kAluOpChangedStateNone},
+    {"addsc", 2, false, kAluOpChangedStateNone},
+    {"addsc", 2, false, kAluOpChangedStateNone},
+    {"subsc", 2, false, kAluOpChangedStateNone},
+    {"subsc", 2, false, kAluOpChangedStateNone},
+    {"sin", 1, false, kAluOpChangedStateNone},
+    {"cos", 1, false, kAluOpChangedStateNone},
+    {"retain_prev", 0, false, kAluOpChangedStateNone},
+    {"opcode_51", 0, false, kAluOpChangedStateNone},
+    {"opcode_52", 0, false, kAluOpChangedStateNone},
+    {"opcode_53", 0, false, kAluOpChangedStateNone},
+    {"opcode_54", 0, false, kAluOpChangedStateNone},
+    {"opcode_55", 0, false, kAluOpChangedStateNone},
+    {"opcode_56", 0, false, kAluOpChangedStateNone},
+    {"opcode_57", 0, false, kAluOpChangedStateNone},
+    {"opcode_58", 0, false, kAluOpChangedStateNone},
+    {"opcode_59", 0, false, kAluOpChangedStateNone},
+    {"opcode_60", 0, false, kAluOpChangedStateNone},
+    {"opcode_61", 0, false, kAluOpChangedStateNone},
+    {"opcode_62", 0, false, kAluOpChangedStateNone},
+    {"opcode_63", 0, false, kAluOpChangedStateNone},
+};
+
+const AluVectorOpcodeInfo kAluVectorOpcodeInfos[32] = {
+    {"add", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"mul", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"max", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"min", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"seq", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"sgt", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"sge", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"sne", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"frc", {0b1111}, kAluOpChangedStateNone},
+    {"trunc", {0b1111}, kAluOpChangedStateNone},
+    {"floor", {0b1111}, kAluOpChangedStateNone},
+    {"mad", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"cndeq", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"cndge", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"cndgt", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"dp4", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"dp3", {0b0111, 0b0111}, kAluOpChangedStateNone},
+    {"dp2add", {0b0011, 0b0011, 0b0001}, kAluOpChangedStateNone},
+    {"cube", {0b1111, 0b1111}, kAluOpChangedStateNone},
+    {"max4", {0b1111}, kAluOpChangedStateNone},
+    {"setp_eq_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate},
+    {"setp_ne_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate},
+    {"setp_gt_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate},
+    {"setp_ge_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate},
+    {"kill_eq", {0b1111, 0b1111}, kAluOpChangedStatePixelKill},
+    {"kill_gt", {0b1111, 0b1111}, kAluOpChangedStatePixelKill},
+    {"kill_ge", {0b1111, 0b1111}, kAluOpChangedStatePixelKill},
+    {"kill_ne", {0b1111, 0b1111}, kAluOpChangedStatePixelKill},
+    {"dst", {0b0110, 0b1010}, kAluOpChangedStateNone},
+    {"maxa", {0b1111, 0b1111}, kAluOpChangedStateAddressRegister},
+    {"opcode_30", {}, kAluOpChangedStateNone},
+    {"opcode_31", {}, kAluOpChangedStateNone},
+};
+
+}  // namespace ucode
+}  // namespace gpu
+}  // namespace xe
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -13,6 +13,7 @@
 #include <cstdint>

 #include "xenia/base/assert.h"
+#include "xenia/base/math.h"
 #include "xenia/base/platform.h"
 #include "xenia/gpu/xenos.h"

@ -900,8 +901,9 @@ static_assert_size(FetchInstruction, sizeof(uint32_t) * 3);
 // Conventions:
 // - All temporary registers are vec4s.
 // - Most scalar ALU operations work with one or two components of the source
-//   register passed as the third operand of the whole co-issued ALU operation,
-//   denoted by `a` (the left-hand operand) and `b` (the right-hand operand).
+//   register or the float constant passed as the third operand of the whole
+//   co-issued ALU operation, denoted by `a` (the left-hand operand) and `b`
+//   (the right-hand operand).
 //   `a` is the [(3 + src3_swizzle[6:7]) & 3] component (W - alpha).
 //   `b` is the [(0 + src3_swizzle[0:1]) & 3] component (X - red).
 // - mulsc, addsc, subsc scalar ALU operations accept two operands - a float
@ -948,6 +950,14 @@ static_assert_size(FetchInstruction, sizeof(uint32_t) * 3);
 //   use instructions that may be interpreted by the host GPU as fused
 //   multiply-add.

+// For analysis of shaders and skipping instructions that write nothing.
+enum AluOpChangedState {
+  kAluOpChangedStateNone = 0,
+  kAluOpChangedStateAddressRegister = 1 << 0,
+  kAluOpChangedStatePredicate = 1 << 1,
+  kAluOpChangedStatePixelKill = 1 << 2,
+};
+
 enum class AluScalarOpcode : uint32_t {
  // Floating-Point Add
  // adds/ADDs dest, src0.ab
@ -1277,17 +1287,28 @@ enum class AluScalarOpcode : uint32_t {
  kRetainPrev = 50,
 };

-constexpr bool AluScalarOpcodeIsKill(AluScalarOpcode scalar_opcode) {
-  switch (scalar_opcode) {
-    case AluScalarOpcode::kKillsEq:
-    case AluScalarOpcode::kKillsGt:
-    case AluScalarOpcode::kKillsGe:
-    case AluScalarOpcode::kKillsNe:
-    case AluScalarOpcode::kKillsOne:
-      return true;
-    default:
-      return false;
-  }
+struct AluScalarOpcodeInfo {
+  const char* name;
+  // 0 - no operands.
+  // 1 - one single-component (W) or two-component (WX) r# or c#.
+  // 2 - c#.w and r#.x.
+  uint32_t operand_count;
+  // If operand_count is 1, whether both W and X of the operand are used rather
+  // than only W.
+  bool single_operand_is_two_component;
+  // Note that all scalar instructions except for retain_prev modify the
+  // previous scalar register, so they must be executed even if they don't write
+  // any result and don't perform any other state changes.
+  AluOpChangedState changed_state;
+};
+
+// 6 scalar opcode bits - 64 entries.
+extern const AluScalarOpcodeInfo kAluScalarOpcodeInfos[64];
+
+inline const AluScalarOpcodeInfo& GetAluScalarOpcodeInfo(
+    AluScalarOpcode opcode) {
+  assert_true(uint32_t(opcode) < xe::countof(kAluScalarOpcodeInfos));
+  return kAluScalarOpcodeInfos[uint32_t(opcode)];
 }

 enum class AluVectorOpcode : uint32_t {
@ -1385,6 +1406,9 @@ enum class AluVectorOpcode : uint32_t {
  //     dest.y = src0.y * src1.y + src2.y;
  //     dest.z = src0.z * src1.z + src2.z;
  //     dest.w = src0.w * src1.w + src2.w;
+  // According to SQ_ALU::multiply_add (used in the isHardwareAccurate case)
+  // from IPR2015-00325 sq_alu, this is FMA - rounding to single-precision only
+  // after the addition.
  kMad = 11,

  // Per-Component Floating-Point Conditional Move If Equal
@ -1490,6 +1514,17 @@ enum class AluVectorOpcode : uint32_t {
  //     } else {
  //       dest.xyzw = src0.w;
  //     }
+  // However, the comparisons may be >= actually - the XNA documentation on
+  // MSDN, as well as R600 and GCN documentation, describe `max` as being
+  // implemented via >= rather than >. `max4` is documented vaguely, without the
+  // exact calculations for each component - MSDN describes it as max(xyzw), and
+  // in the R600 documentation it's max(wzyx). There's also a case more similar
+  // to `max4` where there also is a discrepancy between IPR2015-00325 sq_alu
+  // and the GCN documentation - `cube` has max3 in zyx priority order, and a >=
+  // comparison is used for this purpose on the GCN, but in IPR2015-00325 sq_alu
+  // it's implemented via >. It's possible that in an early version of the R400,
+  // the comparison was >, but was later changed to >=, but this is merely a
+  // guess.
  kMax4 = 19,

  // Floating-Point Predicate Counter Increment If Equal
@ -1627,60 +1662,32 @@ enum class AluVectorOpcode : uint32_t {
  kMaxA = 29,
 };

-constexpr bool AluVectorOpcodeIsKill(AluVectorOpcode vector_opcode) {
-  switch (vector_opcode) {
-    case AluVectorOpcode::kKillEq:
-    case AluVectorOpcode::kKillGt:
-    case AluVectorOpcode::kKillGe:
-    case AluVectorOpcode::kKillNe:
-      return true;
-    default:
-      return false;
-  }
-}
+struct AluVectorOpcodeInfo {
+  const char* name;
+  uint32_t operand_components_used[3];
+  AluOpChangedState changed_state;

-// Whether the vector instruction has side effects such as discarding a pixel or
-// setting the predicate and can't be ignored even if it doesn't write to
-// anywhere. Note that all scalar operations except for retain_prev have a side
-// effect of modifying the previous scalar result register, so they must always
-// be executed even if not writing.
-constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
-  if (AluVectorOpcodeIsKill(vector_opcode)) {
-    return true;
+  uint32_t GetOperandCount() const {
+    if (!operand_components_used[2]) {
+      if (!operand_components_used[1]) {
+        if (!operand_components_used[0]) {
+          return 0;
+        }
+        return 1;
+      }
+      return 2;
+    }
+    return 3;
  }
-  switch (vector_opcode) {
-    case AluVectorOpcode::kSetpEqPush:
-    case AluVectorOpcode::kSetpNePush:
-    case AluVectorOpcode::kSetpGtPush:
-    case AluVectorOpcode::kSetpGePush:
-    case AluVectorOpcode::kMaxA:
-      return true;
-    default:
-      return false;
-  }
-}
+};

-// Whether each component of a source operand is used at all in the instruction
-// (doesn't check the operand count though).
-constexpr uint32_t GetAluVectorOpUsedSourceComponents(
-    AluVectorOpcode vector_opcode, uint32_t src_index) {
-  assert_not_zero(src_index);
-  switch (vector_opcode) {
-    case AluVectorOpcode::kDp3:
-      return 0b0111;
-    case AluVectorOpcode::kDp2Add:
-      return src_index == 3 ? 0b0001 : 0b0011;
-    case AluVectorOpcode::kSetpEqPush:
-    case AluVectorOpcode::kSetpNePush:
-    case AluVectorOpcode::kSetpGtPush:
-    case AluVectorOpcode::kSetpGePush:
-      return 0b1001;
-    case AluVectorOpcode::kDst:
-      return src_index == 2 ? 0b1010 : 0b0110;
-    default:
-      break;
-  }
-  return 0b1111;
+// 5 vector opcode bits - 32 entries.
+extern const AluVectorOpcodeInfo kAluVectorOpcodeInfos[32];
+
+inline const AluVectorOpcodeInfo& GetAluVectorOpcodeInfo(
+    AluVectorOpcode opcode) {
+  assert_true(uint32_t(opcode) < xe::countof(kAluVectorOpcodeInfos));
+  return kAluVectorOpcodeInfos[uint32_t(opcode)];
 }

 // Whether each component of a source operand is needed for the instruction if
@ -1688,7 +1695,7 @@ constexpr uint32_t GetAluVectorOpUsedSourceComponents(
 // undefined in translation. For per-component operations, for example, only the
 // components specified in the write mask are needed, but there are instructions
 // with special behavior for certain components.
-constexpr uint32_t GetAluVectorOpNeededSourceComponents(
+inline uint32_t GetAluVectorOpNeededSourceComponents(
    AluVectorOpcode vector_opcode, uint32_t src_index,
    uint32_t used_result_components) {
  assert_not_zero(src_index);
@ -1721,8 +1728,8 @@ constexpr uint32_t GetAluVectorOpNeededSourceComponents(
    case AluVectorOpcode::kKillNe:
      components = 0b1111;
      break;
-    // kDst is per-component, but not all components are used -
-    // GetAluVectorOpUsedSourceComponents will filter out the unused ones.
+    // kDst is per-component, but not all components are used.
+    // operand_components_used will filter out the unused ones.
    case AluVectorOpcode::kMaxA:
      if (src_index == 1) {
        components |= 0b1000;
@ -1731,8 +1738,8 @@ constexpr uint32_t GetAluVectorOpNeededSourceComponents(
    default:
      break;
  }
-  return components &
-         GetAluVectorOpUsedSourceComponents(vector_opcode, src_index);
+  return components & GetAluVectorOpcodeInfo(vector_opcode)
+                          .operand_components_used[src_index - 1];
 }

 enum class ExportRegister : uint32_t {
@ -1787,7 +1794,6 @@ struct alignas(uint32_t) AluInstruction {

  // Whether data is being exported (or written to local registers).
  bool is_export() const { return data_.export_data == 1; }
-  bool export_write_mask() const { return data_.scalar_dest_rel == 1; }

  // Whether the jump is predicated (or conditional).
  bool is_predicated() const { return data_.is_predicated; }
@ -1921,7 +1927,7 @@ struct alignas(uint32_t) AluInstruction {
    }
  }

-  uint32_t scalar_const_op_src_temp_reg() const {
+  uint32_t scalar_const_reg_op_src_temp_reg() const {
    return (uint32_t(data_.scalar_opc) & 1) | (data_.src3_sel << 1) |
           (data_.src3_swiz & 0x3C);
  }