diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 7c7280338..7331a7e2a 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -28,7 +28,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( uint32_t used_result_components = instr.vector_and_constant_result.GetUsedResultComponents(); if (!used_result_components && - !AluVectorOpHasSideEffects(instr.vector_opcode)) { + !ucode::GetAluVectorOpcodeInfo(instr.vector_opcode).changed_state) { return; } diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index d33baf565..427837a6e 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -561,12 +561,12 @@ struct ParsedAluInstruction { // instruction even if only constants are being exported. The XNA disassembler // falls back to displaying the whole vector operation, even if only constant // components are written, if the scalar operation is a nop or if the vector - // operation has side effects (but if the scalar operation isn't nop, it - // outputs the entire constant mask in the scalar operation destination). - // Normally the XNA disassembler outputs the constant mask in both vector and - // scalar operations, but that's not required by assembler, so it doesn't - // really matter whether it's specified in the vector operation, in the scalar - // operation, or in both. + // operation changes a0, p0 or kills pixels (but if the scalar operation isn't + // nop, it outputs the entire constant mask in the scalar operation + // destination). Normally the XNA disassembler outputs the constant mask in + // both vector and scalar operations, but that's not required by assembler, so + // it doesn't really matter whether it's specified in the vector operation, in + // the scalar operation, or in both. InstructionResult vector_and_constant_result; // Describes how the scalar operation result is stored. InstructionResult scalar_result; @@ -591,8 +591,8 @@ struct ParsedAluInstruction { // will result in the same microcode (since instructions with just an empty // write mask may have different values in other fields). // This is for disassembly! Translators should use the write masks and - // AluVectorOpHasSideEffects to skip operations, as this only covers one very - // specific nop format! + // the changed state bits in the opcode info to skip operations, as this only + // covers one very specific nop format! bool IsVectorOpDefaultNop() const; // Whether the scalar part of the instruction is the same as if it was omitted // in the assembly (if compiled or assembled with the Xbox 360 shader diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index d98fa5b7e..adc56656e 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -370,9 +370,12 @@ void Shader::GatherAluInstructionInformation( ParseAluInstruction(op, type(), instr); instr.Disassemble(&ucode_disasm_buffer); - kills_pixels_ = kills_pixels_ || - ucode::AluVectorOpcodeIsKill(op.vector_opcode()) || - ucode::AluScalarOpcodeIsKill(op.scalar_opcode()); + kills_pixels_ = + kills_pixels_ || + (ucode::GetAluVectorOpcodeInfo(op.vector_opcode()).changed_state & + ucode::kAluOpChangedStatePixelKill) || + (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state & + ucode::kAluOpChangedStatePixelKill); GatherAluResultInformation(instr.vector_and_constant_result, memexport_alloc_current_count); @@ -1055,99 +1058,6 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const { return result.GetUsedResultComponents() & components; } -struct AluOpcodeInfo { - const char* name; - uint32_t argument_count; - uint32_t src_swizzle_component_count; -}; - -static const AluOpcodeInfo alu_vector_opcode_infos[0x20] = { - {"add", 2, 4}, // 0 - {"mul", 2, 4}, // 1 - {"max", 2, 4}, // 2 - {"min", 2, 4}, // 3 - {"seq", 2, 4}, // 4 - {"sgt", 2, 4}, // 5 - {"sge", 2, 4}, // 6 - {"sne", 2, 4}, // 7 - {"frc", 1, 4}, // 8 - {"trunc", 1, 4}, // 9 - {"floor", 1, 4}, // 10 - {"mad", 3, 4}, // 11 - {"cndeq", 3, 4}, // 12 - {"cndge", 3, 4}, // 13 - {"cndgt", 3, 4}, // 14 - {"dp4", 2, 4}, // 15 - {"dp3", 2, 4}, // 16 - {"dp2add", 3, 4}, // 17 - {"cube", 2, 4}, // 18 - {"max4", 1, 4}, // 19 - {"setp_eq_push", 2, 4}, // 20 - {"setp_ne_push", 2, 4}, // 21 - {"setp_gt_push", 2, 4}, // 22 - {"setp_ge_push", 2, 4}, // 23 - {"kill_eq", 2, 4}, // 24 - {"kill_gt", 2, 4}, // 25 - {"kill_ge", 2, 4}, // 26 - {"kill_ne", 2, 4}, // 27 - {"dst", 2, 4}, // 28 - {"maxa", 2, 4}, // 29 -}; - -static const AluOpcodeInfo alu_scalar_opcode_infos[0x40] = { - {"adds", 1, 2}, // 0 - {"adds_prev", 1, 1}, // 1 - {"muls", 1, 2}, // 2 - {"muls_prev", 1, 1}, // 3 - {"muls_prev2", 1, 2}, // 4 - {"maxs", 1, 2}, // 5 - {"mins", 1, 2}, // 6 - {"seqs", 1, 1}, // 7 - {"sgts", 1, 1}, // 8 - {"sges", 1, 1}, // 9 - {"snes", 1, 1}, // 10 - {"frcs", 1, 1}, // 11 - {"truncs", 1, 1}, // 12 - {"floors", 1, 1}, // 13 - {"exp", 1, 1}, // 14 - {"logc", 1, 1}, // 15 - {"log", 1, 1}, // 16 - {"rcpc", 1, 1}, // 17 - {"rcpf", 1, 1}, // 18 - {"rcp", 1, 1}, // 19 - {"rsqc", 1, 1}, // 20 - {"rsqf", 1, 1}, // 21 - {"rsq", 1, 1}, // 22 - {"maxas", 1, 2}, // 23 - {"maxasf", 1, 2}, // 24 - {"subs", 1, 2}, // 25 - {"subs_prev", 1, 1}, // 26 - {"setp_eq", 1, 1}, // 27 - {"setp_ne", 1, 1}, // 28 - {"setp_gt", 1, 1}, // 29 - {"setp_ge", 1, 1}, // 30 - {"setp_inv", 1, 1}, // 31 - {"setp_pop", 1, 1}, // 32 - {"setp_clr", 0, 0}, // 33 - {"setp_rstr", 1, 1}, // 34 - {"kills_eq", 1, 1}, // 35 - {"kills_gt", 1, 1}, // 36 - {"kills_ge", 1, 1}, // 37 - {"kills_ne", 1, 1}, // 38 - {"kills_one", 1, 1}, // 39 - {"sqrt", 1, 1}, // 40 - {"UNKNOWN", 0, 0}, // 41 - {"mulsc", 2, 1}, // 42 - {"mulsc", 2, 1}, // 43 - {"addsc", 2, 1}, // 44 - {"addsc", 2, 1}, // 45 - {"subsc", 2, 1}, // 46 - {"subsc", 2, 1}, // 47 - {"sin", 1, 1}, // 48 - {"cos", 1, 1}, // 49 - {"retain_prev", 0, 0}, // 50 -}; - static void ParseAluInstructionOperand(const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count, InstructionOperand& out_op) { @@ -1290,9 +1200,10 @@ void ParseAluInstruction(const AluInstruction& op, // Vector operation and constant 0/1 writes. - instr.vector_opcode = op.vector_opcode(); - const auto& vector_opcode_info = - alu_vector_opcode_infos[uint32_t(instr.vector_opcode)]; + ucode::AluVectorOpcode vector_opcode = op.vector_opcode(); + instr.vector_opcode = vector_opcode; + const ucode::AluVectorOpcodeInfo& vector_opcode_info = + ucode::GetAluVectorOpcodeInfo(vector_opcode); instr.vector_opcode_name = vector_opcode_info.name; instr.vector_and_constant_result.storage_target = storage_target; @@ -1322,19 +1233,18 @@ void ParseAluInstruction(const AluInstruction& op, instr.vector_and_constant_result.components[i] = component; } - instr.vector_operand_count = vector_opcode_info.argument_count; + instr.vector_operand_count = vector_opcode_info.GetOperandCount(); for (uint32_t i = 0; i < instr.vector_operand_count; ++i) { InstructionOperand& vector_operand = instr.vector_operands[i]; - ParseAluInstructionOperand(op, i + 1, - vector_opcode_info.src_swizzle_component_count, - vector_operand); + ParseAluInstructionOperand(op, i + 1, 4, vector_operand); } // Scalar operation. - instr.scalar_opcode = op.scalar_opcode(); - const auto& scalar_opcode_info = - alu_scalar_opcode_infos[uint32_t(instr.scalar_opcode)]; + ucode::AluScalarOpcode scalar_opcode = op.scalar_opcode(); + instr.scalar_opcode = scalar_opcode; + const ucode::AluScalarOpcodeInfo& scalar_opcode_info = + ucode::GetAluScalarOpcodeInfo(scalar_opcode); instr.scalar_opcode_name = scalar_opcode_info.name; instr.scalar_result.storage_target = storage_target; @@ -1355,12 +1265,12 @@ void ParseAluInstruction(const AluInstruction& op, instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i); } - instr.scalar_operand_count = scalar_opcode_info.argument_count; + instr.scalar_operand_count = scalar_opcode_info.operand_count; if (instr.scalar_operand_count) { if (instr.scalar_operand_count == 1) { - ParseAluInstructionOperand(op, 3, - scalar_opcode_info.src_swizzle_component_count, - instr.scalar_operands[0]); + ParseAluInstructionOperand( + op, 3, scalar_opcode_info.single_operand_is_two_component ? 2 : 1, + instr.scalar_operands[0]); } else { // Constant and temporary register. @@ -1393,7 +1303,7 @@ void ParseAluInstruction(const AluInstruction& op, temp_op.is_negated = src3_negate; temp_op.is_absolute_value = op.abs_constants(); temp_op.storage_source = InstructionStorageSource::kRegister; - temp_op.storage_index = op.scalar_const_op_src_temp_reg(); + temp_op.storage_index = op.scalar_const_reg_op_src_temp_reg(); temp_op.storage_addressing_mode = InstructionStorageAddressingMode::kAbsolute; temp_op.component_count = 1; @@ -1423,7 +1333,7 @@ bool ParsedAluInstruction::IsNop() const { return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev && !scalar_result.GetUsedWriteMask() && !vector_and_constant_result.GetUsedWriteMask() && - !ucode::AluVectorOpHasSideEffects(vector_opcode); + !ucode::GetAluVectorOpcodeInfo(vector_opcode).changed_state; } uint32_t ParsedAluInstruction::GetMemExportStreamConstant() const { diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 7ba883a19..cf1298e7d 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -2264,7 +2264,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation( close_predicated_block = false; if (!instr.vector_and_constant_result.GetUsedWriteMask() && - !AluVectorOpHasSideEffects(instr.vector_opcode)) { + !ucode::GetAluVectorOpcodeInfo(instr.vector_opcode).changed_state) { return false; } diff --git a/src/xenia/gpu/ucode.cc b/src/xenia/gpu/ucode.cc new file mode 100644 index 000000000..0efd5fb10 --- /dev/null +++ b/src/xenia/gpu/ucode.cc @@ -0,0 +1,120 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/ucode.h" + +namespace xe { +namespace gpu { +namespace ucode { + +const AluScalarOpcodeInfo kAluScalarOpcodeInfos[64] = { + {"adds", 1, true, kAluOpChangedStateNone}, + {"adds_prev", 1, false, kAluOpChangedStateNone}, + {"muls", 1, true, kAluOpChangedStateNone}, + {"muls_prev", 1, false, kAluOpChangedStateNone}, + {"muls_prev2", 1, true, kAluOpChangedStateNone}, + {"maxs", 1, true, kAluOpChangedStateNone}, + {"mins", 1, true, kAluOpChangedStateNone}, + {"seqs", 1, false, kAluOpChangedStateNone}, + {"sgts", 1, false, kAluOpChangedStateNone}, + {"sges", 1, false, kAluOpChangedStateNone}, + {"snes", 1, false, kAluOpChangedStateNone}, + {"frcs", 1, false, kAluOpChangedStateNone}, + {"truncs", 1, false, kAluOpChangedStateNone}, + {"floors", 1, false, kAluOpChangedStateNone}, + {"exp", 1, false, kAluOpChangedStateNone}, + {"logc", 1, false, kAluOpChangedStateNone}, + {"log", 1, false, kAluOpChangedStateNone}, + {"rcpc", 1, false, kAluOpChangedStateNone}, + {"rcpf", 1, false, kAluOpChangedStateNone}, + {"rcp", 1, false, kAluOpChangedStateNone}, + {"rsqc", 1, false, kAluOpChangedStateNone}, + {"rsqf", 1, false, kAluOpChangedStateNone}, + {"rsq", 1, false, kAluOpChangedStateNone}, + {"maxas", 1, true, kAluOpChangedStateAddressRegister}, + {"maxasf", 1, true, kAluOpChangedStateAddressRegister}, + {"subs", 1, true, kAluOpChangedStateNone}, + {"subs_prev", 1, false, kAluOpChangedStateNone}, + {"setp_eq", 1, false, kAluOpChangedStatePredicate}, + {"setp_ne", 1, false, kAluOpChangedStatePredicate}, + {"setp_gt", 1, false, kAluOpChangedStatePredicate}, + {"setp_ge", 1, false, kAluOpChangedStatePredicate}, + {"setp_inv", 1, false, kAluOpChangedStatePredicate}, + {"setp_pop", 1, false, kAluOpChangedStatePredicate}, + {"setp_clr", 0, false, kAluOpChangedStatePredicate}, + {"setp_rstr", 1, false, kAluOpChangedStatePredicate}, + {"kills_eq", 1, false, kAluOpChangedStatePixelKill}, + {"kills_gt", 1, false, kAluOpChangedStatePixelKill}, + {"kills_ge", 1, false, kAluOpChangedStatePixelKill}, + {"kills_ne", 1, false, kAluOpChangedStatePixelKill}, + {"kills_one", 1, false, kAluOpChangedStatePixelKill}, + {"sqrt", 1, false, kAluOpChangedStateNone}, + {"opcode_41", 0, false, kAluOpChangedStateNone}, + {"mulsc", 2, false, kAluOpChangedStateNone}, + {"mulsc", 2, false, kAluOpChangedStateNone}, + {"addsc", 2, false, kAluOpChangedStateNone}, + {"addsc", 2, false, kAluOpChangedStateNone}, + {"subsc", 2, false, kAluOpChangedStateNone}, + {"subsc", 2, false, kAluOpChangedStateNone}, + {"sin", 1, false, kAluOpChangedStateNone}, + {"cos", 1, false, kAluOpChangedStateNone}, + {"retain_prev", 0, false, kAluOpChangedStateNone}, + {"opcode_51", 0, false, kAluOpChangedStateNone}, + {"opcode_52", 0, false, kAluOpChangedStateNone}, + {"opcode_53", 0, false, kAluOpChangedStateNone}, + {"opcode_54", 0, false, kAluOpChangedStateNone}, + {"opcode_55", 0, false, kAluOpChangedStateNone}, + {"opcode_56", 0, false, kAluOpChangedStateNone}, + {"opcode_57", 0, false, kAluOpChangedStateNone}, + {"opcode_58", 0, false, kAluOpChangedStateNone}, + {"opcode_59", 0, false, kAluOpChangedStateNone}, + {"opcode_60", 0, false, kAluOpChangedStateNone}, + {"opcode_61", 0, false, kAluOpChangedStateNone}, + {"opcode_62", 0, false, kAluOpChangedStateNone}, + {"opcode_63", 0, false, kAluOpChangedStateNone}, +}; + +const AluVectorOpcodeInfo kAluVectorOpcodeInfos[32] = { + {"add", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"mul", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"max", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"min", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"seq", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"sgt", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"sge", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"sne", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"frc", {0b1111}, kAluOpChangedStateNone}, + {"trunc", {0b1111}, kAluOpChangedStateNone}, + {"floor", {0b1111}, kAluOpChangedStateNone}, + {"mad", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone}, + {"cndeq", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone}, + {"cndge", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone}, + {"cndgt", {0b1111, 0b1111, 0b1111}, kAluOpChangedStateNone}, + {"dp4", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"dp3", {0b0111, 0b0111}, kAluOpChangedStateNone}, + {"dp2add", {0b0011, 0b0011, 0b0001}, kAluOpChangedStateNone}, + {"cube", {0b1111, 0b1111}, kAluOpChangedStateNone}, + {"max4", {0b1111}, kAluOpChangedStateNone}, + {"setp_eq_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate}, + {"setp_ne_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate}, + {"setp_gt_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate}, + {"setp_ge_push", {0b1001, 0b1001}, kAluOpChangedStatePredicate}, + {"kill_eq", {0b1111, 0b1111}, kAluOpChangedStatePixelKill}, + {"kill_gt", {0b1111, 0b1111}, kAluOpChangedStatePixelKill}, + {"kill_ge", {0b1111, 0b1111}, kAluOpChangedStatePixelKill}, + {"kill_ne", {0b1111, 0b1111}, kAluOpChangedStatePixelKill}, + {"dst", {0b0110, 0b1010}, kAluOpChangedStateNone}, + {"maxa", {0b1111, 0b1111}, kAluOpChangedStateAddressRegister}, + {"opcode_30", {}, kAluOpChangedStateNone}, + {"opcode_31", {}, kAluOpChangedStateNone}, +}; + +} // namespace ucode +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 12c5886ac..47aabd63a 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -13,6 +13,7 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/math.h" #include "xenia/base/platform.h" #include "xenia/gpu/xenos.h" @@ -900,8 +901,9 @@ static_assert_size(FetchInstruction, sizeof(uint32_t) * 3); // Conventions: // - All temporary registers are vec4s. // - Most scalar ALU operations work with one or two components of the source -// register passed as the third operand of the whole co-issued ALU operation, -// denoted by `a` (the left-hand operand) and `b` (the right-hand operand). +// register or the float constant passed as the third operand of the whole +// co-issued ALU operation, denoted by `a` (the left-hand operand) and `b` +// (the right-hand operand). // `a` is the [(3 + src3_swizzle[6:7]) & 3] component (W - alpha). // `b` is the [(0 + src3_swizzle[0:1]) & 3] component (X - red). // - mulsc, addsc, subsc scalar ALU operations accept two operands - a float @@ -948,6 +950,14 @@ static_assert_size(FetchInstruction, sizeof(uint32_t) * 3); // use instructions that may be interpreted by the host GPU as fused // multiply-add. +// For analysis of shaders and skipping instructions that write nothing. +enum AluOpChangedState { + kAluOpChangedStateNone = 0, + kAluOpChangedStateAddressRegister = 1 << 0, + kAluOpChangedStatePredicate = 1 << 1, + kAluOpChangedStatePixelKill = 1 << 2, +}; + enum class AluScalarOpcode : uint32_t { // Floating-Point Add // adds/ADDs dest, src0.ab @@ -1277,17 +1287,28 @@ enum class AluScalarOpcode : uint32_t { kRetainPrev = 50, }; -constexpr bool AluScalarOpcodeIsKill(AluScalarOpcode scalar_opcode) { - switch (scalar_opcode) { - case AluScalarOpcode::kKillsEq: - case AluScalarOpcode::kKillsGt: - case AluScalarOpcode::kKillsGe: - case AluScalarOpcode::kKillsNe: - case AluScalarOpcode::kKillsOne: - return true; - default: - return false; - } +struct AluScalarOpcodeInfo { + const char* name; + // 0 - no operands. + // 1 - one single-component (W) or two-component (WX) r# or c#. + // 2 - c#.w and r#.x. + uint32_t operand_count; + // If operand_count is 1, whether both W and X of the operand are used rather + // than only W. + bool single_operand_is_two_component; + // Note that all scalar instructions except for retain_prev modify the + // previous scalar register, so they must be executed even if they don't write + // any result and don't perform any other state changes. + AluOpChangedState changed_state; +}; + +// 6 scalar opcode bits - 64 entries. +extern const AluScalarOpcodeInfo kAluScalarOpcodeInfos[64]; + +inline const AluScalarOpcodeInfo& GetAluScalarOpcodeInfo( + AluScalarOpcode opcode) { + assert_true(uint32_t(opcode) < xe::countof(kAluScalarOpcodeInfos)); + return kAluScalarOpcodeInfos[uint32_t(opcode)]; } enum class AluVectorOpcode : uint32_t { @@ -1385,6 +1406,9 @@ enum class AluVectorOpcode : uint32_t { // dest.y = src0.y * src1.y + src2.y; // dest.z = src0.z * src1.z + src2.z; // dest.w = src0.w * src1.w + src2.w; + // According to SQ_ALU::multiply_add (used in the isHardwareAccurate case) + // from IPR2015-00325 sq_alu, this is FMA - rounding to single-precision only + // after the addition. kMad = 11, // Per-Component Floating-Point Conditional Move If Equal @@ -1490,6 +1514,17 @@ enum class AluVectorOpcode : uint32_t { // } else { // dest.xyzw = src0.w; // } + // However, the comparisons may be >= actually - the XNA documentation on + // MSDN, as well as R600 and GCN documentation, describe `max` as being + // implemented via >= rather than >. `max4` is documented vaguely, without the + // exact calculations for each component - MSDN describes it as max(xyzw), and + // in the R600 documentation it's max(wzyx). There's also a case more similar + // to `max4` where there also is a discrepancy between IPR2015-00325 sq_alu + // and the GCN documentation - `cube` has max3 in zyx priority order, and a >= + // comparison is used for this purpose on the GCN, but in IPR2015-00325 sq_alu + // it's implemented via >. It's possible that in an early version of the R400, + // the comparison was >, but was later changed to >=, but this is merely a + // guess. kMax4 = 19, // Floating-Point Predicate Counter Increment If Equal @@ -1627,60 +1662,32 @@ enum class AluVectorOpcode : uint32_t { kMaxA = 29, }; -constexpr bool AluVectorOpcodeIsKill(AluVectorOpcode vector_opcode) { - switch (vector_opcode) { - case AluVectorOpcode::kKillEq: - case AluVectorOpcode::kKillGt: - case AluVectorOpcode::kKillGe: - case AluVectorOpcode::kKillNe: - return true; - default: - return false; - } -} +struct AluVectorOpcodeInfo { + const char* name; + uint32_t operand_components_used[3]; + AluOpChangedState changed_state; -// Whether the vector instruction has side effects such as discarding a pixel or -// setting the predicate and can't be ignored even if it doesn't write to -// anywhere. Note that all scalar operations except for retain_prev have a side -// effect of modifying the previous scalar result register, so they must always -// be executed even if not writing. -constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) { - if (AluVectorOpcodeIsKill(vector_opcode)) { - return true; + uint32_t GetOperandCount() const { + if (!operand_components_used[2]) { + if (!operand_components_used[1]) { + if (!operand_components_used[0]) { + return 0; + } + return 1; + } + return 2; + } + return 3; } - switch (vector_opcode) { - case AluVectorOpcode::kSetpEqPush: - case AluVectorOpcode::kSetpNePush: - case AluVectorOpcode::kSetpGtPush: - case AluVectorOpcode::kSetpGePush: - case AluVectorOpcode::kMaxA: - return true; - default: - return false; - } -} +}; -// Whether each component of a source operand is used at all in the instruction -// (doesn't check the operand count though). -constexpr uint32_t GetAluVectorOpUsedSourceComponents( - AluVectorOpcode vector_opcode, uint32_t src_index) { - assert_not_zero(src_index); - switch (vector_opcode) { - case AluVectorOpcode::kDp3: - return 0b0111; - case AluVectorOpcode::kDp2Add: - return src_index == 3 ? 0b0001 : 0b0011; - case AluVectorOpcode::kSetpEqPush: - case AluVectorOpcode::kSetpNePush: - case AluVectorOpcode::kSetpGtPush: - case AluVectorOpcode::kSetpGePush: - return 0b1001; - case AluVectorOpcode::kDst: - return src_index == 2 ? 0b1010 : 0b0110; - default: - break; - } - return 0b1111; +// 5 vector opcode bits - 32 entries. +extern const AluVectorOpcodeInfo kAluVectorOpcodeInfos[32]; + +inline const AluVectorOpcodeInfo& GetAluVectorOpcodeInfo( + AluVectorOpcode opcode) { + assert_true(uint32_t(opcode) < xe::countof(kAluVectorOpcodeInfos)); + return kAluVectorOpcodeInfos[uint32_t(opcode)]; } // Whether each component of a source operand is needed for the instruction if @@ -1688,7 +1695,7 @@ constexpr uint32_t GetAluVectorOpUsedSourceComponents( // undefined in translation. For per-component operations, for example, only the // components specified in the write mask are needed, but there are instructions // with special behavior for certain components. -constexpr uint32_t GetAluVectorOpNeededSourceComponents( +inline uint32_t GetAluVectorOpNeededSourceComponents( AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t used_result_components) { assert_not_zero(src_index); @@ -1721,8 +1728,8 @@ constexpr uint32_t GetAluVectorOpNeededSourceComponents( case AluVectorOpcode::kKillNe: components = 0b1111; break; - // kDst is per-component, but not all components are used - - // GetAluVectorOpUsedSourceComponents will filter out the unused ones. + // kDst is per-component, but not all components are used. + // operand_components_used will filter out the unused ones. case AluVectorOpcode::kMaxA: if (src_index == 1) { components |= 0b1000; @@ -1731,8 +1738,8 @@ constexpr uint32_t GetAluVectorOpNeededSourceComponents( default: break; } - return components & - GetAluVectorOpUsedSourceComponents(vector_opcode, src_index); + return components & GetAluVectorOpcodeInfo(vector_opcode) + .operand_components_used[src_index - 1]; } enum class ExportRegister : uint32_t { @@ -1787,7 +1794,6 @@ struct alignas(uint32_t) AluInstruction { // Whether data is being exported (or written to local registers). bool is_export() const { return data_.export_data == 1; } - bool export_write_mask() const { return data_.scalar_dest_rel == 1; } // Whether the jump is predicated (or conditional). bool is_predicated() const { return data_.is_predicated; } @@ -1921,7 +1927,7 @@ struct alignas(uint32_t) AluInstruction { } } - uint32_t scalar_const_op_src_temp_reg() const { + uint32_t scalar_const_reg_op_src_temp_reg() const { return (uint32_t(data_.scalar_opc) & 1) | (data_.src3_sel << 1) | (data_.src3_swiz & 0x3C); }