From fea430f1f91d6be82b6fa3b1775036aa2819071e Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 13 Apr 2022 23:08:19 +0300 Subject: [PATCH] [GPU] Fix scalar c[#+aL], shader docs/refactoring --- src/xenia/gpu/dxbc_shader_translator.cc | 18 +- src/xenia/gpu/shader.h | 24 +- src/xenia/gpu/shader_translator.cc | 229 ++++++++-------- src/xenia/gpu/shader_translator_disasm.cc | 12 +- src/xenia/gpu/spirv_shader_translator.cc | 12 +- src/xenia/gpu/ucode.h | 301 ++++++++++++++++++---- 6 files changed, 395 insertions(+), 201 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 350ea6895..0febe78f3 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1331,12 +1331,12 @@ dxbc::Src DxbcShaderTranslator::LoadOperand(const InstructionOperand& operand, dxbc::Index index(operand.storage_index); switch (operand.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: + case InstructionStorageAddressingMode::kAbsolute: break; - case InstructionStorageAddressingMode::kAddressAbsolute: + case InstructionStorageAddressingMode::kAddressRegisterRelative: index = dxbc::Index(system_temp_ps_pc_p0_a0_, 3, operand.storage_index); break; - case InstructionStorageAddressingMode::kAddressRelative: + case InstructionStorageAddressingMode::kLoopRelative: index = dxbc::Index(system_temp_aL_, 0, operand.storage_index); break; } @@ -1365,7 +1365,7 @@ dxbc::Src DxbcShaderTranslator::LoadOperand(const InstructionOperand& operand, src = dxbc::Src::R(temp); } else { assert_true(operand.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic); + InstructionStorageAddressingMode::kAbsolute); src = dxbc::Src::R(index.index_); } } break; @@ -1376,7 +1376,7 @@ dxbc::Src DxbcShaderTranslator::LoadOperand(const InstructionOperand& operand, const Shader::ConstantRegisterMap& constant_register_map = current_shader().constant_register_map(); if (operand.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { uint32_t float_constant_index = constant_register_map.GetPackedFloatConstantIndex( operand.storage_index); @@ -1429,13 +1429,13 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, if (current_shader().uses_register_dynamic_addressing()) { dxbc::Index register_index(result.storage_index); switch (result.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: + case InstructionStorageAddressingMode::kAbsolute: break; - case InstructionStorageAddressingMode::kAddressAbsolute: + case InstructionStorageAddressingMode::kAddressRegisterRelative: register_index = dxbc::Index(system_temp_ps_pc_p0_a0_, 3, result.storage_index); break; - case InstructionStorageAddressingMode::kAddressRelative: + case InstructionStorageAddressingMode::kLoopRelative: register_index = dxbc::Index(system_temp_aL_, 0, result.storage_index); break; @@ -1443,7 +1443,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, dest = dxbc::Dest::X(0, register_index); } else { assert_true(result.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic); + InstructionStorageAddressingMode::kAbsolute); dest = dxbc::Dest::R(result.storage_index); } break; diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 9603134d4..8422cafdc 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -44,7 +44,7 @@ namespace gpu { enum class InstructionStorageTarget { // Result is not stored. kNone, - // Result is stored to a temporary register indexed by storage_index [0-31]. + // Result is stored to a temporary register indexed by storage_index [0-63]. kRegister, // Result is stored into a vertex shader interpolator export [0-15]. kInterpolator, @@ -85,11 +85,13 @@ constexpr uint32_t GetInstructionStorageTargetUsedComponentCount( enum class InstructionStorageAddressingMode { // The storage index is not dynamically addressed. - kStatic, + kAbsolute, // The storage index is addressed by a0. - kAddressAbsolute, + // Float constants only. + kAddressRegisterRelative, // The storage index is addressed by aL. - kAddressRelative, + // Float constants and temporary registers only. + kLoopRelative, }; // Describes the source value of a particular component. @@ -111,6 +113,12 @@ enum class SwizzleSource { constexpr SwizzleSource GetSwizzleFromComponentIndex(uint32_t i) { return static_cast(i); } +constexpr SwizzleSource GetSwizzledAluSourceComponent( + uint32_t swizzle, uint32_t component_index) { + return GetSwizzleFromComponentIndex( + ucode::AluInstruction::GetSwizzledComponentIndex(swizzle, + component_index)); +} inline char GetCharForComponentIndex(uint32_t i) { const static char kChars[] = {'x', 'y', 'z', 'w'}; return kChars[i]; @@ -127,7 +135,7 @@ struct InstructionResult { uint32_t storage_index = 0; // How the storage index is dynamically addressed, if it is. InstructionStorageAddressingMode storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; + InstructionStorageAddressingMode::kAbsolute; // True to clamp the result value to [0-1]. bool is_clamped = false; // Defines whether each output component is written, though this is from the @@ -191,9 +199,9 @@ struct InstructionResult { }; enum class InstructionStorageSource { - // Source is stored in a temporary register indexed by storage_index [0-31]. + // Source is stored in a temporary register indexed by storage_index [0-63]. kRegister, - // Source is stored in a float constant indexed by storage_index [0-511]. + // Source is stored in a float constant indexed by storage_index [0-255]. kConstantFloat, // Source is stored in a vertex fetch constant indexed by storage_index // [0-95]. @@ -210,7 +218,7 @@ struct InstructionOperand { uint32_t storage_index = 0; // How the storage index is dynamically addressed, if it is. InstructionStorageAddressingMode storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; + InstructionStorageAddressingMode::kAbsolute; // True to negate the operand value. bool is_negated = false; // True to take the absolute value of the source (before any negation). diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 9c1837779..d98fa5b7e 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -247,22 +247,18 @@ void Shader::GatherExecInformation( if (sequence & 0b10) { ucode_disasm_buffer.Append(" serialize\n "); } + const uint32_t* op_ptr = ucode_data_.data() + instr_offset * 3; if (sequence & 0b01) { - auto fetch_opcode = FetchOpcode(ucode_data_[instr_offset * 3] & 0x1F); - if (fetch_opcode == FetchOpcode::kVertexFetch) { - auto& op = *reinterpret_cast( - ucode_data_.data() + instr_offset * 3); - GatherVertexFetchInformation(op, previous_vfetch_full, + auto& op = *reinterpret_cast(op_ptr); + if (op.opcode() == FetchOpcode::kVertexFetch) { + GatherVertexFetchInformation(op.vertex_fetch(), previous_vfetch_full, ucode_disasm_buffer); } else { - auto& op = *reinterpret_cast( - ucode_data_.data() + instr_offset * 3); - GatherTextureFetchInformation(op, unique_texture_bindings, - ucode_disasm_buffer); + GatherTextureFetchInformation( + op.texture_fetch(), unique_texture_bindings, ucode_disasm_buffer); } } else { - auto& op = *reinterpret_cast(ucode_data_.data() + - instr_offset * 3); + auto& op = *reinterpret_cast(op_ptr); GatherAluInstructionInformation(op, memexport_alloc_current_count, memexport_eA_written, ucode_disasm_buffer); @@ -420,7 +416,7 @@ void Shader::GatherOperandInformation(const InstructionOperand& operand) { switch (operand.storage_source) { case InstructionStorageSource::kRegister: if (operand.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { register_static_address_bound_ = std::max(register_static_address_bound_, operand.storage_index + uint32_t(1)); @@ -430,7 +426,7 @@ void Shader::GatherOperandInformation(const InstructionOperand& operand) { break; case InstructionStorageSource::kConstantFloat: if (operand.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { // Store used float constants before translating so the // translator can use tightly packed indices if not dynamically // indexed. @@ -457,7 +453,7 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) { // operand. assert_true(result.storage_target == InstructionStorageTarget::kRegister); if (result.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { register_static_address_bound_ = std::max( register_static_address_bound_, result.storage_index + uint32_t(1)); } else { @@ -473,7 +469,7 @@ void Shader::GatherAluResultInformation( switch (result.storage_target) { case InstructionStorageTarget::kRegister: if (result.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { register_static_address_bound_ = std::max( register_static_address_bound_, result.storage_index + uint32_t(1)); } else { @@ -789,28 +785,24 @@ void ShaderTranslator::TranslateExecInstructions( for (uint32_t instr_offset = instr.instruction_address; instr_offset < instr.instruction_address + instr.instruction_count; ++instr_offset, sequence >>= 2) { + const uint32_t* op_ptr = ucode_dwords + instr_offset * 3; if (sequence & 0b01) { - auto fetch_opcode = - static_cast(ucode_dwords[instr_offset * 3] & 0x1F); - if (fetch_opcode == FetchOpcode::kVertexFetch) { - auto& op = *reinterpret_cast( - ucode_dwords + instr_offset * 3); + auto& op = *reinterpret_cast(op_ptr); + if (op.opcode() == FetchOpcode::kVertexFetch) { + const VertexFetchInstruction& vfetch_op = op.vertex_fetch(); ParsedVertexFetchInstruction vfetch_instr; - if (ParseVertexFetchInstruction(op, previous_vfetch_full_, + if (ParseVertexFetchInstruction(vfetch_op, previous_vfetch_full_, vfetch_instr)) { - previous_vfetch_full_ = op; + previous_vfetch_full_ = vfetch_op; } ProcessVertexFetchInstruction(vfetch_instr); } else { - auto& op = *reinterpret_cast( - ucode_dwords + instr_offset * 3); ParsedTextureFetchInstruction tfetch_instr; - ParseTextureFetchInstruction(op, tfetch_instr); + ParseTextureFetchInstruction(op.texture_fetch(), tfetch_instr); ProcessTextureFetchInstruction(tfetch_instr); } } else { - auto& op = *reinterpret_cast(ucode_dwords + - instr_offset * 3); + auto& op = *reinterpret_cast(op_ptr); ParsedAluInstruction alu_instr; ParseAluInstruction(op, current_shader().type(), alu_instr); ProcessAluInstruction(alu_instr); @@ -826,25 +818,40 @@ static void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle, result.storage_index = dest; result.is_clamped = false; result.storage_addressing_mode = - is_relative ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; + is_relative ? InstructionStorageAddressingMode::kLoopRelative + : InstructionStorageAddressingMode::kAbsolute; result.original_write_mask = 0b1111; for (int i = 0; i < 4; ++i) { - switch (swizzle & 0x7) { - case 4: - case 6: - result.components[i] = SwizzleSource::k0; + SwizzleSource component_source = SwizzleSource::k0; + ucode::FetchDestinationSwizzle component_swizzle = + ucode::GetFetchDestinationComponentSwizzle(swizzle, i); + switch (component_swizzle) { + case ucode::FetchDestinationSwizzle::kX: + component_source = SwizzleSource::kX; break; - case 5: - result.components[i] = SwizzleSource::k1; + case ucode::FetchDestinationSwizzle::kY: + component_source = SwizzleSource::kY; break; - case 7: - result.original_write_mask &= ~uint32_t(1 << i); + case ucode::FetchDestinationSwizzle::kZ: + component_source = SwizzleSource::kZ; + break; + case ucode::FetchDestinationSwizzle::kW: + component_source = SwizzleSource::kW; + break; + case ucode::FetchDestinationSwizzle::k1: + component_source = SwizzleSource::k1; + break; + case ucode::FetchDestinationSwizzle::kKeep: + result.original_write_mask &= ~(UINT32_C(1) << i); break; default: - result.components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3); + // ucode::FetchDestinationSwizzle::k0 or the invalid swizzle 6. + // TODO(Triang3l): Find the correct handling of the invalid swizzle 6. + assert_true(component_swizzle == ucode::FetchDestinationSwizzle::k0); + component_source = SwizzleSource::k0; + break; } - swizzle >>= 3; + result.components[i] = component_source; } } @@ -867,8 +874,8 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op, src_op.storage_index = full_op.src(); src_op.storage_addressing_mode = full_op.is_src_relative() - ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; + ? InstructionStorageAddressingMode::kLoopRelative + : InstructionStorageAddressingMode::kAbsolute; src_op.is_negated = false; src_op.is_absolute_value = false; src_op.component_count = 1; @@ -962,8 +969,8 @@ void ParseTextureFetchInstruction(const TextureFetchInstruction& op, src_op.storage_source = InstructionStorageSource::kRegister; src_op.storage_index = op.src(); src_op.storage_addressing_mode = - op.is_src_relative() ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; + op.is_src_relative() ? InstructionStorageAddressingMode::kLoopRelative + : InstructionStorageAddressingMode::kAbsolute; src_op.is_negated = false; src_op.is_absolute_value = false; src_op.component_count = @@ -1144,91 +1151,51 @@ static const AluOpcodeInfo alu_scalar_opcode_infos[0x40] = { static void ParseAluInstructionOperand(const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count, InstructionOperand& out_op) { - int const_slot = 0; - switch (i) { - case 2: - const_slot = op.src_is_temp(1) ? 0 : 1; - break; - case 3: - const_slot = op.src_is_temp(1) && op.src_is_temp(2) ? 0 : 1; - break; - } out_op.is_negated = op.src_negate(i); uint32_t reg = op.src_reg(i); if (op.src_is_temp(i)) { out_op.storage_source = InstructionStorageSource::kRegister; - out_op.storage_index = reg & 0x1F; - out_op.is_absolute_value = (reg & 0x80) == 0x80; + out_op.storage_index = AluInstruction::src_temp_reg(reg); + out_op.is_absolute_value = AluInstruction::is_src_temp_value_absolute(reg); out_op.storage_addressing_mode = - (reg & 0x40) ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; + AluInstruction::is_src_temp_relative(reg) + ? InstructionStorageAddressingMode::kLoopRelative + : InstructionStorageAddressingMode::kAbsolute; } else { out_op.storage_source = InstructionStorageSource::kConstantFloat; out_op.storage_index = reg; - if ((const_slot == 0 && op.is_const_0_addressed()) || - (const_slot == 1 && op.is_const_1_addressed())) { - if (op.is_address_relative()) { + if (op.src_const_is_addressed(i)) { + if (op.is_const_address_register_relative()) { out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressAbsolute; + InstructionStorageAddressingMode::kAddressRegisterRelative; } else { out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; + InstructionStorageAddressingMode::kLoopRelative; } } else { out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; + InstructionStorageAddressingMode::kAbsolute; } out_op.is_absolute_value = op.abs_constants(); } out_op.component_count = swizzle_component_count; uint32_t swizzle = op.src_swizzle(i); if (swizzle_component_count == 1) { - uint32_t a = ((swizzle >> 6) + 3) & 0x3; - out_op.components[0] = GetSwizzleFromComponentIndex(a); + // Scalar `a` (W). + out_op.components[0] = GetSwizzledAluSourceComponent(swizzle, 3); } else if (swizzle_component_count == 2) { - uint32_t a = ((swizzle >> 6) + 3) & 0x3; - uint32_t b = ((swizzle >> 0) + 0) & 0x3; - out_op.components[0] = GetSwizzleFromComponentIndex(a); - out_op.components[1] = GetSwizzleFromComponentIndex(b); + // Scalar left-hand `a` (W) and right-hand `b` (X). + out_op.components[0] = GetSwizzledAluSourceComponent(swizzle, 3); + out_op.components[1] = GetSwizzledAluSourceComponent(swizzle, 0); } else if (swizzle_component_count == 3) { assert_always(); } else if (swizzle_component_count == 4) { - for (uint32_t j = 0; j < swizzle_component_count; ++j, swizzle >>= 2) { - out_op.components[j] = GetSwizzleFromComponentIndex((swizzle + j) & 0x3); + for (uint32_t j = 0; j < swizzle_component_count; ++j) { + out_op.components[j] = GetSwizzledAluSourceComponent(swizzle, j); } } } -static void ParseAluInstructionOperandSpecial( - const AluInstruction& op, InstructionStorageSource storage_source, - uint32_t reg, bool negate, int const_slot, uint32_t component_index, - InstructionOperand& out_op) { - out_op.is_negated = negate; - out_op.is_absolute_value = op.abs_constants(); - out_op.storage_source = storage_source; - if (storage_source == InstructionStorageSource::kRegister) { - out_op.storage_index = reg & 0x7F; - out_op.storage_addressing_mode = InstructionStorageAddressingMode::kStatic; - } else { - out_op.storage_index = reg; - if ((const_slot == 0 && op.is_const_0_addressed()) || - (const_slot == 1 && op.is_const_1_addressed())) { - if (op.is_address_relative()) { - out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressAbsolute; - } else { - out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; - } - } else { - out_op.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; - } - } - out_op.component_count = 1; - out_op.components[0] = GetSwizzleFromComponentIndex(component_index); -} - bool ParsedAluInstruction::IsVectorOpDefaultNop() const { if (vector_opcode != ucode::AluVectorOpcode::kMax || vector_and_constant_result.original_write_mask || @@ -1237,14 +1204,14 @@ bool ParsedAluInstruction::IsVectorOpDefaultNop() const { InstructionStorageSource::kRegister || vector_operands[0].storage_index != 0 || vector_operands[0].storage_addressing_mode != - InstructionStorageAddressingMode::kStatic || + InstructionStorageAddressingMode::kAbsolute || vector_operands[0].is_negated || vector_operands[0].is_absolute_value || !vector_operands[0].IsStandardSwizzle() || vector_operands[1].storage_source != InstructionStorageSource::kRegister || vector_operands[1].storage_index != 0 || vector_operands[1].storage_addressing_mode != - InstructionStorageAddressingMode::kStatic || + InstructionStorageAddressingMode::kAbsolute || vector_operands[1].is_negated || vector_operands[1].is_absolute_value || !vector_operands[1].IsStandardSwizzle()) { return false; @@ -1253,7 +1220,7 @@ bool ParsedAluInstruction::IsVectorOpDefaultNop() const { InstructionStorageTarget::kRegister) { if (vector_and_constant_result.storage_index != 0 || vector_and_constant_result.storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { return false; } } else { @@ -1330,14 +1297,14 @@ void ParseAluInstruction(const AluInstruction& op, instr.vector_and_constant_result.storage_target = storage_target; instr.vector_and_constant_result.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; + InstructionStorageAddressingMode::kAbsolute; if (is_export) { instr.vector_and_constant_result.storage_index = storage_index_export; } else { instr.vector_and_constant_result.storage_index = op.vector_dest(); if (op.is_vector_dest_relative()) { instr.vector_and_constant_result.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; + InstructionStorageAddressingMode::kLoopRelative; } } instr.vector_and_constant_result.is_clamped = op.vector_clamp(); @@ -1372,14 +1339,14 @@ void ParseAluInstruction(const AluInstruction& op, instr.scalar_result.storage_target = storage_target; instr.scalar_result.storage_addressing_mode = - InstructionStorageAddressingMode::kStatic; + InstructionStorageAddressingMode::kAbsolute; if (is_export) { instr.scalar_result.storage_index = storage_index_export; } else { instr.scalar_result.storage_index = op.scalar_dest(); if (op.is_scalar_dest_relative()) { instr.scalar_result.storage_addressing_mode = - InstructionStorageAddressingMode::kAddressRelative; + InstructionStorageAddressingMode::kLoopRelative; } } instr.scalar_result.is_clamped = op.scalar_clamp(); @@ -1395,20 +1362,42 @@ void ParseAluInstruction(const AluInstruction& op, scalar_opcode_info.src_swizzle_component_count, instr.scalar_operands[0]); } else { + // Constant and temporary register. + + bool src3_negate = op.src_negate(3); uint32_t src3_swizzle = op.src_swizzle(3); - uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3; - uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3; - uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) | - (static_cast(op.scalar_opcode()) & 1); - int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0; - ParseAluInstructionOperandSpecial( - op, InstructionStorageSource::kConstantFloat, op.src_reg(3), - op.src_negate(3), 0, component_a, instr.scalar_operands[0]); + // Left-hand constant operand (`a` - W swizzle). + InstructionOperand& const_op = instr.scalar_operands[0]; + const_op.is_negated = src3_negate; + const_op.is_absolute_value = op.abs_constants(); + const_op.storage_source = InstructionStorageSource::kConstantFloat; + const_op.storage_index = op.src_reg(3); + if (op.src_const_is_addressed(3)) { + if (op.is_const_address_register_relative()) { + const_op.storage_addressing_mode = + InstructionStorageAddressingMode::kAddressRegisterRelative; + } else { + const_op.storage_addressing_mode = + InstructionStorageAddressingMode::kLoopRelative; + } + } else { + const_op.storage_addressing_mode = + InstructionStorageAddressingMode::kAbsolute; + } + const_op.component_count = 1; + const_op.components[0] = GetSwizzledAluSourceComponent(src3_swizzle, 3); - ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister, - reg2, op.src_negate(3), const_slot, - component_b, instr.scalar_operands[1]); + // Right-hand temporary register operand (`b` - X swizzle). + InstructionOperand& temp_op = instr.scalar_operands[1]; + temp_op.is_negated = src3_negate; + temp_op.is_absolute_value = op.abs_constants(); + temp_op.storage_source = InstructionStorageSource::kRegister; + temp_op.storage_index = op.scalar_const_op_src_temp_reg(); + temp_op.storage_addressing_mode = + InstructionStorageAddressingMode::kAbsolute; + temp_op.component_count = 1; + temp_op.components[0] = GetSwizzledAluSourceComponent(src3_swizzle, 0); } } } @@ -1421,7 +1410,7 @@ bool ParsedAluInstruction::IsScalarOpDefaultNop() const { if (scalar_result.storage_target == InstructionStorageTarget::kRegister) { if (scalar_result.storage_index != 0 || scalar_result.storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) { + InstructionStorageAddressingMode::kAbsolute) { return false; } } @@ -1446,7 +1435,7 @@ uint32_t ParsedAluInstruction::GetMemExportStreamConstant() const { vector_operands[2].storage_source == InstructionStorageSource::kConstantFloat && vector_operands[2].storage_addressing_mode == - InstructionStorageAddressingMode::kStatic && + InstructionStorageAddressingMode::kAbsolute && vector_operands[2].IsStandardSwizzle() && !vector_operands[2].is_negated && !vector_operands[2].is_absolute_value) { return vector_operands[2].storage_index; diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc index 8dd72413a..8c6440e3e 100644 --- a/src/xenia/gpu/shader_translator_disasm.cc +++ b/src/xenia/gpu/shader_translator_disasm.cc @@ -57,13 +57,13 @@ void DisassembleResultOperand(const InstructionResult& result, } if (uses_storage_index) { switch (result.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: + case InstructionStorageAddressingMode::kAbsolute: out->AppendFormat("{}", result.storage_index); break; - case InstructionStorageAddressingMode::kAddressAbsolute: + case InstructionStorageAddressingMode::kAddressRegisterRelative: out->AppendFormat("[{}+a0]", result.storage_index); break; - case InstructionStorageAddressingMode::kAddressRelative: + case InstructionStorageAddressingMode::kLoopRelative: out->AppendFormat("[{}+aL]", result.storage_index); break; } @@ -109,17 +109,17 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) { out->Append("_abs"); } switch (op.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: + case InstructionStorageAddressingMode::kAbsolute: if (op.is_absolute_value) { out->AppendFormat("[{}]", op.storage_index); } else { out->AppendFormat("{}", op.storage_index); } break; - case InstructionStorageAddressingMode::kAddressAbsolute: + case InstructionStorageAddressingMode::kAddressRegisterRelative: out->AppendFormat("[{}+a0]", op.storage_index); break; - case InstructionStorageAddressingMode::kAddressRelative: + case InstructionStorageAddressingMode::kLoopRelative: out->AppendFormat("[{}+aL]", op.storage_index); break; } diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 1063e8e0c..7ba883a19 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -3110,16 +3110,16 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { } switch (op.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: { + case InstructionStorageAddressingMode::kAbsolute: { storage_index = b.makeUintConstant(storage_base + op.storage_index); } break; - case InstructionStorageAddressingMode::kAddressAbsolute: { + case InstructionStorageAddressingMode::kAddressRegisterRelative: { // storage_index + a0 storage_index = b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_), b.makeUintConstant(storage_base + op.storage_index)); } break; - case InstructionStorageAddressingMode::kAddressRelative: { + case InstructionStorageAddressingMode::kLoopRelative: { // storage_index + aL.x auto idx = b.createCompositeExtract(b.createLoad(aL_), uint_type_, 0); storage_index = @@ -3269,16 +3269,16 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, std::vector storage_offsets; // Offsets in nested arrays -> storage switch (result.storage_addressing_mode) { - case InstructionStorageAddressingMode::kStatic: { + case InstructionStorageAddressingMode::kAbsolute: { storage_index = b.makeUintConstant(result.storage_index); } break; - case InstructionStorageAddressingMode::kAddressAbsolute: { + case InstructionStorageAddressingMode::kAddressRegisterRelative: { // storage_index + a0 storage_index = b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_), b.makeUintConstant(result.storage_index)); } break; - case InstructionStorageAddressingMode::kAddressRelative: { + case InstructionStorageAddressingMode::kLoopRelative: { // storage_index + aL.x auto idx = b.createCompositeExtract(b.createLoad(aL_), uint_type_, 0); storage_index = b.createBinOp(spv::Op::OpIAdd, uint_type_, idx, diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index e86387535..12c5886ac 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2015 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -16,11 +16,45 @@ #include "xenia/base/platform.h" #include "xenia/gpu/xenos.h" -// Closest AMD doc: +// The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource, +// which, for TargetPlatform.Xbox360, can validate and assemble Xbox 360 shader +// microcode from Xbox 360 and Direct3D 9 shader assembly, returning the binary, +// as well as validation warnings and errors and the disassembly via the warning +// output. It is the primary source of information about the binary encoding of +// the instructions, as well as valid usage of instruction parameters and +// sequences. +// https://www.microsoft.com/en-us/download/details.aspx?id=39 +// (XNAGS31_setup.exe) +// Xenia provides a tool, tools/shader-playground, that invokes the assembler, +// displays the binary and the disassembly from the official assembler, and also +// shows the disassembly generated by Xenia, and passes it back to the assembler +// to validate Xenia's microcode parsing and disassembly by checking if +// reassembling the disassembly results in the same binary. +// +// The behavior and the parameters of some of the instructions were previously +// documented on MSDN in the XNA Game Studio programming guide: +// http://web.archive.org/web/20081211005537/http://msdn.microsoft.com/en-us/library/bb313877.aspx +// +// A great amount of documentation, such as the R400 sequencer specification and +// the official emulator code, was made available during the LG Electronics, +// Inc. v. ATI Technologies ULC "Multi-thread Graphics Processing System" patent +// dispute IPR2015-00325, with the motion to seal having been denied due to "a +// strong public policy interest in making all information filed in an inter +// partes review publicly available". Most of the documents attached, however, +// cover early versions - the development process - of the R400 architecture, so +// there are some differences from the final Xenos GPU (DOT2ADDv is defined +// differently, for example, and MUL/ADD/SUB_CONST are missing). +// https://portal.unifiedpatents.com/ptab/case/IPR2015-00325 +// +// Also, the R600, while having a different 5-scalar, as opposed to vec4|scalar, +// parallelism model and instruction encodings and targeting Direct3D 10 rather +// that 9, inherits a lot of instructions and architectural concepts from the +// R400. +// https://www.x.org/docs/AMD/old/r600isa.pdf +// https://developer.amd.com/wordpress/media/2012/10/r600isa.pdf // https://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf -// Microcode format differs, but most fields/enums are the same. -// This code comes from the freedreno project: +// Parts of this code also come from the freedreno project: // https://github.com/freedreno/freedreno/blob/master/includes/instr-a2xx.h /* * Copyright (c) 2012 Rob Clark @@ -156,7 +190,8 @@ struct ControlFlowExecInstruction { uint32_t address() const { return address_; } // Number of instructions being executed. uint32_t count() const { return count_; } - // Sequence bits, 2 per instruction, indicating whether ALU or fetch. + // Sequence bits, 2 per instruction. + // [0] - ALU (0) or fetch (1), [1] - serialize. uint32_t sequence() const { return serialize_; } // Whether to reset the current predicate. bool clean() const { return clean_ == 1; } @@ -189,7 +224,8 @@ struct ControlFlowCondExecInstruction { uint32_t address() const { return address_; } // Number of instructions being executed. uint32_t count() const { return count_; } - // Sequence bits, 2 per instruction, indicating whether ALU or fetch. + // Sequence bits, 2 per instruction. + // [0] - ALU (0) or fetch (1), [1] - serialize. uint32_t sequence() const { return serialize_; } // Constant index used as the conditional. uint32_t bool_address() const { return bool_address_; } @@ -224,7 +260,8 @@ struct ControlFlowCondExecPredInstruction { uint32_t address() const { return address_; } // Number of instructions being executed. uint32_t count() const { return count_; } - // Sequence bits, 2 per instruction, indicating whether ALU or fetch. + // Sequence bits, 2 per instruction. + // [0] - ALU (0) or fetch (1), [1] - serialize. uint32_t sequence() const { return serialize_; } // Whether to reset the current predicate. bool clean() const { return clean_ == 1; } @@ -591,6 +628,24 @@ enum class FetchOpcode : uint32_t { kSetTextureGradientsVert = 26, }; +enum class FetchDestinationSwizzle { + // The component indices are absolute (not relative to the component itself, + // unlike in ALU operation sources). + kX = 0, + kY = 1, + kZ = 2, + kW = 3, + k0 = 4, + k1 = 5, + // Keep the current value of the destination register (don't write). + kKeep = 7, +}; + +constexpr FetchDestinationSwizzle GetFetchDestinationComponentSwizzle( + uint32_t swizzle, uint32_t component) { + return FetchDestinationSwizzle((swizzle >> (3 * component)) & 0b111); +} + struct alignas(uint32_t) VertexFetchInstruction { FetchOpcode opcode() const { return data_.opcode_value; } @@ -614,29 +669,6 @@ struct alignas(uint32_t) VertexFetchInstruction { uint32_t src_swizzle() const { return data_.src_swiz; } bool is_src_relative() const { return data_.src_reg_am; } - // Returns true if the fetch actually fetches data. - // This may be false if it's used only to populate constants. - bool fetches_any_data() const { - uint32_t dst_swiz = data_.dst_swiz; - bool fetches_any_data = false; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - // 0.0 - } else if ((dst_swiz & 0x7) == 5) { - // 1.0 - } else if ((dst_swiz & 0x7) == 6) { - // ? - } else if ((dst_swiz & 0x7) == 7) { - // Previous register value. - } else { - fetches_any_data = true; - break; - } - dst_swiz >>= 3; - } - return fetches_any_data; - } - uint32_t prefetch_count() const { return data_.prefetch_count; } bool is_mini_fetch() const { return data_.is_mini_fetch == 1; } @@ -676,6 +708,7 @@ struct alignas(uint32_t) VertexFetchInstruction { uint32_t const_index_sel : 2; // Prefetch count minus 1. uint32_t prefetch_count : 3; + // Absolute, one component. uint32_t src_swiz : 2; }; struct { @@ -769,10 +802,11 @@ struct alignas(uint32_t) TextureFetchInstruction { uint32_t fetch_valid_only : 1; uint32_t const_index : 5; uint32_t tx_coord_denorm : 1; - uint32_t src_swiz : 6; // xyz + // Absolute, three components. + uint32_t src_swiz : 6; }; struct { - uint32_t dst_swiz : 12; // xyzw + uint32_t dst_swiz : 12; xenos::TextureFilter mag_filter : 2; xenos::TextureFilter min_filter : 2; xenos::TextureFilter mip_filter : 2; @@ -801,21 +835,96 @@ struct alignas(uint32_t) TextureFetchInstruction { }; static_assert_size(TextureFetchInstruction, sizeof(uint32_t) * 3); +union alignas(uint32_t) FetchInstruction { + public: + FetchOpcode opcode() const { return data_.opcode_value; } + + // Whether the jump is predicated (or conditional). + bool is_predicated() const { return data_.is_predicated; } + // Required condition value of the comparision (true or false). + bool predicate_condition() const { return data_.pred_condition == 1; } + + uint32_t dest() const { return data_.dst_reg; } + uint32_t dest_swizzle() const { return data_.dst_swiz; } + bool is_dest_relative() const { return data_.dst_reg_am; } + uint32_t src() const { return data_.src_reg; } + bool is_src_relative() const { return data_.src_reg_am; } + + // For FetchOpcode::kVertexFetch. + const VertexFetchInstruction& vertex_fetch() const { return vertex_fetch_; } + // For operations other than FetchOpcode::kVertexFetch. + const TextureFetchInstruction& texture_fetch() const { + return texture_fetch_; + } + + private: + struct Data { + struct { + FetchOpcode opcode_value : 5; + uint32_t src_reg : 6; + uint32_t src_reg_am : 1; + uint32_t dst_reg : 6; + uint32_t dst_reg_am : 1; + // Specific to vertex or texture fetch. + uint32_t : 1; + // [0-31], points to one tf# or three vf# constants. + uint32_t const_index : 5; + // Specific to vertex or texture fetch. + uint32_t : 7; + }; + struct { + uint32_t dst_swiz : 12; + // Specific to vertex or texture fetch. + uint32_t : 19; + uint32_t is_predicated : 1; + }; + struct { + // Specific to vertex or texture fetch. + uint32_t : 31; + uint32_t pred_condition : 1; + }; + }; + Data data_; + VertexFetchInstruction vertex_fetch_; + TextureFetchInstruction texture_fetch_; +}; +static_assert_size(FetchInstruction, sizeof(uint32_t) * 3); + // What follows is largely a mash up of the microcode assembly naming and the -// R600 docs that have a near 1:1 with the instructions available in the xenos +// R600 docs that have a near 1:1 with the instructions available in the Xenos // GPU, and Adreno 2xx instruction names found in Freedreno. Some of the -// behavior has been experimentally verified. Some has been guessed. -// Docs: https://www.x.org/docs/AMD/old/r600isa.pdf +// behavior has been experimentally verified. Some has been guessed. Some +// instructions are implemented in the Exhibit 2092 - sq_alu of IPR2015-00325, +// however, the code provided there is early and incomplete. // // Conventions: // - All temporary registers are vec4s. -// - Scalar ops swizzle out a single component of their source registers denoted -// by 'a' or 'b'. src0.a means 'the first component specified for src0' and -// src0.ab means 'two components specified for src0, in order'. -// - Scalar ops write the result to the entire destination register. -// - pv and ps are the previous results of a vector or scalar ALU operation. -// Both are valid only within the current ALU clause. They are not modified -// when the instruction that would write them fails its predication check. +// - Most scalar ALU operations work with one or two components of the source +// register passed as the third operand of the whole co-issued ALU operation, +// denoted by `a` (the left-hand operand) and `b` (the right-hand operand). +// `a` is the [(3 + src3_swizzle[6:7]) & 3] component (W - alpha). +// `b` is the [(0 + src3_swizzle[0:1]) & 3] component (X - red). +// - mulsc, addsc, subsc scalar ALU operations accept two operands - a float +// constant with the `a` (W) swizzle (addressed by the third operand index and +// addressing mode) being the left-hand operand, and a temporary register with +// the `b` (X) swizzle with the index constructed from: +// - [0:0] = scalar_opcode[0:0] +// - [1:1] = src3_sel[0:0] +// - [2:5] = src3_swizzle[2:5] +// abs_constants and third source's negation are applied to both the constant +// and the temporary register. +// - Some scalar ALU instructions don't have operands. +// - Scalar ALU operations replicate the result into all masked components. +// - Overall, the WXYZ order is pretty commonly used in the Exhibit 2092 - +// sq_alu of IPR2015-00325, this is where the AB = WX order of scalar operands +// likely comes from. Vector predicate instructions also involve the W and X +// components, and in IPR2015-00325 sq_alu, individual components in the +// emulated vector instructions are handled in the WXYZ order. However, max4's +// "greater than the rest" check order is RGBA (XYZW) there. dp4, though, sums +// the products in WXYZ order in IPR2015-00325 sq_alu (but in XYZW order on +// MSDN). +// - ps is the previous result of a scalar ALU operation. It is not modified +// when the instruction that would write it fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for // multiplication (+-0 or denormal * anything = +0) wherever it's present // (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect @@ -1137,6 +1246,9 @@ enum class AluScalarOpcode : uint32_t { // dest.xyzw = sqrt(src0.a); kSqrt = 40, + // 0 and 1 are the same instruction - one bit of the register index is stored + // in the opcode field. + // mulsc/MUL_CONST_0 dest, src0.a, src1.a kMulsc0 = 42, // mulsc/MUL_CONST_1 dest, src0.a, src1.a @@ -1303,19 +1415,24 @@ enum class AluVectorOpcode : uint32_t { // dp4/DOT4v dest, src0, src1 // dest.xyzw = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + // src0.w * src1.w; - // Note: only pv.x contains the value. kDp4 = 15, // Three-Element Dot Product // dp3/DOT3v dest, src0, src1 // dest.xyzw = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z; - // Note: only pv.x contains the value. kDp3 = 16, // Two-Element Dot Product and Add // dp2add/DOT2ADDv dest, src0, src1, src2 // dest.xyzw = src0.x * src1.x + src0.y * src1.y + src2.x; - // Note: only pv.x contains the value. + // IPR2015-00325 sq_alu may be an outdated and unreliable reference (Sequencer + // Parts Development folder history lists a few changes regarding the swizzle + // in dot2add, sq_alu though implements the instruction as + // src0.x * src1.x + src0.z * src1.z + src2.y, but MSDN specifies the correct + // order as provided in the beginning of this comment, further proven by + // assembling PC shader assembly using XNA, with Shader Model 2 dp2add being + // translated directly into Xenos dp2add without additional swizzling). + // http://web.archive.org/web/20100705150552/http://msdn.microsoft.com/en-us/library/bb313922.aspx kDp2Add = 17, // Cube Map @@ -1363,8 +1480,16 @@ enum class AluVectorOpcode : uint32_t { // Four-Element Maximum // max4/MAX4v dest, src0 - // dest.xyzw = max(src0.x, src0.y, src0.z, src0.w); - // Note: only pv.x contains the value. + // According to IPR2015-00325 sq_alu: + // if (src0.x > src0.y && src0.x > src0.z && src0.x > src0.w) { + // dest.xyzw = src0.x; + // } else if (src0.y > src0.z && src0.y > src0.w) { + // dest.xyzw = src0.y; + // } else if (src0.z > src0.w) { + // dest.xyzw = src0.z; + // } else { + // dest.xyzw = src0.w; + // } kMax4 = 19, // Floating-Point Predicate Counter Increment If Equal @@ -1672,7 +1797,9 @@ struct alignas(uint32_t) AluInstruction { bool abs_constants() const { return data_.abs_constants == 1; } bool is_const_0_addressed() const { return data_.const_0_rel_abs == 1; } bool is_const_1_addressed() const { return data_.const_1_rel_abs == 1; } - bool is_address_relative() const { return data_.address_absolute == 1; } + bool is_const_address_register_relative() const { + return data_.const_address_register_relative == 1; + } AluVectorOpcode vector_opcode() const { return data_.vector_opc; } uint32_t vector_write_mask() const { return data_.vector_write_mask; } @@ -1686,6 +1813,18 @@ struct alignas(uint32_t) AluInstruction { bool is_scalar_dest_relative() const { return data_.scalar_dest_rel == 1; } bool scalar_clamp() const { return data_.scalar_clamp == 1; } + static constexpr uint32_t src_temp_reg(uint32_t src_reg) { + return src_reg & 0x3F; + } + static constexpr bool is_src_temp_relative(uint32_t src_reg) { + return (src_reg & 0x40) != 0; + } + static constexpr bool is_src_temp_value_absolute(uint32_t src_reg) { + return (src_reg & 0x80) != 0; + } + // Full register index for constants, packed structure for temporary + // registers (unpack using src_temp_reg, is_src_temp_relative, + // is_src_temp_value_absolute). uint32_t src_reg(size_t i) const { switch (i) { case 1: @@ -1702,16 +1841,59 @@ struct alignas(uint32_t) AluInstruction { bool src_is_temp(size_t i) const { switch (i) { case 1: - return data_.src1_sel == 1; + return bool(data_.src1_sel); case 2: - return data_.src2_sel == 1; + return bool(data_.src2_sel); case 3: - return data_.src3_sel == 1; + return bool(data_.src3_sel); default: assert_unhandled_case(i); return 0; } } + // Whether the specified operand is actually a constant is disregarded in this + // function so its scope is limited to just parsing the structure's layout - + // to decide whether to use relative addressing for the operand as a whole, + // check externally whether the operand is actually a constant first. + // + // For the constant operand in mulsc, addsc, subsc, this should be called for + // the operand index 3. Note that the XNA disassembler takes the addressing + // mode for the constant scalar operand unconditionally from const_1_rel_abs, + // and ignores the +aL for it unless the scalar operation is co-issued with a + // vector operation reading from a constant. However, the XNA assembler treats + // the constant scalar operand as a constant in the third operand, and places + // the addressing mode for it in const_0_rel_abs if no other constants are + // used in the whole ALU instruction. The validator also doesn't report + // anything if +aL is used when the constant scalar operand is the only + // constant in the instruction (and explicitly calls it the third constant in + // the error message in case both vector operands are constants, and different + // addressing modes are used for the second vector operand and the constant + // scalar operand). Passing the disassembly produced by XNA back to the + // assembler results in different microcode in this case. This indicates that + // most likely there's a bug in the XNA disassembler, and that the addressing + // mode for the constant scalar operand should actually be taken the same way + // as for the third vector operand - from const_0_rel_abs if there are no + // constant vector operands, or from const_1_rel_abs if there is at least one. + bool src_const_is_addressed(size_t i) const { + // "error X7100: When three constants are used in one instruction, the + // second and third constant must either both be non-relative, or both be + // relative." + // Whether to use const_0_rel_abs or const_1_rel_abs is essentially + // min(sum of whether the previous operands are constants, 1). + switch (i) { + case 1: + return bool(data_.const_0_rel_abs); + case 2: + return bool(src_is_temp(1) ? data_.const_0_rel_abs + : data_.const_1_rel_abs); + case 3: + return bool((src_is_temp(1) && src_is_temp(2)) ? data_.const_0_rel_abs + : data_.const_1_rel_abs); + default: + assert_unhandled_case(i); + return false; + } + } uint32_t src_swizzle(size_t i) const { switch (i) { case 1: @@ -1739,8 +1921,20 @@ struct alignas(uint32_t) AluInstruction { } } + uint32_t scalar_const_op_src_temp_reg() const { + return (uint32_t(data_.scalar_opc) & 1) | (data_.src3_sel << 1) | + (data_.src3_swiz & 0x3C); + } + // Helpers. + // Returns the absolute component index calculated from the relative swizzle + // in an ALU instruction. + static constexpr uint32_t GetSwizzledComponentIndex( + uint32_t swizzle, uint32_t component_index) { + return ((swizzle >> (2 * component_index)) + component_index) & 3; + } + // Note that even if the export component is unused (like W of the vertex // shader misc register, YZW of pixel shader depth), it must still not be // excluded - that may make disassembly not reassemblable if there are @@ -1803,6 +1997,7 @@ struct alignas(uint32_t) AluInstruction { AluScalarOpcode scalar_opc : 6; }; struct { + // Swizzles are component-relative. uint32_t src3_swiz : 8; uint32_t src2_swiz : 8; uint32_t src1_swiz : 8; @@ -1811,7 +2006,9 @@ struct alignas(uint32_t) AluInstruction { uint32_t src1_reg_negate : 1; uint32_t pred_condition : 1; uint32_t is_predicated : 1; - uint32_t address_absolute : 1; + // Temporary registers can have only absolute and aL-relative indices, not + // a0-relative. + uint32_t const_address_register_relative : 1; uint32_t const_1_rel_abs : 1; uint32_t const_0_rel_abs : 1; };