diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 278162d63..29bb913a5 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2961,6 +2961,14 @@ bool D3D12CommandProcessor::UpdateBindings( (!samplers_written_pixel_ || current_samplers_hash_pixel_ != samplers_hash_pixel); + // These are the constant base addresses/ranges for shaders. + // We have these hardcoded right now cause nothing seems to differ on the Xbox + // 360 (however, OpenGL ES on Adreno 200 on Android has different ranges). + assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 || + regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000); + assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 || + regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000); + // Check if the float constant layout is still the same and get the counts. const Shader::ConstantRegisterMap& float_constant_map_vertex = vertex_shader->constant_register_map(); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 8151d90eb..2a694d59a 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -809,14 +809,6 @@ bool PipelineCache::EnsureShadersTranslated( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, Shader::HostVertexShaderType host_vertex_shader_type) { auto& regs = *register_file_; - - // These are the constant base addresses/ranges for shaders. - // We have these hardcoded right now cause nothing seems to differ. - assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 || - regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000); - assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 || - regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000); - auto sq_program_cntl = regs.Get(); // Edge flags are not supported yet (because polygon primitives are not). diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 5f3ed1c38..31f2a680e 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -18,6 +18,7 @@ #include "xenia/base/assert.h" #include "xenia/base/cvar.h" +#include "xenia/base/math.h" DEFINE_bool(dxbc_switch, true, "Use switch rather than if for flow control. Turning this off or " @@ -86,7 +87,6 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id, // Don't allocate again and again for the first shader. shader_code_.reserve(8192); shader_object_.reserve(16384); - float_constant_index_offsets_.reserve(512); } DxbcShaderTranslator::~DxbcShaderTranslator() = default; @@ -161,8 +161,6 @@ void DxbcShaderTranslator::Reset() { cbuffer_index_fetch_constants_ = kCbufferIndexUnallocated; system_constants_used_ = 0; - float_constants_dynamic_indexed_ = false; - float_constant_index_offsets_.clear(); in_control_point_index_used_ = false; @@ -1166,29 +1164,6 @@ void DxbcShaderTranslator::CompleteShaderCode() { // Release system_temps_subroutine_. PopSystemTemp(system_temps_subroutine_count_); - - // Remap float constant indices if not indexed dynamically. - if (!float_constants_dynamic_indexed_ && - !float_constant_index_offsets_.empty()) { - uint8_t float_constant_map[256] = {}; - uint32_t float_constant_count = 0; - for (uint32_t i = 0; i < 4; ++i) { - uint64_t float_constants_used = constant_register_map().float_bitmap[i]; - uint32_t float_constant_index; - while ( - xe::bit_scan_forward(float_constants_used, &float_constant_index)) { - float_constants_used &= ~(1ull << float_constant_index); - float_constant_map[i * 64 + float_constant_index] = - float_constant_count++; - } - } - size_t index_count = float_constant_index_offsets_.size(); - for (size_t i = 0; i < index_count; ++i) { - uint32_t index_offset = float_constant_index_offsets_[i]; - shader_code_[index_offset] = - float_constant_map[shader_code_[index_offset] & 255]; - } - } } std::vector DxbcShaderTranslator::CompleteTranslation() { @@ -1420,7 +1395,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand( shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, kSwizzleXYZW, 2)); shader_code_.push_back(0); - shader_code_.push_back(uint32_t(operand.storage_index)); + shader_code_.push_back(operand.storage_index); } else { shader_code_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | @@ -1433,7 +1408,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand( D3D10_SB_OPERAND_INDEX_IMMEDIATE32, D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE)); shader_code_.push_back(0); - shader_code_.push_back(uint32_t(operand.storage_index)); + shader_code_.push_back(operand.storage_index); shader_code_.push_back(EncodeVectorSelectOperand( D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1)); shader_code_.push_back(dynamic_address_register); @@ -1445,7 +1420,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand( assert_true(operand.storage_addressing_mode == InstructionStorageAddressingMode::kStatic); dxbc_operand.type = DxbcSourceOperand::Type::kRegister; - dxbc_operand.index = uint32_t(operand.storage_index); + dxbc_operand.index = operand.storage_index; } break; @@ -1457,11 +1432,18 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand( cbuffer_index_float_constants_ = cbuffer_count_++; } dxbc_operand.type = DxbcSourceOperand::Type::kConstantFloat; - dxbc_operand.index = uint32_t(operand.storage_index); dxbc_operand.addressing_mode = operand.storage_addressing_mode; - if (operand.storage_addressing_mode != + if (operand.storage_addressing_mode == InstructionStorageAddressingMode::kStatic) { - float_constants_dynamic_indexed_ = true; + uint32_t float_constant_index = + constant_register_map().GetPackedFloatConstantIndex( + operand.storage_index); + assert_true(float_constant_index != UINT32_MAX); + dxbc_operand.index = + float_constant_index != UINT32_MAX ? float_constant_index : 0; + } else { + assert_true(constant_register_map().float_dynamic_addressing); + dxbc_operand.index = operand.storage_index; } break; @@ -1652,11 +1634,6 @@ void DxbcShaderTranslator::UseDxbcSourceOperand( } shader_code_.push_back(cbuffer_index_float_constants_); shader_code_.push_back(uint32_t(CbufferRegister::kFloatConstants)); - if (!float_constants_dynamic_indexed_) { - // If there's no dynamic indexing in the shader, constants are compacted - // and remapped. Store where the index has been written. - float_constant_index_offsets_.push_back(uint32_t(shader_code_.size())); - } shader_code_.push_back(operand.index); if (!is_static) { uint32_t dynamic_address_register, dynamic_address_component; @@ -1718,8 +1695,9 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand( void DxbcShaderTranslator::StoreResult(const InstructionResult& result, uint32_t reg, bool replicate_x, bool can_store_memexport_address) { + uint32_t used_write_mask = result.GetUsedWriteMask(); if (result.storage_target == InstructionStorageTarget::kNone || - !result.has_any_writes()) { + !result.GetUsedWriteMask()) { return; } @@ -1744,10 +1722,9 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped); // Scalar targets get only one component. + // TODO(Triang3l): It's not replicated, it's X specifically. if (result.storage_target == InstructionStorageTarget::kDepth) { - if (!result.write_mask[0]) { - return; - } + assert_not_zero(used_write_mask & 0b0001); SwizzleSource component = result.components[0]; if (replicate_x && component <= SwizzleSource::kW) { component = SwizzleSource::kX; @@ -1802,7 +1779,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, uint32_t constant_mask = 0; uint32_t constant_values = 0; for (uint32_t i = 0; i < 4; ++i) { - if (!result.write_mask[i]) { + if (!(used_write_mask & (1 << i))) { continue; } SwizzleSource component = result.components[i]; @@ -1858,7 +1835,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32 : D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE)); shader_code_.push_back(0); - shader_code_.push_back(uint32_t(result.storage_index)); + shader_code_.push_back(result.storage_index); if (!is_static) { shader_code_.push_back(EncodeVectorSelectOperand( D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1)); @@ -1874,11 +1851,11 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, saturate_bit); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back(uint32_t(result.storage_index)); + shader_code_.push_back(result.storage_index); } break; - case InstructionStorageTarget::kInterpolant: + case InstructionStorageTarget::kInterpolator: ++stat_.instruction_count; ++stat_.mov_instruction_count; shader_code_.push_back( @@ -1943,7 +1920,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, [uint32_t(result.storage_index)]); break; - case InstructionStorageTarget::kColorTarget: + case InstructionStorageTarget::kColor: ++stat_.instruction_count; ++stat_.mov_instruction_count; shader_code_.push_back( @@ -1952,8 +1929,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, saturate_bit); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back( - system_temps_color_[uint32_t(result.storage_index)]); + shader_code_.push_back(system_temps_color_[result.storage_index]); break; default: @@ -1989,13 +1965,13 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); shader_code_.push_back( - 1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3))); + uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3))); ++stat_.instruction_count; ++stat_.uint_instruction_count; } if (edram_rov_used_ && - result.storage_target == InstructionStorageTarget::kColorTarget) { + result.storage_target == InstructionStorageTarget::kColor) { // For ROV output, mark that the color has been written to. // According to: // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color @@ -2014,7 +1990,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, shader_code_.push_back(system_temp_rov_params_); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(1 << (8 + uint32_t(result.storage_index))); + shader_code_.push_back(1 << (8 + result.storage_index)); ++stat_.instruction_count; ++stat_.uint_instruction_count; } @@ -2479,19 +2455,6 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator:: }; void DxbcShaderTranslator::WriteResourceDefinitions() { - // *************************************************************************** - // Preparation - // *************************************************************************** - - // Float constant count. - uint32_t float_constant_count = 0; - if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) { - for (uint32_t i = 0; i < 4; ++i) { - float_constant_count += - xe::bit_count(constant_register_map().float_bitmap[i]); - } - } - uint32_t chunk_position_dwords = uint32_t(shader_object_.size()); uint32_t new_offset; @@ -2583,7 +2546,8 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { if (RdefTypeIndex(i) == RdefTypeIndex::kFloat4ConstantArray) { // Declaring a 0-sized array may not be safe, so write something valid // even if they aren't used. - shader_object_.push_back(std::max(float_constant_count, 1u)); + shader_object_.push_back( + std::max(constant_register_map().float_count, uint32_t(1))); } else { shader_object_.push_back(type.element_count | (type.struct_member_count << 16)); @@ -2692,8 +2656,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) { shader_object_.push_back(constant_name_offset_float); shader_object_.push_back(0); - shader_object_.push_back(std::max(float_constant_count, 1u) * 4 * - sizeof(float)); + shader_object_.push_back( + std::max(constant_register_map().float_count, uint32_t(1)) * 4 * + sizeof(float)); shader_object_.push_back(kDxbcRdefVariableFlagUsed); shader_object_.push_back(types_offset + uint32_t(RdefTypeIndex::kFloat4ConstantArray) * @@ -2795,8 +2760,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { shader_object_.push_back(cbuffer_name_offset_float); shader_object_.push_back(1); shader_object_.push_back(constant_offset_float); - shader_object_.push_back(std::max(float_constant_count, 1u) * 4 * - sizeof(float)); + shader_object_.push_back( + std::max(constant_register_map().float_count, uint32_t(1)) * 4 * + sizeof(float)); shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer)); shader_object_.push_back(0); } else if (i == cbuffer_index_bool_loop_constants_) { @@ -3646,15 +3612,10 @@ void DxbcShaderTranslator::WriteShaderCode() { // Constant buffers, from most frequenly accessed to least frequently accessed // (the order is a hint to the driver according to the DXBC header). if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) { - uint32_t float_constant_count = 0; - for (uint32_t i = 0; i < 4; ++i) { - float_constant_count += - xe::bit_count(constant_register_map().float_bitmap[i]); - } shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) | ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN( - float_constants_dynamic_indexed_ + constant_register_map().float_dynamic_addressing ? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED : D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); @@ -3663,7 +3624,7 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(cbuffer_index_float_constants_); shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants)); shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants)); - shader_object_.push_back(float_constant_count); + shader_object_.push_back(constant_register_map().float_count); shader_object_.push_back(0); } if (cbuffer_index_system_constants_ != kCbufferIndexUnallocated) { diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index cb23fa511..3fff2c561 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -857,10 +857,10 @@ class DxbcShaderTranslator : public ShaderTranslator { return 0b0000; } } - DxbcDest Mask(uint32_t write_mask) const { + [[nodiscard]] DxbcDest Mask(uint32_t write_mask) const { return DxbcDest(type_, write_mask, index_1d_, index_2d_, index_3d_); } - DxbcDest MaskMasked(uint32_t write_mask) const { + [[nodiscard]] DxbcDest MaskMasked(uint32_t write_mask) const { return DxbcDest(type_, write_mask_ & write_mask, index_1d_, index_2d_, index_3d_); } @@ -991,26 +991,28 @@ class DxbcShaderTranslator : public ShaderTranslator { return DxbcSrc(DxbcOperandType::kInputCoverageMask, kXXXX); } - DxbcSrc WithModifiers(bool absolute, bool negate) const { + [[nodiscard]] DxbcSrc WithModifiers(bool absolute, bool negate) const { DxbcSrc new_src(*this); new_src.absolute_ = absolute; new_src.negate_ = negate; return new_src; } - DxbcSrc WithAbs(bool absolute) const { + [[nodiscard]] DxbcSrc WithAbs(bool absolute) const { return WithModifiers(absolute, negate_); } - DxbcSrc WithNeg(bool negate) const { + [[nodiscard]] DxbcSrc WithNeg(bool negate) const { return WithModifiers(absolute_, negate); } - DxbcSrc Abs() const { return WithModifiers(true, false); } - DxbcSrc operator-() const { return WithModifiers(absolute_, !negate_); } - DxbcSrc Swizzle(uint32_t swizzle) const { + [[nodiscard]] DxbcSrc Abs() const { return WithModifiers(true, false); } + [[nodiscard]] DxbcSrc operator-() const { + return WithModifiers(absolute_, !negate_); + } + [[nodiscard]] DxbcSrc Swizzle(uint32_t swizzle) const { DxbcSrc new_src(*this); new_src.swizzle_ = swizzle; return new_src; } - DxbcSrc SwizzleSwizzled(uint32_t swizzle) const { + [[nodiscard]] DxbcSrc SwizzleSwizzled(uint32_t swizzle) const { DxbcSrc new_src(*this); new_src.swizzle_ = 0; for (uint32_t i = 0; i < 4; ++i) { @@ -1019,12 +1021,12 @@ class DxbcShaderTranslator : public ShaderTranslator { } return new_src; } - DxbcSrc Select(uint32_t component) const { + [[nodiscard]] DxbcSrc Select(uint32_t component) const { DxbcSrc new_src(*this); new_src.swizzle_ = component * 0b01010101; return new_src; } - DxbcSrc SelectFromSwizzled(uint32_t component) const { + [[nodiscard]] DxbcSrc SelectFromSwizzled(uint32_t component) const { DxbcSrc new_src(*this); new_src.swizzle_ = ((swizzle_ >> (component * 2)) & 3) * 0b01010101; return new_src; @@ -2026,6 +2028,7 @@ class DxbcShaderTranslator : public ShaderTranslator { void EmitInstructionDisassembly(); // Abstract 4-component vector source operand. + // TODO(Triang3l): Remove after fully moving to the new emitter. struct DxbcSourceOperand { enum class Type { // GPR number in the index - used only when GPRs are not dynamically @@ -2064,18 +2067,22 @@ class DxbcShaderTranslator : public ShaderTranslator { }; // Each Load must be followed by Unload, otherwise there may be a temporary // register leak. + // TODO(Triang3l): Remove after fully moving to the new emitter. void LoadDxbcSourceOperand(const InstructionOperand& operand, DxbcSourceOperand& dxbc_operand); // Number of tokens this operand adds to the instruction length when used. + // TODO(Triang3l): Remove after fully moving to the new emitter. uint32_t DxbcSourceOperandLength(const DxbcSourceOperand& operand, bool negate = false, bool absolute = false) const; // Writes the operand access tokens to the instruction (either for a scalar if // select_component is <= 3, or for a vector). + // TODO(Triang3l): Remove after fully moving to the new emitter. void UseDxbcSourceOperand(const DxbcSourceOperand& operand, uint32_t additional_swizzle = kSwizzleXYZW, uint32_t select_component = 4, bool negate = false, bool absolute = false); + // TODO(Triang3l): Remove after fully moving to the new emitter. void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand); // Writes xyzw or xxxx of the specified r# to the destination. @@ -2258,15 +2265,6 @@ class DxbcShaderTranslator : public ShaderTranslator { // the remaining ones can be marked as unused in RDEF. uint64_t system_constants_used_; - // Whether constants are dynamically indexed and need to be marked as such in - // dcl_constantBuffer. - bool float_constants_dynamic_indexed_; - - // Offsets of float constant indices in shader_code_, for remapping in - // CompleteTranslation (initially, at these offsets, guest float constant - // indices are written). - std::vector float_constant_index_offsets_; - // Whether InOutRegister::kDSInControlPointIndex has been used in the shader. bool in_control_point_index_used_; diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index fecdf6fdf..6b253dd2e 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -23,7 +23,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( replicate_result_x = false; predicate_written = false; - if (!instr.has_vector_op) { + if (!instr.vector_and_constant_result.GetUsedWriteMask() && + !AluVectorOpHasSideEffects(instr.vector_opcode)) { return false; } @@ -32,7 +33,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( if (instr.vector_opcode == AluVectorOpcode::kCube) { operand_count = 1; } else { - operand_count = uint32_t(instr.vector_operand_count); + operand_count = instr.vector_operand_count; } DxbcSourceOperand dxbc_operands[3]; // Whether the operand is the same as any previous operand, and thus is loaded @@ -42,7 +43,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( for (uint32_t i = 0; i < operand_count; ++i) { const InstructionOperand& operand = instr.vector_operands[i]; for (uint32_t j = 0; j < i; ++j) { - if (operand == instr.vector_operands[j]) { + if (operand.GetIdenticalComponents(instr.vector_operands[j]) == 0b1111) { operands_duplicate[i] = true; dxbc_operands[i] = dxbc_operands[j]; break; @@ -117,7 +118,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( UseDxbcSourceOperand(dxbc_operands[1]); ++stat_.instruction_count; ++stat_.float_instruction_count; - if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) { + if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]) != 0b1111) { // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0), // flushing denormals (must be done using eq - doing bitwise comparison // doesn't flush denormals). @@ -281,7 +283,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( UseDxbcSourceOperand(dxbc_operands[2]); ++stat_.instruction_count; ++stat_.float_instruction_count; - if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) { + if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]) != 0b1111) { // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0). // If any operand is zero or denormalized, just leave the addition part. uint32_t is_subnormal_temp = PushSystemTemp(); @@ -388,7 +391,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( case AluVectorOpcode::kDp4: case AluVectorOpcode::kDp3: case AluVectorOpcode::kDp2Add: { - if (instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) { + if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]) != 0b1111) { // The operands are the same when calculating vector length, no need to // emulate 0 * anything = 0 in this case. shader_code_.push_back( @@ -1092,7 +1096,9 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1); ++stat_.instruction_count; ++stat_.float_instruction_count; - if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) { + if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]) & + 0b0010)) { // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0). // This is an attenuation calculation function, so infinity is probably // not very unlikely. @@ -1294,7 +1300,8 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation( const ParsedAluInstruction& instr, bool& predicate_written) { predicate_written = false; - if (!instr.has_scalar_op) { + if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev && + !instr.scalar_result.GetUsedWriteMask()) { return false; } @@ -1306,7 +1313,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation( for (uint32_t i = 0; i < uint32_t(instr.scalar_operand_count); ++i) { const InstructionOperand& operand = instr.scalar_operands[i]; for (uint32_t j = 0; j < i; ++j) { - if (operand == instr.scalar_operands[j]) { + if (operand.GetIdenticalComponents(instr.scalar_operands[j]) == 0b1111) { operands_duplicate[i] = true; dxbc_operands[i] = dxbc_operands[j]; break; @@ -2303,7 +2310,9 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation( UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 0); ++stat_.instruction_count; ++stat_.float_instruction_count; - if (!instr.scalar_operands[0].EqualsAbsolute(instr.scalar_operands[1])) { + if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents( + instr.scalar_operands[1]) & + 0b0001)) { // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0). uint32_t is_subnormal_temp = PushSystemTemp(); // Get the non-NaN multiplicand closer to zero to check if any of them @@ -2421,7 +2430,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation( void DxbcShaderTranslator::ProcessAluInstruction( const ParsedAluInstruction& instr) { - if (instr.is_nop()) { + if (instr.IsNop()) { return; } @@ -2445,7 +2454,8 @@ void DxbcShaderTranslator::ProcessAluInstruction( ProcessScalarAluOperation(instr, predicate_written_scalar); if (store_vector) { - StoreResult(instr.vector_result, system_temp_pv_, replicate_vector_x, + StoreResult(instr.vector_and_constant_result, system_temp_pv_, + replicate_vector_x, instr.GetMemExportStreamConstant() != UINT32_MAX); } if (store_scalar) { diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index b1502ce1e..25941c4bd 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -10,10 +10,12 @@ #ifndef XENIA_GPU_SHADER_H_ #define XENIA_GPU_SHADER_H_ +#include #include #include #include +#include "xenia/base/math.h" #include "xenia/base/string_buffer.h" #include "xenia/gpu/ucode.h" #include "xenia/gpu/xenos.h" @@ -21,23 +23,32 @@ namespace xe { namespace gpu { +// The structures here are used for both translation and disassembly. +// +// Because disassembly uses them too, to make sure "assemble -> disassemble -> +// reassemble" round trip is always successful with the XNA assembler (as it is +// the accuracy benchmark for translation), only generalization - not +// optimization like nop skipping/replacement - must be done while converting +// microcode to these structures (in other words, parsed shader code should be +// enough to accurately reconstruct the microcode for any shader that could be +// written by a human in assembly). +// +// During the "parsed -> host" part of the translation, however, translators are +// free to make any optimizations (as long as they don't affect the result, of +// course) they find appropriate. + enum class InstructionStorageTarget { // Result is not stored. kNone, // Result is stored to a temporary register indexed by storage_index [0-31]. kRegister, - // Result is stored into a vertex shader interpolant export [0-15]. - kInterpolant, + // Result is stored into a vertex shader interpolator export [0-15]. + kInterpolator, // Result is stored to the position export (gl_Position). kPosition, - // Result is stored to the vertex shader misc export register. - // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG, - // USE_VTX_KILL_FLAG). - // X - PSIZE (gl_PointSize). - // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point - // drawing. - // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set - // for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition. + // Result is stored to the vertex shader misc export register, see + // ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex for description of + // components. kPointSizeEdgeFlagKillVertex, // Result is stored as memexport destination address // (see xenos::xe_gpu_memexport_stream_t). @@ -45,11 +56,29 @@ enum class InstructionStorageTarget { // Result is stored to memexport destination data. kExportData, // Result is stored to a color target export indexed by storage_index [0-3]. - kColorTarget, - // Result is stored to the depth export (gl_FragDepth). + kColor, + // X of the result is stored to the depth export (gl_FragDepth). kDepth, }; +// Must be used only in translation to skip unused components, but not in +// disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both +// skipped components and zeros, which cannot be encoded, and therefore it will +// not). +constexpr uint32_t GetInstructionStorageTargetUsedComponents( + InstructionStorageTarget target) { + switch (target) { + case InstructionStorageTarget::kNone: + return 0b0000; + case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: + return 0b0111; + case InstructionStorageTarget::kDepth: + return 0b0001; + default: + return 0b1111; + } +} + enum class InstructionStorageAddressingMode { // The storage index is not dynamically addressed. kStatic, @@ -75,71 +104,63 @@ enum class SwizzleSource { k1, }; -constexpr SwizzleSource GetSwizzleFromComponentIndex(int i) { +constexpr SwizzleSource GetSwizzleFromComponentIndex(uint32_t i) { return static_cast(i); } -inline char GetCharForComponentIndex(int i) { +inline char GetCharForComponentIndex(uint32_t i) { const static char kChars[] = {'x', 'y', 'z', 'w'}; return kChars[i]; } inline char GetCharForSwizzle(SwizzleSource swizzle_source) { const static char kChars[] = {'x', 'y', 'z', 'w', '0', '1'}; - return kChars[static_cast(swizzle_source)]; + return kChars[static_cast(swizzle_source)]; } struct InstructionResult { // Where the result is going. InstructionStorageTarget storage_target = InstructionStorageTarget::kNone; // Index into the storage_target, if it is indexed. - int storage_index = 0; + uint32_t storage_index = 0; // How the storage index is dynamically addressed, if it is. InstructionStorageAddressingMode storage_addressing_mode = InstructionStorageAddressingMode::kStatic; - // True if the result is exporting from the shader. - bool is_export = false; // True to clamp the result value to [0-1]. bool is_clamped = false; - // Defines whether each output component is written. - bool write_mask[4] = {false, false, false, false}; + // Defines whether each output component is written, though this is from the + // original microcode, not taking into account whether such components + // actually exist in the target. + uint32_t original_write_mask = 0b0000; // Defines the source for each output component xyzw. SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY, SwizzleSource::kZ, SwizzleSource::kW}; - // Returns true if any component is written to. - bool has_any_writes() const { - return write_mask[0] || write_mask[1] || write_mask[2] || write_mask[3]; - } - // Returns true if all components are written to. - bool has_all_writes() const { - return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3]; - } - // Returns number of components written - uint32_t num_writes() const { - uint32_t total = 0; - for (int i = 0; i < 4; i++) { - if (write_mask[i]) { - total++; - } - } - - return total; - } - // Returns true if any non-constant components are written. - bool stores_non_constants() const { - for (int i = 0; i < 4; ++i) { - if (write_mask[i] && components[i] != SwizzleSource::k0 && - components[i] != SwizzleSource::k1) { - return true; - } - } - return false; + // Returns the write mask containing only components actually present in the + // target. + uint32_t GetUsedWriteMask() const { + return original_write_mask & + GetInstructionStorageTargetUsedComponents(storage_target); } // True if the components are in their 'standard' swizzle arrangement (xyzw). - bool is_standard_swizzle() const { - return has_all_writes() && components[0] == SwizzleSource::kX && + bool IsStandardSwizzle() const { + return (GetUsedWriteMask() == 0b1111) && + components[0] == SwizzleSource::kX && components[1] == SwizzleSource::kY && components[2] == SwizzleSource::kZ && components[3] == SwizzleSource::kW; } + // Returns the components of the result, before swizzling, that won't be + // discarded or replaced with a constant. + uint32_t GetUsedResultComponents() const { + uint32_t used_write_mask = GetUsedWriteMask(); + uint32_t used_components = 0b0000; + for (uint32_t i = 0; i < 4; ++i) { + if ((used_write_mask & (1 << i)) && components[i] >= SwizzleSource::kX && + components[i] <= SwizzleSource::kW) { + used_components |= + 1 << (uint32_t(components[i]) - uint32_t(SwizzleSource::kX)); + } + } + return used_components; + } }; enum class InstructionStorageSource { @@ -159,7 +180,7 @@ struct InstructionOperand { // Where the source comes from. InstructionStorageSource storage_source = InstructionStorageSource::kRegister; // Index into the storage_target, if it is indexed. - int storage_index = 0; + uint32_t storage_index = 0; // How the storage index is dynamically addressed, if it is. InstructionStorageAddressingMode storage_addressing_mode = InstructionStorageAddressingMode::kStatic; @@ -168,13 +189,19 @@ struct InstructionOperand { // True to take the absolute value of the source (before any negation). bool is_absolute_value = false; // Number of components taken from the source operand. - int component_count = 0; + uint32_t component_count = 4; // Defines the source for each component xyzw (up to the given // component_count). SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY, SwizzleSource::kZ, SwizzleSource::kW}; + // Returns the swizzle source for the component, replicating the rightmost + // component if there are less than 4 components (similar to what the Xbox 360 + // shader compiler does as a general rule for unspecified components). + SwizzleSource GetComponent(uint32_t index) const { + return components[std::min(index, component_count - 1)]; + } // True if the components are in their 'standard' swizzle arrangement (xyzw). - bool is_standard_swizzle() const { + bool IsStandardSwizzle() const { switch (component_count) { case 4: return components[0] == SwizzleSource::kX && @@ -185,26 +212,32 @@ struct InstructionOperand { return false; } - // Whether absolute values of two operands are identical (useful for emulating - // Shader Model 3 0*anything=0 multiplication behavior). - bool EqualsAbsolute(const InstructionOperand& other) const { + // Returns which components of two operands are identical, but may have + // different signs (for simplicity of usage with GetComponent, treating the + // rightmost component as replicated). + uint32_t GetAbsoluteIdenticalComponents( + const InstructionOperand& other) const { if (storage_source != other.storage_source || storage_index != other.storage_index || - storage_addressing_mode != other.storage_addressing_mode || - component_count != other.component_count) { - return false; + storage_addressing_mode != other.storage_addressing_mode) { + return 0; } - for (int i = 0; i < component_count; ++i) { - if (components[i] != other.components[i]) { - return false; - } + uint32_t identical_components = 0; + for (uint32_t i = 0; i < 4; ++i) { + identical_components |= uint32_t(GetComponent(i) == other.GetComponent(i)) + << i; } - return true; + return identical_components; } - - bool operator==(const InstructionOperand& other) const { - return EqualsAbsolute(other) && is_negated == other.is_negated && - is_absolute_value == other.is_absolute_value; + // Returns which components of two operands will always be bitwise equal, but + // may have different signs (disregarding component_count for simplicity of + // usage with GetComponent, treating the rightmost component as replicated). + uint32_t GetIdenticalComponents(const InstructionOperand& other) const { + if (is_negated != other.is_negated || + is_absolute_value != other.is_absolute_value) { + return 0; + } + return GetAbsoluteIdenticalComponents(other); } }; @@ -365,9 +398,6 @@ struct ParsedAllocInstruction { }; struct ParsedVertexFetchInstruction { - // Index into the ucode dword source. - uint32_t dword_index = 0; - // Opcode for the instruction. ucode::FetchOpcode opcode; // Friendly name of the instruction. @@ -409,9 +439,6 @@ struct ParsedVertexFetchInstruction { }; struct ParsedTextureFetchInstruction { - // Index into the ucode dword source. - uint32_t dword_index = 0; - // Opcode for the instruction. ucode::FetchOpcode opcode; // Friendly name of the instruction. @@ -462,17 +489,6 @@ struct ParsedTextureFetchInstruction { }; struct ParsedAluInstruction { - // Index into the ucode dword source. - uint32_t dword_index = 0; - - // True if the vector part of the instruction needs to be executed and data - // about it in this structure is valid. - bool has_vector_op = false; - // True if the scalar part of the instruction needs to be executed and data - // about it in this structure is valid. - bool has_scalar_op = false; - bool is_nop() const { return !has_vector_op && !has_scalar_op; } - // Opcode for the vector part of the instruction. ucode::AluVectorOpcode vector_opcode = ucode::AluVectorOpcode::kAdd; // Opcode for the scalar part of the instruction. @@ -488,8 +504,20 @@ struct ParsedAluInstruction { // Expected predication condition value if predicated. bool predicate_condition = false; - // Describes how the vector operation result is stored. - InstructionResult vector_result; + // Describes how the vector operation result and, for exports, constant 0/1 + // are stored. For simplicity of translation and disassembly, treating + // constant 0/1 writes as a part of the vector operation - they need to be + // expressed somehow in the disassembly anyway with a properly disassembled + // instruction even if only constants are being exported. The XNA disassembler + // falls back to displaying the whole vector operation, even if only constant + // components are written, if the scalar operation is a nop or if the vector + // operation has side effects (but if the scalar operation isn't nop, it + // outputs the entire constant mask in the scalar operation destination). + // Normally the XNA disassembler outputs the constant mask in both vector and + // scalar operations, but that's not required by assembler, so it doesn't + // really matter whether it's specified in the vector operation, in the scalar + // operation, or in both. + InstructionResult vector_and_constant_result; // Describes how the scalar operation result is stored. InstructionResult scalar_result; // Both operations must be executed before any result is stored if vector and @@ -499,27 +527,109 @@ struct ParsedAluInstruction { // operations. // Number of source operands of the vector operation. - size_t vector_operand_count = 0; + uint32_t vector_operand_count = 0; // Describes each source operand of the vector operation. InstructionOperand vector_operands[3]; // Number of source operands of the scalar operation. - size_t scalar_operand_count = 0; + uint32_t scalar_operand_count = 0; // Describes each source operand of the scalar operation. InstructionOperand scalar_operands[2]; - // If this is a valid eA write (MAD with a stream constant), returns the index - // of the stream float constant, otherwise returns UINT32_MAX. + // Whether the vector part of the instruction is the same as if it was omitted + // in the assembly (if compiled or assembled with the Xbox 360 shader + // compiler), and thus reassembling the shader with this instruction omitted + // will result in the same microcode (since instructions with just an empty + // write mask may have different values in other fields). + // This is for disassembly! Translators should use the write masks and + // AluVectorOpHasSideEffects to skip operations, as this only covers one very + // specific nop format! + bool IsVectorOpDefaultNop() const { + if (vector_opcode != ucode::AluVectorOpcode::kMax || + vector_and_constant_result.original_write_mask || + vector_and_constant_result.is_clamped || + vector_operands[0].storage_source != + InstructionStorageSource::kRegister || + vector_operands[0].storage_index != 0 || + vector_operands[0].storage_addressing_mode != + InstructionStorageAddressingMode::kStatic || + vector_operands[0].is_negated || vector_operands[0].is_absolute_value || + !vector_operands[0].IsStandardSwizzle() || + vector_operands[1].storage_source != + InstructionStorageSource::kRegister || + vector_operands[1].storage_index != 0 || + vector_operands[1].storage_addressing_mode != + InstructionStorageAddressingMode::kStatic || + vector_operands[1].is_negated || vector_operands[1].is_absolute_value || + !vector_operands[1].IsStandardSwizzle()) { + return false; + } + if (vector_and_constant_result.storage_target == + InstructionStorageTarget::kRegister) { + if (vector_and_constant_result.storage_index != 0 || + vector_and_constant_result.storage_addressing_mode != + InstructionStorageAddressingMode::kStatic) { + return false; + } + } else { + // In case both vector and scalar operations are nop, still need to write + // somewhere that it's an export, not mov r0._, r0 + retain_prev r0._. + // Accurate round trip is possible only if the target is o0 or oC0, + // because if the total write mask is empty, the XNA assembler forces the + // destination to be o0/oC0, but this doesn't really matter in this case. + if (IsScalarOpDefaultNop()) { + return false; + } + } + return true; + } + + // Whether the scalar part of the instruction is the same as if it was omitted + // in the assembly (if compiled or assembled with the Xbox 360 shader + // compiler), and thus reassembling the shader with this instruction omitted + // will result in the same microcode (since instructions with just an empty + // write mask may have different values in other fields). + bool IsScalarOpDefaultNop() const { + if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev || + scalar_result.original_write_mask || scalar_result.is_clamped) { + return false; + } + if (scalar_result.storage_target == InstructionStorageTarget::kRegister) { + if (scalar_result.storage_index != 0 || + scalar_result.storage_addressing_mode != + InstructionStorageAddressingMode::kStatic) { + return false; + } + } + // For exports, if both are nop, the vector operation will be kept to state + // in the microcode that the destination in the microcode is an export. + return true; + } + + // For translation (not disassembly) - whether this instruction has totally no + // effect. + bool IsNop() const { + return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev && + !scalar_result.GetUsedWriteMask() && + !vector_and_constant_result.GetUsedWriteMask() && + !ucode::AluVectorOpHasSideEffects(vector_opcode); + } + + // If this is a "normal" eA write recognized by Xenia (MAD with a stream + // constant), returns the index of the stream float constant, otherwise + // returns UINT32_MAX. uint32_t GetMemExportStreamConstant() const { - if (has_vector_op && - vector_result.storage_target == + if (vector_and_constant_result.storage_target == InstructionStorageTarget::kExportAddress && vector_opcode == ucode::AluVectorOpcode::kMad && - vector_result.has_all_writes() && + vector_and_constant_result.GetUsedResultComponents() == 0b1111 && + !vector_and_constant_result.is_clamped && vector_operands[2].storage_source == InstructionStorageSource::kConstantFloat && vector_operands[2].storage_addressing_mode == InstructionStorageAddressingMode::kStatic && - vector_operands[2].is_standard_swizzle()) { + vector_operands[2].IsStandardSwizzle() && + !vector_operands[2].is_negated && + !vector_operands[2].is_absolute_value) { return vector_operands[2].storage_index; } return UINT32_MAX; @@ -581,9 +691,8 @@ class Shader { struct ConstantRegisterMap { // Bitmap of all kConstantFloat registers read by the shader. // Any shader can only read up to 256 of the 512, and the base is dependent - // on the shader type. Each bit corresponds to a storage index from the type - // base, so bit 0 in a vertex shader is register 0, and bit 0 in a fragment - // shader is register 256. + // on the shader type and SQ_VS/PS_CONST registers. Each bit corresponds to + // a storage index from the type base. uint64_t float_bitmap[256 / 64]; // Bitmap of all loop constants read by the shader. // Each bit corresponds to a storage index [0-31]. @@ -595,8 +704,33 @@ class Shader { // Total number of kConstantFloat registers read by the shader. uint32_t float_count; - // Computed byte count of all registers required when packed. - uint32_t packed_byte_length; + // Whether kConstantFloat registers are indexed dynamically - in this case, + // float_bitmap must be set to all 1, and tight packing must not be done. + bool float_dynamic_addressing; + + // Returns the index of the float4 constant as if all float4 constant + // registers actually referenced were tightly packed in a buffer, or + // UINT32_MAX if not found. + uint32_t GetPackedFloatConstantIndex(uint32_t float_constant) const { + if (float_constant >= 256) { + return UINT32_MAX; + } + if (float_dynamic_addressing) { + // Any can potentially be read - not packing. + return float_constant; + } + uint32_t block_index = float_constant / 64; + uint32_t bit_index = float_constant % 64; + if (!(float_bitmap[block_index] & (uint64_t(1) << bit_index))) { + return UINT32_MAX; + } + uint32_t offset = 0; + for (uint32_t i = 0; i < block_index; ++i) { + offset += xe::bit_count(float_bitmap[i]); + } + return offset + xe::bit_count(float_bitmap[block_index] & + ((uint64_t(1) << bit_index) - 1)); + } }; Shader(ShaderType shader_type, uint64_t ucode_data_hash, @@ -642,7 +776,9 @@ class Shader { } // Returns true if the given color target index [0-3]. - bool writes_color_target(int i) const { return writes_color_targets_[i]; } + bool writes_color_target(uint32_t i) const { + return writes_color_targets_[i]; + } // True if the shader overrides the pixel depth. bool writes_depth() const { return writes_depth_; } diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index bce8af1ab..d8efbc4d0 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -131,9 +131,8 @@ bool ShaderTranslator::TranslateInternal( ucode_dwords_ = shader->ucode_dwords(); ucode_dword_count_ = shader->ucode_dword_count(); - // Run through and gather all binding information and to check whether - // registers are dynamically addressed. - // Translators may need this before they start codegen. + // Run through and gather all binding, operand addressing and export + // information. Translators may need this before they start codegen. uint32_t max_cf_dword_index = static_cast(ucode_dword_count_); for (uint32_t i = 0; i < max_cf_dword_index; i += 3) { ControlFlowInstruction cf_a; @@ -151,10 +150,27 @@ bool ShaderTranslator::TranslateInternal( GatherInstructionInformation(cf_a); GatherInstructionInformation(cf_b); } + + if (constant_register_map_.float_dynamic_addressing) { + // All potentially can be referenced. + constant_register_map_.float_count = 256; + memset(constant_register_map_.float_bitmap, UINT8_MAX, + sizeof(constant_register_map_.float_bitmap)); + } else { + constant_register_map_.float_count = 0; + for (int i = 0; i < 4; ++i) { + // Each bit indicates a vec4 (4 floats). + constant_register_map_.float_count += + xe::bit_count(constant_register_map_.float_bitmap[i]); + } + } + // Cleanup invalid/unneeded memexport allocs. for (uint32_t i = 0; i < kMaxMemExports; ++i) { - if (!memexport_eM_written_[i]) { - memexport_eA_written_ &= ~(1u << i); + if (!(memexport_eA_written_ & (uint32_t(1) << i))) { + memexport_eM_written_[i] = 0; + } else if (!memexport_eM_written_[i]) { + memexport_eA_written_ &= ~(uint32_t(1) << i); } } if (memexport_eA_written_ == 0) { @@ -171,27 +187,6 @@ bool ShaderTranslator::TranslateInternal( TranslateBlocks(); - // Compute total number of float registers and total bytes used by the - // register map. This saves us work later when we need to pack them. - constant_register_map_.packed_byte_length = 0; - constant_register_map_.float_count = 0; - for (int i = 0; i < 4; ++i) { - // Each bit indicates a vec4 (4 floats). - constant_register_map_.float_count += - xe::bit_count(constant_register_map_.float_bitmap[i]); - } - constant_register_map_.packed_byte_length += - 4 * 4 * constant_register_map_.float_count; - // Each bit indicates a single word. - constant_register_map_.packed_byte_length += - 4 * xe::bit_count(constant_register_map_.loop_bitmap); - // Direct map between words and words we upload. - for (int i = 0; i < 8; ++i) { - if (constant_register_map_.bool_bitmap[i]) { - constant_register_map_.packed_byte_length += 4; - } - } - shader->errors_ = std::move(errors_); shader->translated_binary_ = CompleteTranslation(); shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string(); @@ -267,6 +262,43 @@ void ShaderTranslator::EmitUnimplementedTranslationError() { void ShaderTranslator::GatherInstructionInformation( const ControlFlowInstruction& cf) { + uint32_t bool_constant_index = UINT32_MAX; + switch (cf.opcode()) { + case ControlFlowOpcode::kCondExec: + case ControlFlowOpcode::kCondExecEnd: + case ControlFlowOpcode::kCondExecPredClean: + case ControlFlowOpcode::kCondExecPredCleanEnd: + bool_constant_index = cf.cond_exec.bool_address(); + break; + case ControlFlowOpcode::kCondCall: + if (!cf.cond_call.is_unconditional() && !cf.cond_call.is_predicated()) { + bool_constant_index = cf.cond_call.bool_address(); + } + break; + case ControlFlowOpcode::kCondJmp: + if (!cf.cond_jmp.is_unconditional() && !cf.cond_jmp.is_predicated()) { + bool_constant_index = cf.cond_jmp.bool_address(); + } + break; + case ControlFlowOpcode::kLoopStart: + constant_register_map_.loop_bitmap |= uint32_t(1) + << cf.loop_start.loop_id(); + break; + case ControlFlowOpcode::kLoopEnd: + constant_register_map_.loop_bitmap |= uint32_t(1) + << cf.loop_end.loop_id(); + break; + case ControlFlowOpcode::kAlloc: + if (cf.alloc.alloc_type() == AllocType::kMemory) { + ++memexport_alloc_count_; + } + break; + } + if (bool_constant_index != UINT32_MAX) { + constant_register_map_.bool_bitmap[bool_constant_index / 32] |= + uint32_t(1) << (bool_constant_index % 32); + } + switch (cf.opcode()) { case ControlFlowOpcode::kExec: case ControlFlowOpcode::kExecEnd: @@ -296,99 +328,128 @@ void ShaderTranslator::GatherInstructionInformation( ucode_dwords_ + instr_offset * 3)); } } else { - // Gather up color targets written to, and check if using dynamic - // register indices. + // Gather info needed for the translation pass because having such + // state changed in the middle of translation may break things. Check + // the comments for each specific variable set here to see usage + // restrictions that can be assumed here (such as only marking exports + // as written if the used write mask is non-empty). auto& op = *reinterpret_cast(ucode_dwords_ + instr_offset * 3); - if (op.has_vector_op()) { - const auto& opcode_info = - alu_vector_opcode_infos_[static_cast(op.vector_opcode())]; - implicit_early_z_allowed_ &= !opcode_info.disable_implicit_early_z; - for (size_t i = 0; i < opcode_info.argument_count; ++i) { - if (op.src_is_temp(i + 1) && (op.src_reg(i + 1) & 0x40)) { - uses_register_dynamic_addressing_ = true; - } - } - if (op.is_export()) { - if (is_pixel_shader()) { - if (op.vector_dest() <= 3) { - writes_color_targets_[op.vector_dest()] = true; - } else if (op.vector_dest() == 61) { + ParsedAluInstruction instr; + ParseAluInstruction(op, instr); + + const auto& vector_opcode_info = + alu_vector_opcode_infos_[uint32_t(op.vector_opcode())]; + implicit_early_z_allowed_ &= + !vector_opcode_info.disable_implicit_early_z; + const auto& scalar_opcode_info = + alu_scalar_opcode_infos_[uint32_t(op.scalar_opcode())]; + implicit_early_z_allowed_ &= + !scalar_opcode_info.disable_implicit_early_z; + + if (instr.vector_and_constant_result.storage_target != + InstructionStorageTarget::kRegister || + instr.scalar_result.storage_target != + InstructionStorageTarget::kRegister) { + // Export is done to vector_dest of the ucode instruction for both + // vector and scalar operations - no need to check separately. + assert_true(instr.vector_and_constant_result.storage_target == + instr.scalar_result.storage_target && + instr.vector_and_constant_result.storage_index == + instr.scalar_result.storage_index); + if (instr.vector_and_constant_result.GetUsedWriteMask() || + instr.scalar_result.GetUsedWriteMask()) { + InstructionStorageTarget export_target = + instr.vector_and_constant_result.storage_target; + uint32_t export_index = + instr.vector_and_constant_result.storage_index; + switch (export_target) { + case InstructionStorageTarget::kExportAddress: + // Store used memexport constants because CPU code needs + // addresses and sizes, and also whether there have been + // writes to eA and eM# for register allocation in shader + // translator implementations. + // eA is (hopefully) always written to using: + // mad eA, r#, const0100, c# + // (though there are some exceptions, shaders in Halo 3 for + // some reason set eA to zeros, but the swizzle of the + // constant is not .xyzw in this case, and they don't write to + // eM#). + if (memexport_alloc_count_ > 0 && + memexport_alloc_count_ <= kMaxMemExports) { + uint32_t memexport_stream_constant = + instr.GetMemExportStreamConstant(); + if (memexport_stream_constant != UINT32_MAX) { + memexport_eA_written_ |= uint32_t(1) + << (memexport_alloc_count_ - 1); + memexport_stream_constants_.insert( + memexport_stream_constant); + } else { + XELOGE( + "ShaderTranslator::GatherInstructionInformation: " + "Couldn't extract memexport stream constant index"); + } + } + break; + case InstructionStorageTarget::kExportData: + if (memexport_alloc_count_ > 0 && + memexport_alloc_count_ <= kMaxMemExports) { + memexport_eM_written_[memexport_alloc_count_ - 1] |= + uint32_t(1) << export_index; + } + break; + case InstructionStorageTarget::kColor: + writes_color_targets_[export_index] = true; + break; + case InstructionStorageTarget::kDepth: writes_depth_ = true; implicit_early_z_allowed_ = false; - } - } - if (memexport_alloc_count_ > 0 && - memexport_alloc_count_ <= kMaxMemExports) { - // Store used memexport constants because CPU code needs - // addresses and sizes, and also whether there have been writes - // to eA and eM# for register allocation in shader translator - // implementations. - // eA is (hopefully) always written to using: - // mad eA, r#, const0100, c# - // (though there are some exceptions, shaders in Halo 3 for some - // reason set eA to zeros, but the swizzle of the constant is - // not .xyzw in this case, and they don't write to eM#). - uint32_t memexport_alloc_index = memexport_alloc_count_ - 1; - if (op.vector_dest() == 32 && - op.vector_opcode() == AluVectorOpcode::kMad && - op.vector_write_mask() == 0b1111 && !op.src_is_temp(3) && - op.src_swizzle(3) == 0) { - memexport_eA_written_ |= 1u << memexport_alloc_index; - memexport_stream_constants_.insert(op.src_reg(3)); - } else if (op.vector_dest() >= 33 && op.vector_dest() <= 37) { - if (memexport_eA_written_ & (1u << memexport_alloc_index)) { - memexport_eM_written_[memexport_alloc_index] |= - 1 << (op.vector_dest() - 33); - } - } - } - } else { - if (op.is_vector_dest_relative()) { - uses_register_dynamic_addressing_ = true; + break; + default: + break; } } - } - if (op.has_scalar_op()) { - const auto& opcode_info = - alu_scalar_opcode_infos_[static_cast(op.scalar_opcode())]; - implicit_early_z_allowed_ &= !opcode_info.disable_implicit_early_z; - if (opcode_info.argument_count == 1 && op.src_is_temp(3) && - (op.src_reg(3) & 0x40)) { + } else { + if ((instr.vector_and_constant_result.GetUsedWriteMask() && + instr.vector_and_constant_result.storage_addressing_mode != + InstructionStorageAddressingMode::kStatic) || + (instr.scalar_result.GetUsedWriteMask() && + instr.scalar_result.storage_addressing_mode != + InstructionStorageAddressingMode::kStatic)) { uses_register_dynamic_addressing_ = true; } - if (op.is_export()) { - if (is_pixel_shader()) { - if (op.scalar_dest() <= 3) { - writes_color_targets_[op.scalar_dest()] = true; - } else if (op.scalar_dest() == 61) { - writes_depth_ = true; - implicit_early_z_allowed_ = false; - } - } - if (memexport_alloc_count_ > 0 && - memexport_alloc_count_ <= kMaxMemExports && - op.scalar_dest() >= 33 && op.scalar_dest() <= 37) { - uint32_t memexport_alloc_index = memexport_alloc_count_ - 1; - if (memexport_eA_written_ & (1u << memexport_alloc_index)) { - memexport_eM_written_[memexport_alloc_index] |= - 1 << (op.scalar_dest() - 33); - } - } - } else { - if (op.is_scalar_dest_relative()) { + } + + uint32_t total_operand_count = + instr.vector_operand_count + instr.scalar_operand_count; + for (uint32_t i = 0; i < total_operand_count; ++i) { + const InstructionOperand& operand = + (i < instr.vector_operand_count) + ? instr.vector_operands[i] + : instr.scalar_operands[i - instr.vector_operand_count]; + if (operand.storage_source == InstructionStorageSource::kRegister) { + if (operand.storage_addressing_mode != + InstructionStorageAddressingMode::kStatic) { uses_register_dynamic_addressing_ = true; } + } else if (operand.storage_source == + InstructionStorageSource::kConstantFloat) { + if (operand.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + // Store used float constants before translating so the + // translator can use tightly packed indices if not dynamically + // indexed. + uint32_t constant_index = operand.storage_index; + constant_register_map_.float_bitmap[constant_index / 64] |= + uint64_t(1) << (constant_index % 64); + } else { + constant_register_map_.float_dynamic_addressing = true; + } } } } } } break; - case ControlFlowOpcode::kAlloc: - if (cf.alloc.alloc_type() == AllocType::kMemory) { - ++memexport_alloc_count_; - } - break; default: break; } @@ -674,8 +735,9 @@ void ShaderTranslator::TranslateControlFlowCondExec( i.instruction_count = cf.count(); i.type = ParsedExecInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= - 1 << (i.bool_constant_index % 32); + assert_not_zero( + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & + (uint32_t(1) << (i.bool_constant_index % 32))); i.condition = cf.condition(); switch (cf.opcode()) { case ControlFlowOpcode::kCondExec: @@ -715,7 +777,8 @@ void ShaderTranslator::TranslateControlFlowLoopStart( ParsedLoopStartInstruction i; i.dword_index = cf_index_; i.loop_constant_index = cf.loop_id(); - constant_register_map_.loop_bitmap |= 1 << i.loop_constant_index; + assert_not_zero(constant_register_map_.loop_bitmap & + (uint32_t(1) << i.loop_constant_index)); i.is_repeat = cf.is_repeat(); i.loop_skip_address = cf.address(); @@ -731,7 +794,8 @@ void ShaderTranslator::TranslateControlFlowLoopEnd( i.is_predicated_break = cf.is_predicated_break(); i.predicate_condition = cf.condition(); i.loop_constant_index = cf.loop_id(); - constant_register_map_.loop_bitmap |= 1 << i.loop_constant_index; + assert_not_zero(constant_register_map_.loop_bitmap & + (uint32_t(1) << i.loop_constant_index)); i.loop_body_address = cf.address(); i.Disassemble(&ucode_disasm_buffer_); @@ -752,8 +816,9 @@ void ShaderTranslator::TranslateControlFlowCondCall( } else { i.type = ParsedCallInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= - 1 << (i.bool_constant_index % 32); + assert_not_zero( + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & + (uint32_t(1) << (i.bool_constant_index % 32))); i.condition = cf.condition(); } @@ -785,8 +850,9 @@ void ShaderTranslator::TranslateControlFlowCondJmp( } else { i.type = ParsedJumpInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= - 1 << (i.bool_constant_index % 32); + assert_not_zero( + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] & + (uint32_t(1) << (i.bool_constant_index % 32))); i.condition = cf.condition(); } @@ -852,23 +918,25 @@ void ParseFetchInstructionResult(uint32_t dest, uint32_t swizzle, InstructionResult* out_result) { out_result->storage_target = InstructionStorageTarget::kRegister; out_result->storage_index = dest; - out_result->is_export = false; out_result->is_clamped = false; out_result->storage_addressing_mode = is_relative ? InstructionStorageAddressingMode::kAddressRelative : InstructionStorageAddressingMode::kStatic; + out_result->original_write_mask = 0b1111; for (int i = 0; i < 4; ++i) { - out_result->write_mask[i] = true; - if ((swizzle & 0x7) == 4) { - out_result->components[i] = SwizzleSource::k0; - } else if ((swizzle & 0x7) == 5) { - out_result->components[i] = SwizzleSource::k1; - } else if ((swizzle & 0x7) == 6) { - out_result->components[i] = SwizzleSource::k0; - } else if ((swizzle & 0x7) == 7) { - out_result->write_mask[i] = false; - } else { - out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3); + switch (swizzle & 0x7) { + case 4: + case 6: + out_result->components[i] = SwizzleSource::k0; + break; + case 5: + out_result->components[i] = SwizzleSource::k1; + break; + case 7: + out_result->original_write_mask &= ~uint32_t(1 << i); + break; + default: + out_result->components[i] = GetSwizzleFromComponentIndex(swizzle & 0x3); } swizzle >>= 3; } @@ -885,7 +953,6 @@ void ShaderTranslator::TranslateVertexFetchInstruction( void ShaderTranslator::ParseVertexFetchInstruction( const VertexFetchInstruction& op, ParsedVertexFetchInstruction* out_instr) { auto& i = *out_instr; - i.dword_index = 0; i.opcode = FetchOpcode::kVertexFetch; i.opcode_name = op.is_mini_fetch() ? "vfetch_mini" : "vfetch_full"; i.is_mini_fetch = op.is_mini_fetch(); @@ -908,7 +975,7 @@ void ShaderTranslator::ParseVertexFetchInstruction( src_op.is_absolute_value = false; src_op.component_count = 1; uint32_t swizzle = full_op.src_swizzle(); - for (int j = 0; j < src_op.component_count; ++j, swizzle >>= 2) { + for (uint32_t j = 0; j < src_op.component_count; ++j, swizzle >>= 2) { src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3); } @@ -947,7 +1014,7 @@ void ShaderTranslator::ParseTextureFetchInstruction( bool has_dest; bool has_const; bool has_attributes; - int override_component_count; + uint32_t override_component_count; } opcode_info; switch (op.opcode()) { case FetchOpcode::kTextureFetch: { @@ -993,7 +1060,6 @@ void ShaderTranslator::ParseTextureFetchInstruction( } auto& i = *out_instr; - i.dword_index = 0; i.opcode = op.opcode(); i.opcode_name = opcode_info.name; i.dimension = op.dimension(); @@ -1020,7 +1086,7 @@ void ShaderTranslator::ParseTextureFetchInstruction( ? opcode_info.override_component_count : GetTextureDimensionComponentCount(op.dimension()); uint32_t swizzle = op.src_swizzle(); - for (int j = 0; j < src_op.component_count; ++j, swizzle >>= 2) { + for (uint32_t j = 0; j < src_op.component_count; ++j, swizzle >>= 2) { src_op.components[j] = GetSwizzleFromComponentIndex(swizzle & 0x3); } @@ -1118,7 +1184,7 @@ const ShaderTranslator::AluOpcodeInfo {"setp_ge", 1, 1, false}, // 30 {"setp_inv", 1, 1, false}, // 31 {"setp_pop", 1, 1, false}, // 32 - {"setp_clr", 1, 1, false}, // 33 + {"setp_clr", 0, 0, false}, // 33 {"setp_rstr", 1, 1, false}, // 34 {"kills_eq", 1, 1, true}, // 35 {"kills_gt", 1, 1, true}, // 36 @@ -1135,28 +1201,164 @@ const ShaderTranslator::AluOpcodeInfo {"subsc", 2, 1, false}, // 47 {"sin", 1, 1, false}, // 48 {"cos", 1, 1, false}, // 49 - {"retain_prev", 1, 1, false}, // 50 + {"retain_prev", 0, 0, false}, // 50 }; void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { ParsedAluInstruction instr; - - instr.dword_index = 0; - - instr.is_predicated = op.is_predicated(); - instr.predicate_condition = op.predicate_condition(); - - ParseAluVectorOperation(op, instr); - ParseAluScalarOperation(op, instr); - + ParseAluInstruction(op, instr); instr.Disassemble(&ucode_disasm_buffer_); - ProcessAluInstruction(instr); } -void ParseAluInstructionOperand(const AluInstruction& op, int i, - int swizzle_component_count, - InstructionOperand* out_op) { +void ShaderTranslator::ParseAluInstruction(const AluInstruction& op, + ParsedAluInstruction& instr) const { + instr.is_predicated = op.is_predicated(); + instr.predicate_condition = op.predicate_condition(); + + bool is_export = op.is_export(); + + InstructionStorageTarget storage_target = InstructionStorageTarget::kRegister; + uint32_t storage_index_export = 0; + if (is_export) { + storage_target = InstructionStorageTarget::kNone; + // Both vector and scalar operation export to vector_dest. + ExportRegister export_register = ExportRegister(op.vector_dest()); + if (export_register == ExportRegister::kExportAddress) { + storage_target = InstructionStorageTarget::kExportAddress; + } else if (export_register >= ExportRegister::kExportData0 && + export_register <= ExportRegister::kExportData4) { + storage_target = InstructionStorageTarget::kExportData; + storage_index_export = + uint32_t(export_register) - uint32_t(ExportRegister::kExportData0); + } else if (is_vertex_shader()) { + if (export_register >= ExportRegister::kVSInterpolator0 && + export_register <= ExportRegister::kVSInterpolator15) { + storage_target = InstructionStorageTarget::kInterpolator; + storage_index_export = uint32_t(export_register) - + uint32_t(ExportRegister::kVSInterpolator0); + } else if (export_register == ExportRegister::kVSPosition) { + storage_target = InstructionStorageTarget::kPosition; + } else if (export_register == + ExportRegister::kVSPointSizeEdgeFlagKillVertex) { + storage_target = InstructionStorageTarget::kPointSizeEdgeFlagKillVertex; + } + } else if (is_pixel_shader()) { + if (export_register >= ExportRegister::kPSColor0 && + export_register <= ExportRegister::kPSColor3) { + storage_target = InstructionStorageTarget::kColor; + storage_index_export = + uint32_t(export_register) - uint32_t(ExportRegister::kPSColor0); + } else if (export_register == ExportRegister::kPSDepth) { + storage_target = InstructionStorageTarget::kDepth; + } + } + if (storage_target == InstructionStorageTarget::kNone) { + assert_always(); + XELOGE( + "ShaderTranslator::ParseAluInstruction: Unsupported write to export " + "{}", + uint32_t(export_register)); + } + } + + // Vector operation and constant 0/1 writes. + + instr.vector_opcode = op.vector_opcode(); + const auto& vector_opcode_info = + alu_vector_opcode_infos_[uint32_t(instr.vector_opcode)]; + instr.vector_opcode_name = vector_opcode_info.name; + + instr.vector_and_constant_result.storage_target = storage_target; + instr.vector_and_constant_result.storage_addressing_mode = + InstructionStorageAddressingMode::kStatic; + if (is_export) { + instr.vector_and_constant_result.storage_index = storage_index_export; + } else { + instr.vector_and_constant_result.storage_index = op.vector_dest(); + assert_true(op.vector_dest() < register_count()); + if (op.is_vector_dest_relative()) { + instr.vector_and_constant_result.storage_addressing_mode = + InstructionStorageAddressingMode::kAddressRelative; + } + } + instr.vector_and_constant_result.is_clamped = op.vector_clamp(); + uint32_t constant_0_mask = op.GetConstant0WriteMask(); + uint32_t constant_1_mask = op.GetConstant1WriteMask(); + instr.vector_and_constant_result.original_write_mask = + op.GetVectorOpResultWriteMask() | constant_0_mask | constant_1_mask; + for (uint32_t i = 0; i < 4; ++i) { + SwizzleSource component = GetSwizzleFromComponentIndex(i); + if (constant_0_mask & (1 << i)) { + component = SwizzleSource::k0; + } else if (constant_1_mask & (1 << i)) { + component = SwizzleSource::k1; + } + instr.vector_and_constant_result.components[i] = component; + } + + instr.vector_operand_count = vector_opcode_info.argument_count; + for (uint32_t i = 0; i < instr.vector_operand_count; ++i) { + InstructionOperand& vector_operand = instr.vector_operands[i]; + ParseAluInstructionOperand(op, i + 1, + vector_opcode_info.src_swizzle_component_count, + vector_operand); + } + + // Scalar operation. + + instr.scalar_opcode = op.scalar_opcode(); + const auto& scalar_opcode_info = + alu_scalar_opcode_infos_[uint32_t(instr.scalar_opcode)]; + instr.scalar_opcode_name = scalar_opcode_info.name; + + instr.scalar_result.storage_target = storage_target; + instr.scalar_result.storage_addressing_mode = + InstructionStorageAddressingMode::kStatic; + if (is_export) { + instr.scalar_result.storage_index = storage_index_export; + } else { + instr.scalar_result.storage_index = op.scalar_dest(); + assert_true(op.scalar_dest() < register_count()); + if (op.is_scalar_dest_relative()) { + instr.scalar_result.storage_addressing_mode = + InstructionStorageAddressingMode::kAddressRelative; + } + } + instr.scalar_result.is_clamped = op.scalar_clamp(); + instr.scalar_result.original_write_mask = op.GetScalarOpResultWriteMask(); + for (uint32_t i = 0; i < 4; ++i) { + instr.scalar_result.components[i] = GetSwizzleFromComponentIndex(i); + } + + instr.scalar_operand_count = scalar_opcode_info.argument_count; + if (instr.scalar_operand_count) { + if (instr.scalar_operand_count == 1) { + ParseAluInstructionOperand(op, 3, + scalar_opcode_info.src_swizzle_component_count, + instr.scalar_operands[0]); + } else { + uint32_t src3_swizzle = op.src_swizzle(3); + uint32_t component_a = ((src3_swizzle >> 6) + 3) & 0x3; + uint32_t component_b = ((src3_swizzle >> 0) + 0) & 0x3; + uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) | + (static_cast(op.scalar_opcode()) & 1); + int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0; + + ParseAluInstructionOperandSpecial( + op, InstructionStorageSource::kConstantFloat, op.src_reg(3), + op.src_negate(3), 0, component_a, instr.scalar_operands[0]); + + ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister, + reg2, op.src_negate(3), const_slot, + component_b, instr.scalar_operands[1]); + } + } +} + +void ShaderTranslator::ParseAluInstructionOperand( + const AluInstruction& op, uint32_t i, uint32_t swizzle_component_count, + InstructionOperand& out_op) { int const_slot = 0; switch (i) { case 2: @@ -1166,393 +1368,80 @@ void ParseAluInstructionOperand(const AluInstruction& op, int i, const_slot = op.src_is_temp(1) && op.src_is_temp(2) ? 0 : 1; break; } - out_op->is_negated = op.src_negate(i); + out_op.is_negated = op.src_negate(i); uint32_t reg = op.src_reg(i); if (op.src_is_temp(i)) { - out_op->storage_source = InstructionStorageSource::kRegister; - out_op->storage_index = reg & 0x1F; - out_op->is_absolute_value = (reg & 0x80) == 0x80; - out_op->storage_addressing_mode = + out_op.storage_source = InstructionStorageSource::kRegister; + out_op.storage_index = reg & 0x1F; + out_op.is_absolute_value = (reg & 0x80) == 0x80; + out_op.storage_addressing_mode = (reg & 0x40) ? InstructionStorageAddressingMode::kAddressRelative : InstructionStorageAddressingMode::kStatic; } else { - out_op->storage_source = InstructionStorageSource::kConstantFloat; - out_op->storage_index = reg; + out_op.storage_source = InstructionStorageSource::kConstantFloat; + out_op.storage_index = reg; if ((const_slot == 0 && op.is_const_0_addressed()) || (const_slot == 1 && op.is_const_1_addressed())) { if (op.is_address_relative()) { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kAddressAbsolute; } else { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kAddressRelative; } } else { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kStatic; } - out_op->is_absolute_value = op.abs_constants(); + out_op.is_absolute_value = op.abs_constants(); } - out_op->component_count = swizzle_component_count; + out_op.component_count = swizzle_component_count; uint32_t swizzle = op.src_swizzle(i); if (swizzle_component_count == 1) { uint32_t a = ((swizzle >> 6) + 3) & 0x3; - out_op->components[0] = GetSwizzleFromComponentIndex(a); + out_op.components[0] = GetSwizzleFromComponentIndex(a); } else if (swizzle_component_count == 2) { uint32_t a = ((swizzle >> 6) + 3) & 0x3; uint32_t b = ((swizzle >> 0) + 0) & 0x3; - out_op->components[0] = GetSwizzleFromComponentIndex(a); - out_op->components[1] = GetSwizzleFromComponentIndex(b); + out_op.components[0] = GetSwizzleFromComponentIndex(a); + out_op.components[1] = GetSwizzleFromComponentIndex(b); } else if (swizzle_component_count == 3) { assert_always(); } else if (swizzle_component_count == 4) { - for (int j = 0; j < swizzle_component_count; ++j, swizzle >>= 2) { - out_op->components[j] = GetSwizzleFromComponentIndex((swizzle + j) & 0x3); + for (uint32_t j = 0; j < swizzle_component_count; ++j, swizzle >>= 2) { + out_op.components[j] = GetSwizzleFromComponentIndex((swizzle + j) & 0x3); } } } -void ParseAluInstructionOperandSpecial(const AluInstruction& op, - InstructionStorageSource storage_source, - uint32_t reg, bool negate, - int const_slot, uint32_t swizzle, - InstructionOperand* out_op) { - out_op->is_negated = negate; - out_op->is_absolute_value = op.abs_constants(); - out_op->storage_source = storage_source; +void ShaderTranslator::ParseAluInstructionOperandSpecial( + const AluInstruction& op, InstructionStorageSource storage_source, + uint32_t reg, bool negate, int const_slot, uint32_t component_index, + InstructionOperand& out_op) { + out_op.is_negated = negate; + out_op.is_absolute_value = op.abs_constants(); + out_op.storage_source = storage_source; if (storage_source == InstructionStorageSource::kRegister) { - out_op->storage_index = reg & 0x7F; - out_op->storage_addressing_mode = InstructionStorageAddressingMode::kStatic; + out_op.storage_index = reg & 0x7F; + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kStatic; } else { - out_op->storage_index = reg; + out_op.storage_index = reg; if ((const_slot == 0 && op.is_const_0_addressed()) || (const_slot == 1 && op.is_const_1_addressed())) { if (op.is_address_relative()) { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kAddressAbsolute; } else { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kAddressRelative; } } else { - out_op->storage_addressing_mode = + out_op.storage_addressing_mode = InstructionStorageAddressingMode::kStatic; } } - out_op->component_count = 1; - uint32_t a = swizzle & 0x3; - out_op->components[0] = GetSwizzleFromComponentIndex(a); -} - -void ShaderTranslator::ParseAluVectorOperation(const AluInstruction& op, - ParsedAluInstruction& i) { - i.has_vector_op = op.has_vector_op(); - if (!i.has_vector_op) { - return; - } - i.vector_opcode = op.vector_opcode(); - const auto& opcode_info = - alu_vector_opcode_infos_[static_cast(op.vector_opcode())]; - i.vector_opcode_name = opcode_info.name; - - i.vector_result.is_export = op.is_export(); - i.vector_result.is_clamped = op.vector_clamp(); - i.vector_result.storage_target = InstructionStorageTarget::kRegister; - i.vector_result.storage_index = 0; - uint32_t dest_num = op.vector_dest(); - if (!op.is_export()) { - assert_true(dest_num < 32); - i.vector_result.storage_target = InstructionStorageTarget::kRegister; - i.vector_result.storage_index = dest_num; - i.vector_result.storage_addressing_mode = - op.is_vector_dest_relative() - ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; - } else if (is_vertex_shader()) { - switch (dest_num) { - case 32: - i.vector_result.storage_target = - InstructionStorageTarget::kExportAddress; - break; - case 33: - case 34: - case 35: - case 36: - case 37: - i.vector_result.storage_index = dest_num - 33; - i.vector_result.storage_target = InstructionStorageTarget::kExportData; - break; - case 62: - i.vector_result.storage_target = InstructionStorageTarget::kPosition; - break; - case 63: - i.vector_result.storage_target = - InstructionStorageTarget::kPointSizeEdgeFlagKillVertex; - break; - default: - if (dest_num < 16) { - i.vector_result.storage_target = - InstructionStorageTarget::kInterpolant; - i.vector_result.storage_index = dest_num; - } else { - // Unimplemented. - // assert_always(); - XELOGE( - "ShaderTranslator::ParseAluVectorOperation: Unsupported write to " - "export {}", - dest_num); - i.vector_result.storage_target = InstructionStorageTarget::kNone; - i.vector_result.storage_index = 0; - } - break; - } - } else if (is_pixel_shader()) { - switch (dest_num) { - case 0: - case 63: // ? masked? - i.vector_result.storage_target = InstructionStorageTarget::kColorTarget; - i.vector_result.storage_index = 0; - break; - case 1: - i.vector_result.storage_target = InstructionStorageTarget::kColorTarget; - i.vector_result.storage_index = 1; - break; - case 2: - i.vector_result.storage_target = InstructionStorageTarget::kColorTarget; - i.vector_result.storage_index = 2; - break; - case 3: - i.vector_result.storage_target = InstructionStorageTarget::kColorTarget; - i.vector_result.storage_index = 3; - break; - case 32: - i.vector_result.storage_target = - InstructionStorageTarget::kExportAddress; - break; - case 33: - case 34: - case 35: - case 36: - case 37: - i.vector_result.storage_index = dest_num - 33; - i.vector_result.storage_target = InstructionStorageTarget::kExportData; - break; - case 61: - i.vector_result.storage_target = InstructionStorageTarget::kDepth; - break; - default: - XELOGE( - "ShaderTranslator::ParseAluVectorOperation: Unsupported write to " - "export {}", - dest_num); - i.vector_result.storage_target = InstructionStorageTarget::kNone; - i.vector_result.storage_index = 0; - } - } - if (op.is_export()) { - uint32_t write_mask = op.vector_write_mask(); - uint32_t const_1_mask = op.scalar_write_mask(); - if (!write_mask) { - for (int j = 0; j < 4; ++j) { - i.vector_result.write_mask[j] = false; - } - } else { - for (int j = 0; j < 4; ++j, write_mask >>= 1, const_1_mask >>= 1) { - i.vector_result.write_mask[j] = true; - if (write_mask & 0x1) { - if (const_1_mask & 0x1) { - i.vector_result.components[j] = SwizzleSource::k1; - } else { - i.vector_result.components[j] = GetSwizzleFromComponentIndex(j); - } - } else { - if (op.is_scalar_dest_relative()) { - i.vector_result.components[j] = SwizzleSource::k0; - } else { - i.vector_result.write_mask[j] = false; - } - } - } - } - } else { - uint32_t write_mask = op.vector_write_mask(); - for (int j = 0; j < 4; ++j, write_mask >>= 1) { - i.vector_result.write_mask[j] = (write_mask & 0x1) == 0x1; - i.vector_result.components[j] = GetSwizzleFromComponentIndex(j); - } - } - - i.vector_operand_count = opcode_info.argument_count; - for (int j = 0; j < i.vector_operand_count; ++j) { - ParseAluInstructionOperand(op, j + 1, - opcode_info.src_swizzle_component_count, - &i.vector_operands[j]); - - // Track constant float register loads. - if (i.vector_operands[j].storage_source == - InstructionStorageSource::kConstantFloat) { - if (i.vector_operands[j].storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) { - // Dynamic addressing makes all constants required. - std::memset(constant_register_map_.float_bitmap, 0xFF, - sizeof(constant_register_map_.float_bitmap)); - } else { - auto register_index = i.vector_operands[j].storage_index; - constant_register_map_.float_bitmap[register_index / 64] |= - 1ull << (register_index % 64); - } - } - } -} - -void ShaderTranslator::ParseAluScalarOperation(const AluInstruction& op, - ParsedAluInstruction& i) { - i.has_scalar_op = op.has_scalar_op(); - if (!i.has_scalar_op) { - return; - } - i.scalar_opcode = op.scalar_opcode(); - const auto& opcode_info = - alu_scalar_opcode_infos_[static_cast(op.scalar_opcode())]; - i.scalar_opcode_name = opcode_info.name; - - uint32_t dest_num; - uint32_t write_mask; - if (op.is_export()) { - dest_num = op.vector_dest(); - write_mask = op.scalar_write_mask() & ~op.vector_write_mask(); - } else { - dest_num = op.scalar_dest(); - write_mask = op.scalar_write_mask(); - } - i.scalar_result.is_export = op.is_export(); - i.scalar_result.is_clamped = op.scalar_clamp(); - i.scalar_result.storage_target = InstructionStorageTarget::kRegister; - i.scalar_result.storage_index = 0; - if (!op.is_export()) { - assert_true(dest_num < 32); - i.scalar_result.storage_target = InstructionStorageTarget::kRegister; - i.scalar_result.storage_index = dest_num; - i.scalar_result.storage_addressing_mode = - op.is_scalar_dest_relative() - ? InstructionStorageAddressingMode::kAddressRelative - : InstructionStorageAddressingMode::kStatic; - } else if (is_vertex_shader()) { - switch (dest_num) { - case 32: - i.scalar_result.storage_target = - InstructionStorageTarget::kExportAddress; - break; - case 33: - case 34: - case 35: - case 36: - case 37: - i.scalar_result.storage_index = dest_num - 33; - i.scalar_result.storage_target = InstructionStorageTarget::kExportData; - break; - case 62: - i.scalar_result.storage_target = InstructionStorageTarget::kPosition; - break; - case 63: - i.scalar_result.storage_target = - InstructionStorageTarget::kPointSizeEdgeFlagKillVertex; - break; - default: - if (dest_num < 16) { - i.scalar_result.storage_target = - InstructionStorageTarget::kInterpolant; - i.scalar_result.storage_index = dest_num; - } else { - // Unimplemented. - // assert_always(); - XELOGE( - "ShaderTranslator::ParseAluScalarOperation: Unsupported write to " - "export {}", - dest_num); - i.scalar_result.storage_target = InstructionStorageTarget::kNone; - i.scalar_result.storage_index = 0; - } - break; - } - } else if (is_pixel_shader()) { - switch (dest_num) { - case 0: - case 63: // ? masked? - i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget; - i.scalar_result.storage_index = 0; - break; - case 1: - i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget; - i.scalar_result.storage_index = 1; - break; - case 2: - i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget; - i.scalar_result.storage_index = 2; - break; - case 3: - i.scalar_result.storage_target = InstructionStorageTarget::kColorTarget; - i.scalar_result.storage_index = 3; - break; - case 32: - i.scalar_result.storage_target = - InstructionStorageTarget::kExportAddress; - break; - case 33: - case 34: - case 35: - case 36: - case 37: - i.scalar_result.storage_index = dest_num - 33; - i.scalar_result.storage_target = InstructionStorageTarget::kExportData; - break; - case 61: - i.scalar_result.storage_target = InstructionStorageTarget::kDepth; - break; - } - } - for (int j = 0; j < 4; ++j, write_mask >>= 1) { - i.scalar_result.write_mask[j] = (write_mask & 0x1) == 0x1; - i.scalar_result.components[j] = GetSwizzleFromComponentIndex(j); - } - - i.scalar_operand_count = opcode_info.argument_count; - if (opcode_info.argument_count == 1) { - ParseAluInstructionOperand(op, 3, opcode_info.src_swizzle_component_count, - &i.scalar_operands[0]); - } else { - uint32_t src3_swizzle = op.src_swizzle(3); - uint32_t swiz_a = ((src3_swizzle >> 6) + 3) & 0x3; - uint32_t swiz_b = ((src3_swizzle >> 0) + 0) & 0x3; - uint32_t reg2 = (src3_swizzle & 0x3C) | (op.src_is_temp(3) << 1) | - (static_cast(op.scalar_opcode()) & 1); - - int const_slot = (op.src_is_temp(1) || op.src_is_temp(2)) ? 1 : 0; - - ParseAluInstructionOperandSpecial( - op, InstructionStorageSource::kConstantFloat, op.src_reg(3), - op.src_negate(3), 0, swiz_a, &i.scalar_operands[0]); - - ParseAluInstructionOperandSpecial(op, InstructionStorageSource::kRegister, - reg2, op.src_negate(3), const_slot, - swiz_b, &i.scalar_operands[1]); - } - - // Track constant float register loads - in either case, a float constant may - // be used in operand 0. - if (i.scalar_operands[0].storage_source == - InstructionStorageSource::kConstantFloat) { - auto register_index = i.scalar_operands[0].storage_index; - if (i.scalar_operands[0].storage_addressing_mode != - InstructionStorageAddressingMode::kStatic) { - // Dynamic addressing makes all constants required. - std::memset(constant_register_map_.float_bitmap, 0xFF, - sizeof(constant_register_map_.float_bitmap)); - } else { - constant_register_map_.float_bitmap[register_index / 64] |= - 1ull << (register_index % 64); - } - } + out_op.component_count = 1; + out_op.components[0] = GetSwizzleFromComponentIndex(component_index); } } // namespace gpu diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index a41253669..300e00b48 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -57,15 +57,19 @@ class ShaderTranslator { } // True if the current shader is a pixel shader. bool is_pixel_shader() const { return shader_type_ == ShaderType::kPixel; } + // Used constant register info, populated before translation. const Shader::ConstantRegisterMap& constant_register_map() const { return constant_register_map_; } // True if the current shader addresses general-purpose registers with dynamic - // indices. + // indices, set before translation. Doesn't include writes to r[#+a#] with an + // empty used write mask. bool uses_register_dynamic_addressing() const { return uses_register_dynamic_addressing_; } - // True if the current shader writes to a color target on any execution path. + // True if the current shader writes to a color target on any execution path, + // set before translation. Doesn't include writes with an empty used write + // mask. bool writes_color_target(int i) const { return writes_color_targets_[i]; } bool writes_any_color_target() const { for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { @@ -75,7 +79,8 @@ class ShaderTranslator { } return false; } - // True if the current shader overrides the pixel depth. + // True if the current shader overrides the pixel depth, set before + // translation. Doesn't include writes with an empty used write mask. bool writes_depth() const { return writes_depth_; } // True if Xenia can automatically enable early depth/stencil for the pixel // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha @@ -181,8 +186,8 @@ class ShaderTranslator { private: struct AluOpcodeInfo { const char* name; - size_t argument_count; - int src_swizzle_component_count; + uint32_t argument_count; + uint32_t src_swizzle_component_count; bool disable_implicit_early_z; }; @@ -229,10 +234,16 @@ class ShaderTranslator { ParsedTextureFetchInstruction* out_instr); void TranslateAluInstruction(const ucode::AluInstruction& op); - void ParseAluVectorOperation(const ucode::AluInstruction& op, - ParsedAluInstruction& instr); - void ParseAluScalarOperation(const ucode::AluInstruction& op, - ParsedAluInstruction& instr); + void ParseAluInstruction(const ucode::AluInstruction& op, + ParsedAluInstruction& out_instr) const; + static void ParseAluInstructionOperand(const ucode::AluInstruction& op, + uint32_t i, + uint32_t swizzle_component_count, + InstructionOperand& out_op); + static void ParseAluInstructionOperandSpecial( + const ucode::AluInstruction& op, InstructionStorageSource storage_source, + uint32_t reg, bool negate, int const_slot, uint32_t component_index, + InstructionOperand& out_op); // Input shader metadata and microcode. ShaderType shader_type_; @@ -265,12 +276,16 @@ class ShaderTranslator { uint32_t unique_vertex_bindings_ = 0; uint32_t unique_texture_bindings_ = 0; + // These all are gathered before translation. + // uses_register_dynamic_addressing_ for writes, writes_color_targets_, + // writes_depth_ don't include empty used write masks. Shader::ConstantRegisterMap constant_register_map_ = {0}; bool uses_register_dynamic_addressing_ = false; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; bool implicit_early_z_allowed_ = true; + // Memexport info is gathered before translation. uint32_t memexport_alloc_count_ = 0; // For register allocation in implementations - what was used after each // `alloc export`. diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc index 7a0bdf179..2a9536b26 100644 --- a/src/xenia/gpu/shader_translator_disasm.cc +++ b/src/xenia/gpu/shader_translator_disasm.cc @@ -28,7 +28,7 @@ void DisassembleResultOperand(const InstructionResult& result, out->Append('r'); uses_storage_index = true; break; - case InstructionStorageTarget::kInterpolant: + case InstructionStorageTarget::kInterpolator: out->Append('o'); uses_storage_index = true; break; @@ -45,7 +45,7 @@ void DisassembleResultOperand(const InstructionResult& result, out->Append("eM"); uses_storage_index = true; break; - case InstructionStorageTarget::kColorTarget: + case InstructionStorageTarget::kColor: out->Append("oC"); uses_storage_index = true; break; @@ -68,12 +68,19 @@ void DisassembleResultOperand(const InstructionResult& result, break; } } - if (!result.has_any_writes()) { + // Not using GetUsedWriteMask/IsStandardSwizzle because they filter out + // components not having any runtime effect, but those components are still + // present in the microcode. + if (!result.original_write_mask) { out->Append("._"); - } else if (!result.is_standard_swizzle()) { + } else if (result.original_write_mask != 0b1111 || + result.components[0] != SwizzleSource::kX || + result.components[1] != SwizzleSource::kY || + result.components[2] != SwizzleSource::kZ || + result.components[3] != SwizzleSource::kW) { out->Append('.'); for (int i = 0; i < 4; ++i) { - if (result.write_mask[i]) { + if (result.original_write_mask & (1 << i)) { out->Append(GetCharForSwizzle(result.components[i])); } else { out->Append('_'); @@ -116,7 +123,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) { out->AppendFormat("[{}+aL]", op.storage_index); break; } - if (!op.is_standard_swizzle()) { + if (!op.IsStandardSwizzle()) { out->Append('.'); if (op.component_count == 1) { out->Append(GetCharForSwizzle(op.components[0])); @@ -124,7 +131,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) { out->Append(GetCharForSwizzle(op.components[0])); out->Append(GetCharForSwizzle(op.components[1])); } else { - for (int j = 0; j < op.component_count; ++j) { + for (uint32_t j = 0; j < op.component_count; ++j) { out->Append(GetCharForSwizzle(op.components[j])); } } @@ -454,11 +461,19 @@ void ParsedTextureFetchInstruction::Disassemble(StringBuffer* out) const { } void ParsedAluInstruction::Disassemble(StringBuffer* out) const { - if (is_nop()) { - out->Append(" nop\n"); + bool is_vector_op_default_nop = IsVectorOpDefaultNop(); + bool is_scalar_op_default_nop = IsScalarOpDefaultNop(); + if (is_vector_op_default_nop && is_scalar_op_default_nop) { + out->Append(" "); + if (is_predicated) { + out->Append(predicate_condition ? " (p0) " : "(!p0) "); + } else { + out->Append(" "); + } + out->Append("nop\n"); return; } - if (has_vector_op) { + if (!is_vector_op_default_nop) { out->Append(" "); if (is_predicated) { out->Append(predicate_condition ? " (p0) " : "(!p0) "); @@ -466,19 +481,19 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const { out->Append(" "); } out->Append(vector_opcode_name); - if (vector_result.is_clamped) { + if (vector_and_constant_result.is_clamped) { out->Append("_sat"); } out->Append(' '); - DisassembleResultOperand(vector_result, out); - for (int i = 0; i < vector_operand_count; ++i) { + DisassembleResultOperand(vector_and_constant_result, out); + for (uint32_t i = 0; i < vector_operand_count; ++i) { out->Append(", "); DisassembleSourceOperand(vector_operands[i], out); } out->Append('\n'); } - if (has_scalar_op) { - out->Append(has_vector_op ? " + " : " "); + if (!is_scalar_op_default_nop) { + out->Append(is_vector_op_default_nop ? " " : " + "); if (is_predicated) { out->Append(predicate_condition ? " (p0) " : "(!p0) "); } else { @@ -490,7 +505,7 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const { } out->Append(' '); DisassembleResultOperand(scalar_result, out); - for (int i = 0; i < scalar_operand_count; ++i) { + for (uint32_t i = 0; i < scalar_operand_count; ++i) { out->Append(", "); DisassembleSourceOperand(scalar_operands[i], out); } diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 5680f9eca..bd8f2217e 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -2003,7 +2003,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( void SpirvShaderTranslator::ProcessAluInstruction( const ParsedAluInstruction& instr) { - if (instr.is_nop()) { + if (instr.IsNop()) { return; } @@ -2044,7 +2044,7 @@ void SpirvShaderTranslator::ProcessAluInstruction( ProcessScalarAluOperation(instr, close_predicated_block_scalar); if (store_vector) { - StoreToResult(b.createLoad(pv_), instr.vector_result); + StoreToResult(b.createLoad(pv_), instr.vector_and_constant_result); } if (store_scalar) { StoreToResult(b.createLoad(ps_), instr.scalar_result); @@ -2252,7 +2252,8 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation( const ParsedAluInstruction& instr, bool& close_predicated_block) { close_predicated_block = false; - if (!instr.has_vector_op) { + if (!instr.vector_and_constant_result.GetUsedWriteMask() && + !AluVectorOpHasSideEffects(instr.vector_opcode)) { return false; } @@ -2261,7 +2262,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation( // TODO: If we have identical operands, reuse previous one. Id sources[3] = {0}; Id dest = vec4_float_zero_; - for (size_t i = 0; i < instr.vector_operand_count; i++) { + for (uint32_t i = 0; i < instr.vector_operand_count; i++) { sources[i] = LoadFromOperand(instr.vector_operands[i]); } @@ -2636,7 +2637,8 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation( const ParsedAluInstruction& instr, bool& close_predicated_block) { close_predicated_block = false; - if (!instr.has_scalar_op) { + if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev && + !instr.scalar_result.GetUsedWriteMask()) { return false; } @@ -2645,12 +2647,12 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation( // TODO: If we have identical operands, reuse previous one. Id sources[3] = {0}; Id dest = b.makeFloatConstant(0); - for (size_t i = 0, x = 0; i < instr.scalar_operand_count; i++) { + for (uint32_t i = 0, x = 0; i < instr.scalar_operand_count; i++) { auto src = LoadFromOperand(instr.scalar_operands[i]); // Pull components out of the vector operands and use them as sources. if (instr.scalar_operands[i].component_count > 1) { - for (int j = 0; j < instr.scalar_operands[i].component_count; j++) { + for (uint32_t j = 0; j < instr.scalar_operands[i].component_count; j++) { sources[x++] = b.createCompositeExtract(src, float_type_, j); } } else { @@ -3191,7 +3193,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { } // swizzle - if (op.component_count > 1 && !op.is_standard_swizzle()) { + if (op.component_count > 1 && !op.IsStandardSwizzle()) { std::vector operands; operands.push_back(storage_value); operands.push_back(b.makeCompositeConstant( @@ -3200,7 +3202,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { // Components start from left and are duplicated rightwards // e.g. count = 1, xxxx / count = 2, xyyy ... - for (int i = 0; i < 4; i++) { + for (uint32_t i = 0; i < 4; i++) { auto swiz = op.components[i]; if (i > op.component_count - 1) { swiz = op.components[op.component_count - 1]; @@ -3244,7 +3246,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, return; } - if (!result.has_any_writes()) { + uint32_t used_write_mask = result.GetUsedWriteMask(); + if (!used_write_mask) { return; } @@ -3285,7 +3288,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, storage_array = true; assert_true(uint32_t(result.storage_index) < register_count()); break; - case InstructionStorageTarget::kInterpolant: + case InstructionStorageTarget::kInterpolator: assert_true(is_vertex_shader()); storage_pointer = interpolators_; storage_class = spv::StorageClass::StorageClassOutput; @@ -3310,7 +3313,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, storage_offsets.push_back(0); storage_array = false; break; - case InstructionStorageTarget::kColorTarget: + case InstructionStorageTarget::kColor: assert_true(is_pixel_shader()); assert_not_zero(frag_outputs_); storage_pointer = frag_outputs_; @@ -3351,7 +3354,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, // Only load from storage if we need it later. Id storage_value = 0; - if ((source_is_scalar && !storage_is_scalar) || !result.has_all_writes()) { + if ((source_is_scalar && !storage_is_scalar) || used_write_mask != 0b1111) { storage_value = b.createLoad(storage_pointer); } @@ -3366,7 +3369,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, } // destination swizzle - if (!result.is_standard_swizzle() && !source_is_scalar) { + if (!result.IsStandardSwizzle() && !source_is_scalar) { std::vector operands; operands.push_back(source_value_id); operands.push_back(b.makeCompositeConstant( @@ -3377,7 +3380,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, // e.g. count = 1, xxxx / count = 2, xyyy ... uint32_t source_components = b.getNumComponents(source_value_id); for (int i = 0; i < 4; i++) { - if (!result.write_mask[i]) { + if (!(used_write_mask & (1 << i))) { // Undefined / don't care. operands.push_back(0); continue; @@ -3411,29 +3414,30 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, } // write mask - if (!result.has_all_writes() && !source_is_scalar && !storage_is_scalar) { + if (used_write_mask != 0b1111 && !source_is_scalar && !storage_is_scalar) { std::vector operands; operands.push_back(source_value_id); operands.push_back(storage_value); for (int i = 0; i < b.getNumTypeComponents(storage_type); i++) { - operands.push_back( - result.write_mask[i] ? i : b.getNumComponents(source_value_id) + i); + operands.push_back((used_write_mask & (1 << i)) + ? i + : b.getNumComponents(source_value_id) + i); } source_value_id = b.createOp(spv::Op::OpVectorShuffle, storage_type, operands); } else if (source_is_scalar && !storage_is_scalar) { - assert_true(result.num_writes() >= 1); + assert_not_zero(used_write_mask); - if (result.has_all_writes()) { + if (used_write_mask == 0b1111) { source_value_id = b.smearScalar(spv::NoPrecision, source_value_id, storage_type); } else { // Find first enabled component uint32_t index = 0; for (uint32_t i = 0; i < 4; i++) { - if (result.write_mask[i]) { + if (used_write_mask & (1 << i)) { index = i; break; } @@ -3443,10 +3447,10 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, } } else if (!source_is_scalar && storage_is_scalar) { // Num writes /needs/ to be 1, and let's assume it's the first element. - assert_true(result.num_writes() == 1); + assert_true(xe::bit_count(used_write_mask) == 1); for (uint32_t i = 0; i < 4; i++) { - if (result.write_mask[i]) { + if (used_write_mask & (1 << i)) { source_value_id = b.createCompositeExtract(source_value_id, storage_type, 0); break; diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 7aa135bce..b588f6776 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -667,7 +667,11 @@ static_assert_size(TextureFetchInstruction, 12); // Both are valid only within the current ALU clause. They are not modified // when the instruction that would write them fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for -// multiplication (0 * anything = 0) and for NaN in min/max. +// multiplication (0 * anything = 0) wherever it's present (mul, mad, dp, +// etc.) and for NaN in min/max. It's very important to respect this rule for +// multiplication, as games often rely on it in vector normalization (rcp and +// mul), Infinity * 0 resulting in NaN breaks a lot of things in games - +// causes white screen in Halo 3, white specular on characters in GTA IV. enum class AluScalarOpcode : uint32_t { // Floating-Point Add @@ -1300,8 +1304,10 @@ enum class AluVectorOpcode : uint32_t { // Whether the vector instruction has side effects such as discarding a pixel or // setting the predicate and can't be ignored even if it doesn't write to -// anywhere. -inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) { +// anywhere. Note that all scalar operations except for retain_prev have a side +// effect of modifying the previous scalar result register, so they must always +// be executed even if not writing. +constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) { switch (vector_opcode) { case AluVectorOpcode::kSetpEqPush: case AluVectorOpcode::kSetpNePush: @@ -1319,7 +1325,126 @@ inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) { return false; } +// Whether each component of a source operand is used at all in the instruction +// (doesn't check the operand count though). +constexpr uint32_t GetAluVectorOpUsedSourceComponents( + AluVectorOpcode vector_opcode, uint32_t src_index) { + switch (vector_opcode) { + case AluVectorOpcode::kDp3: + return 0b0111; + case AluVectorOpcode::kDp2Add: + return src_index == 3 ? 0b0001 : 0b0011; + case AluVectorOpcode::kSetpEqPush: + case AluVectorOpcode::kSetpNePush: + case AluVectorOpcode::kSetpGtPush: + case AluVectorOpcode::kSetpGePush: + return 0b1001; + case AluVectorOpcode::kDst: + return src_index == 2 ? 0b1010 : 0b0110; + default: + break; + } + return 0b1111; +} + +// Whether each component of a source operand is needed for the instruction if +// executed with the specified write mask, and thus can't be thrown away or be +// undefined in translation. For per-component operations, for example, only the +// components specified in the write mask are needed, but there are instructions +// with special behavior for certain components. +constexpr uint32_t GetAluVectorOpNeededSourceComponents( + AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) { + uint32_t components = write_mask; + switch (vector_opcode) { + case AluVectorOpcode::kDp4: + case AluVectorOpcode::kMax4: + components = write_mask ? 0b1111 : 0; + break; + case AluVectorOpcode::kDp3: + components = write_mask ? 0b0111 : 0; + break; + case AluVectorOpcode::kDp2Add: + components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0; + break; + case AluVectorOpcode::kCube: + components = write_mask ? 0b1111 : 0; + break; + case AluVectorOpcode::kSetpEqPush: + case AluVectorOpcode::kSetpNePush: + case AluVectorOpcode::kSetpGtPush: + case AluVectorOpcode::kSetpGePush: + components = write_mask ? 0b1001 : 0b1000; + break; + case AluVectorOpcode::kKillEq: + case AluVectorOpcode::kKillGt: + case AluVectorOpcode::kKillGe: + case AluVectorOpcode::kKillNe: + components = 0b1111; + break; + // kDst is per-component, but not all components are used - + // GetAluVectorOpUsedSourceComponents will filter out the unused ones. + case AluVectorOpcode::kMaxA: + if (src_index == 1) { + components |= 0b1000; + } + break; + default: + break; + } + return components & + GetAluVectorOpUsedSourceComponents(vector_opcode, src_index); +} + +enum class ExportRegister : uint32_t { + kVSInterpolator0 = 0, + kVSInterpolator1, + kVSInterpolator2, + kVSInterpolator3, + kVSInterpolator4, + kVSInterpolator5, + kVSInterpolator6, + kVSInterpolator7, + kVSInterpolator8, + kVSInterpolator9, + kVSInterpolator10, + kVSInterpolator11, + kVSInterpolator12, + kVSInterpolator13, + kVSInterpolator14, + kVSInterpolator15, + + kVSPosition = 62, + + // See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG, + // USE_VTX_KILL_FLAG). + // X - PSIZE (gl_PointSize). + // Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point + // drawing. + // Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set + // for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition. + kVSPointSizeEdgeFlagKillVertex = 63, + + kPSColor0 = 0, + kPSColor1, + kPSColor2, + kPSColor3, + + // In X. + kPSDepth = 61, + + // Memory export: index.?y?? * 0100 + xe_gpu_memexport_stream_t.xyzw. + kExportAddress = 32, + // Memory export: values for texels [index+0], [index+1], ..., [index+4]. + kExportData0 = 33, + kExportData1, + kExportData2, + kExportData3, + kExportData4, +}; + struct AluInstruction { + // Raw accessors. + // Whether data is being exported (or written to local registers). bool is_export() const { return data_.export_data == 1; } bool export_write_mask() const { return data_.scalar_dest_rel == 1; } @@ -1334,20 +1459,12 @@ struct AluInstruction { bool is_const_1_addressed() const { return data_.const_1_rel_abs == 1; } bool is_address_relative() const { return data_.address_absolute == 1; } - bool has_vector_op() const { - return vector_write_mask() || is_export() || - AluVectorOpcodeHasSideEffects(vector_opcode()); - } AluVectorOpcode vector_opcode() const { return data_.vector_opc; } uint32_t vector_write_mask() const { return data_.vector_write_mask; } uint32_t vector_dest() const { return data_.vector_dest; } bool is_vector_dest_relative() const { return data_.vector_dest_rel == 1; } bool vector_clamp() const { return data_.vector_clamp == 1; } - bool has_scalar_op() const { - return scalar_opcode() != AluScalarOpcode::kRetainPrev || - (!is_export() && scalar_write_mask() != 0); - } AluScalarOpcode scalar_opcode() const { return data_.scalar_opc; } uint32_t scalar_write_mask() const { return data_.scalar_write_mask; } uint32_t scalar_dest() const { return data_.scalar_dest; } @@ -1407,14 +1524,62 @@ struct AluInstruction { } } + // Helpers. + + // Note that even if the export component is unused (like W of the vertex + // shader misc register, YZW of pixel shader depth), it must still not be + // excluded - that may make disassembly not reassemblable if there are + // constant 0 writes in the export, like, oPts.x000 will be assembled, but + // oPts.x00_ will not, even though W has no effect on anything. + uint32_t GetVectorOpResultWriteMask() const { + uint32_t mask = vector_write_mask(); + if (is_export()) { + mask &= ~scalar_write_mask(); + } + return mask; + } + uint32_t GetScalarOpResultWriteMask() const { + uint32_t mask = scalar_write_mask(); + if (is_export()) { + mask &= ~vector_write_mask(); + } + return mask; + } + uint32_t GetConstant0WriteMask() const { + if (!is_export() || !is_scalar_dest_relative()) { + return 0b0000; + } + return 0b1111 & ~(vector_write_mask() | scalar_write_mask()); + } + uint32_t GetConstant1WriteMask() const { + if (!is_export()) { + return 0b0000; + } + return vector_write_mask() & scalar_write_mask(); + } + private: XEPACKEDSTRUCT(Data, { XEPACKEDSTRUCTANONYMOUS({ + // If exporting, both vector and scalar operations use the vector + // destination (which can't be relative in this case). + // Not very important note: If both scalar and vector operations exporting + // something have empty write mask, the XNA assembler forces vector_dest + // to 0 (interpolator 0 or color 0) directly in the microcode. uint32_t vector_dest : 6; uint32_t vector_dest_rel : 1; uint32_t abs_constants : 1; uint32_t scalar_dest : 6; uint32_t scalar_dest_rel : 1; + // Exports have different write masking (export is done to vector_dest by + // both the vector and the scalar operation, and exports can write + // constant 0 and 1). For each component: + // - vector_write_mask 0, scalar_write_mask 0: + // - scalar_dest_rel 0 - unchanged. + // - scalar_dest_rel 1 - constant 0 (all components must be written). + // - vector_write_mask 1, scalar_write_mask 0 - from vector operation. + // - vector_write_mask 0, scalar_write_mask 1 - from scalar operation. + // - vector_write_mask 1, scalar_write_mask 1 - constant 1. uint32_t export_data : 1; uint32_t vector_write_mask : 4; uint32_t scalar_write_mask : 4; diff --git a/tools/shader-playground/Editor.cs b/tools/shader-playground/Editor.cs index dd5c46e0d..017773b39 100644 --- a/tools/shader-playground/Editor.cs +++ b/tools/shader-playground/Editor.cs @@ -267,6 +267,7 @@ namespace shader_playground { "--shader_output=" + translatedDisasmPath, "--shader_output_type=" + outputType, "--vertex_shader_output_type=" + vertexShaderType, + "--dxbc_source_map=true", }; if (translationComboBox.SelectedIndex == 1) { startArguments.Add("--shader_output_dxbc_rov=true");