diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 63f99358f..8c1740e42 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -126,6 +126,8 @@ void DxbcShaderTranslator::Reset() { texture_srvs_.clear(); sampler_bindings_.clear(); + memexport_alloc_current_count_ = 0; + std::memset(&stat_, 0, sizeof(stat_)); } @@ -967,6 +969,33 @@ void DxbcShaderTranslator::StartTranslation() { } if (!is_depth_only_pixel_shader_) { + // Allocate temporary registers for memexport addresses and data. + std::memset(system_temps_memexport_address_, 0xFF, + sizeof(system_temps_memexport_address_)); + std::memset(system_temps_memexport_data_, 0xFF, + sizeof(system_temps_memexport_data_)); + system_temp_memexport_written_ = UINT32_MAX; + const uint8_t* memexports_written = memexport_eM_written(); + for (uint32_t i = 0; i < kMaxMemExports; ++i) { + uint32_t memexport_alloc_written = memexports_written[i]; + if (memexport_alloc_written == 0) { + continue; + } + // If memexport is used at all, allocate a register containing whether eM# + // have actually been written to. + if (system_temp_memexport_written_ == UINT32_MAX) { + system_temp_memexport_written_ = PushSystemTemp(true); + } + system_temps_memexport_address_[i] = PushSystemTemp(true); + uint32_t memexport_data_index; + while (xe::bit_scan_forward(memexport_alloc_written, + &memexport_data_index)) { + memexport_alloc_written &= ~(1u << memexport_data_index); + system_temps_memexport_data_[i][memexport_data_index] = + PushSystemTemp(); + } + } + // Allocate system temporary variables for the translated code. system_temp_pv_ = PushSystemTemp(true); system_temp_ps_pc_p0_a0_ = PushSystemTemp(true); @@ -1266,6 +1295,26 @@ void DxbcShaderTranslator::CompleteShaderCode() { // - system_temp_grad_h_lod_. // - system_temp_grad_v_. PopSystemTemp(6); + + // TODO(Triang3l): Do memexport. + + // Release memexport temporary registers. + for (int i = kMaxMemExports - 1; i >= 0; --i) { + if (system_temps_memexport_address_[i] == UINT32_MAX) { + continue; + } + // Release exported data registers. + for (int j = 4; j >= 0; --j) { + if (system_temps_memexport_data_[i][j] != UINT32_MAX) { + PopSystemTemp(); + } + } + // Release the address register. + PopSystemTemp(); + } + if (system_temp_memexport_written_ != UINT32_MAX) { + PopSystemTemp(); + } } // Write stage-specific epilogue. @@ -2009,12 +2058,30 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand( } void DxbcShaderTranslator::StoreResult(const InstructionResult& result, - uint32_t reg, bool replicate_x) { + uint32_t reg, bool replicate_x, + bool can_store_memexport_address) { if (result.storage_target == InstructionStorageTarget::kNone || !result.has_any_writes()) { return; } + // Validate memexport writes (Halo 3 has some weird invalid ones). + if (result.storage_target == InstructionStorageTarget::kExportAddress) { + if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 || + memexport_alloc_current_count_ > kMaxMemExports || + system_temps_memexport_address_[memexport_alloc_current_count_ - 1] == + UINT32_MAX) { + return; + } + } else if (result.storage_target == InstructionStorageTarget::kExportData) { + if (memexport_alloc_current_count_ == 0 || + memexport_alloc_current_count_ > kMaxMemExports || + system_temps_memexport_data_[memexport_alloc_current_count_ - 1] + [result.storage_index] == UINT32_MAX) { + return; + } + } + uint32_t saturate_bit = ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped); @@ -2187,6 +2254,34 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, shader_code_.push_back(system_temp_position_); break; + case InstructionStorageTarget::kExportAddress: + ++stat_.instruction_count; + ++stat_.mov_instruction_count; + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | + saturate_bit); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); + shader_code_.push_back( + system_temps_memexport_address_[memexport_alloc_current_count_ - + 1]); + break; + + case InstructionStorageTarget::kExportData: + ++stat_.instruction_count; + ++stat_.mov_instruction_count; + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | + saturate_bit); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); + shader_code_.push_back( + system_temps_memexport_data_[memexport_alloc_current_count_ - 1] + [uint32_t(result.storage_index)]); + break; + case InstructionStorageTarget::kColorTarget: ++stat_.instruction_count; ++stat_.mov_instruction_count; @@ -2219,6 +2314,25 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, } } + if (result.storage_target == InstructionStorageTarget::kExportData) { + // Mark that the eM# has been written to and needs to be exported. + uint32_t memexport_index = memexport_alloc_current_count_ - 1; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back(EncodeVectorMaskedOperand( + D3D10_SB_OPERAND_TYPE_TEMP, 1 << (memexport_index >> 2), 1)); + shader_code_.push_back(system_temp_memexport_written_); + shader_code_.push_back(EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, + memexport_index >> 2, 1)); + shader_code_.push_back(system_temp_memexport_written_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back( + 1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3))); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + } + if (edram_rov_used_ && result.storage_target == InstructionStorageTarget::kColorTarget) { // For ROV output, mark that the color has been written to. @@ -2862,6 +2976,19 @@ void DxbcShaderTranslator::ProcessJumpInstruction( JumpToLabel(instr.target_address); } +void DxbcShaderTranslator::ProcessAllocInstruction( + const ParsedAllocInstruction& instr) { + if (FLAGS_dxbc_source_map) { + instruction_disassembly_buffer_.Reset(); + instr.Disassemble(&instruction_disassembly_buffer_); + EmitInstructionDisassembly(); + } + + if (instr.type == AllocType::kMemory) { + ++memexport_alloc_current_count_; + } +} + uint32_t DxbcShaderTranslator::AppendString(std::vector& dest, const char* source) { size_t size = std::strlen(source) + 1; diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index df004ffc8..73cd3a964 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -521,6 +521,7 @@ class DxbcShaderTranslator : public ShaderTranslator { void ProcessLoopEndInstruction( const ParsedLoopEndInstruction& instr) override; void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override; + void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override; void ProcessVertexFetchInstruction( const ParsedVertexFetchInstruction& instr) override; @@ -965,8 +966,10 @@ class DxbcShaderTranslator : public ShaderTranslator { void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand); // Writes xyzw or xxxx of the specified r# to the destination. + // can_store_memexport_address is for safety, to allow only proper MADs with + // a stream constant to write to eA. void StoreResult(const InstructionResult& result, uint32_t reg, - bool replicate_x); + bool replicate_x, bool can_store_memexport_address = false); // The nesting of `if` instructions is the following: // - pc checks (labels). @@ -1149,20 +1152,6 @@ class DxbcShaderTranslator : public ShaderTranslator { // translation (for the declaration). uint32_t system_temp_count_max_; - // Vector ALU result/scratch (since Xenos write masks can contain swizzles). - uint32_t system_temp_pv_; - // Temporary register ID for previous scalar result, program counter, - // predicate and absolute address register. - uint32_t system_temp_ps_pc_p0_a0_; - // Loop index stack - .x is the active loop, shifted right to .yzw on push. - uint32_t system_temp_aL_; - // Loop counter stack, .x is the active loop. Represents number of times - // remaining to loop. - uint32_t system_temp_loop_count_; - // Explicitly set texture gradients and LOD. - uint32_t system_temp_grad_h_lod_; - uint32_t system_temp_grad_v_; - // Position in vertex shaders (because viewport and W transformations can be // applied in the end of the shader). uint32_t system_temp_position_; @@ -1182,6 +1171,29 @@ class DxbcShaderTranslator : public ShaderTranslator { // - Z - depth Y derivative. uint32_t system_temp_depth_; + // Bits containing whether each eM# has been written, for up to 16 streams, or + // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with + // 4 `alloc export`s per component. + uint32_t system_temp_memexport_written_; + // eA in each `alloc export`, or UINT32_MAX if not used. + uint32_t system_temps_memexport_address_[kMaxMemExports]; + // eM# in each `alloc export`, or UINT32_MAX if not used. + uint32_t system_temps_memexport_data_[kMaxMemExports][5]; + + // Vector ALU result/scratch (since Xenos write masks can contain swizzles). + uint32_t system_temp_pv_; + // Temporary register ID for previous scalar result, program counter, + // predicate and absolute address register. + uint32_t system_temp_ps_pc_p0_a0_; + // Loop index stack - .x is the active loop, shifted right to .yzw on push. + uint32_t system_temp_aL_; + // Loop counter stack, .x is the active loop. Represents number of times + // remaining to loop. + uint32_t system_temp_loop_count_; + // Explicitly set texture gradients and LOD. + uint32_t system_temp_grad_h_lod_; + uint32_t system_temp_grad_v_; + // The bool constant number containing the condition for the currently // processed exec (or the last - unless a label has reset this), or // kCfExecBoolConstantNone if it's not checked. @@ -1209,6 +1221,10 @@ class DxbcShaderTranslator : public ShaderTranslator { std::vector texture_srvs_; std::vector sampler_bindings_; + // Number of `alloc export`s encountered so far in the translation. The index + // of the current eA/eM# temp register set is this minus 1, if it's not 0. + uint32_t memexport_alloc_current_count_; + // The STAT chunk (based on Wine d3dcompiler_parse_stat). struct Statistics { uint32_t instruction_count; diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 4486a2201..73467a961 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -1289,7 +1289,8 @@ void DxbcShaderTranslator::ProcessVectorAluInstruction( } } - StoreResult(instr.result, system_temp_pv_, replicate_result); + StoreResult(instr.result, system_temp_pv_, replicate_result, + instr.GetMemExportStreamConstant() != UINT32_MAX); if (predicate_written) { cf_exec_predicate_written_ = true;