diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 54d15d5d2..97035eb94 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1798,6 +1798,8 @@ void DxbcShaderTranslator::ProcessLoopStartInstruction( } // Break if the loop counter is 0 (since the condition is checked in the end). + // TODO(Triang3l): Move this before pushing and address loading. This won't + // pop if the counter is 0. a_.OpIf(false, dxbc::Src::R(system_temp_loop_count_, dxbc::Src::kXXXX)); JumpToLabel(instr.loop_skip_address); a_.OpEndIf(); diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 427837a6e..91964e332 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -302,8 +302,9 @@ struct ParsedExecInstruction { // Whether this exec ends the shader. bool is_end = false; - // Whether to reset the current predicate. - bool clean = true; + // Whether the hardware doesn't have to wait for the predicate to be updated + // after this exec. + bool is_predicate_clean = true; // ? bool is_yield = false; diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index adc56656e..4e4bce854 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -662,7 +662,7 @@ void ParseControlFlowExec(const ControlFlowExecInstruction& cf, instr.instruction_count = cf.count(); instr.type = ParsedExecInstruction::Type::kUnconditional; instr.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd; - instr.clean = cf.clean(); + instr.is_predicate_clean = cf.is_predicate_clean(); instr.is_yield = cf.is_yield(); instr.sequence = cf.sequence(); } @@ -689,7 +689,7 @@ void ParseControlFlowCondExec(const ControlFlowCondExecInstruction& cf, switch (cf.opcode()) { case ControlFlowOpcode::kCondExec: case ControlFlowOpcode::kCondExecEnd: - instr.clean = false; + instr.is_predicate_clean = false; break; default: break; @@ -710,7 +710,7 @@ void ParseControlFlowCondExecPred(const ControlFlowCondExecPredInstruction& cf, instr.type = ParsedExecInstruction::Type::kPredicated; instr.condition = cf.condition(); instr.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd; - instr.clean = cf.clean(); + instr.is_predicate_clean = cf.is_predicate_clean(); instr.is_yield = cf.is_yield(); instr.sequence = cf.sequence(); } diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc index 3abb25893..d12cccd83 100644 --- a/src/xenia/gpu/shader_translator_disasm.cc +++ b/src/xenia/gpu/shader_translator_disasm.cc @@ -160,7 +160,7 @@ void ParsedExecInstruction::Disassemble(StringBuffer* out) const { } out->Append(" Yield=true"); } - if (!clean) { + if (!is_predicate_clean) { out->Append(" // PredicateClean=false"); } out->Append('\n'); diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 47aabd63a..e4ce497d1 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -97,18 +97,44 @@ enum class ControlFlowOpcode : uint32_t { // Executes fetch or ALU instructions then ends execution. kExecEnd = 2, // Conditionally executes based on a bool const. + // PredicateClean = false. kCondExec = 3, // Conditionally executes based on a bool const then ends execution. + // PredicateClean = false. + // According to the IPR2015-00325 sequencer specification, execution ends only + // if the condition is actually met (unlike on the R600, where execution ends + // unconditionally if END_OF_PROGRAM is set in the control flow instruction). + // 4D5307ED has many shaders (used, in particular, in the Press Start screen + // background) with if / else in its tail done via cexece - cexece b130, then + // cexece !b130, and then an empty exece (if the condition was ignored, the + // second cexece would have never been reached). Also, the validator reports + // "Shader will try to execute instruction 3.0, but last possible instruction + // is 2.1" for a shader that contains just one cexece without an exece. kCondExecEnd = 4, // Conditionally executes based on the current predicate. + // Since 64 vertices or pixels are processed by each sequencer in the Xenos + // hardware, the actual condition is AND of the predicate values for all + // active (and not killed) invocations for (!p0) exec, and OR of them for + // (p0) exec - if any of the invocations passes the predicate check, all of + // them will enter the exec. This is according to the IPR2015-00325 sequencer + // specification. Because of this, the compiler makes the ALU and fetch + // instructions themselves inside a predicated exec predicated as well. The + // validator also reports mismatch between the control flow predication and + // ALU / fetch predication. kCondExecPred = 5, // Conditionally executes based on the current predicate then ends execution. + // According to the IPR2015-00325 sequencer specification, execution ends only + // if the condition is actually met for any of the invocations (unlike on the + // R600, where execution ends unconditionally if END_OF_PROGRAM is set in the + // control flow instruction). kCondExecPredEnd = 6, - // Starts a loop that must be terminated with kLoopEnd. + // Starts a loop that must be terminated with kLoopEnd, with depth of up to 4. kLoopStart = 7, - // Continues or breaks out of a loop started with kLoopStart. + // Continues or breaks out of a loop started with kLoopStart. According to the + // IPR2015-00325 sequencer specification, the incrementing of the loop + // iteration happens before the count and the predicated break checks. kLoopEnd = 8, - // Conditionally calls a function. + // Conditionally calls a function, with depth of up to 4. // A return address is pushed to the stack to be used by a kReturn. kCondCall = 9, // Returns from the current function as called by kCondCall. @@ -118,11 +144,21 @@ enum class ControlFlowOpcode : uint32_t { kCondJmp = 11, // Allocates output values. kAlloc = 12, - // Conditionally executes based on the current predicate. - // Optionally resets the predicate value. + // Conditionally executes based on a bool const. + // PredicateClean = true. + // This is cexec with a bool constant (kCondExec, can be verified by comparing + // the XNA disassembler output with cexec containing and not containing a setp + // instruction), not a kCondExecPred. kCondExec doesn't have a predicate clean + // field (the space is occupied by the bool constant index), while + // kCondExecPred itself does. This is unlike what the IPR2015-00325 sequencer + // specification says about the Conditional_Execute_Predicates_No_Stall + // instruction using this opcode (in the specification, this is described as + // behaving like kCondExecPred with PredicateClean = false, but the + // specification is likely highly outdated - it doesn't even have predicate + // clean fields in exec instructions overall). kCondExecPredClean = 13, - // Conditionally executes based on the current predicate then ends execution. - // Optionally resets the predicate value. + // Conditionally executes based on a bool const then ends execution. + // PredicateClean = true. kCondExecPredCleanEnd = 14, // Hints that no more vertex fetches will be performed. kMarkVsFetchDone = 15, @@ -150,9 +186,9 @@ constexpr bool DoesControlFlowOpcodeEndShader(ControlFlowOpcode opcode) { opcode == ControlFlowOpcode::kCondExecPredCleanEnd; } -// Returns true if the given control flow opcode resets the predicate prior to -// execution. -constexpr bool DoesControlFlowOpcodeCleanPredicate(ControlFlowOpcode opcode) { +// See the description of ControlFlowOpcode::kCondExecPredClean. +constexpr bool DoesControlFlowCondExecHaveCleanPredicate( + ControlFlowOpcode opcode) { return opcode == ControlFlowOpcode::kCondExecPredClean || opcode == ControlFlowOpcode::kCondExecPredCleanEnd; } @@ -193,31 +229,39 @@ struct ControlFlowExecInstruction { uint32_t count() const { return count_; } // Sequence bits, 2 per instruction. // [0] - ALU (0) or fetch (1), [1] - serialize. - uint32_t sequence() const { return serialize_; } - // Whether to reset the current predicate. - bool clean() const { return clean_ == 1; } + uint32_t sequence() const { return sequence_; } + bool is_predicate_clean() const { return is_predicate_clean_ == 1; } // ? - bool is_yield() const { return is_yeild_ == 1; } + bool is_yield() const { return is_yield_ == 1; } private: // Word 0: (32 bits) uint32_t address_ : 12; uint32_t count_ : 3; - uint32_t is_yeild_ : 1; - uint32_t serialize_ : 12; + uint32_t is_yield_ : 1; + uint32_t sequence_ : 12; uint32_t vc_hi_ : 4; // Vertex cache? // Word 1: (16 bits) uint32_t vc_lo_ : 2; uint32_t : 7; - uint32_t clean_ : 1; + // According to the description of Conditional_Execute_Predicates_No_Stall in + // the IPR2015-00325 sequencer specification, the sequencer's control flow + // logic will not wait for the predicate to be updated (apparently after this + // exec). The compiler specifies PredicateClean=false for the exec if the + // instructions inside it modify the predicate (but if the predicate set is + // only a refinement of the current predicate, like in case of a nested `if`, + // PredicateClean=true may still be set according to the IPR2015-00325 + // sequencer specification, because the optimization would still work). + uint32_t is_predicate_clean_ : 1; uint32_t : 1; AddressingMode address_mode_ : 1; ControlFlowOpcode opcode_ : 4; }; static_assert_size(ControlFlowExecInstruction, sizeof(uint32_t) * 2); -// Instruction data for ControlFlowOpcode::kCondExec and kCondExecEnd. +// Instruction data for ControlFlowOpcode::kCondExec, kCondExecEnd, +// kCondExecPredClean and kCondExecPredCleanEnd. struct ControlFlowCondExecInstruction { ControlFlowOpcode opcode() const { return opcode_; } AddressingMode addressing_mode() const { return address_mode_; } @@ -227,20 +271,20 @@ struct ControlFlowCondExecInstruction { uint32_t count() const { return count_; } // Sequence bits, 2 per instruction. // [0] - ALU (0) or fetch (1), [1] - serialize. - uint32_t sequence() const { return serialize_; } + uint32_t sequence() const { return sequence_; } // Constant index used as the conditional. uint32_t bool_address() const { return bool_address_; } // Required condition value of the comparision (true or false). bool condition() const { return condition_ == 1; } // ? - bool is_yield() const { return is_yeild_ == 1; } + bool is_yield() const { return is_yield_ == 1; } private: // Word 0: (32 bits) uint32_t address_ : 12; uint32_t count_ : 3; - uint32_t is_yeild_ : 1; - uint32_t serialize_ : 12; + uint32_t is_yield_ : 1; + uint32_t sequence_ : 12; uint32_t vc_hi_ : 4; // Vertex cache? // Word 1: (16 bits) @@ -252,8 +296,7 @@ struct ControlFlowCondExecInstruction { }; static_assert_size(ControlFlowCondExecInstruction, sizeof(uint32_t) * 2); -// Instruction data for ControlFlowOpcode::kCondExecPred, kCondExecPredEnd, -// kCondExecPredClean, kCondExecPredCleanEnd. +// Instruction data for ControlFlowOpcode::kCondExecPred and kCondExecPredEnd. struct ControlFlowCondExecPredInstruction { ControlFlowOpcode opcode() const { return opcode_; } AddressingMode addressing_mode() const { return address_mode_; } @@ -263,26 +306,25 @@ struct ControlFlowCondExecPredInstruction { uint32_t count() const { return count_; } // Sequence bits, 2 per instruction. // [0] - ALU (0) or fetch (1), [1] - serialize. - uint32_t sequence() const { return serialize_; } - // Whether to reset the current predicate. - bool clean() const { return clean_ == 1; } + uint32_t sequence() const { return sequence_; } + bool is_predicate_clean() const { return is_predicate_clean_ == 1; } // Required condition value of the comparision (true or false). bool condition() const { return condition_ == 1; } // ? - bool is_yield() const { return is_yeild_ == 1; } + bool is_yield() const { return is_yield_ == 1; } private: // Word 0: (32 bits) uint32_t address_ : 12; uint32_t count_ : 3; - uint32_t is_yeild_ : 1; - uint32_t serialize_ : 12; + uint32_t is_yield_ : 1; + uint32_t sequence_ : 12; uint32_t vc_hi_ : 4; // Vertex cache? // Word 1: (16 bits) uint32_t vc_lo_ : 2; uint32_t : 7; - uint32_t clean_ : 1; + uint32_t is_predicate_clean_ : 1; uint32_t condition_ : 1; AddressingMode address_mode_ : 1; ControlFlowOpcode opcode_ : 4; @@ -293,12 +335,13 @@ static_assert_size(ControlFlowCondExecPredInstruction, sizeof(uint32_t) * 2); struct ControlFlowLoopStartInstruction { ControlFlowOpcode opcode() const { return opcode_; } AddressingMode addressing_mode() const { return address_mode_; } - // Target address to jump to when skipping the loop. + // Target address to jump to when skipping the loop (normally points to the + // instruction right after the `endloop` instruction). uint32_t address() const { return address_; } // Whether to reuse the current aL instead of reset it to loop start. bool is_repeat() const { return is_repeat_; } - // Integer constant register that holds the loop parameters. - // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. + // Integer constant register that holds the loop parameters + // (xenos::LoopConstant). uint32_t loop_id() const { return loop_id_; } private: @@ -320,12 +363,14 @@ static_assert_size(ControlFlowLoopStartInstruction, sizeof(uint32_t) * 2); struct ControlFlowLoopEndInstruction { ControlFlowOpcode opcode() const { return opcode_; } AddressingMode addressing_mode() const { return address_mode_; } - // Target address of the start of the loop body. + // Target address of the start of the loop body (normally points to the + // instruction right after the `loop` instruction). uint32_t address() const { return address_; } - // Integer constant register that holds the loop parameters. - // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. + // Integer constant register that holds the loop parameters + // (xenos::LoopConstant). uint32_t loop_id() const { return loop_id_; } - // Break from the loop if the predicate matches the expected value. + // Break from the loop if the predicate in all 64 invocations matches the + // expected value. bool is_predicated_break() const { return is_predicated_break_; } // Required condition value of the comparision (true or false). bool condition() const { return condition_ == 1; } diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index d2279a7b8..e9c7ff53f 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -975,6 +975,21 @@ inline uint32_t GpuToCpu(uint32_t p) { return p; } inline uint32_t CpuToGpu(uint32_t p) { return p & 0x1FFFFFFF; } +// XE_GPU_REG_SHADER_CONSTANT_LOOP_* +union alignas(uint32_t) LoopConstant { + uint32_t value; + struct { + uint32_t count : 8; // +0 + // Address (aL) start and step. + // The resulting aL is `iterator * step + start`, 10-bit, and has the real + // range of [-256, 256], according to the IPR2015-00325 sequencer + // specification. + uint32_t start : 8; // +8 + int32_t step : 8; // +16 + }; +}; +static_assert_size(LoopConstant, sizeof(uint32_t)); + // SQ_TEX_VTX_INVALID/VALID_TEXTURE/BUFFER enum class FetchConstantType : uint32_t { kInvalidTexture,