[GPU] Shader control flow documentation improvements

This commit is contained in:
Triang3l 2022-04-27 21:34:08 +03:00
parent b42680abf7
commit 5519dbb39f
6 changed files with 107 additions and 44 deletions

View File

@ -1798,6 +1798,8 @@ void DxbcShaderTranslator::ProcessLoopStartInstruction(
}
// Break if the loop counter is 0 (since the condition is checked in the end).
// TODO(Triang3l): Move this before pushing and address loading. This won't
// pop if the counter is 0.
a_.OpIf(false, dxbc::Src::R(system_temp_loop_count_, dxbc::Src::kXXXX));
JumpToLabel(instr.loop_skip_address);
a_.OpEndIf();

View File

@ -302,8 +302,9 @@ struct ParsedExecInstruction {
// Whether this exec ends the shader.
bool is_end = false;
// Whether to reset the current predicate.
bool clean = true;
// Whether the hardware doesn't have to wait for the predicate to be updated
// after this exec.
bool is_predicate_clean = true;
// ?
bool is_yield = false;

View File

@ -662,7 +662,7 @@ void ParseControlFlowExec(const ControlFlowExecInstruction& cf,
instr.instruction_count = cf.count();
instr.type = ParsedExecInstruction::Type::kUnconditional;
instr.is_end = cf.opcode() == ControlFlowOpcode::kExecEnd;
instr.clean = cf.clean();
instr.is_predicate_clean = cf.is_predicate_clean();
instr.is_yield = cf.is_yield();
instr.sequence = cf.sequence();
}
@ -689,7 +689,7 @@ void ParseControlFlowCondExec(const ControlFlowCondExecInstruction& cf,
switch (cf.opcode()) {
case ControlFlowOpcode::kCondExec:
case ControlFlowOpcode::kCondExecEnd:
instr.clean = false;
instr.is_predicate_clean = false;
break;
default:
break;
@ -710,7 +710,7 @@ void ParseControlFlowCondExecPred(const ControlFlowCondExecPredInstruction& cf,
instr.type = ParsedExecInstruction::Type::kPredicated;
instr.condition = cf.condition();
instr.is_end = cf.opcode() == ControlFlowOpcode::kCondExecPredEnd;
instr.clean = cf.clean();
instr.is_predicate_clean = cf.is_predicate_clean();
instr.is_yield = cf.is_yield();
instr.sequence = cf.sequence();
}

View File

@ -160,7 +160,7 @@ void ParsedExecInstruction::Disassemble(StringBuffer* out) const {
}
out->Append(" Yield=true");
}
if (!clean) {
if (!is_predicate_clean) {
out->Append(" // PredicateClean=false");
}
out->Append('\n');

View File

@ -97,18 +97,44 @@ enum class ControlFlowOpcode : uint32_t {
// Executes fetch or ALU instructions then ends execution.
kExecEnd = 2,
// Conditionally executes based on a bool const.
// PredicateClean = false.
kCondExec = 3,
// Conditionally executes based on a bool const then ends execution.
// PredicateClean = false.
// According to the IPR2015-00325 sequencer specification, execution ends only
// if the condition is actually met (unlike on the R600, where execution ends
// unconditionally if END_OF_PROGRAM is set in the control flow instruction).
// 4D5307ED has many shaders (used, in particular, in the Press Start screen
// background) with if / else in its tail done via cexece - cexece b130, then
// cexece !b130, and then an empty exece (if the condition was ignored, the
// second cexece would have never been reached). Also, the validator reports
// "Shader will try to execute instruction 3.0, but last possible instruction
// is 2.1" for a shader that contains just one cexece without an exece.
kCondExecEnd = 4,
// Conditionally executes based on the current predicate.
// Since 64 vertices or pixels are processed by each sequencer in the Xenos
// hardware, the actual condition is AND of the predicate values for all
// active (and not killed) invocations for (!p0) exec, and OR of them for
// (p0) exec - if any of the invocations passes the predicate check, all of
// them will enter the exec. This is according to the IPR2015-00325 sequencer
// specification. Because of this, the compiler makes the ALU and fetch
// instructions themselves inside a predicated exec predicated as well. The
// validator also reports mismatch between the control flow predication and
// ALU / fetch predication.
kCondExecPred = 5,
// Conditionally executes based on the current predicate then ends execution.
// According to the IPR2015-00325 sequencer specification, execution ends only
// if the condition is actually met for any of the invocations (unlike on the
// R600, where execution ends unconditionally if END_OF_PROGRAM is set in the
// control flow instruction).
kCondExecPredEnd = 6,
// Starts a loop that must be terminated with kLoopEnd.
// Starts a loop that must be terminated with kLoopEnd, with depth of up to 4.
kLoopStart = 7,
// Continues or breaks out of a loop started with kLoopStart.
// Continues or breaks out of a loop started with kLoopStart. According to the
// IPR2015-00325 sequencer specification, the incrementing of the loop
// iteration happens before the count and the predicated break checks.
kLoopEnd = 8,
// Conditionally calls a function.
// Conditionally calls a function, with depth of up to 4.
// A return address is pushed to the stack to be used by a kReturn.
kCondCall = 9,
// Returns from the current function as called by kCondCall.
@ -118,11 +144,21 @@ enum class ControlFlowOpcode : uint32_t {
kCondJmp = 11,
// Allocates output values.
kAlloc = 12,
// Conditionally executes based on the current predicate.
// Optionally resets the predicate value.
// Conditionally executes based on a bool const.
// PredicateClean = true.
// This is cexec with a bool constant (kCondExec, can be verified by comparing
// the XNA disassembler output with cexec containing and not containing a setp
// instruction), not a kCondExecPred. kCondExec doesn't have a predicate clean
// field (the space is occupied by the bool constant index), while
// kCondExecPred itself does. This is unlike what the IPR2015-00325 sequencer
// specification says about the Conditional_Execute_Predicates_No_Stall
// instruction using this opcode (in the specification, this is described as
// behaving like kCondExecPred with PredicateClean = false, but the
// specification is likely highly outdated - it doesn't even have predicate
// clean fields in exec instructions overall).
kCondExecPredClean = 13,
// Conditionally executes based on the current predicate then ends execution.
// Optionally resets the predicate value.
// Conditionally executes based on a bool const then ends execution.
// PredicateClean = true.
kCondExecPredCleanEnd = 14,
// Hints that no more vertex fetches will be performed.
kMarkVsFetchDone = 15,
@ -150,9 +186,9 @@ constexpr bool DoesControlFlowOpcodeEndShader(ControlFlowOpcode opcode) {
opcode == ControlFlowOpcode::kCondExecPredCleanEnd;
}
// Returns true if the given control flow opcode resets the predicate prior to
// execution.
constexpr bool DoesControlFlowOpcodeCleanPredicate(ControlFlowOpcode opcode) {
// See the description of ControlFlowOpcode::kCondExecPredClean.
constexpr bool DoesControlFlowCondExecHaveCleanPredicate(
ControlFlowOpcode opcode) {
return opcode == ControlFlowOpcode::kCondExecPredClean ||
opcode == ControlFlowOpcode::kCondExecPredCleanEnd;
}
@ -193,31 +229,39 @@ struct ControlFlowExecInstruction {
uint32_t count() const { return count_; }
// Sequence bits, 2 per instruction.
// [0] - ALU (0) or fetch (1), [1] - serialize.
uint32_t sequence() const { return serialize_; }
// Whether to reset the current predicate.
bool clean() const { return clean_ == 1; }
uint32_t sequence() const { return sequence_; }
bool is_predicate_clean() const { return is_predicate_clean_ == 1; }
// ?
bool is_yield() const { return is_yeild_ == 1; }
bool is_yield() const { return is_yield_ == 1; }
private:
// Word 0: (32 bits)
uint32_t address_ : 12;
uint32_t count_ : 3;
uint32_t is_yeild_ : 1;
uint32_t serialize_ : 12;
uint32_t is_yield_ : 1;
uint32_t sequence_ : 12;
uint32_t vc_hi_ : 4; // Vertex cache?
// Word 1: (16 bits)
uint32_t vc_lo_ : 2;
uint32_t : 7;
uint32_t clean_ : 1;
// According to the description of Conditional_Execute_Predicates_No_Stall in
// the IPR2015-00325 sequencer specification, the sequencer's control flow
// logic will not wait for the predicate to be updated (apparently after this
// exec). The compiler specifies PredicateClean=false for the exec if the
// instructions inside it modify the predicate (but if the predicate set is
// only a refinement of the current predicate, like in case of a nested `if`,
// PredicateClean=true may still be set according to the IPR2015-00325
// sequencer specification, because the optimization would still work).
uint32_t is_predicate_clean_ : 1;
uint32_t : 1;
AddressingMode address_mode_ : 1;
ControlFlowOpcode opcode_ : 4;
};
static_assert_size(ControlFlowExecInstruction, sizeof(uint32_t) * 2);
// Instruction data for ControlFlowOpcode::kCondExec and kCondExecEnd.
// Instruction data for ControlFlowOpcode::kCondExec, kCondExecEnd,
// kCondExecPredClean and kCondExecPredCleanEnd.
struct ControlFlowCondExecInstruction {
ControlFlowOpcode opcode() const { return opcode_; }
AddressingMode addressing_mode() const { return address_mode_; }
@ -227,20 +271,20 @@ struct ControlFlowCondExecInstruction {
uint32_t count() const { return count_; }
// Sequence bits, 2 per instruction.
// [0] - ALU (0) or fetch (1), [1] - serialize.
uint32_t sequence() const { return serialize_; }
uint32_t sequence() const { return sequence_; }
// Constant index used as the conditional.
uint32_t bool_address() const { return bool_address_; }
// Required condition value of the comparision (true or false).
bool condition() const { return condition_ == 1; }
// ?
bool is_yield() const { return is_yeild_ == 1; }
bool is_yield() const { return is_yield_ == 1; }
private:
// Word 0: (32 bits)
uint32_t address_ : 12;
uint32_t count_ : 3;
uint32_t is_yeild_ : 1;
uint32_t serialize_ : 12;
uint32_t is_yield_ : 1;
uint32_t sequence_ : 12;
uint32_t vc_hi_ : 4; // Vertex cache?
// Word 1: (16 bits)
@ -252,8 +296,7 @@ struct ControlFlowCondExecInstruction {
};
static_assert_size(ControlFlowCondExecInstruction, sizeof(uint32_t) * 2);
// Instruction data for ControlFlowOpcode::kCondExecPred, kCondExecPredEnd,
// kCondExecPredClean, kCondExecPredCleanEnd.
// Instruction data for ControlFlowOpcode::kCondExecPred and kCondExecPredEnd.
struct ControlFlowCondExecPredInstruction {
ControlFlowOpcode opcode() const { return opcode_; }
AddressingMode addressing_mode() const { return address_mode_; }
@ -263,26 +306,25 @@ struct ControlFlowCondExecPredInstruction {
uint32_t count() const { return count_; }
// Sequence bits, 2 per instruction.
// [0] - ALU (0) or fetch (1), [1] - serialize.
uint32_t sequence() const { return serialize_; }
// Whether to reset the current predicate.
bool clean() const { return clean_ == 1; }
uint32_t sequence() const { return sequence_; }
bool is_predicate_clean() const { return is_predicate_clean_ == 1; }
// Required condition value of the comparision (true or false).
bool condition() const { return condition_ == 1; }
// ?
bool is_yield() const { return is_yeild_ == 1; }
bool is_yield() const { return is_yield_ == 1; }
private:
// Word 0: (32 bits)
uint32_t address_ : 12;
uint32_t count_ : 3;
uint32_t is_yeild_ : 1;
uint32_t serialize_ : 12;
uint32_t is_yield_ : 1;
uint32_t sequence_ : 12;
uint32_t vc_hi_ : 4; // Vertex cache?
// Word 1: (16 bits)
uint32_t vc_lo_ : 2;
uint32_t : 7;
uint32_t clean_ : 1;
uint32_t is_predicate_clean_ : 1;
uint32_t condition_ : 1;
AddressingMode address_mode_ : 1;
ControlFlowOpcode opcode_ : 4;
@ -293,12 +335,13 @@ static_assert_size(ControlFlowCondExecPredInstruction, sizeof(uint32_t) * 2);
struct ControlFlowLoopStartInstruction {
ControlFlowOpcode opcode() const { return opcode_; }
AddressingMode addressing_mode() const { return address_mode_; }
// Target address to jump to when skipping the loop.
// Target address to jump to when skipping the loop (normally points to the
// instruction right after the `endloop` instruction).
uint32_t address() const { return address_; }
// Whether to reuse the current aL instead of reset it to loop start.
bool is_repeat() const { return is_repeat_; }
// Integer constant register that holds the loop parameters.
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
// Integer constant register that holds the loop parameters
// (xenos::LoopConstant).
uint32_t loop_id() const { return loop_id_; }
private:
@ -320,12 +363,14 @@ static_assert_size(ControlFlowLoopStartInstruction, sizeof(uint32_t) * 2);
struct ControlFlowLoopEndInstruction {
ControlFlowOpcode opcode() const { return opcode_; }
AddressingMode addressing_mode() const { return address_mode_; }
// Target address of the start of the loop body.
// Target address of the start of the loop body (normally points to the
// instruction right after the `loop` instruction).
uint32_t address() const { return address_; }
// Integer constant register that holds the loop parameters.
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
// Integer constant register that holds the loop parameters
// (xenos::LoopConstant).
uint32_t loop_id() const { return loop_id_; }
// Break from the loop if the predicate matches the expected value.
// Break from the loop if the predicate in all 64 invocations matches the
// expected value.
bool is_predicated_break() const { return is_predicated_break_; }
// Required condition value of the comparision (true or false).
bool condition() const { return condition_ == 1; }

View File

@ -975,6 +975,21 @@ inline uint32_t GpuToCpu(uint32_t p) { return p; }
inline uint32_t CpuToGpu(uint32_t p) { return p & 0x1FFFFFFF; }
// XE_GPU_REG_SHADER_CONSTANT_LOOP_*
union alignas(uint32_t) LoopConstant {
uint32_t value;
struct {
uint32_t count : 8; // +0
// Address (aL) start and step.
// The resulting aL is `iterator * step + start`, 10-bit, and has the real
// range of [-256, 256], according to the IPR2015-00325 sequencer
// specification.
uint32_t start : 8; // +8
int32_t step : 8; // +16
};
};
static_assert_size(LoopConstant, sizeof(uint32_t));
// SQ_TEX_VTX_INVALID/VALID_TEXTURE/BUFFER
enum class FetchConstantType : uint32_t {
kInvalidTexture,