[DXBC] All prologues and epilogues to new DXBC code

This commit is contained in:
Triang3l 2020-04-26 13:51:25 +03:00
parent 1799585e92
commit 96a61bc623
2 changed files with 133 additions and 334 deletions

View File

@ -634,6 +634,12 @@ void DxbcShaderTranslator::StartPixelShader() {
return;
}
if (!edram_rov_used_ && writes_depth()) {
// Initialize the depth output if used, which must be written to regardless
// of the taken execution path.
DxbcOpMov(DxbcDest::ODepth(), DxbcSrc::LF(0.0f));
}
uint32_t interpolator_count = std::min(kInterpolatorCount, register_count());
if (interpolator_count != 0) {
// Copy interpolants to GPRs.
@ -901,333 +907,136 @@ void DxbcShaderTranslator::StartTranslation() {
}
// Start the main loop (for jumping to labels by setting pc and continuing).
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_LOOP) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
DxbcOpLoop();
// Switch and the first label (pc == 0).
if (UseSwitchForControlFlow()) {
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_SWITCH) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(system_temp_ps_pc_p0_a0_);
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_CASE) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(0);
++stat_.instruction_count;
++stat_.static_flow_control_count;
DxbcOpSwitch(DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kYYYY));
DxbcOpCase(DxbcSrc::LU(0));
} else {
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) |
ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(
D3D10_SB_INSTRUCTION_TEST_ZERO) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(system_temp_ps_pc_p0_a0_);
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
DxbcOpIf(false, DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kYYYY));
}
}
void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
// Get what we need to do with the position.
uint32_t ndc_control_temp = PushSystemTemp();
uint32_t temp = PushSystemTemp();
DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001));
DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX));
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(ndc_control_temp);
shader_code_.push_back(EncodeVectorReplicatedOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_Flags_Vec);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(kSysFlag_XYDividedByW);
shader_code_.push_back(kSysFlag_ZDividedByW);
shader_code_.push_back(kSysFlag_WNotReciprocal);
shader_code_.push_back(kSysFlag_ReverseZ);
++stat_.instruction_count;
++stat_.uint_instruction_count;
DxbcSrc flags_src(DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp));
// Revert getting the reciprocal of W and dividing XY by W if needed.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should enable
// Check if the shader already returns W, not 1/W, and if it doesn't, turn 1/W
// into W.
DxbcOpAnd(temp_x_dest, flags_src, DxbcSrc::LU(kSysFlag_WNotReciprocal));
DxbcOpIf(false, temp_x_src);
DxbcOpRcp(DxbcDest::R(system_temp_position_, 0b1000),
DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW));
DxbcOpEndIf();
// Check if the shader returns XY/W rather than XY, and if it does, revert
// that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
uint32_t w_format_temp = PushSystemTemp();
// If the shader has returned 1/W, restore W. First take the reciprocal, which
// may be either W (what we need) or 1/W, depending on the vertex W format.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_RCP) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(w_format_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.float_instruction_count;
// Then, if the shader returns 1/W (vtx_w0_fmt is 0), write 1/(1/W) to the
// position.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1));
shader_code_.push_back(ndc_control_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(w_format_temp);
++stat_.instruction_count;
++stat_.movc_instruction_count;
// Multiply XYZ by W in case the shader returns XYZ/W and we'll need to
// restore XYZ.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1));
shader_code_.push_back(w_format_temp);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.float_instruction_count;
// If vtx_xy_fmt and/or vtx_z_fmt are 1, XY and/or Z are pre-divided by W.
// Restore them in this case.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b00010000, 1));
shader_code_.push_back(ndc_control_temp);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(w_format_temp);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
// Release w_format_temp.
PopSystemTemp();
DxbcOpAnd(temp_x_dest, flags_src, DxbcSrc::LU(kSysFlag_XYDividedByW));
DxbcOpIf(true, temp_x_src);
DxbcOpMul(DxbcDest::R(system_temp_position_, 0b0011),
DxbcSrc::R(system_temp_position_),
DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW));
DxbcOpEndIf();
// Check if the shader returns Z/W rather than Z, and if it does, revert that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
DxbcOpAnd(temp_x_dest, flags_src, DxbcSrc::LU(kSysFlag_ZDividedByW));
DxbcOpIf(true, temp_x_src);
DxbcOpMul(DxbcDest::R(system_temp_position_, 0b0100),
DxbcSrc::R(system_temp_position_, DxbcSrc::kZZZZ),
DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW));
DxbcOpEndIf();
// Zero-initialize SV_ClipDistance# (for user clip planes) and SV_CullDistance
// (for vertex kill) in case they're not needed.
DxbcOpMov(DxbcDest::O(uint32_t(InOutRegister::kVSDSOutClipDistance0123)),
DxbcSrc::LF(0.0f));
DxbcOpMov(DxbcDest::O(
uint32_t(InOutRegister::kVSDSOutClipDistance45AndCullDistance),
0b0111),
DxbcSrc::LF(0.0f));
// Clip against user clip planes.
// Not possible to handle UCP_CULL_ONLY_ENA with the same shader though, since
// there can be only 8 SV_ClipDistance + SV_CullDistance values at most, but
// 12 would be needed.
uint32_t ucp_dot_temp = PushSystemTemp();
uint32_t ucp_enabled_temp = PushSystemTemp();
system_constants_used_ |= (1ull << kSysConst_UserClipPlanes_Index) |
(1ull << kSysConst_Flags_Index);
for (uint32_t i = 0; i < 2; ++i) {
uint32_t ucp_count = i ? 2 : 4;
uint32_t ucp_mask = (1 << ucp_count) - 1;
for (uint32_t j = 0; j < ucp_count; ++j) {
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DP4) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 << j, 1));
shader_code_.push_back(ucp_dot_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_UserClipPlanes_Vec + i * 4 + j);
++stat_.instruction_count;
++stat_.float_instruction_count;
system_constants_used_ |= 1ull << kSysConst_UserClipPlanes_Index;
for (uint32_t i = 0; i < 6; ++i) {
// Check if the clip plane is enabled - this `if` is needed, as opposed to
// just zeroing the clip planes in the constants, so Infinity and NaN in the
// position won't have any effect caused by this if clip planes are
// disabled.
DxbcOpAnd(temp_x_dest, flags_src,
DxbcSrc::LU(kSysFlag_UserClipPlane0 << i));
DxbcOpIf(true, temp_x_src);
DxbcOpDP4(DxbcDest::O(
uint32_t(InOutRegister::kVSDSOutClipDistance0123) + (i >> 2),
1 << (i & 3)),
DxbcSrc::R(system_temp_position_),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_UserClipPlanes_Vec + i));
DxbcOpEndIf();
}
// Using movc rather than zeroing the planes in the constants because dp4
// would handle Infinity and NaN in an unexpected way.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, ucp_mask, 1));
shader_code_.push_back(ucp_enabled_temp);
shader_code_.push_back(EncodeVectorReplicatedOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_Flags_Vec);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
for (uint32_t j = 0; j < ucp_count; ++j) {
shader_code_.push_back(kSysFlag_UserClipPlane0 << (i * 4 + j));
}
for (uint32_t j = ucp_count; j < 4; ++j) {
shader_code_.push_back(0);
}
++stat_.instruction_count;
++stat_.uint_instruction_count;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, ucp_mask, 1));
shader_code_.push_back(uint32_t(InOutRegister::kVSDSOutClipDistance0123) +
i);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(ucp_enabled_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(ucp_dot_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
shader_code_.push_back(0);
shader_code_.push_back(0);
shader_code_.push_back(0);
++stat_.instruction_count;
++stat_.movc_instruction_count;
}
// Release ucp_dot_temp and ucp_enabled_temp.
PopSystemTemp(2);
// Apply scale for drawing without a viewport, and also remap from OpenGL
// Z clip space to Direct3D if needed. Also, if the vertex shader is
// multipass, the NDC scale constant can be used to set position to NaN to
// kill all primitives.
system_constants_used_ |= 1ull << kSysConst_NDCScale_Index;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER,
kSysConst_NDCScale_Comp | ((kSysConst_NDCScale_Comp + 1) << 2) |
((kSysConst_NDCScale_Comp + 2) << 4),
3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_NDCScale_Vec);
++stat_.instruction_count;
++stat_.float_instruction_count;
DxbcOpMul(DxbcDest::R(system_temp_position_, 0b0111),
DxbcSrc::R(system_temp_position_),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_NDCScale_Vec,
kSysConst_NDCScale_Comp * 0b010101 + 0b100100));
// Reverse Z (Z = W - Z) if the viewport depth is inverted.
uint32_t reverse_z_temp = PushSystemTemp();
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(reverse_z_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1) |
ENCODE_D3D10_SB_OPERAND_EXTENDED(1));
shader_code_.push_back(
ENCODE_D3D10_SB_EXTENDED_OPERAND_MODIFIER(D3D10_SB_OPERAND_MODIFIER_NEG));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.float_instruction_count;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(ndc_control_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(reverse_z_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
// Release reverse_z_temp.
PopSystemTemp();
// Release ndc_control_temp.
PopSystemTemp();
DxbcOpAnd(temp_x_dest, flags_src, DxbcSrc::LU(kSysFlag_ReverseZ));
DxbcOpIf(true, temp_x_src);
DxbcOpAdd(DxbcDest::R(system_temp_position_, 0b0100),
DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW),
-DxbcSrc::R(system_temp_position_, DxbcSrc::kZZZZ));
DxbcOpEndIf();
// Apply offset (multiplied by W) for drawing without a viewport and for half
// pixel offset.
system_constants_used_ |= 1ull << kSysConst_NDCOffset_Index;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAD) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER,
kSysConst_NDCOffset_Comp | ((kSysConst_NDCOffset_Comp + 1) << 2) |
((kSysConst_NDCOffset_Comp + 2) << 4),
3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_NDCOffset_Vec);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1));
shader_code_.push_back(system_temp_position_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.float_instruction_count;
DxbcOpMAd(DxbcDest::R(system_temp_position_, 0b0111),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_NDCOffset_Vec,
kSysConst_NDCOffset_Comp * 0b010101 + 0b100100),
DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW),
DxbcSrc::R(system_temp_position_));
// Write Z and W of the position to a separate attribute so ROV output can get
// per-sample depth.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b0011, 1));
shader_code_.push_back(uint32_t(InOutRegister::kVSDSOutClipSpaceZW));
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b11111110, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.mov_instruction_count;
DxbcOpMov(DxbcDest::O(uint32_t(InOutRegister::kVSDSOutClipSpaceZW), 0b0011),
DxbcSrc::R(system_temp_position_, 0b1110));
// Initialize SV_CullDistance.
DxbcOpMov(DxbcDest::O(
uint32_t(InOutRegister::kVSDSOutClipDistance45AndCullDistance),
0b0100),
DxbcSrc::LF(0.0f));
// Assuming SV_CullDistance was zeroed earlier in this function.
// Kill the primitive if needed - check if the shader wants to kill.
// TODO(Triang3l): Find if the condition is actually the flag being non-zero.
uint32_t kill_temp = PushSystemTemp();
DxbcOpNE(
DxbcDest::R(kill_temp, 0b0001),
temp_x_dest,
DxbcSrc::R(system_temp_point_size_edge_flag_kill_vertex_, DxbcSrc::kZZZZ),
DxbcSrc::LF(0.0f));
DxbcOpIf(true, DxbcSrc::R(kill_temp, DxbcSrc::kXXXX));
DxbcOpIf(true, temp_x_src);
{
// Extract the killing condition.
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
DxbcOpAnd(DxbcDest::R(kill_temp, 0b0001),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcOpAnd(temp_x_dest, flags_src,
DxbcSrc::LU(kSysFlag_KillIfAnyVertexKilled_Shift));
DxbcOpIf(true, DxbcSrc::R(kill_temp, DxbcSrc::kXXXX));
// Release kill_temp.
PopSystemTemp();
DxbcOpIf(true, temp_x_src);
{
// Kill the primitive if any vertex is killed - write NaN to position.
DxbcOpMov(DxbcDest::R(system_temp_position_, 0b1000),
@ -1248,16 +1057,8 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
DxbcOpEndIf();
// Write the position to the output.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1));
shader_code_.push_back(uint32_t(InOutRegister::kVSDSOutPosition));
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_position_);
++stat_.instruction_count;
++stat_.mov_instruction_count;
DxbcOpMov(DxbcDest::O(uint32_t(InOutRegister::kVSDSOutPosition)),
DxbcSrc::R(system_temp_position_));
// Zero the point coordinate (will be set in the geometry shader if needed)
// and write the point size.
@ -1268,6 +1069,9 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
DxbcDest::O(uint32_t(InOutRegister::kVSDSOutPointParameters), 0b0100),
DxbcSrc::R(system_temp_point_size_edge_flag_kill_vertex_,
DxbcSrc::kXXXX));
// Release temp.
PopSystemTemp();
}
void DxbcShaderTranslator::CompleteShaderCode() {
@ -1277,28 +1081,14 @@ void DxbcShaderTranslator::CompleteShaderCode() {
CloseExecConditionals();
// Close the last label and the switch.
if (UseSwitchForControlFlow()) {
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_BREAK) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDSWITCH) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
DxbcOpBreak();
DxbcOpEndSwitch();
} else {
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
DxbcOpEndIf();
}
// End the main loop.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_BREAK) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDLOOP) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
DxbcOpBreak();
DxbcOpEndLoop();
// Release the following system temporary values so epilogue can reuse them:
// - system_temp_pv_.
@ -1339,10 +1129,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
}
// Return from `main`.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_RET) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
++stat_.instruction_count;
++stat_.static_flow_control_count;
DxbcOpRet();
// Write subroutines - can only do this immediately after `ret`. They still
// need the global system temps, and can't allocate their own temps (since
@ -4261,20 +4048,6 @@ void DxbcShaderTranslator::WriteShaderCode() {
stat_.temp_array_count += register_count();
}
// Initialize the depth output if used, which must be initialized on every
// execution path.
if (!edram_rov_used_ && IsDxbcPixelShader() && writes_depth()) {
shader_object_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4));
shader_object_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0));
shader_object_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_object_.push_back(0);
++stat_.instruction_count;
++stat_.mov_instruction_count;
}
// Write the translated shader code.
size_t code_size_dwords = shader_code_.size();
// So [] won't crash in case the size is zero somehow.

View File

@ -1082,6 +1082,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
kDefault = 10,
kDiscard = 13,
kDiv = 14,
kDP4 = 17,
kElse = 18,
kEndIf = 21,
kEndLoop = 22,
@ -1101,6 +1102,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
kIShL = 41,
kIToF = 43,
kLabel = 44,
kLoop = 48,
kLT = 49,
kMAd = 50,
kMin = 51,
@ -1129,6 +1131,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
kDerivRTXFine = 123,
kDerivRTYCoarse = 124,
kDerivRTYFine = 125,
kRcp = 129,
kF32ToF16 = 130,
kF16ToF32 = 131,
kFirstBitHi = 135,
@ -1286,6 +1289,19 @@ class DxbcShaderTranslator : public ShaderTranslator {
DxbcEmitAluOp(DxbcOpcode::kDiv, 0b00, dest, src0, src1, saturate);
++stat_.float_instruction_count;
}
void DxbcOpDP4(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1,
bool saturate = false) {
uint32_t operands_length =
dest.GetLength() + src0.GetLength(0b1111) + src1.GetLength(0b1111);
shader_code_.reserve(shader_code_.size() + 1 + operands_length);
shader_code_.push_back(
DxbcOpcodeToken(DxbcOpcode::kDP4, operands_length, saturate));
dest.Write(shader_code_);
src0.Write(shader_code_, false, 0b1111);
src1.Write(shader_code_, false, 0b1111);
++stat_.instruction_count;
++stat_.float_instruction_count;
}
void DxbcOpElse() {
shader_code_.push_back(DxbcOpcodeToken(DxbcOpcode::kElse, 0));
++stat_.instruction_count;
@ -1378,6 +1394,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
label.Write(shader_code_, true, 0b0000);
// Doesn't count towards stat_.instruction_count.
}
void DxbcOpLoop() {
shader_code_.push_back(DxbcOpcodeToken(DxbcOpcode::kLoop, 0));
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
}
void DxbcOpLT(const DxbcDest& dest, const DxbcSrc& src0,
const DxbcSrc& src1) {
DxbcEmitAluOp(DxbcOpcode::kLT, 0b00, dest, src0, src1);
@ -1521,6 +1542,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
DxbcEmitAluOp(DxbcOpcode::kDerivRTYFine, 0b0, dest, src, saturate);
++stat_.float_instruction_count;
}
void DxbcOpRcp(const DxbcDest& dest, const DxbcSrc& src,
bool saturate = false) {
DxbcEmitAluOp(DxbcOpcode::kRcp, 0b0, dest, src, saturate);
++stat_.float_instruction_count;
}
void DxbcOpF32ToF16(const DxbcDest& dest, const DxbcSrc& src) {
DxbcEmitAluOp(DxbcOpcode::kF32ToF16, 0b0, dest, src);
++stat_.conversion_instruction_count;