[D3D12] ROV: Track which RTs and components have been actually written

This commit is contained in:
Triang3l 2018-10-18 14:54:33 +03:00
parent 1860bc6a59
commit f48ea20880
4 changed files with 186 additions and 61 deletions

View File

@ -1887,26 +1887,19 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
80;
dirty |= system_constants_.edram_pitch_tiles != edram_pitch_tiles;
system_constants_.edram_pitch_tiles = edram_pitch_tiles;
static const uint32_t kRTFormatAllComponentsMask[16] = {
0b1111, 0b1111, 0b1111, 0b1111, 0b0011, 0b1111, 0b0011, 0b1111,
0b0000, 0b0000, 0b1111, 0b0000, 0b1111, 0b0000, 0b0001, 0b0011,
};
uint32_t rt_mask_all = kRTFormatAllComponentsMask[uint32_t(color_format)];
uint32_t rt_mask = (rb_color_mask >> (i * 4)) & rt_mask_all;
uint32_t rt_flags =
DxbcShaderTranslator::GetColorFormatRTFlags(color_format);
// Exclude unused components from the write mask.
uint32_t rt_mask =
(rb_color_mask >> (i * 4)) & 0xF &
~(rt_flags >> DxbcShaderTranslator::kRTFlag_FormatUnusedR_Shift);
if (rt_mask != 0) {
rt_flags |= DxbcShaderTranslator::kRTFlag_Used |
(rt_mask << DxbcShaderTranslator::kRTFlag_WriteR_Shift);
if (rt_mask != rt_mask_all) {
rt_flags |= DxbcShaderTranslator::kRTFlag_Load;
}
rt_flags |= rt_mask << DxbcShaderTranslator::kRTFlag_WriteR_Shift;
uint32_t blend_x, blend_y;
if (colorcontrol_blend_enable &&
DxbcShaderTranslator::GetBlendConstants(blend_control, blend_x,
blend_y)) {
rt_flags |= DxbcShaderTranslator::kRTFlag_Load |
DxbcShaderTranslator::kRTFlag_Blend;
rt_flags |= DxbcShaderTranslator::kRTFlag_Blend;
uint32_t rt_pair_index = i >> 1;
uint32_t rt_pair_comp = (i & 1) << 1;
if (system_constants_

View File

@ -83,29 +83,33 @@ uint32_t DxbcShaderTranslator::GetColorFormatRTFlags(
// k_2_10_10_10_FLOAT
kRTFlag_FormatFloat10,
// k_16_16
kRTFlag_FormatFixed,
kRTFlag_FormatFixed | kRTFlag_FormatUnusedB | kRTFlag_FormatUnusedA,
// k_16_16_16_16
kRTFlag_FormatFixed,
// k_16_16_FLOAT
kRTFlag_FormatFloat16,
kRTFlag_FormatFloat16 | kRTFlag_FormatUnusedB | kRTFlag_FormatUnusedA,
// k_16_16_16_16_FLOAT
kRTFlag_FormatFloat16,
// Unused
0,
kRTFlag_FormatUnusedR | kRTFlag_FormatUnusedG | kRTFlag_FormatUnusedB |
kRTFlag_FormatUnusedA,
// Unused
0,
kRTFlag_FormatUnusedR | kRTFlag_FormatUnusedG | kRTFlag_FormatUnusedB |
kRTFlag_FormatUnusedA,
// k_2_10_10_10_AS_16_16_16_16
kRTFlag_FormatFixed,
// Unused.
0,
kRTFlag_FormatUnusedR | kRTFlag_FormatUnusedG | kRTFlag_FormatUnusedB |
kRTFlag_FormatUnusedA,
// k_2_10_10_10_FLOAT_AS_16_16_16_16
kRTFlag_FormatFloat10,
// Unused.
0,
kRTFlag_FormatUnusedR | kRTFlag_FormatUnusedG | kRTFlag_FormatUnusedB |
kRTFlag_FormatUnusedA,
// k_32_FLOAT
0,
kRTFlag_FormatUnusedG | kRTFlag_FormatUnusedB | kRTFlag_FormatUnusedA,
// k_32_32_FLOAT
0,
kRTFlag_FormatUnusedB | kRTFlag_FormatUnusedA,
};
return kRTFormatFlags[uint32_t(format)];
}
@ -1054,10 +1058,15 @@ void DxbcShaderTranslator::StartTranslation() {
} else if (IsDXBCPixelShader()) {
if (!is_depth_only_pixel_shader_) {
for (uint32_t i = 0; i < 4; ++i) {
system_temp_color_[i] = PushSystemTemp(true);
// In the ROV path, no need to initialize the colors because original
// values will be kept for the unwritten components.
system_temp_color_[i] = PushSystemTemp(!edram_rov_used_);
}
}
if (edram_rov_used_) {
if (!is_depth_only_pixel_shader_) {
system_temp_color_written_ = PushSystemTemp(true);
}
system_temp_depth_ = PushSystemTemp();
}
}
@ -3242,6 +3251,14 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV_StoreColor(
}
void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
bool color_targets_written;
if (is_depth_only_pixel_shader_) {
color_targets_written = false;
} else {
color_targets_written = writes_color_target(0) || writes_color_target(1) ||
writes_color_target(2) || writes_color_target(3);
}
// ***************************************************************************
// Calculate the offsets of the samples in the EDRAM.
// ***************************************************************************
@ -3399,7 +3416,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Calculate the address in the EDRAM buffer.
if (!is_depth_only_pixel_shader_) {
if (color_targets_written) {
// 1a) Get dword offset within the tile to edram_coord_low_temp.x.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
@ -3437,7 +3454,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
++stat_.instruction_count;
++stat_.uint_instruction_count;
if (!is_depth_only_pixel_shader_) {
if (color_targets_written) {
// 2a) Combine the tile offset and the offset within the tile to
// edram_coord_low_temp.x.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) |
@ -3481,7 +3498,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
uint32_t edram_coord_high_temp = 0;
if (!is_depth_only_pixel_shader_) {
if (color_targets_written) {
edram_coord_high_temp = PushSystemTemp();
// Get which render targets are 64bpp, as log2 of dword count per pixel.
@ -3990,16 +4007,16 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// Write to color render targets.
// ***************************************************************************
if (!is_depth_only_pixel_shader_) {
if (color_targets_written) {
system_constants_used_ |= 1ull << kSysConst_EDRAMRTFlags_Index;
// Get what render targets need to be written to.
uint32_t rt_used_temp = PushSystemTemp();
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) |
// Mask disabled color writes.
uint32_t rt_write_masks_temp = PushSystemTemp();
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_used_temp);
shader_code_.push_back(rt_write_masks_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
@ -4007,33 +4024,27 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(kRTFlag_Used);
shader_code_.push_back(kRTFlag_Used);
shader_code_.push_back(kRTFlag_Used);
shader_code_.push_back(kRTFlag_Used);
shader_code_.push_back(kRTFlag_WriteR_Shift);
shader_code_.push_back(kRTFlag_WriteR_Shift);
shader_code_.push_back(kRTFlag_WriteR_Shift);
shader_code_.push_back(kRTFlag_WriteR_Shift);
++stat_.instruction_count;
++stat_.uint_instruction_count;
// Get what render targets need to be read (for write masks and blending).
uint32_t rt_load_temp = PushSystemTemp();
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_load_temp);
shader_code_.push_back(system_temp_color_written_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec);
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_color_written_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(kRTFlag_Load);
shader_code_.push_back(kRTFlag_Load);
shader_code_.push_back(kRTFlag_Load);
shader_code_.push_back(kRTFlag_Load);
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(rt_write_masks_temp);
++stat_.instruction_count;
++stat_.uint_instruction_count;
// Release rt_write_masks_temp.
PopSystemTemp();
// Get what render targets need blending (if only write mask is used and no
// blending, skip blending).
@ -4057,7 +4068,92 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
++stat_.instruction_count;
++stat_.uint_instruction_count;
// Get what render targets need to be read (for write mask and blending).
uint32_t rt_overwritten_temp = PushSystemTemp();
// First, ignore components that don't exist in the render target at all -
// treat them as overwritten.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_UBFE) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_overwritten_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(4);
shader_code_.push_back(4);
shader_code_.push_back(4);
shader_code_.push_back(4);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(kRTFlag_FormatUnusedR_Shift);
shader_code_.push_back(kRTFlag_FormatUnusedR_Shift);
shader_code_.push_back(kRTFlag_FormatUnusedR_Shift);
shader_code_.push_back(kRTFlag_FormatUnusedR_Shift);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec);
++stat_.instruction_count;
++stat_.uint_instruction_count;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_overwritten_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_color_written_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(rt_overwritten_temp);
++stat_.instruction_count;
++stat_.uint_instruction_count;
// Then, check if the write mask + unused components is 1111 - if yes (and
// not blending), the pixel will be totally overwritten and no need to load
// the old pixel value.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_overwritten_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(rt_overwritten_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0b1111);
shader_code_.push_back(0b1111);
shader_code_.push_back(0b1111);
shader_code_.push_back(0b1111);
++stat_.instruction_count;
++stat_.int_instruction_count;
// Force load the previous pixel if blending.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(rt_overwritten_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(rt_blend_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
shader_code_.push_back(0);
shader_code_.push_back(0);
shader_code_.push_back(0);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(rt_overwritten_temp);
++stat_.instruction_count;
++stat_.movc_instruction_count;
for (uint32_t i = 0; i < 4; ++i) {
if (!writes_color_target(i)) {
continue;
}
// Check if the render target needs to be written to.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) |
ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(
@ -4065,7 +4161,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1));
shader_code_.push_back(rt_used_temp);
shader_code_.push_back(system_temp_color_written_);
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
@ -4077,11 +4173,11 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// write mask.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) |
ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(
D3D10_SB_INSTRUCTION_TEST_NONZERO) |
D3D10_SB_INSTRUCTION_TEST_ZERO) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1));
shader_code_.push_back(rt_load_temp);
shader_code_.push_back(rt_overwritten_temp);
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
uint32_t dest_color_temp = PushSystemTemp();
@ -4165,12 +4261,12 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
++stat_.instruction_count;
}
// Release rt_used_temp, rt_load_temp and rt_blend_temp.
PopSystemTemp(3);
// Release rt_blend_temp and rt_overwritten_temp.
PopSystemTemp(2);
}
// Release edram_coord_low_temp and, if used, edram_coord_high_temp.
PopSystemTemp(is_depth_only_pixel_shader_ ? 1 : 2);
PopSystemTemp(color_targets_written ? 2 : 1);
}
void DxbcShaderTranslator::CompletePixelShader() {
@ -4476,6 +4572,10 @@ void DxbcShaderTranslator::CompleteShaderCode() {
if (edram_rov_used_) {
// Release system_temp_depth_.
PopSystemTemp();
if (!is_depth_only_pixel_shader_) {
// Release system_temp_color_written_.
PopSystemTemp();
}
}
if (!is_depth_only_pixel_shader_) {
// Release system_temp_color_.
@ -5460,6 +5560,30 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
}
PopSystemTemp(2);
}
if (edram_rov_used_ &&
result.storage_target == InstructionStorageTarget::kColorTarget) {
// For ROV output, mark that the color has been written to.
// According to:
// https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
// if a color target has been written to - including due to flow control -
// the render target must not be modified (the unwritten components of a
// written target are undefined, but let's keep the original value in this
// case).
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 1 << uint32_t(result.storage_index), 1));
shader_code_.push_back(system_temp_color_written_);
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_TEMP, uint32_t(result.storage_index), 1));
shader_code_.push_back(system_temp_color_written_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(swizzle_mask | constant_mask);
++stat_.instruction_count;
++stat_.uint_instruction_count;
}
}
void DxbcShaderTranslator::ClosePredicate() {

View File

@ -77,17 +77,19 @@ class DxbcShaderTranslator : public ShaderTranslator {
};
enum : uint32_t {
// Whether the write mask is non-zero.
kRTFlag_Used_Shift,
// Whether the render target needs to be merged with another (if the write
// mask is not 1111, or 11 for 16_16, or 1 for 32_FLOAT, or blending is
// enabled and it's not no-op).
kRTFlag_Load_Shift,
kRTFlag_Blend_Shift,
kRTFlag_WriteR_Shift,
kRTFlag_WriteG_Shift,
kRTFlag_WriteB_Shift,
kRTFlag_WriteA_Shift,
kRTFlag_Blend_Shift,
// Whether the component does not exist in the render target format.
kRTFlag_FormatUnusedR_Shift,
kRTFlag_FormatUnusedG_Shift,
kRTFlag_FormatUnusedB_Shift,
kRTFlag_FormatUnusedA_Shift,
// Whether the format is fixed-point and needs to be converted to integer
// (k_8_8_8_8, k_2_10_10_10, k_16_16, k_16_16_16_16).
kRTFlag_FormatFixed_Shift,
@ -97,13 +99,15 @@ class DxbcShaderTranslator : public ShaderTranslator {
// f16tof32/f32tof16 is needed.
kRTFlag_FormatFloat16_Shift,
kRTFlag_Used = 1u << kRTFlag_Used_Shift,
kRTFlag_Load = 1u << kRTFlag_Load_Shift,
kRTFlag_Blend = 1u << kRTFlag_Blend_Shift,
kRTFlag_WriteR = 1u << kRTFlag_WriteR_Shift,
kRTFlag_WriteG = 1u << kRTFlag_WriteG_Shift,
kRTFlag_WriteB = 1u << kRTFlag_WriteB_Shift,
kRTFlag_WriteA = 1u << kRTFlag_WriteA_Shift,
kRTFlag_Blend = 1u << kRTFlag_Blend_Shift,
kRTFlag_FormatUnusedR = 1u << kRTFlag_FormatUnusedR_Shift,
kRTFlag_FormatUnusedG = 1u << kRTFlag_FormatUnusedG_Shift,
kRTFlag_FormatUnusedB = 1u << kRTFlag_FormatUnusedB_Shift,
kRTFlag_FormatUnusedA = 1u << kRTFlag_FormatUnusedA_Shift,
kRTFlag_FormatFixed = 1u << kRTFlag_FormatFixed_Shift,
kRTFlag_FormatFloat10 = 1u << kRTFlag_FormatFloat10_Shift,
kRTFlag_FormatFloat16 = 1u << kRTFlag_FormatFloat16_Shift,
@ -919,6 +923,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Color outputs in pixel shaders (because of exponent bias, alpha test and
// remapping).
uint32_t system_temp_color_[4];
// Whether the color output has been written in the execution path (ROV only).
uint32_t system_temp_color_written_;
// Depth output in pixel shader, and 3 dwords usable as scratch for operations
// related to depth. Currently only used for ROV depth.
// TODO(Triang3l): Reduce depth to 24-bit in pixel shaders when using a DSV

View File

@ -53,6 +53,8 @@ class ShaderTranslator {
bool uses_register_dynamic_addressing() const {
return uses_register_dynamic_addressing_;
}
// True if the current shader writes to a color target on any execution path.
bool writes_color_target(int i) const { return writes_color_targets_[i]; }
// A list of all vertex bindings, populated before translation occurs.
const std::vector<Shader::VertexBinding>& vertex_bindings() const {
return vertex_bindings_;