[D3D12] ROV: Disable RTs when using ROV

This commit is contained in:
Triang3l 2018-10-10 16:37:35 +03:00
parent 6d2e74325c
commit 67e5cb8681
4 changed files with 107 additions and 74 deletions

View File

@ -46,7 +46,8 @@ PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor,
// Zero out tessellation, stream output, blend state and formats for render
// targets 4+, node mask, cached PSO, flags and other things.
std::memset(&update_desc_, 0, sizeof(update_desc_));
update_desc_.BlendState.IndependentBlendEnable = TRUE;
update_desc_.BlendState.IndependentBlendEnable =
edram_rov_used_ ? FALSE : TRUE;
update_desc_.SampleMask = UINT_MAX;
update_desc_.SampleDesc.Count = 1;
}
@ -353,6 +354,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages(
PipelineCache::UpdateStatus PipelineCache::UpdateBlendStateAndRenderTargets(
D3D12Shader* pixel_shader,
const RenderTargetCache::PipelineRenderTarget render_targets[4]) {
if (edram_rov_used_) {
return current_pipeline_ == nullptr ? UpdateStatus::kMismatch
: UpdateStatus::kCompatible;
}
auto& regs = update_blend_state_and_render_targets_regs_;
bool dirty = current_pipeline_ == nullptr;
@ -624,6 +630,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizerState(
PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState(
DXGI_FORMAT format) {
if (edram_rov_used_) {
return current_pipeline_ == nullptr ? UpdateStatus::kMismatch
: UpdateStatus::kCompatible;
}
auto& regs = update_depth_stencil_state_regs_;
bool dirty = current_pipeline_ == nullptr;

View File

@ -352,6 +352,10 @@ void RenderTargetCache::BeginFrame() {
}
bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
if (IsROVUsedForEDRAM()) {
return true;
}
// There are two kinds of render target binding updates in this implementation
// in case something has been changed - full and partial.
//

View File

@ -62,8 +62,8 @@ using namespace ucode;
// second buffer in the descriptor array at b2, which is assigned to CB1, the
// index would be CB1[3][0].
DxbcShaderTranslator::DxbcShaderTranslator(bool edram_rovs_used)
: edram_rovs_used_(edram_rovs_used) {
DxbcShaderTranslator::DxbcShaderTranslator(bool edram_rov_used)
: edram_rov_used_(edram_rov_used) {
// Don't allocate again and again for the first shader.
shader_code_.reserve(8192);
shader_object_.reserve(16384);
@ -892,6 +892,75 @@ void DxbcShaderTranslator::CompleteVertexShader() {
++stat_.mov_instruction_count;
}
void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
// Remap guest render target indices to host since because on the host, the
// indices of the bound render targets are consecutive. This is done using 16
// movc instructions because indexable temps are known to be causing
// performance issues on some Nvidia GPUs. In the map, the components are host
// render target indices, and the values are the guest ones.
uint32_t remap_movc_mask_temp = PushSystemTemp();
uint32_t remap_movc_target_temp = PushSystemTemp();
system_constants_used_ |= 1u << kSysConst_ColorOutputMap_Index;
// Host RT i, guest RT j.
for (uint32_t i = 0; i < 4; ++i) {
// mask = map.iiii == (0, 1, 2, 3)
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(remap_movc_mask_temp);
shader_code_.push_back(EncodeVectorReplicatedOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, i, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_ColorOutputMap_Vec);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
shader_code_.push_back(1);
shader_code_.push_back(2);
shader_code_.push_back(3);
++stat_.instruction_count;
++stat_.int_instruction_count;
for (uint32_t j = 0; j < 4; ++j) {
// If map.i == j, move guest color j to the temporary host color.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(remap_movc_target_temp);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1));
shader_code_.push_back(remap_movc_mask_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_color_[j]);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(remap_movc_target_temp);
++stat_.instruction_count;
++stat_.movc_instruction_count;
}
// Write the remapped color to host render target i.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1));
shader_code_.push_back(i);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(remap_movc_target_temp);
++stat_.instruction_count;
++stat_.mov_instruction_count;
}
// Free the temporary registers used for remapping.
PopSystemTemp(2);
}
void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// TODO(Triang3l): Write the output to the EDRAM rasterizer-ordered view.
}
void DxbcShaderTranslator::CompletePixelShader() {
// Alpha test.
// Check if alpha test is enabled (if the constant is not 0).
@ -1126,68 +1195,12 @@ void DxbcShaderTranslator::CompletePixelShader() {
// Release gamma_toggle_temp and gamma_pieces_temp.
PopSystemTemp(2);
// Remap guest render target indices to host since because on the host, the
// indices of the bound render targets are consecutive. This is done using 16
// movc instructions because indexable temps are known to be causing
// performance issues on some Nvidia GPUs. In the map, the components are host
// render target indices, and the values are the guest ones.
uint32_t remap_movc_mask_temp = PushSystemTemp();
uint32_t remap_movc_target_temp = PushSystemTemp();
system_constants_used_ |= 1u << kSysConst_ColorOutputMap_Index;
// Host RT i, guest RT j.
for (uint32_t i = 0; i < 4; ++i) {
// mask = map.iiii == (0, 1, 2, 3)
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(remap_movc_mask_temp);
shader_code_.push_back(EncodeVectorReplicatedOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, i, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants));
shader_code_.push_back(kSysConst_ColorOutputMap_Vec);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
shader_code_.push_back(1);
shader_code_.push_back(2);
shader_code_.push_back(3);
++stat_.instruction_count;
++stat_.int_instruction_count;
for (uint32_t j = 0; j < 4; ++j) {
// If map.i == j, move guest color j to the temporary host color.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(remap_movc_target_temp);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1));
shader_code_.push_back(remap_movc_mask_temp);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_color_[j]);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(remap_movc_target_temp);
++stat_.instruction_count;
++stat_.movc_instruction_count;
}
// Write the remapped color to host render target i.
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1));
shader_code_.push_back(i);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(remap_movc_target_temp);
++stat_.instruction_count;
++stat_.mov_instruction_count;
// Write the values to the render targets.
if (edram_rov_used_) {
CompletePixelShader_WriteToROV();
} else {
CompletePixelShader_WriteToRTVs();
}
// Free the temporary registers used for remapping.
PopSystemTemp(2);
}
void DxbcShaderTranslator::CompleteShaderCode() {
@ -8389,16 +8402,19 @@ void DxbcShaderTranslator::WriteShaderCode() {
shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_IS_FRONT_FACE));
++stat_.dcl_count;
// Color output.
for (uint32_t i = 0; i < 4; ++i) {
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_object_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1));
shader_object_.push_back(i);
++stat_.dcl_count;
if (!edram_rov_used_) {
for (uint32_t i = 0; i < 4; ++i) {
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_object_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1));
shader_object_.push_back(i);
++stat_.dcl_count;
}
}
// Depth output.
// TODO(Triang3l): Do something with this for ROV.
if (writes_depth_) {
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) |

View File

@ -23,7 +23,7 @@ namespace gpu {
// Generates shader model 5_1 byte code (for Direct3D 12).
class DxbcShaderTranslator : public ShaderTranslator {
public:
DxbcShaderTranslator(bool edram_rovs_used);
DxbcShaderTranslator(bool edram_rov_used);
~DxbcShaderTranslator() override;
// Constant buffer bindings in space 0.
@ -305,6 +305,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Writing the epilogue.
void CompleteVertexShader();
void CompletePixelShader_WriteToRTVs();
void CompletePixelShader_WriteToROV();
void CompletePixelShader();
void CompleteShaderCode();
@ -432,7 +434,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
std::vector<uint32_t> shader_object_;
// Whether the output merger should be emulated in pixel shaders.
bool edram_rovs_used_;
bool edram_rov_used_;
// Data types used in constants buffers. Listed in dependency order.
enum class RdefTypeIndex {