From 67e5cb8681d187ce59285ecf4349229a27179812 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 10 Oct 2018 16:37:35 +0300 Subject: [PATCH] [D3D12] ROV: Disable RTs when using ROV --- src/xenia/gpu/d3d12/pipeline_cache.cc | 13 +- src/xenia/gpu/d3d12/render_target_cache.cc | 4 + src/xenia/gpu/dxbc_shader_translator.cc | 158 ++++++++++++--------- src/xenia/gpu/dxbc_shader_translator.h | 6 +- 4 files changed, 107 insertions(+), 74 deletions(-) diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 860de9aa9..04a6eb11d 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -46,7 +46,8 @@ PipelineCache::PipelineCache(D3D12CommandProcessor* command_processor, // Zero out tessellation, stream output, blend state and formats for render // targets 4+, node mask, cached PSO, flags and other things. std::memset(&update_desc_, 0, sizeof(update_desc_)); - update_desc_.BlendState.IndependentBlendEnable = TRUE; + update_desc_.BlendState.IndependentBlendEnable = + edram_rov_used_ ? FALSE : TRUE; update_desc_.SampleMask = UINT_MAX; update_desc_.SampleDesc.Count = 1; } @@ -353,6 +354,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( PipelineCache::UpdateStatus PipelineCache::UpdateBlendStateAndRenderTargets( D3D12Shader* pixel_shader, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { + if (edram_rov_used_) { + return current_pipeline_ == nullptr ? UpdateStatus::kMismatch + : UpdateStatus::kCompatible; + } + auto& regs = update_blend_state_and_render_targets_regs_; bool dirty = current_pipeline_ == nullptr; @@ -624,6 +630,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizerState( PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState( DXGI_FORMAT format) { + if (edram_rov_used_) { + return current_pipeline_ == nullptr ? UpdateStatus::kMismatch + : UpdateStatus::kCompatible; + } + auto& regs = update_depth_stencil_state_regs_; bool dirty = current_pipeline_ == nullptr; diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index d2fa952ab..94fd99ef6 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -352,6 +352,10 @@ void RenderTargetCache::BeginFrame() { } bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { + if (IsROVUsedForEDRAM()) { + return true; + } + // There are two kinds of render target binding updates in this implementation // in case something has been changed - full and partial. // diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 23d480d16..52da6a7ae 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -62,8 +62,8 @@ using namespace ucode; // second buffer in the descriptor array at b2, which is assigned to CB1, the // index would be CB1[3][0]. -DxbcShaderTranslator::DxbcShaderTranslator(bool edram_rovs_used) - : edram_rovs_used_(edram_rovs_used) { +DxbcShaderTranslator::DxbcShaderTranslator(bool edram_rov_used) + : edram_rov_used_(edram_rov_used) { // Don't allocate again and again for the first shader. shader_code_.reserve(8192); shader_object_.reserve(16384); @@ -892,6 +892,75 @@ void DxbcShaderTranslator::CompleteVertexShader() { ++stat_.mov_instruction_count; } +void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { + // Remap guest render target indices to host since because on the host, the + // indices of the bound render targets are consecutive. This is done using 16 + // movc instructions because indexable temps are known to be causing + // performance issues on some Nvidia GPUs. In the map, the components are host + // render target indices, and the values are the guest ones. + uint32_t remap_movc_mask_temp = PushSystemTemp(); + uint32_t remap_movc_target_temp = PushSystemTemp(); + system_constants_used_ |= 1u << kSysConst_ColorOutputMap_Index; + // Host RT i, guest RT j. + for (uint32_t i = 0; i < 4; ++i) { + // mask = map.iiii == (0, 1, 2, 3) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(remap_movc_mask_temp); + shader_code_.push_back(EncodeVectorReplicatedOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, i, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_ColorOutputMap_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(0); + shader_code_.push_back(1); + shader_code_.push_back(2); + shader_code_.push_back(3); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + for (uint32_t j = 0; j < 4; ++j) { + // If map.i == j, move guest color j to the temporary host color. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(remap_movc_target_temp); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1)); + shader_code_.push_back(remap_movc_mask_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(system_temp_color_[j]); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(remap_movc_target_temp); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + } + // Write the remapped color to host render target i. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1)); + shader_code_.push_back(i); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(remap_movc_target_temp); + ++stat_.instruction_count; + ++stat_.mov_instruction_count; + } + // Free the temporary registers used for remapping. + PopSystemTemp(2); +} + +void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { + // TODO(Triang3l): Write the output to the EDRAM rasterizer-ordered view. +} + void DxbcShaderTranslator::CompletePixelShader() { // Alpha test. // Check if alpha test is enabled (if the constant is not 0). @@ -1126,68 +1195,12 @@ void DxbcShaderTranslator::CompletePixelShader() { // Release gamma_toggle_temp and gamma_pieces_temp. PopSystemTemp(2); - // Remap guest render target indices to host since because on the host, the - // indices of the bound render targets are consecutive. This is done using 16 - // movc instructions because indexable temps are known to be causing - // performance issues on some Nvidia GPUs. In the map, the components are host - // render target indices, and the values are the guest ones. - uint32_t remap_movc_mask_temp = PushSystemTemp(); - uint32_t remap_movc_target_temp = PushSystemTemp(); - system_constants_used_ |= 1u << kSysConst_ColorOutputMap_Index; - // Host RT i, guest RT j. - for (uint32_t i = 0; i < 4; ++i) { - // mask = map.iiii == (0, 1, 2, 3) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(remap_movc_mask_temp); - shader_code_.push_back(EncodeVectorReplicatedOperand( - D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, i, 3)); - shader_code_.push_back(cbuffer_index_system_constants_); - shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); - shader_code_.push_back(kSysConst_ColorOutputMap_Vec); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(1); - shader_code_.push_back(2); - shader_code_.push_back(3); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - for (uint32_t j = 0; j < 4; ++j) { - // If map.i == j, move guest color j to the temporary host color. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(remap_movc_target_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1)); - shader_code_.push_back(remap_movc_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_color_[j]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(remap_movc_target_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } - // Write the remapped color to host render target i. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1)); - shader_code_.push_back(i); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(remap_movc_target_temp); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; + // Write the values to the render targets. + if (edram_rov_used_) { + CompletePixelShader_WriteToROV(); + } else { + CompletePixelShader_WriteToRTVs(); } - // Free the temporary registers used for remapping. - PopSystemTemp(2); } void DxbcShaderTranslator::CompleteShaderCode() { @@ -8389,16 +8402,19 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_IS_FRONT_FACE)); ++stat_.dcl_count; // Color output. - for (uint32_t i = 0; i < 4; ++i) { - shader_object_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_object_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1)); - shader_object_.push_back(i); - ++stat_.dcl_count; + if (!edram_rov_used_) { + for (uint32_t i = 0; i < 4; ++i) { + shader_object_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_object_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, 0b1111, 1)); + shader_object_.push_back(i); + ++stat_.dcl_count; + } } // Depth output. + // TODO(Triang3l): Do something with this for ROV. if (writes_depth_) { shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 0595b6ed4..82130ea3a 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -23,7 +23,7 @@ namespace gpu { // Generates shader model 5_1 byte code (for Direct3D 12). class DxbcShaderTranslator : public ShaderTranslator { public: - DxbcShaderTranslator(bool edram_rovs_used); + DxbcShaderTranslator(bool edram_rov_used); ~DxbcShaderTranslator() override; // Constant buffer bindings in space 0. @@ -305,6 +305,8 @@ class DxbcShaderTranslator : public ShaderTranslator { // Writing the epilogue. void CompleteVertexShader(); + void CompletePixelShader_WriteToRTVs(); + void CompletePixelShader_WriteToROV(); void CompletePixelShader(); void CompleteShaderCode(); @@ -432,7 +434,7 @@ class DxbcShaderTranslator : public ShaderTranslator { std::vector shader_object_; // Whether the output merger should be emulated in pixel shaders. - bool edram_rovs_used_; + bool edram_rov_used_; // Data types used in constants buffers. Listed in dependency order. enum class RdefTypeIndex {