From 66a37c0cc392c8ac3f4762cc81bd70b292369203 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 17 Nov 2018 17:15:15 +0300 Subject: [PATCH] [D3D12] ROV: Aliasing and bounds checking, retc instead of discard --- .../gpu/d3d12/d3d12_command_processor.cc | 154 ++++++++++++------ src/xenia/gpu/d3d12/d3d12_command_processor.h | 2 +- src/xenia/gpu/dxbc_shader_translator.cc | 72 +++++++- 3 files changed, 177 insertions(+), 51 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index f097df267..e233e68b8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1223,7 +1223,7 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, // Update system constants before uploading them. UpdateSystemConstantValues( indexed ? index_buffer_info->endianness : Endian::kUnspecified, - pipeline_render_targets); + color_mask, pipeline_render_targets); // Update constant buffers, descriptors and root parameters. if (!UpdateBindings(command_list, vertex_shader, pixel_shader, @@ -1607,7 +1607,7 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( } void D3D12CommandProcessor::UpdateSystemConstantValues( - Endian index_endian, + Endian index_endian, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { auto& regs = *register_file_; @@ -1629,7 +1629,71 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t rb_surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; uint32_t rb_colorcontrol = regs[XE_GPU_REG_RB_COLORCONTROL].u32; uint32_t rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].u32; - uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + + // Get the color info register values for each render target, and also put + // some safety measures for the ROV path - disable fully aliased render + // targets. Also, for ROV, exclude components that don't exist in the format + // from the write mask. + uint32_t color_infos[4], rov_color_format_rt_flags[4]; + for (uint32_t i = 0; i < 4; ++i) { + uint32_t color_info; + switch (i) { + case 1: + color_info = regs[XE_GPU_REG_RB_COLOR1_INFO].u32; + break; + case 2: + color_info = regs[XE_GPU_REG_RB_COLOR2_INFO].u32; + break; + case 3: + color_info = regs[XE_GPU_REG_RB_COLOR3_INFO].u32; + break; + default: + color_info = regs[XE_GPU_REG_RB_COLOR_INFO].u32; + } + color_infos[i] = color_info; + + if (IsROVUsedForEDRAM()) { + ColorRenderTargetFormat color_format = + RenderTargetCache::GetBaseColorFormat( + ColorRenderTargetFormat((color_info >> 16) & 0xF)); + uint32_t rt_flags = + DxbcShaderTranslator::GetColorFormatRTFlags(color_format); + rov_color_format_rt_flags[i] = rt_flags; + + // Exclude unused components from the write mask. + color_mask &= + ~(((rt_flags >> DxbcShaderTranslator::kRTFlag_FormatUnusedR_Shift) & + 0xF) + << (i * 4)); + + // Disable the render target if it has the same EDRAM base as another one + // (with a smaller index - assume it's more important). + if (color_mask & (0xF << (i * 4))) { + uint32_t edram_base = color_info & 0xFFF; + for (uint32_t j = 0; j < i; ++j) { + if ((color_mask & (0xF << (j * 4))) && + edram_base == (color_infos[j] & 0xFFF)) { + color_mask &= ~(uint32_t(0xF << (i * 4))); + break; + } + } + } + } + } + + // Disable depth and stencil if it aliases a color render target (for + // instance, during the XBLA logo in Banjo-Kazooie, though depth writing is + // already disabled there). + if (IsROVUsedForEDRAM() && (rb_depthcontrol & (0x1 | 0x2))) { + uint32_t edram_base_depth = rb_depth_info & 0xFFF; + for (uint32_t i = 0; i < 4; ++i) { + if ((color_mask & (0xF << (i * 4))) && + edram_base_depth == (color_infos[i] & 0xFFF)) { + rb_depthcontrol &= ~(uint32_t(0x1 | 0x2)); + break; + } + } + } bool dirty = false; @@ -1674,32 +1738,30 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t(ColorRenderTargetFormat::k_8_8_8_8_GAMMA)) { flags |= DxbcShaderTranslator::kSysFlag_Color3Gamma; } - if (IsROVUsedForEDRAM()) { - if (rb_depthcontrol & (0x1 | 0x2)) { - flags |= DxbcShaderTranslator::kSysFlag_DepthStencil; - if (DepthRenderTargetFormat((rb_depth_info >> 16) & 0x1) == - DepthRenderTargetFormat::kD24FS8) { - flags |= DxbcShaderTranslator::kSysFlag_DepthFloat24; + if (IsROVUsedForEDRAM() && (rb_depthcontrol & (0x1 | 0x2))) { + flags |= DxbcShaderTranslator::kSysFlag_DepthStencil; + if (DepthRenderTargetFormat((rb_depth_info >> 16) & 0x1) == + DepthRenderTargetFormat::kD24FS8) { + flags |= DxbcShaderTranslator::kSysFlag_DepthFloat24; + } + if (rb_depthcontrol & 0x2) { + flags |= ((rb_depthcontrol >> 4) & 0x7) + << DxbcShaderTranslator::kSysFlag_DepthPassIfLess_Shift; + if (rb_depthcontrol & 0x4) { + flags |= DxbcShaderTranslator::kSysFlag_DepthWriteMask | + DxbcShaderTranslator::kSysFlag_DepthStencilWrite; } - if (rb_depthcontrol & 0x2) { - flags |= ((rb_depthcontrol >> 4) & 0x7) - << DxbcShaderTranslator::kSysFlag_DepthPassIfLess_Shift; - if (rb_depthcontrol & 0x4) { - flags |= DxbcShaderTranslator::kSysFlag_DepthWriteMask | - DxbcShaderTranslator::kSysFlag_DepthStencilWrite; - } - } else { - // In case stencil is used without depth testing - always pass, and - // don't modify the stored depth. - flags |= DxbcShaderTranslator::kSysFlag_DepthPassIfLess | - DxbcShaderTranslator::kSysFlag_DepthPassIfEqual | - DxbcShaderTranslator::kSysFlag_DepthPassIfGreater; - } - if (rb_depthcontrol & 0x1) { - flags |= DxbcShaderTranslator::kSysFlag_StencilTest; - if (rb_stencilrefmask & (0xFF << 16)) { - flags |= DxbcShaderTranslator::kSysFlag_DepthStencilWrite; - } + } else { + // In case stencil is used without depth testing - always pass, and + // don't modify the stored depth. + flags |= DxbcShaderTranslator::kSysFlag_DepthPassIfLess | + DxbcShaderTranslator::kSysFlag_DepthPassIfEqual | + DxbcShaderTranslator::kSysFlag_DepthPassIfGreater; + } + if (rb_depthcontrol & 0x1) { + flags |= DxbcShaderTranslator::kSysFlag_StencilTest; + if (rb_stencilrefmask & (0xFF << 16)) { + flags |= DxbcShaderTranslator::kSysFlag_DepthStencilWrite; } } } @@ -1860,25 +1922,32 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_test != alpha_test; system_constants_.alpha_test = alpha_test; - // Color exponent bias and output index mapping or ROV writing. + // EDRAM pitch for ROV writing. + if (IsROVUsedForEDRAM()) { + uint32_t edram_pitch_tiles = ((std::min(rb_surface_info & 0x3FFFu, 2560u) * + (msaa_samples >= MsaaSamples::k4X ? 2 : 1)) + + 79) / + 80; + dirty |= system_constants_.edram_pitch_tiles != edram_pitch_tiles; + system_constants_.edram_pitch_tiles = edram_pitch_tiles; + } + + // Color exponent bias and output index mapping or ROV render target writing. bool colorcontrol_blend_enable = (rb_colorcontrol & 0x20) == 0; for (uint32_t i = 0; i < 4; ++i) { - uint32_t color_info, blend_control; + uint32_t color_info = color_infos[i]; + uint32_t blend_control; switch (i) { case 1: - color_info = regs[XE_GPU_REG_RB_COLOR1_INFO].u32; blend_control = regs[XE_GPU_REG_RB_BLENDCONTROL_1].u32; break; case 2: - color_info = regs[XE_GPU_REG_RB_COLOR2_INFO].u32; blend_control = regs[XE_GPU_REG_RB_BLENDCONTROL_2].u32; break; case 3: - color_info = regs[XE_GPU_REG_RB_COLOR3_INFO].u32; blend_control = regs[XE_GPU_REG_RB_BLENDCONTROL_3].u32; break; default: - color_info = regs[XE_GPU_REG_RB_COLOR_INFO].u32; blend_control = regs[XE_GPU_REG_RB_BLENDCONTROL_0].u32; } // Exponent bias is in bits 20:25 of RB_COLOR_INFO. @@ -1907,19 +1976,10 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t edram_base_dwords = (color_info & 0xFFF) * 1280; dirty |= system_constants_.edram_base_dwords[i] != edram_base_dwords; system_constants_.edram_base_dwords[i] = edram_base_dwords; - uint32_t edram_pitch_tiles = - ((std::min(rb_surface_info & 0x3FFFu, 2560u) * - (msaa_samples >= MsaaSamples::k4X ? 2 : 1)) + - 79) / - 80; - dirty |= system_constants_.edram_pitch_tiles != edram_pitch_tiles; - system_constants_.edram_pitch_tiles = edram_pitch_tiles; - uint32_t rt_flags = - DxbcShaderTranslator::GetColorFormatRTFlags(color_format); - // Exclude unused components from the write mask. - uint32_t rt_mask = - (rb_color_mask >> (i * 4)) & 0xF & - ~(rt_flags >> DxbcShaderTranslator::kRTFlag_FormatUnusedR_Shift); + uint32_t rt_flags = rov_color_format_rt_flags[i]; + // Unused components already excluded from the write mask when color infos + // were obtained, and fully aliased render targets were already skipped. + uint32_t rt_mask = (color_mask >> (i * 4)) & 0xF; if (rt_mask != 0) { rt_flags |= rt_mask << DxbcShaderTranslator::kRTFlag_WriteR_Shift; uint32_t blend_x, blend_y; diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 6406d8c3d..d8aba2380 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -197,7 +197,7 @@ class D3D12CommandProcessor : public CommandProcessor { void UpdateFixedFunctionState(ID3D12GraphicsCommandList* command_list); void UpdateSystemConstantValues( - Endian index_endian, + Endian index_endian, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]); bool UpdateBindings(ID3D12GraphicsCommandList* command_list, const D3D12Shader* vertex_shader, diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index b00b82793..70eaa7f53 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -4111,6 +4111,37 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.int_instruction_count; + // Prevent going out of EDRAM bounds. + uint32_t depth_stencil_bound_check_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(depth_stencil_bound_check_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(1280 * 2048); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_stencil_bound_check_temp); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + // Release depth_stencil_bound_check_temp. + PopSystemTemp(); + // Enter the depth/stencil test if needed. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( @@ -4805,13 +4836,14 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // Discard the pixel if depth test failed and no stencil testing. shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DISCARD) | + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_RETC) | ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(D3D10_SB_INSTRUCTION_TEST_ZERO) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); shader_code_.push_back(depth_stencil_test_result_temp); ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; // Stencil test done. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | @@ -4894,13 +4926,14 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // writing because stencil may be modified even if the depth/stencil test // fails. shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DISCARD) | + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_RETC) | ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(D3D10_SB_INSTRUCTION_TEST_ZERO) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); shader_code_.push_back(depth_stencil_test_result_temp); ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; // Release depth_stencil_test_result_temp. PopSystemTemp(); @@ -4920,8 +4953,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { if (color_targets_written) { system_constants_used_ |= 1ull << kSysConst_EDRAMRTFlags_Index; - // Mask disabled color writes. uint32_t rt_write_masks_temp = PushSystemTemp(); + + // Mask disabled color writes. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); shader_code_.push_back( @@ -4953,6 +4987,38 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { shader_code_.push_back(rt_write_masks_temp); ++stat_.instruction_count; ++stat_.uint_instruction_count; + + // Prevent going out of EDRAM bounds. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(rt_write_masks_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(edram_coord_low_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(1280 * 2048); + shader_code_.push_back(1280 * 2048); + shader_code_.push_back(1280 * 2048); + shader_code_.push_back(1280 * 2048); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(system_temp_color_written_); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(system_temp_color_written_); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(rt_write_masks_temp); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + // Release rt_write_masks_temp. PopSystemTemp();