diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 4214a0084..e6a530d58 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1793,7 +1793,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_test != alpha_test; system_constants_.alpha_test = alpha_test; - // Color exponent bias and output index mapping. + // Color exponent bias and output index mapping or ROV writing. + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; for (uint32_t i = 0; i < 4; ++i) { uint32_t color_info; switch (i) { @@ -1841,9 +1842,59 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( 80; dirty |= system_constants_.edram_pitch_tiles != edram_pitch_tiles; system_constants_.edram_pitch_tiles = edram_pitch_tiles; + static const uint32_t kRTFormatFlags[16] = { + // k_8_8_8_8 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_8_8_8_8_GAMMA + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_2_10_10_10 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_2_10_10_10_FLOAT + DxbcShaderTranslator::kRTFlag_FormatFloat10, + // k_16_16 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_16_16_16_16 + DxbcShaderTranslator::kRTFlag_Format64bpp | + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_16_16_FLOAT + DxbcShaderTranslator::kRTFlag_FormatFloat16, + // k_16_16_16_16_FLOAT + DxbcShaderTranslator::kRTFlag_Format64bpp | + DxbcShaderTranslator::kRTFlag_FormatFloat16, + // Unused + 0, + // Unused + 0, + // k_2_10_10_10_AS_16_16_16_16 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // Unused. + 0, + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + DxbcShaderTranslator::kRTFlag_FormatFloat10, + // Unused. + 0, + // k_32_FLOAT + 0, + // k_32_32_FLOAT + DxbcShaderTranslator::kRTFlag_Format64bpp, + }; + static const uint32_t kRTFormatAllComponentsMask[16] = { + 0b1111, 0b1111, 0b1111, 0b1111, 0b0011, 0b1111, 0b0011, 0b1111, + 0b0000, 0b0000, 0b1111, 0b0000, 0b1111, 0b0000, 0b0001, 0b0011, + }; + uint32_t rt_mask_all = kRTFormatAllComponentsMask[uint32_t(color_format)]; + uint32_t rt_mask = (rb_color_mask >> (i * 4)) & rt_mask_all; + uint32_t rt_flags = kRTFormatFlags[uint32_t(color_format)]; + if (rt_mask != 0) { + rt_flags |= DxbcShaderTranslator::kRTFlag_Used; + if (rt_mask != rt_mask_all) { + rt_flags |= DxbcShaderTranslator::kRTFlag_LoadingNeeded; + } + } + dirty |= system_constants_.edram_rt_flags[i] != rt_flags; + system_constants_.edram_rt_flags[i] = rt_flags; if (system_constants_color_formats_[i] != color_format) { dirty = true; - uint32_t rt_flags = 0; // Initialize min/max to Infinity. uint32_t color_min = 0xFF800000u, alpha_min = 0xFF800000u; uint32_t color_max = 0x7F800000u, alpha_max = 0x7F800000u; @@ -1851,7 +1902,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( switch (color_format) { case ColorRenderTargetFormat::k_8_8_8_8: case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 8; system_constants_.edram_rt_pack_width_low[i][1] = 8; system_constants_.edram_rt_pack_width_low[i][2] = 8; @@ -1866,7 +1916,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_2_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 10; system_constants_.edram_rt_pack_width_low[i][1] = 10; system_constants_.edram_rt_pack_width_low[i][2] = 10; @@ -1883,7 +1932,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFloat10; system_constants_.edram_rt_pack_width_low[i][0] = 10; system_constants_.edram_rt_pack_width_low[i][1] = 10; system_constants_.edram_rt_pack_width_low[i][2] = 10; @@ -1900,7 +1948,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_16_16: case ColorRenderTargetFormat::k_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 16; system_constants_.edram_rt_pack_width_low[i][1] = 16; system_constants_.edram_rt_pack_width_low[i][2] = 0; @@ -1918,7 +1965,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_16_16_FLOAT: case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFloat16; system_constants_.edram_rt_pack_width_low[i][0] = 16; system_constants_.edram_rt_pack_width_low[i][1] = 16; system_constants_.edram_rt_pack_width_low[i][2] = 0; @@ -1945,7 +1991,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( assert_always(); break; } - system_constants_.edram_rt_flags[i] = rt_flags; uint32_t rt_pair_index = i >> 1; uint32_t rt_pair_comp = (i & 1) << 1; system_constants_ diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index f71e09589..32b775ad9 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1355,14 +1355,80 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.int_instruction_count; // *************************************************************************** - // Test pixel writing. + // Write to color render targets. // *************************************************************************** - CompletePixelShader_WriteToROV_StoreColor(edram_coord_temp, 0, - system_temp_color_[0]); + system_constants_used_ |= 1ull << kSysConst_EDRAMRTFlags_Index; - // Release edram_coord_temp. - PopSystemTemp(); + // Get what render targets need to be written to. + uint32_t rt_used_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(rt_used_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Get what render targets need to be read (for write masks and blending). + uint32_t rt_loading_needed_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(rt_loading_needed_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + for (uint32_t i = 0; i < 4; ++i) { + // In case of overlap, the render targets with the lower index have higher + // priority since they usually have the most important value. + uint32_t rt_index = 3 - i; + + // Check if the render target needs to be written to. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | + ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( + D3D10_SB_INSTRUCTION_TEST_NONZERO) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, rt_index, 1)); + shader_code_.push_back(rt_used_temp); + ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; + + CompletePixelShader_WriteToROV_StoreColor(edram_coord_temp, rt_index, + system_temp_color_[rt_index]); + + // Close the check whether the RT is used. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; + } + + // Release rt_used_temp, rt_loading_needed_temp and edram_coord_temp. + PopSystemTemp(3); } void DxbcShaderTranslator::CompletePixelShader() { diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index edb3a52b7..3aea91d05 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -46,9 +46,14 @@ class DxbcShaderTranslator : public ShaderTranslator { }; enum : uint32_t { + // Whether the write mask is non-zero. kRTFlag_Used = 1, + // Whether the render target needs to be merged with another (if the write + // mask is not 1111, or 11 for 16_16, or 1 for 32_FLOAT, or blending is + // enabled and it's not no-op). + kRTFlag_LoadingNeeded = kRTFlag_Used << 1, // Whether the format is represented by 2 dwords. - kRTFlag_Format64bpp = kRTFlag_Used << 1, + kRTFlag_Format64bpp = kRTFlag_LoadingNeeded << 1, // Whether the format is fixed-point and needs to be converted to integer // (k_8_8_8_8, k_2_10_10_10, k_16_16, k_16_16_16_16). kRTFlag_FormatFixed = kRTFlag_Format64bpp << 1,