From 10a3dc6d940e5ebdf323aea0e942216a3783b5e3 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 12 Oct 2018 20:06:40 +0300 Subject: [PATCH] [D3D12] ROV: 32-bit loading, except for 7e3 --- .../gpu/d3d12/d3d12_command_processor.cc | 75 +- .../shaders/dxbc/primitive_point_list_gs.cso | Bin 6700 -> 6828 bytes .../shaders/dxbc/primitive_point_list_gs.h | 900 +++++++++--------- .../shaders/dxbc/primitive_point_list_gs.txt | 14 +- src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli | 14 +- src/xenia/gpu/dxbc_shader_translator.cc | 257 ++++- src/xenia/gpu/dxbc_shader_translator.h | 31 +- 7 files changed, 809 insertions(+), 482 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 4214a0084..a43429eda 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1793,7 +1793,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_test != alpha_test; system_constants_.alpha_test = alpha_test; - // Color exponent bias and output index mapping. + // Color exponent bias and output index mapping or ROV writing. + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; for (uint32_t i = 0; i < 4; ++i) { uint32_t color_info; switch (i) { @@ -1841,9 +1842,60 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( 80; dirty |= system_constants_.edram_pitch_tiles != edram_pitch_tiles; system_constants_.edram_pitch_tiles = edram_pitch_tiles; + static const uint32_t kRTFormatFlags[16] = { + // k_8_8_8_8 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_8_8_8_8_GAMMA + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_2_10_10_10 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_2_10_10_10_FLOAT + DxbcShaderTranslator::kRTFlag_FormatFloat10, + // k_16_16 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_16_16_16_16 + DxbcShaderTranslator::kRTFlag_Format64bpp | + DxbcShaderTranslator::kRTFlag_FormatFixed, + // k_16_16_FLOAT + DxbcShaderTranslator::kRTFlag_FormatFloat16, + // k_16_16_16_16_FLOAT + DxbcShaderTranslator::kRTFlag_Format64bpp | + DxbcShaderTranslator::kRTFlag_FormatFloat16, + // Unused + 0, + // Unused + 0, + // k_2_10_10_10_AS_16_16_16_16 + DxbcShaderTranslator::kRTFlag_FormatFixed, + // Unused. + 0, + // k_2_10_10_10_FLOAT_AS_16_16_16_16 + DxbcShaderTranslator::kRTFlag_FormatFloat10, + // Unused. + 0, + // k_32_FLOAT + 0, + // k_32_32_FLOAT + DxbcShaderTranslator::kRTFlag_Format64bpp, + }; + static const uint32_t kRTFormatAllComponentsMask[16] = { + 0b1111, 0b1111, 0b1111, 0b1111, 0b0011, 0b1111, 0b0011, 0b1111, + 0b0000, 0b0000, 0b1111, 0b0000, 0b1111, 0b0000, 0b0001, 0b0011, + }; + uint32_t rt_mask_all = kRTFormatAllComponentsMask[uint32_t(color_format)]; + uint32_t rt_mask = (rb_color_mask >> (i * 4)) & rt_mask_all; + uint32_t rt_flags = kRTFormatFlags[uint32_t(color_format)]; + if (rt_mask != 0) { + rt_flags |= DxbcShaderTranslator::kRTFlag_Used; + if (rt_mask != rt_mask_all) { + rt_flags |= DxbcShaderTranslator::kRTFlag_LoadingNeeded; + } + } + dirty |= system_constants_.edram_rt_flags[i] != rt_flags; + system_constants_.edram_rt_flags[i] = rt_flags; if (system_constants_color_formats_[i] != color_format) { dirty = true; - uint32_t rt_flags = 0; + uint32_t color_mask = UINT32_MAX, alpha_mask = UINT32_MAX; // Initialize min/max to Infinity. uint32_t color_min = 0xFF800000u, alpha_min = 0xFF800000u; uint32_t color_max = 0x7F800000u, alpha_max = 0x7F800000u; @@ -1851,7 +1903,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( switch (color_format) { case ColorRenderTargetFormat::k_8_8_8_8: case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 8; system_constants_.edram_rt_pack_width_low[i][1] = 8; system_constants_.edram_rt_pack_width_low[i][2] = 8; @@ -1860,13 +1911,13 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.edram_rt_pack_offset_low[i][1] = 8; system_constants_.edram_rt_pack_offset_low[i][2] = 16; system_constants_.edram_rt_pack_offset_low[i][3] = 24; + color_mask = alpha_mask = 255; color_min = alpha_min = 0; color_max = alpha_max = 0x3F800000; color_store_scale = alpha_store_scale = 255.0f; break; case ColorRenderTargetFormat::k_2_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 10; system_constants_.edram_rt_pack_width_low[i][1] = 10; system_constants_.edram_rt_pack_width_low[i][2] = 10; @@ -1875,6 +1926,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.edram_rt_pack_offset_low[i][1] = 10; system_constants_.edram_rt_pack_offset_low[i][2] = 20; system_constants_.edram_rt_pack_offset_low[i][3] = 30; + color_mask = 1023; + alpha_mask = 3; color_min = alpha_min = 0; color_max = alpha_max = 0x3F800000; // 1023.0. @@ -1883,7 +1936,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFloat10; system_constants_.edram_rt_pack_width_low[i][0] = 10; system_constants_.edram_rt_pack_width_low[i][1] = 10; system_constants_.edram_rt_pack_width_low[i][2] = 10; @@ -1892,6 +1944,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.edram_rt_pack_offset_low[i][1] = 10; system_constants_.edram_rt_pack_offset_low[i][2] = 20; system_constants_.edram_rt_pack_offset_low[i][3] = 30; + color_mask = 1023; + alpha_mask = 3; color_min = alpha_min = 0; // 31.875. color_max = 0x41FF0000; @@ -1900,7 +1954,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_16_16: case ColorRenderTargetFormat::k_16_16_16_16: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFixed; system_constants_.edram_rt_pack_width_low[i][0] = 16; system_constants_.edram_rt_pack_width_low[i][1] = 16; system_constants_.edram_rt_pack_width_low[i][2] = 0; @@ -1910,6 +1963,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.edram_rt_pack_offset_low[i][2] = 0; system_constants_.edram_rt_pack_offset_low[i][3] = 0; // TODO(Triang3l): 64bpp variant. + // Color and alpha mask UINT32_MAX because the format is signed. // -32.0. color_min = alpha_min = 0xC2000000u; // 32.0. @@ -1918,7 +1972,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( break; case ColorRenderTargetFormat::k_16_16_FLOAT: case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: - rt_flags |= DxbcShaderTranslator::kRTFlag_FormatFloat16; system_constants_.edram_rt_pack_width_low[i][0] = 16; system_constants_.edram_rt_pack_width_low[i][1] = 16; system_constants_.edram_rt_pack_width_low[i][2] = 0; @@ -1928,6 +1981,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.edram_rt_pack_offset_low[i][2] = 0; system_constants_.edram_rt_pack_offset_low[i][3] = 0; // TODO(Triang3l): 64bpp variant. + color_mask = alpha_mask = 0xFFFF; break; case ColorRenderTargetFormat::k_32_FLOAT: case ColorRenderTargetFormat::k_32_32_FLOAT: @@ -1945,9 +1999,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( assert_always(); break; } - system_constants_.edram_rt_flags[i] = rt_flags; uint32_t rt_pair_index = i >> 1; uint32_t rt_pair_comp = (i & 1) << 1; + system_constants_ + .edram_load_mask_rt01_rt23[rt_pair_index][rt_pair_comp] = + color_mask; + system_constants_ + .edram_load_mask_rt01_rt23[rt_pair_index][rt_pair_comp + 1] = + alpha_mask; system_constants_ .edram_store_min_rt01_rt23[rt_pair_index][rt_pair_comp] = color_min; system_constants_ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso index 1ea05e426ba3b81203e1a3ea7d0c9732c8e0d0c3..85476b9965de25803a2c187521b3441357b1edf7 100644 GIT binary patch delta 926 zcmZY8OGpAi7zgkfNq4n~U5OG2ilQz>Y+)b;CE61TDk40!+5@7Jqz6TZuudI>l<6AO zB`+R=u0>sghz>yy=s^fiD4Y_g}Hd+ zixDN+MAGr+Dz(Crg#v5>%Egr)AnsVO+&&6afcyfFIZOPd(jJ9YW0v3aB%^oGT4-VI zJG9wQ`<lP`9A-P>-P7(1DyH<#u4ukz!R7P(L)^FFzUk8886k zkH_O7)4~rlj9FeCGTWk6D*B8RC~tQZ8WVIHdMM})^hD4j=%t{S&|5)Yp>KlzLDPcP zSV8zS6eJ!FnfKcbRRquj28>M_&@wbouh{av-! z&Dtt702N)>FltS8s6=y;8qkQOGtjlRBDHsjLgT&aPC%2;PQT)2JbSj*zkm!!iwzFC*HE@289uqe+v3}?hbqS*ur)aT`J=MR k5`0LFvI5^w=WXu_KT~g5hKIC8TSYA&YSXOzrhmBp0gQ--> 1; + uint32_t rt_pair_swizzle = rt_index & 1 ? 0b11101010 : 0b01000000; + + // Extract the needed flags. + uint32_t flags_temp = PushSystemTemp(); + system_constants_used_ |= 1ull << kSysConst_EDRAMRTFlags_Index; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(flags_temp); + shader_code_.push_back(EncodeVectorReplicatedOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, rt_index, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kRTFlag_Format64bpp); + shader_code_.push_back(kRTFlag_FormatFixed); + shader_code_.push_back(kRTFlag_FormatFloat10); + shader_code_.push_back(kRTFlag_FormatFloat16); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Load the low 32 bits. + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_LD_UAV_TYPED) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(edram_dword_offset_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW, kSwizzleXYZW, 2)); + shader_code_.push_back(0); + shader_code_.push_back(0); + ++stat_.instruction_count; + ++stat_.texture_load_instructions; + + // Unpack the low 32 bits, as signed because of k_16_16 and k_16_16_16_16 + // (will be masked later if needed). + system_constants_used_ |= ((1ull << kSysConst_EDRAMRTPackWidthLowRT0_Index) | + (1ull << kSysConst_EDRAMRTPackOffsetLowRT0_Index)) + << rt_index; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_IBFE) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(13)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTPackWidthLowRT0_Vec + rt_index); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTPackOffsetLowRT0_Vec + rt_index); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXXXX, 1)); + shader_code_.push_back(target_temp); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // Mask the components to differentiate between signed and unsigned. + system_constants_used_ |= (1ull << kSysConst_EDRAMLoadMaskRT01_Index) + << rt_pair_index; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, rt_pair_swizzle, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMLoadMaskRT01_Vec + rt_pair_index); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // TODO(Triang3l): 64bpp loading and unpacking. + + // Convert from fixed-point. + uint32_t fixed_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ITOF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(fixed_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(target_temp); + ++stat_.instruction_count; + ++stat_.conversion_instruction_count; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(flags_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(fixed_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(target_temp); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + // Release fixed_temp. + PopSystemTemp(); + + // TODO(Triang3l): Convert from 7e3. + + // Convert from 16-bit float. + uint32_t f16_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_F16TOF32) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(f16_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(target_temp); + ++stat_.instruction_count; + ++stat_.conversion_instruction_count; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(target_temp); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(flags_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(f16_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(target_temp); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + // Release f16_temp. + PopSystemTemp(); + + // Release flags_temp. + PopSystemTemp(); +} + void DxbcShaderTranslator::CompletePixelShader_WriteToROV_StoreColor( uint32_t edram_dword_offset_temp, uint32_t rt_index, uint32_t source_and_scratch_temp) { @@ -1355,14 +1515,80 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.int_instruction_count; // *************************************************************************** - // Test pixel writing. + // Write to color render targets. // *************************************************************************** - CompletePixelShader_WriteToROV_StoreColor(edram_coord_temp, 0, - system_temp_color_[0]); + system_constants_used_ |= 1ull << kSysConst_EDRAMRTFlags_Index; - // Release edram_coord_temp. - PopSystemTemp(); + // Get what render targets need to be written to. + uint32_t rt_used_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(rt_used_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + shader_code_.push_back(kRTFlag_Used); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Get what render targets need to be read (for write masks and blending). + uint32_t rt_loading_needed_temp = PushSystemTemp(); + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(rt_loading_needed_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSwizzleXYZW, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMRTFlags_Vec); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + shader_code_.push_back(kRTFlag_LoadingNeeded); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + for (uint32_t i = 0; i < 4; ++i) { + // In case of overlap, the render targets with the lower index have higher + // priority since they usually have the most important value. + uint32_t rt_index = 3 - i; + + // Check if the render target needs to be written to. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | + ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( + D3D10_SB_INSTRUCTION_TEST_NONZERO) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, rt_index, 1)); + shader_code_.push_back(rt_used_temp); + ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; + + CompletePixelShader_WriteToROV_StoreColor(edram_coord_temp, rt_index, + system_temp_color_[rt_index]); + + // Close the check whether the RT is used. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; + } + + // Release rt_used_temp, rt_loading_needed_temp and edram_coord_temp. + PopSystemTemp(3); } void DxbcShaderTranslator::CompletePixelShader() { @@ -1503,8 +1729,9 @@ void DxbcShaderTranslator::CompletePixelShader() { ++stat_.float_instruction_count; } - // Convert to gamma space (likely needs to be done after the exponent bias - // since gamma is a property of the storage format). + // Convert to gamma space. + // TODO(Triang3l): Do it after blending for ROV. + // https://steamcdn-a.akamaihd.net/apps/valve/2008/GDC2008_PostProcessingInTheOrangeBox.pdf // Get which render targets need the conversion. uint32_t gamma_toggle_temp = PushSystemTemp(); uint32_t gamma_pieces_temp = PushSystemTemp(); @@ -7898,17 +8125,21 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator:: // vec4 17 {"xe_edram_rt_pack_offset_low_rt3", RdefTypeIndex::kUint4, 272, 16}, // vec4 18 - {"xe_edram_store_min_rt01", RdefTypeIndex::kFloat4, 288, 16}, + {"xe_edram_rt_unpack_mask_low_rt01", RdefTypeIndex::kUint4, 288, 16}, // vec4 19 - {"xe_edram_store_min_rt23", RdefTypeIndex::kFloat4, 304, 16}, + {"xe_edram_rt_unpack_mask_low_rt23", RdefTypeIndex::kUint4, 304, 16}, // vec4 20 - {"xe_edram_store_max_rt01", RdefTypeIndex::kFloat4, 320, 16}, + {"xe_edram_store_min_rt01", RdefTypeIndex::kFloat4, 320, 16}, // vec4 21 - {"xe_edram_store_max_rt23", RdefTypeIndex::kFloat4, 336, 16}, + {"xe_edram_store_min_rt23", RdefTypeIndex::kFloat4, 336, 16}, // vec4 22 - {"xe_edram_store_scale_rt01", RdefTypeIndex::kFloat4, 352, 16}, + {"xe_edram_store_max_rt01", RdefTypeIndex::kFloat4, 352, 16}, // vec4 23 - {"xe_edram_store_scale_rt23", RdefTypeIndex::kFloat4, 368, 16}, + {"xe_edram_store_max_rt23", RdefTypeIndex::kFloat4, 368, 16}, + // vec4 24 + {"xe_edram_store_scale_rt01", RdefTypeIndex::kFloat4, 384, 16}, + // vec4 25 + {"xe_edram_store_scale_rt23", RdefTypeIndex::kFloat4, 400, 16}, }; void DxbcShaderTranslator::WriteResourceDefinitions() { diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index edb3a52b7..c9b787981 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -46,9 +46,14 @@ class DxbcShaderTranslator : public ShaderTranslator { }; enum : uint32_t { + // Whether the write mask is non-zero. kRTFlag_Used = 1, + // Whether the render target needs to be merged with another (if the write + // mask is not 1111, or 11 for 16_16, or 1 for 32_FLOAT, or blending is + // enabled and it's not no-op). + kRTFlag_LoadingNeeded = kRTFlag_Used << 1, // Whether the format is represented by 2 dwords. - kRTFlag_Format64bpp = kRTFlag_Used << 1, + kRTFlag_Format64bpp = kRTFlag_LoadingNeeded << 1, // Whether the format is fixed-point and needs to be converted to integer // (k_8_8_8_8, k_2_10_10_10, k_16_16, k_16_16_16_16). kRTFlag_FormatFixed = kRTFlag_Format64bpp << 1, @@ -118,18 +123,25 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t edram_rt_pack_offset_low[4][4]; // vec4 18:19 + // Format info - mask of color and alpha after unpacking, but before float + // conversion. Primarily to differentiate between signed and unsigned + // formats because ibfe is used for both since k_16_16 and k_16_16_16_16 are + // signed. + uint32_t edram_load_mask_rt01_rt23[2][4]; + + // vec4 20:21 // Format info - minimum color and alpha values (as float, before // conversion) writable to the each render target. Integer so it's easier to // write infinity. uint32_t edram_store_min_rt01_rt23[2][4]; - // vec4 20:21 + // vec4 22:23 // Format info - maximum color and alpha values (as float, before // conversion) writable to the each render target. Integer so it's easier to // write infinity. uint32_t edram_store_max_rt01_rt23[2][4]; - // vec4 22:23 + // vec4 24:25 // Format info - scale to apply to the color and the alpha of each render // target before packing. float edram_store_scale_rt01_rt23[2][4]; @@ -302,9 +314,15 @@ class DxbcShaderTranslator : public ShaderTranslator { kSysConst_EDRAMRTPackOffsetLowRT3_Vec = kSysConst_EDRAMRTPackOffsetLowRT2_Vec + 1, - kSysConst_EDRAMStoreMinRT01_Index = + kSysConst_EDRAMLoadMaskRT01_Index = kSysConst_EDRAMRTPackOffsetLowRT3_Index + 1, - kSysConst_EDRAMStoreMinRT01_Vec = kSysConst_EDRAMRTPackOffsetLowRT3_Vec + 1, + kSysConst_EDRAMLoadMaskRT01_Vec = kSysConst_EDRAMRTPackOffsetLowRT3_Vec + 1, + + kSysConst_EDRAMLoadMaskRT23_Index = kSysConst_EDRAMLoadMaskRT01_Index + 1, + kSysConst_EDRAMLoadMaskRT23_Vec = kSysConst_EDRAMLoadMaskRT01_Vec + 1, + + kSysConst_EDRAMStoreMinRT01_Index = kSysConst_EDRAMLoadMaskRT23_Index + 1, + kSysConst_EDRAMStoreMinRT01_Vec = kSysConst_EDRAMLoadMaskRT23_Vec + 1, kSysConst_EDRAMStoreMinRT23_Index = kSysConst_EDRAMStoreMinRT01_Index + 1, kSysConst_EDRAMStoreMinRT23_Vec = kSysConst_EDRAMStoreMinRT01_Vec + 1, @@ -421,6 +439,9 @@ class DxbcShaderTranslator : public ShaderTranslator { // Writing the epilogue. void CompleteVertexShader(); void CompletePixelShader_WriteToRTVs(); + void CompletePixelShader_WriteToROV_LoadColor( + uint32_t edram_dword_offset_temp, uint32_t rt_index, + uint32_t target_index); void CompletePixelShader_WriteToROV_StoreColor( uint32_t edram_dword_offset_temp, uint32_t rt_index, uint32_t source_and_scratch_temp);