From e455467c32ba668acaf9c14fb7752117197ae6d0 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 15 Oct 2018 22:13:06 +0300 Subject: [PATCH] [D3D12] ROV: Non-working depth testing code --- .../gpu/d3d12/d3d12_command_processor.cc | 26 +- .../shaders/dxbc/primitive_point_list_gs.cso | Bin 6844 -> 6908 bytes .../shaders/dxbc/primitive_point_list_gs.h | 843 ++++++++--------- .../shaders/dxbc/primitive_point_list_gs.txt | 1 + src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli | 1 + src/xenia/gpu/dxbc_shader_translator.cc | 877 ++++++++++++++++-- src/xenia/gpu/dxbc_shader_translator.h | 84 +- 7 files changed, 1316 insertions(+), 516 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index df917a532..410ec2ca3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1589,6 +1589,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t vgt_indx_offset = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32; uint32_t pa_cl_vte_cntl = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32; + uint32_t rb_depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32; uint32_t pa_cl_clip_cntl = regs[XE_GPU_REG_PA_CL_CLIP_CNTL].u32; uint32_t pa_su_vtx_cntl = regs[XE_GPU_REG_PA_SU_VTX_CNTL].u32; uint32_t pa_su_point_size = regs[XE_GPU_REG_PA_SU_POINT_SIZE].u32; @@ -1642,6 +1643,25 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t(ColorRenderTargetFormat::k_8_8_8_8_GAMMA)) { flags |= DxbcShaderTranslator::kSysFlag_Color3Gamma; } + if (render_target_cache_->IsROVUsedForEDRAM()) { + uint32_t rb_depthcontrol = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32; + if (rb_depthcontrol & 0x2) { + if (DepthRenderTargetFormat(rb_depth_info) == + DepthRenderTargetFormat::kD24FS8) { + flags |= DxbcShaderTranslator::kSysFlag_DepthFloat24; + } + // Read depth/stencil if depth comparison function is not "always". + uint32_t depth_comparison = (rb_depthcontrol >> 4) & 0x7; + flags |= depth_comparison + << DxbcShaderTranslator::kSysFlag_DepthPassIfLess_Shift; + if (depth_comparison != 0x7) { + flags |= DxbcShaderTranslator::kSysFlag_DepthStencilRead; + } + if (rb_depthcontrol & 0x4) { + flags |= DxbcShaderTranslator::kSysFlag_DepthStencilWrite; + } + } + } dirty |= system_constants_.flags != flags; system_constants_.flags = flags; @@ -1905,8 +1925,12 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } } - // Blend constant for ROV blending. + // Depth testing and blend constant for ROV blending. if (render_target_cache_->IsROVUsedForEDRAM()) { + uint32_t depth_base_dwords = + (regs[XE_GPU_REG_RB_DEPTH_INFO].u32 & 0xFFF) * 1280; + dirty |= system_constants_.edram_depth_base_dwords != depth_base_dwords; + system_constants_.edram_depth_base_dwords = depth_base_dwords; dirty |= system_constants_.edram_blend_constant[0] != regs[XE_GPU_REG_RB_BLEND_RED].f32; system_constants_.edram_blend_constant[0] = diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso index ef07065585e3c60231c15b9eb986179cfe9dc495..2a0fef903cedabc6208af783a26b4435e9020436 100644 GIT binary patch delta 833 zcmZY7%PT}t90%~<8ScZgnY%EB#?+9FG|807qlxRW93&PpgFB^!@g}8N_ycA-TZ*#Q zNaVFpva(@kfk@dY8!1`%o;x#T?mcz7-}5=|+qn$<27>xXXV>*Xwf^J1*l*bGIHAoF znGw-%C6R=0GwMSzQM#1quAC?eAJDsdwo}0>HlG@rtgFR{Bx+hr%NDS=Cs1^Va*8#zM_@G?xiv}QrCGGz}&G zTIiFukeniEJ4jx<8Yj9$$=vJ^>bI(N0}Yw;pOC_4`a+7DsSGbPZl*eG>_M~? zwnX6~R8tNSG=_~WND;L`brm5XDT03%(qli>&1W4my!r-57SGLcuKZuQi5b{11l4 z+McV}EYm;^ZMXuII&lk#^V%y@z+NE#fDeW5c>{D}(cJTeV-4s6wpPj}k&v;LN~f5s zTIC?r3*{~7Fk`Bwp?-}XL4z8-f-W=^x5f^4&}gSx%Fr0J*)4x8@;fjAPKh>SGSPDT2WILDX*prq@tQ0knYt~MS51#FVd@;`nm}D;z{-(5&TbWXUSn` zZ>I`zsEfTB+`=xBmnX|UQn0m>eHt9*Xsht<> 23 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(23); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // z = 113 - (f32 >> 23) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(113); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1) | + ENCODE_D3D10_SB_OPERAND_EXTENDED(1)); + shader_code_.push_back( + ENCODE_D3D10_SB_EXTENDED_OPERAND_MODIFIER(D3D10_SB_OPERAND_MODIFIER_NEG)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // y = ((f32 & 0x7FFFFF) | 0x800000) >> (113 - (f32 >> 23)) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Check if the number is too small to be represented as normalized 20e4. + // z = f32 < 0x38800000 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(0x38800000); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Bias the exponent. + // f32 += 0xC8000000 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(0xC8000000u); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // Replace the number in f32 with a denormalized one if needed. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + + // Build the 20e4 number. + // y = f32 >> 3 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(3); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // y = (f32 >> 3) & 1 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(1); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // f24 = f32 + 3 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(3); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // f24 = f32 + 3 + ((f32 >> 3) & 1) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // f24 = (f32 + 3 + ((f32 >> 3) & 1)) >> 3 + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(3); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // f24 = ((f32 + 3 + ((f32 >> 3) & 1)) >> 3) & 0xFFFFFF + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(0xFFFFFF); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // *************************************************************************** + // 20e4 conversion ends here. + // *************************************************************************** + + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ELSE) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; + + // *************************************************************************** + // Unorm24 conversion begins here. + // *************************************************************************** + + // Multiply by float(0xFFFFFF). + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(0x4B7FFFFF); + ++stat_.instruction_count; + ++stat_.float_instruction_count; + + // Convert to fixed-point. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_FTOU) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.conversion_instruction_count; + + // *************************************************************************** + // Unorm24 conversion ends here. + // *************************************************************************** + + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; +} + void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { // Remap guest render target indices to host since because on the host, the // indices of the bound render targets are consecutive. This is done using 16 @@ -2477,7 +2802,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV_StoreColor( ++stat_.instruction_count; ++stat_.uint_instruction_count; - // Check if the numbers are too small to be represented as a normalized 7e3. + // Check if the numbers are too small to be represented as normalized 7e3. // t2 = f32 < 0x3E800000 shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); @@ -2884,9 +3209,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.int_instruction_count; - // Calculate the address in the EDRAM buffer. - - // 1) Multiply tile Y index by the pitch and add X tile index to it to + // Multiply tile Y index by the pitch and add X tile index to it to // edram_coord_low_temp.z. system_constants_used_ |= 1ull << kSysConst_EDRAMPitchTiles_Index; shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | @@ -2909,9 +3232,61 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.uint_instruction_count; - // TODO(Triang3l): For depth, swap 40-column groups into Y. + // Swap 40 sample columns within the tile for the depth buffer into + // system_temp_depth_.w - shaders uploading depth to the EDRAM by aliasing a + // color render target expect this. - // 2) Get dword offset within the tile to edram_coord_low_temp.x. + // 1) Check in which half of the tile the sample is. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(edram_coord_low_temp); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(40); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 2) Get the value to add to the tile-relative X sample index. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(40); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(uint32_t(-40)); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + + // 3) Actually swap the 40 sample columns. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(edram_coord_low_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // Calculate the address in the EDRAM buffer. + + // 1a) Get dword offset within the tile to edram_coord_low_temp.x. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( @@ -2929,7 +3304,25 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.uint_instruction_count; - // 3) Combine the tile offset and the offset within the tile to + // 1b) Do the same for depth/stencil to system_temp_depth_.w. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(edram_coord_low_temp); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(80); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 2a) Combine the tile offset and the offset within the tile to // edram_coord_low_temp.x. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); @@ -2948,6 +3341,24 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.uint_instruction_count; + // 2b) Do the same for depth/stencil to system_temp_depth_.w. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); + shader_code_.push_back(edram_coord_low_temp); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(1280); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + // Adjust the offsets for 64 bits per pixel. uint32_t edram_coord_high_temp = PushSystemTemp(); @@ -3000,7 +3411,6 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.int_instruction_count; // Add the EDRAM bases for each render target. - // TODO(Triang3l): Do this for depth to a separate register. system_constants_used_ |= 1ull << kSysConst_EDRAMBaseDwords_Index; shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); @@ -3018,6 +3428,25 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.int_instruction_count; + // Add the EDRAM base for depth. + system_constants_used_ |= 1ull << kSysConst_EDRAMDepthBaseDwords_Index; + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, + kSysConst_EDRAMDepthBaseDwords_Comp, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_EDRAMDepthBaseDwords_Vec); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + // Get the offsets of the upper 32 bits. shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IADD) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); @@ -3033,6 +3462,282 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { ++stat_.instruction_count; ++stat_.int_instruction_count; + // *************************************************************************** + // Do depth/stencil testing. This must be done before the color writing, so + // discard happens before the write, and also because in case the EDRAM base + // of the depth buffer is for some reason the same as of some color buffer, + // the color buffer should win - otherwise the Xbox Live Arcade splash screen + // is missing in Banjo-Kazooie. + // TODO(Triang3l): Do depth/stencil before the translated shader if possible. + // *************************************************************************** + + // Convert the depth to the target format - won't modify the W value. No need + // to do this in an if - if the value is not needed, the command processor can + // specify that the format is unorm24 - the conversion is much easier this way + // than for float24, only 2 instructions. + CompletePixelShader_DepthTo24Bit(); + + uint32_t depth_flags_temp = PushSystemTemp(); + system_constants_used_ |= 1ull << kSysConst_Flags_Index; + + // Extract 0 or 0xFFFFFFFF for whether depth/stencil needs to be read and for + // the comparison results when the test should pass to depth_flags_temp. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_IBFE) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(depth_flags_temp); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(1); + shader_code_.push_back(1); + shader_code_.push_back(1); + shader_code_.push_back(1); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(kSysFlag_DepthStencilRead_Shift); + shader_code_.push_back(kSysFlag_DepthPassIfLess_Shift); + shader_code_.push_back(kSysFlag_DepthPassIfEqual_Shift); + shader_code_.push_back(kSysFlag_DepthPassIfGreater_Shift); + shader_code_.push_back(EncodeVectorReplicatedOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_Flags_Vec); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // Check if operations involving the previous depth/stencil value need to be + // done. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | + ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( + D3D10_SB_INSTRUCTION_TEST_NONZERO) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_flags_temp); + ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; + + // Load the previous combined depth/stencil value into system_temp_depth_.y. + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_LD_UAV_TYPED) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW, kSwizzleXYZW, 2)); + shader_code_.push_back(0); + shader_code_.push_back(0); + ++stat_.instruction_count; + ++stat_.texture_load_instructions; + + // Separate the previous depth/stencil into depth and stencil to + // system_temp_depth_.yz. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_UBFE) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0110, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(0); + shader_code_.push_back(24); + shader_code_.push_back(8); + shader_code_.push_back(0); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(0); + shader_code_.push_back(8); + shader_code_.push_back(0); + shader_code_.push_back(0); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Do depth/stencil testing. + uint32_t depth_stencil_test_temp = PushSystemTemp(); + + // First, the depth test. + // New depth in system_temp_depth_.x, old depth in system_temp_depth_.y. + + // 1) Less/greater. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ULT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1010, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b01000000, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b00000100, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 2) Equal. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.int_instruction_count; + + // 3) Compare the results with the expected. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1110, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(depth_flags_temp); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 4) Start combining the results into system_temp_depth_.x - using .x + // specifically to keep the value because the stencil test also depends on the + // result of the depth test. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); + shader_code_.push_back(depth_stencil_test_temp); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 5) Finish combining the results into system_temp_depth_.w. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_stencil_test_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(depth_stencil_test_temp); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // 6) Discard the pixel if depth test failed. + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DISCARD) | + ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN(D3D10_SB_INSTRUCTION_TEST_ZERO) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_stencil_test_temp); + ++stat_.instruction_count; + + // TODO(Triang3l): Preserve the original depth if the depth write mask is + // false. + + // TODO(Triang3l): Do stencil testing. + + // Release depth_stencil_test_temp. + PopSystemTemp(); + + // Operations involving the previous value done. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; + + // Get the bit to check if need to write the new depth/stencil value. + // The write masks of depth specifically and stencil specifically are handled + // in the depth/stencil test code. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(depth_flags_temp); + shader_code_.push_back(EncodeVectorSelectOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3)); + shader_code_.push_back(cbuffer_index_system_constants_); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_Flags_Vec); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(kSysFlag_DepthStencilWrite); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Actually check if need to write the new depth/stencil value. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | + ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( + D3D10_SB_INSTRUCTION_TEST_NONZERO) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(depth_flags_temp); + ++stat_.instruction_count; + ++stat_.dynamic_flow_control_count; + + // Shift the depth into the higher bits. + // TODO(Triang3l): UMAD the new stencil there when stencil is added. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ISHL) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); + shader_code_.push_back(8); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // Write the new depth/stencil value. + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_STORE_UAV_TYPED) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); + shader_code_.push_back(EncodeVectorMaskedOperand( + D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW, 0b1111, 2)); + shader_code_.push_back(0); + shader_code_.push_back(0); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_depth_); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(system_temp_depth_); + ++stat_.instruction_count; + ++stat_.c_texture_store_instructions; + + // Done writing to the depth/stencil buffer. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ++stat_.instruction_count; + + // Release depth_flags_temp. + PopSystemTemp(); + // *************************************************************************** // Write to color render targets. // *************************************************************************** @@ -3462,6 +4167,10 @@ void DxbcShaderTranslator::CompleteShaderCode() { // Release system_temp_position_. PopSystemTemp(); } else if (is_pixel_shader()) { + if (edram_rov_used_) { + // Release system_temp_depth_. + PopSystemTemp(); + } // Release system_temp_color_. PopSystemTemp(4); } @@ -4214,11 +4923,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, break; case InstructionStorageTarget::kDepth: writes_depth_ = true; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4) | saturate_bit); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); + if (edram_rov_used_) { + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | + ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(system_temp_depth_); + } else { + shader_code_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | + ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4)); + shader_code_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); + } break; default: assert_unhandled_case(result.storage_target); @@ -9686,6 +10406,7 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator:: // vec4 5 {"xe_alpha_test_range", RdefTypeIndex::kFloat2, 80, 8}, {"xe_edram_pitch_tiles", RdefTypeIndex::kUint, 88, 4}, + {"xe_edram_depth_base_dwords", RdefTypeIndex::kUint, 92, 4}, // vec4 6 {"xe_color_exp_bias", RdefTypeIndex::kFloat4, 96, 16}, // vec4 7 @@ -10288,15 +11009,16 @@ void DxbcShaderTranslator::WriteInputSignature() { shader_object_.push_back(kPSInPointParametersRegister); shader_object_.push_back(0x7 | (0x3 << 8)); - // Position (only XY needed). Always used because ps_param_gen is handled - // dynamically and because this is needed for ROV storing. + // Position (only XY needed for ps_param_gen, but XYZ needed for ROV). + // Always used because ps_param_gen is handled dynamically and because this + // is needed for ROV storing. shader_object_.push_back(0); shader_object_.push_back(0); // D3D_NAME_POSITION. shader_object_.push_back(1); shader_object_.push_back(3); shader_object_.push_back(kPSInPositionRegister); - shader_object_.push_back(0xF | (0x3 << 8)); + shader_object_.push_back(0xF | ((edram_rov_used_ ? 0x7 : 0x3) << 8)); // Is front face. Always used because ps_param_gen is handled dynamically. shader_object_.push_back(0); @@ -10395,53 +11117,60 @@ void DxbcShaderTranslator::WriteOutputSignature() { new_offset += AppendString(shader_object_, "SV_Position"); } else { assert_true(is_pixel_shader()); - // Color render targets, optionally depth. - shader_object_.push_back(4 + (writes_depth_ ? 1 : 0)); - // Unknown. - shader_object_.push_back(8); + if (edram_rov_used_) { + // No outputs - only ROV read/write. + shader_object_.push_back(0); + // Unknown. + shader_object_.push_back(8); + } else { + // Color render targets, optionally depth. + shader_object_.push_back(4 + (writes_depth_ ? 1 : 0)); + // Unknown. + shader_object_.push_back(8); - // Color render targets. - for (uint32_t i = 0; i < 4; ++i) { - // Reserve space for the semantic name (SV_Target). - shader_object_.push_back(0); - shader_object_.push_back(i); - // D3D_NAME_UNDEFINED for some reason - this is correct. - shader_object_.push_back(0); - shader_object_.push_back(3); - // Register must match the render target index. - shader_object_.push_back(i); - // All are used because X360 RTs are dynamically remapped to D3D12 RTs to - // make the indices consecutive. - shader_object_.push_back(0xF); - } + // Color render targets. + for (uint32_t i = 0; i < 4; ++i) { + // Reserve space for the semantic name (SV_Target). + shader_object_.push_back(0); + shader_object_.push_back(i); + // D3D_NAME_UNDEFINED for some reason - this is correct. + shader_object_.push_back(0); + shader_object_.push_back(3); + // Register must match the render target index. + shader_object_.push_back(i); + // All are used because X360 RTs are dynamically remapped to D3D12 RTs + // to make the indices consecutive. + shader_object_.push_back(0xF); + } - // Depth. - if (writes_depth_) { - // Reserve space for the semantic name (SV_Depth). - shader_object_.push_back(0); - shader_object_.push_back(0); - shader_object_.push_back(0); - shader_object_.push_back(3); - shader_object_.push_back(0xFFFFFFFFu); - shader_object_.push_back(0x1 | (0xE << 8)); - } + // Depth. + if (writes_depth_) { + // Reserve space for the semantic name (SV_Depth). + shader_object_.push_back(0); + shader_object_.push_back(0); + shader_object_.push_back(0); + shader_object_.push_back(3); + shader_object_.push_back(0xFFFFFFFFu); + shader_object_.push_back(0x1 | (0xE << 8)); + } - // Write the semantic names. - new_offset = (uint32_t(shader_object_.size()) - chunk_position_dwords) * - sizeof(uint32_t); - for (uint32_t i = 0; i < 4; ++i) { - uint32_t color_name_position_dwords = chunk_position_dwords + - signature_position_dwords + - i * signature_size_dwords; - shader_object_[color_name_position_dwords] = new_offset; - } - new_offset += AppendString(shader_object_, "SV_Target"); - if (writes_depth_) { - uint32_t depth_name_position_dwords = chunk_position_dwords + - signature_position_dwords + - 4 * signature_size_dwords; - shader_object_[depth_name_position_dwords] = new_offset; - new_offset += AppendString(shader_object_, "SV_Depth"); + // Write the semantic names. + new_offset = (uint32_t(shader_object_.size()) - chunk_position_dwords) * + sizeof(uint32_t); + for (uint32_t i = 0; i < 4; ++i) { + uint32_t color_name_position_dwords = chunk_position_dwords + + signature_position_dwords + + i * signature_size_dwords; + shader_object_[color_name_position_dwords] = new_offset; + } + new_offset += AppendString(shader_object_, "SV_Target"); + if (writes_depth_) { + uint32_t depth_name_position_dwords = chunk_position_dwords + + signature_position_dwords + + 4 * signature_size_dwords; + shader_object_[depth_name_position_dwords] = new_offset; + new_offset += AppendString(shader_object_, "SV_Depth"); + } } } } @@ -10688,14 +11417,15 @@ void DxbcShaderTranslator::WriteShaderCode() { EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_INPUT, 0b0011, 1)); shader_object_.push_back(kPSInPointParametersRegister); ++stat_.dcl_count; - // Position input (only XY needed). + // Position input (only XY needed for ps_param_gen, but for ROV access, XYZ + // are needed). shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INPUT_PS_SIV) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4) | ENCODE_D3D10_SB_INPUT_INTERPOLATION_MODE( D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE)); - shader_object_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_INPUT, 0b0011, 1)); + shader_object_.push_back(EncodeVectorMaskedOperand( + D3D10_SB_OPERAND_TYPE_INPUT, edram_rov_used_ ? 0b0111 : 0b0011, 1)); shader_object_.push_back(kPSInPositionRegister); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_POSITION)); ++stat_.dcl_count; @@ -10712,8 +11442,8 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(kPSInFrontFaceRegister); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_IS_FRONT_FACE)); ++stat_.dcl_count; - // Color output. if (!edram_rov_used_) { + // Color output. for (uint32_t i = 0; i < 4; ++i) { shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | @@ -10723,16 +11453,15 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(i); ++stat_.dcl_count; } - } - // Depth output. - // TODO(Triang3l): Do something with this for ROV. - if (writes_depth_) { - shader_object_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(2)); - shader_object_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); - ++stat_.dcl_count; + // Depth output. + if (writes_depth_) { + shader_object_.push_back( + ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(2)); + shader_object_.push_back( + EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); + ++stat_.dcl_count; + } } } diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 7859cc5a8..028ef106d 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -35,38 +35,66 @@ class DxbcShaderTranslator : public ShaderTranslator { }; enum : uint32_t { - kSysFlag_XYDividedByW = 1, - kSysFlag_ZDividedByW = kSysFlag_XYDividedByW << 1, - kSysFlag_WNotReciprocal = kSysFlag_ZDividedByW << 1, - kSysFlag_ReverseZ = kSysFlag_WNotReciprocal << 1, - kSysFlag_Color0Gamma = kSysFlag_ReverseZ << 1, - kSysFlag_Color1Gamma = kSysFlag_Color0Gamma << 1, - kSysFlag_Color2Gamma = kSysFlag_Color1Gamma << 1, - kSysFlag_Color3Gamma = kSysFlag_Color2Gamma << 1, + kSysFlag_XYDividedByW_Shift, + kSysFlag_ZDividedByW_Shift, + kSysFlag_WNotReciprocal_Shift, + kSysFlag_ReverseZ_Shift, + kSysFlag_DepthStencilRead_Shift, + // Depth/stencil testing not done if DepthStencilRead is disabled, but + // writing may still be done. + kSysFlag_DepthPassIfLess_Shift, + kSysFlag_DepthPassIfEqual_Shift, + kSysFlag_DepthPassIfGreater_Shift, + // This doesn't include depth/stencil masks - only reflects the fact that + // the new value must be written. + kSysFlag_DepthStencilWrite_Shift, + // If don't need to read or write the depth component of the depth/stencil + // buffer, better disable kSysFlag_DepthFloat24 because float->unorm is + // easier to perform than float32->float24. + kSysFlag_DepthFloat24_Shift, + kSysFlag_Color0Gamma_Shift, + kSysFlag_Color1Gamma_Shift, + kSysFlag_Color2Gamma_Shift, + kSysFlag_Color3Gamma_Shift, + + kSysFlag_XYDividedByW = 1u << kSysFlag_XYDividedByW_Shift, + kSysFlag_ZDividedByW = 1u << kSysFlag_ZDividedByW_Shift, + kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift, + kSysFlag_ReverseZ = 1u << kSysFlag_ReverseZ_Shift, + kSysFlag_DepthStencilRead = 1u << kSysFlag_DepthStencilRead_Shift, + kSysFlag_DepthPassIfLess = 1u << kSysFlag_DepthPassIfLess_Shift, + kSysFlag_DepthPassIfEqual = 1u << kSysFlag_DepthPassIfEqual_Shift, + kSysFlag_DepthPassIfGreater = 1u << kSysFlag_DepthPassIfGreater_Shift, + kSysFlag_DepthStencilWrite = 1u << kSysFlag_DepthStencilWrite_Shift, + kSysFlag_DepthFloat24 = 1u << kSysFlag_DepthFloat24_Shift, + kSysFlag_Color0Gamma = 1u << kSysFlag_Color0Gamma_Shift, + kSysFlag_Color1Gamma = 1u << kSysFlag_Color1Gamma_Shift, + kSysFlag_Color2Gamma = 1u << kSysFlag_Color2Gamma_Shift, + kSysFlag_Color3Gamma = 1u << kSysFlag_Color3Gamma_Shift, }; enum : uint32_t { // Whether the write mask is non-zero. - kRTFlag_Used_Shift = 0, - kRTFlag_Used = 1u << kRTFlag_Used_Shift, + kRTFlag_Used_Shift, // Whether the render target needs to be merged with another (if the write // mask is not 1111, or 11 for 16_16, or 1 for 32_FLOAT, or blending is // enabled and it's not no-op). - kRTFlag_Load_Shift = kRTFlag_Used_Shift + 1, - kRTFlag_Load = 1u << kRTFlag_Load_Shift, - kRTFlag_Blend_Shift = kRTFlag_Load_Shift + 1, - kRTFlag_Blend = 1u << kRTFlag_Blend_Shift, - + kRTFlag_Load_Shift, + kRTFlag_Blend_Shift, // Whether the format is fixed-point and needs to be converted to integer // (k_8_8_8_8, k_2_10_10_10, k_16_16, k_16_16_16_16). - kRTFlag_FormatFixed_Shift = kRTFlag_Blend_Shift + 1, - kRTFlag_FormatFixed = 1u << kRTFlag_FormatFixed_Shift, + kRTFlag_FormatFixed_Shift, // Whether the format is k_2_10_10_10_FLOAT and 7e3 conversion is needed. - kRTFlag_FormatFloat10_Shift = kRTFlag_FormatFixed_Shift + 1, - kRTFlag_FormatFloat10 = 1u << kRTFlag_FormatFloat10_Shift, + kRTFlag_FormatFloat10_Shift, // Whether the format is k_16_16_FLOAT or k_16_16_16_16_FLOAT and // f16tof32/f32tof16 is needed. - kRTFlag_FormatFloat16_Shift = kRTFlag_FormatFloat10_Shift + 1, + kRTFlag_FormatFloat16_Shift, + + kRTFlag_Used = 1u << kRTFlag_Used_Shift, + kRTFlag_Load = 1u << kRTFlag_Load_Shift, + kRTFlag_Blend = 1u << kRTFlag_Blend_Shift, + kRTFlag_FormatFixed = 1u << kRTFlag_FormatFixed_Shift, + kRTFlag_FormatFloat10 = 1u << kRTFlag_FormatFloat10_Shift, kRTFlag_FormatFloat16 = 1u << kRTFlag_FormatFloat16_Shift, }; @@ -220,7 +248,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // The range is floats as uints so it's easier to pass infinity. uint32_t alpha_test_range[2]; uint32_t edram_pitch_tiles; - uint32_t padding_5; + uint32_t edram_depth_base_dwords; // vec4 6 float color_exp_bias[4]; @@ -420,8 +448,11 @@ class DxbcShaderTranslator : public ShaderTranslator { kSysConst_EDRAMPitchTiles_Index = kSysConst_AlphaTestRange_Index + 1, kSysConst_EDRAMPitchTiles_Vec = kSysConst_AlphaTestRange_Vec, kSysConst_EDRAMPitchTiles_Comp = 2, + kSysConst_EDRAMDepthBaseDwords_Index = kSysConst_EDRAMPitchTiles_Index + 1, + kSysConst_EDRAMDepthBaseDwords_Vec = kSysConst_AlphaTestRange_Vec, + kSysConst_EDRAMDepthBaseDwords_Comp = 3, - kSysConst_ColorExpBias_Index = kSysConst_EDRAMPitchTiles_Index + 1, + kSysConst_ColorExpBias_Index = kSysConst_EDRAMDepthBaseDwords_Index + 1, kSysConst_ColorExpBias_Vec = kSysConst_AlphaTestRange_Vec + 1, kSysConst_ColorOutputMap_Index = kSysConst_ColorExpBias_Index + 1, @@ -588,6 +619,10 @@ class DxbcShaderTranslator : public ShaderTranslator { // Writing the epilogue. void CompleteVertexShader(); + // Converts the depth in system_temp_depth_.x to 24-bit unorm or float, + // depending on the flag value. Uses system_temp_depth_.yz as scratch - w not + // touched. + void CompletePixelShader_DepthTo24Bit(); void CompletePixelShader_WriteToRTVs(); // Extracts widths and offsets of the components in the lower or the upper // dword of a pixel from the format constants, for use as ibfe and bfi @@ -850,6 +885,11 @@ class DxbcShaderTranslator : public ShaderTranslator { // Color outputs in pixel shaders (because of exponent bias, alpha test and // remapping). uint32_t system_temp_color_[4]; + // Depth output in pixel shader, and 3 dwords usable as scratch for operations + // related to depth. Currently only used for ROV depth. + // TODO(Triang3l): Reduce depth to 24-bit in pixel shaders when using a DSV + // for accuracy. + uint32_t system_temp_depth_; // Whether a predicate `if` is open. bool cf_currently_predicated_;