From 0fbf0eec9deb8caa4ba094241abc3d6379247f08 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 5 Jul 2020 22:50:24 +0300 Subject: [PATCH] [DXBC] ROV: Inline 24-bit depth conversion --- src/xenia/gpu/dxbc_shader_translator.cc | 7 -- src/xenia/gpu/dxbc_shader_translator.h | 18 +++--- src/xenia/gpu/dxbc_shader_translator_om.cc | 75 ++++++++++++---------- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 716e5a9a0..69e1af1dd 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -812,15 +812,11 @@ void DxbcShaderTranslator::StartPixelShader() { void DxbcShaderTranslator::StartTranslation() { // Allocate labels and registers for subroutines. - label_rov_depth_to_24bit_ = UINT32_MAX; label_rov_depth_stencil_sample_ = UINT32_MAX; std::memset(label_rov_color_sample_, 0xFF, sizeof(label_rov_color_sample_)); uint32_t label_index = 0; system_temps_subroutine_count_ = 0; if (IsDxbcPixelShader() && edram_rov_used_) { - label_rov_depth_to_24bit_ = label_index++; - system_temps_subroutine_count_ = - std::max((uint32_t)1, system_temps_subroutine_count_); label_rov_depth_stencil_sample_ = label_index++; system_temps_subroutine_count_ = std::max((uint32_t)2, system_temps_subroutine_count_); @@ -1157,9 +1153,6 @@ void DxbcShaderTranslator::CompleteShaderCode() { // need the global system temps, and can't allocate their own temps (since // they may be called from anywhere and don't know anything about the caller's // register allocation). - if (label_rov_depth_to_24bit_ != UINT32_MAX) { - CompleteShaderCode_ROV_DepthTo24BitSubroutine(); - } if (label_rov_depth_stencil_sample_ != UINT32_MAX) { CompleteShaderCode_ROV_DepthStencilSampleSubroutine(); } diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 34286535c..014b5c138 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -2166,6 +2166,14 @@ class DxbcShaderTranslator : public ShaderTranslator { bool ROV_IsDepthStencilEarly() const { return !is_depth_only_pixel_shader_ && !writes_depth(); } + // Converts the depth value to 24-bit (storing the result in bits 0:23 and + // zeros in 24:31, not creating room for stencil - since this may be involved + // in comparisons) according to the format specified in the system constants. + // Source and destination may be the same, temporary must be different than + // both. + void ROV_DepthTo24Bit(uint32_t d24_temp, uint32_t d24_temp_component, + uint32_t d32_temp, uint32_t d32_temp_component, + uint32_t temp_temp, uint32_t temp_temp_component); // Does all the depth/stencil-related things, including or not including // writing based on whether it's late, or on whether it's safe to do it early. // Updates system_temp_rov_params_ result and coverage if allowed and safe, @@ -2231,15 +2239,6 @@ class DxbcShaderTranslator : public ShaderTranslator { void CompletePixelShader_WriteToROV(); void CompletePixelShader(); - // Writes a function that converts depth to 24 bits, putting it in 0:23, not - // creating space for stencil (ROV only). - // Input: - // - system_temps_subroutine_[0].x - Z/W + polygon offset at sample. - // Output: - // - system_temps_subroutine_[0].x - 24-bit depth. - // Local temps: - // - system_temps_subroutine_[0].y. - void CompleteShaderCode_ROV_DepthTo24BitSubroutine(); // Writes a function that does early (or both early and late, when not // separating) depth/stencil testing for one sample (ROV only). // Input: @@ -2496,7 +2495,6 @@ class DxbcShaderTranslator : public ShaderTranslator { // Subroutine labels. D3D10_SB_OPCODE_LABEL is not counted as an instruction // in STAT. - uint32_t label_rov_depth_to_24bit_; uint32_t label_rov_depth_stencil_sample_; uint32_t label_rov_color_sample_[4]; diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index f709ff794..cf9a9464b 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -9,6 +9,7 @@ #include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/base/assert.h" #include "xenia/base/math.h" namespace xe { @@ -386,13 +387,12 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { DxbcOpIf(true, DxbcSrc::R(temp1, DxbcSrc::kXXXX)); if (writes_depth()) { - // Convert the shader-generated depth to 24-bit - move the 32-bit depth to - // the conversion subroutine's argument. - DxbcOpMov(DxbcDest::R(system_temps_subroutine_, 0b0001), - DxbcSrc::R(system_temp_rov_depth_stencil_, DxbcSrc::kXXXX)); - // Convert the shader-generated depth to 24-bit. - DxbcOpCall(DxbcSrc::Label(label_rov_depth_to_24bit_)); - // Store a copy of the depth in temp1.x to reload later. + // Convert the shader-generated depth to 24-bit to + // system_temps_subroutine_[0].x, using temp1.x as temporary. + ROV_DepthTo24Bit(system_temps_subroutine_, 0, + system_temp_rov_depth_stencil_, 0, temp1, 0); + // Store a copy of the depth in temp1.x to reload to + // system_temps_subroutine_[0].x for samples other than the first one. // temp1.x = 24-bit oDepth DxbcOpMov(DxbcDest::R(temp1, 0b0001), DxbcSrc::R(system_temps_subroutine_, DxbcSrc::kXXXX)); @@ -573,9 +573,10 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { DxbcOpMin(DxbcDest::R(system_temps_subroutine_, 0b0001), DxbcSrc::R(system_temps_subroutine_, DxbcSrc::kXXXX), DxbcSrc::R(temp1, DxbcSrc::kYYYY), true); - // Convert the depth to 24-bit - takes system_temps_subroutine_[0].x, - // returns also in system_temps_subroutine_[0].x. - DxbcOpCall(DxbcSrc::Label(label_rov_depth_to_24bit_)); + // Convert the depth in system_temps_subroutine_[0].x to 24-bit, using + // system_temps_subroutine_[0].y as a temporary. + ROV_DepthTo24Bit(system_temps_subroutine_, 0, system_temps_subroutine_, 0, + system_temps_subroutine_, 1); } // Perform depth/stencil test for the sample, get the result in bits 4 @@ -1868,15 +1869,23 @@ void DxbcShaderTranslator::CompletePixelShader() { } } -void DxbcShaderTranslator::CompleteShaderCode_ROV_DepthTo24BitSubroutine() { - DxbcOpLabel(DxbcSrc::Label(label_rov_depth_to_24bit_)); +void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, + uint32_t d24_temp_component, + uint32_t d32_temp, + uint32_t d32_temp_component, + uint32_t temp_temp, + uint32_t temp_temp_component) { + assert_true(temp_temp != d24_temp || + temp_temp_component != d24_temp_component); + assert_true(temp_temp != d32_temp || + temp_temp_component != d32_temp_component); + // Source and destination may be the same. + DxbcDest d24_dest(DxbcDest::R(d24_temp, 1 << d24_temp_component)); + DxbcSrc d24_src(DxbcSrc::R(d24_temp).Select(d24_temp_component)); + DxbcSrc d32_src(DxbcSrc::R(d32_temp).Select(d32_temp_component)); + DxbcDest temp_dest(DxbcDest::R(temp_temp, 1 << temp_temp_component)); + DxbcSrc temp_src(DxbcSrc::R(temp_temp).Select(temp_temp_component)); - DxbcDest depth_dest(DxbcDest::R(system_temps_subroutine_, 0b0001)); - DxbcSrc depth_src(DxbcSrc::R(system_temps_subroutine_, DxbcSrc::kXXXX)); - DxbcDest temp_dest(DxbcDest::R(system_temps_subroutine_, 0b0010)); - DxbcSrc temp_src(DxbcSrc::R(system_temps_subroutine_, DxbcSrc::kYYYY)); - - // Extract the depth format to Y. Take 1 SGPR. system_constants_used_ |= 1ull << kSysConst_Flags_Index; DxbcOpAnd(temp_dest, DxbcSrc::CB(cbuffer_index_system_constants_, @@ -1884,7 +1893,7 @@ void DxbcShaderTranslator::CompleteShaderCode_ROV_DepthTo24BitSubroutine() { kSysConst_Flags_Vec) .Select(kSysConst_Flags_Comp), DxbcSrc::LU(kSysFlag_ROVDepthFloat24)); - // Convert according to the format. Release 1 SGPR. + // Convert according to the format. DxbcOpIf(true, temp_src); { // 20e4 conversion, using 1 VGPR. @@ -1894,12 +1903,12 @@ void DxbcShaderTranslator::CompleteShaderCode_ROV_DepthTo24BitSubroutine() { // Check if the number is too small to be represented as normalized 20e4. // temp = f32 < 2^-14 - DxbcOpULT(temp_dest, depth_src, DxbcSrc::LU(0x38800000)); + DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000)); // Handle denormalized numbers separately. DxbcOpIf(true, temp_src); { // temp = f32 >> 23 - DxbcOpUShR(temp_dest, depth_src, DxbcSrc::LU(23)); + DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23)); // temp = 113 - (f32 >> 23) DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src); // Don't allow the shift to overflow, since in DXBC the lower 5 bits of @@ -1907,11 +1916,11 @@ void DxbcShaderTranslator::CompleteShaderCode_ROV_DepthTo24BitSubroutine() { // temp = min(113 - (f32 >> 23), 24) DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24)); // biased_f32 = (f32 & 0x7FFFFF) | 0x800000 - DxbcOpBFI(depth_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), - depth_src); + DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), + d32_src); // biased_f32 = // ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) - DxbcOpUShR(depth_dest, depth_src, temp_src); + DxbcOpUShR(d24_dest, d24_src, temp_src); } // Not denormalized? DxbcOpElse(); @@ -1919,37 +1928,35 @@ void DxbcShaderTranslator::CompleteShaderCode_ROV_DepthTo24BitSubroutine() { // Bias the exponent. // biased_f32 = f32 + (-112 << 23) // (left shift of a negative value is undefined behavior) - DxbcOpIAdd(depth_dest, depth_src, DxbcSrc::LU(0xC8000000u)); + DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u)); } // Close the denormal check. DxbcOpEndIf(); // Build the 20e4 number. // temp = (biased_f32 >> 3) & 1 - DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), depth_src); + DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src); // f24 = biased_f32 + 3 - DxbcOpIAdd(depth_dest, depth_src, DxbcSrc::LU(3)); + DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3)); // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) - DxbcOpIAdd(depth_dest, depth_src, temp_src); + DxbcOpIAdd(d24_dest, d24_src, temp_src); // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF - DxbcOpUBFE(depth_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), depth_src); + DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src); } DxbcOpElse(); { // Unorm24 conversion. // Multiply by float(0xFFFFFF). - DxbcOpMul(depth_dest, depth_src, DxbcSrc::LF(16777215.0f)); + DxbcOpMul(d24_dest, d32_src, DxbcSrc::LF(16777215.0f)); // Round to the nearest even integer. This seems to be the correct way: // rounding towards zero gives 0xFF instead of 0x100 in clear shaders in, // for instance, Halo 3, but other clear shaders in it are also broken if // 0.5 is added before ftou instead of round_ne. - DxbcOpRoundNE(depth_dest, depth_src); + DxbcOpRoundNE(d24_dest, d24_src); // Convert to fixed-point. - DxbcOpFToU(depth_dest, depth_src); + DxbcOpFToU(d24_dest, d24_src); } DxbcOpEndIf(); - - DxbcOpRet(); } void DxbcShaderTranslator::