From 56f72da1371f5a18af362a8aa012adb2e3bd2232 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 7 Jun 2022 21:26:34 +0300 Subject: [PATCH 1/2] [GPU] More exact PWL texture/RT gamma conversion --- src/xenia/gpu/dxbc_shader_translator.cc | 173 ++++++++++++------ src/xenia/gpu/dxbc_shader_translator.h | 26 ++- src/xenia/gpu/dxbc_shader_translator_fetch.cc | 4 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 24 ++- src/xenia/gpu/xenos.cc | 85 +++++++++ src/xenia/gpu/xenos.h | 3 + 6 files changed, 238 insertions(+), 77 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 513e46887..1cf525b4c 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -212,63 +212,124 @@ void DxbcShaderTranslator::PopSystemTemp(uint32_t count) { system_temp_count_current_ -= std::min(count, system_temp_count_current_); } -void DxbcShaderTranslator::ConvertPWLGamma( - bool to_gamma, int32_t source_temp, uint32_t source_temp_component, - uint32_t target_temp, uint32_t target_temp_component, uint32_t piece_temp, - uint32_t piece_temp_component, uint32_t accumulator_temp, - uint32_t accumulator_temp_component) { - assert_true(source_temp != target_temp || - source_temp_component != target_temp_component || - ((target_temp != accumulator_temp || - target_temp_component != accumulator_temp_component) && - (target_temp != piece_temp || - target_temp_component != piece_temp_component))); - assert_true(piece_temp != source_temp || - piece_temp_component != source_temp_component); - assert_true(accumulator_temp != source_temp || - accumulator_temp_component != source_temp_component); - assert_true(piece_temp != accumulator_temp || - piece_temp_component != accumulator_temp_component); +void DxbcShaderTranslator::PWLGammaToLinear( + uint32_t target_temp, uint32_t target_temp_component, uint32_t source_temp, + uint32_t source_temp_component, bool source_pre_saturated, uint32_t temp1, + uint32_t temp1_component, uint32_t temp2, uint32_t temp2_component) { + // The source is needed only once to begin building the result, so it can be + // the same as the destination. + assert_true(temp1 != target_temp || temp1_component != target_temp_component); + assert_true(temp1 != source_temp || temp1_component != source_temp_component); + assert_true(temp2 != target_temp || temp2_component != target_temp_component); + assert_true(temp2 != source_temp || temp2_component != source_temp_component); + assert_true(temp1 != temp2 || temp1_component != temp2_component); + dxbc::Dest target_dest( + dxbc::Dest::R(target_temp, UINT32_C(1) << target_temp_component)); + dxbc::Src target_src(dxbc::Src::R(target_temp).Select(target_temp_component)); dxbc::Src source_src(dxbc::Src::R(source_temp).Select(source_temp_component)); - dxbc::Dest piece_dest(dxbc::Dest::R(piece_temp, 1 << piece_temp_component)); - dxbc::Src piece_src(dxbc::Src::R(piece_temp).Select(piece_temp_component)); - dxbc::Dest accumulator_dest( - dxbc::Dest::R(accumulator_temp, 1 << accumulator_temp_component)); - dxbc::Src accumulator_src( - dxbc::Src::R(accumulator_temp).Select(accumulator_temp_component)); - // For each piece: - // 1) Calculate how far we are on it. Multiply by 1/width, subtract - // start/width and saturate. - // 2) Add the contribution of the piece - multiply the position on the piece - // by its slope*width and accumulate. - // Piece 1. - a_.OpMul(piece_dest, source_src, - dxbc::Src::LF(to_gamma ? (1.0f / 0.0625f) : (1.0f / 0.25f)), true); - a_.OpMul(accumulator_dest, piece_src, - dxbc::Src::LF(to_gamma ? (4.0f * 0.0625f) : (0.25f * 0.25f))); - // Piece 2. - a_.OpMAd(piece_dest, source_src, - dxbc::Src::LF(to_gamma ? (1.0f / 0.0625f) : (1.0f / 0.125f)), - dxbc::Src::LF(to_gamma ? (-0.0625f / 0.0625f) : (-0.25f / 0.125f)), - true); - a_.OpMAd(accumulator_dest, piece_src, - dxbc::Src::LF(to_gamma ? (2.0f * 0.0625f) : (0.5f * 0.125f)), - accumulator_src); - // Piece 3. - a_.OpMAd(piece_dest, source_src, - dxbc::Src::LF(to_gamma ? (1.0f / 0.375f) : (1.0f / 0.375f)), - dxbc::Src::LF(to_gamma ? (-0.125f / 0.375f) : (-0.375f / 0.375f)), - true); - a_.OpMAd(accumulator_dest, piece_src, - dxbc::Src::LF(to_gamma ? (1.0f * 0.375f) : (1.0f * 0.375f)), - accumulator_src); - // Piece 4. - a_.OpMAd(piece_dest, source_src, - dxbc::Src::LF(to_gamma ? (1.0f / 0.5f) : (1.0f / 0.25f)), - dxbc::Src::LF(to_gamma ? (-0.5f / 0.5f) : (-0.75f / 0.25f)), true); - a_.OpMAd(dxbc::Dest::R(target_temp, 1 << target_temp_component), piece_src, - dxbc::Src::LF(to_gamma ? (0.5f * 0.5f) : (2.0f * 0.25f)), - accumulator_src); + dxbc::Dest temp1_dest(dxbc::Dest::R(temp1, UINT32_C(1) << temp1_component)); + dxbc::Src temp1_src(dxbc::Src::R(temp1).Select(temp1_component)); + dxbc::Dest temp2_dest(dxbc::Dest::R(temp2, UINT32_C(1) << temp2_component)); + dxbc::Src temp2_src(dxbc::Src::R(temp2).Select(temp2_component)); + + // Get the scale (into temp1) and the offset (into temp2) for the piece. + // Using `source >= threshold` comparisons because the input might have not + // been saturated yet, and thus it may be NaN - since it will be saturated to + // 0 later, the 0...64/255 case should be selected for it. + a_.OpGE(temp2_dest, source_src, dxbc::Src::LF(96.0f / 255.0f)); + a_.OpIf(true, temp2_src); + // [96/255 ... 1 + a_.OpGE(temp2_dest, source_src, dxbc::Src::LF(192.0f / 255.0f)); + a_.OpMovC(temp1_dest, temp2_src, dxbc::Src::LF(8.0f / 1024.0f), + dxbc::Src::LF(4.0f / 1024.0f)); + a_.OpMovC(temp2_dest, temp2_src, dxbc::Src::LF(-1024.0f), + dxbc::Src::LF(-256.0f)); + a_.OpElse(); + // 0 ... 96/255) + a_.OpGE(temp2_dest, source_src, dxbc::Src::LF(64.0f / 255.0f)); + a_.OpMovC(temp1_dest, temp2_src, dxbc::Src::LF(2.0f / 1024.0f), + dxbc::Src::LF(1.0f / 1024.0f)); + a_.OpMovC(temp2_dest, temp2_src, dxbc::Src::LF(-64.0f), dxbc::Src::LF(0.0f)); + a_.OpEndIf(); + + if (!source_pre_saturated) { + // Saturate the input, and flush NaN to 0. + a_.OpMov(target_dest, source_src, true); + } + // linear = gamma * (255 * 1024) * scale + offset + // As both 1024 and the scale are powers of 2, and 1024 * scale is not smaller + // than 1, it's not important if it's (gamma * 255) * 1024 * scale, + // (gamma * 255 * 1024) * scale, gamma * 255 * (1024 * scale), or + // gamma * (255 * 1024 * scale) - or the option chosen here, as long as + // 1024 is applied before the scale since the scale is < 1 (specifically at + // least 1/1024), and it may make very small values denormal. + a_.OpMul(target_dest, source_pre_saturated ? source_src : target_src, + dxbc::Src::LF(255.0f * 1024.0f)); + a_.OpMAd(target_dest, target_src, temp1_src, temp2_src); + // linear += trunc(linear * scale) + a_.OpMul(temp1_dest, target_src, temp1_src); + a_.OpRoundZ(temp1_dest, temp1_src); + a_.OpAdd(target_dest, target_src, temp1_src); + // linear *= 1/1023 + a_.OpMul(target_dest, target_src, dxbc::Src::LF(1.0f / 1023.0f)); +} + +void DxbcShaderTranslator::PreSaturatedLinearToPWLGamma( + uint32_t target_temp, uint32_t target_temp_component, uint32_t source_temp, + uint32_t source_temp_component, uint32_t temp_or_target, + uint32_t temp_or_target_component, uint32_t temp_non_target, + uint32_t temp_non_target_component) { + // The source may be the same as the target, but in this case it can't also be + // used as a temporary variable. + assert_true(target_temp != source_temp || + target_temp_component != source_temp_component || + target_temp != temp_or_target || + target_temp_component != temp_or_target_component); + assert_true(temp_or_target != source_temp || + temp_or_target_component != source_temp_component); + assert_true(temp_non_target != target_temp || + temp_non_target_component != target_temp_component); + assert_true(temp_non_target != source_temp || + temp_non_target_component != source_temp_component); + assert_true(temp_or_target != temp_non_target || + temp_or_target_component != temp_non_target_component); + dxbc::Dest target_dest( + dxbc::Dest::R(target_temp, UINT32_C(1) << target_temp_component)); + dxbc::Src target_src(dxbc::Src::R(target_temp).Select(target_temp_component)); + dxbc::Src source_src(dxbc::Src::R(source_temp).Select(source_temp_component)); + dxbc::Dest temp_or_target_dest( + dxbc::Dest::R(temp_or_target, UINT32_C(1) << temp_or_target_component)); + dxbc::Src temp_or_target_src( + dxbc::Src::R(temp_or_target).Select(temp_or_target_component)); + dxbc::Dest temp_non_target_dest( + dxbc::Dest::R(temp_non_target, UINT32_C(1) << temp_non_target_component)); + dxbc::Src temp_non_target_src( + dxbc::Src::R(temp_non_target).Select(temp_non_target_component)); + + // Get the scale (into temp_or_target) and the offset (into temp_non_target) + // for the piece. + a_.OpGE(temp_non_target_dest, source_src, dxbc::Src::LF(128.0f / 1023.0f)); + a_.OpIf(true, temp_non_target_src); + // [128/1023 ... 1 + a_.OpGE(temp_non_target_dest, source_src, dxbc::Src::LF(512.0f / 1023.0f)); + a_.OpMovC(temp_or_target_dest, temp_non_target_src, + dxbc::Src::LF(1023.0f / 8.0f), dxbc::Src::LF(1023.0f / 4.0f)); + a_.OpMovC(temp_non_target_dest, temp_non_target_src, + dxbc::Src::LF(128.0f / 255.0f), dxbc::Src::LF(64.0f / 255.0f)); + a_.OpElse(); + // 0 ... 128/1023) + a_.OpGE(temp_non_target_dest, source_src, dxbc::Src::LF(64.0f / 1023.0f)); + a_.OpMovC(temp_or_target_dest, temp_non_target_src, + dxbc::Src::LF(1023.0f / 2.0f), dxbc::Src::LF(1023.0f)); + a_.OpMovC(temp_non_target_dest, temp_non_target_src, + dxbc::Src::LF(32.0f / 255.0f), dxbc::Src::LF(0.0f)); + a_.OpEndIf(); + + // gamma = trunc(linear * scale) * (1.0 / 255.0) + offset + a_.OpMul(target_dest, source_src, temp_or_target_src); + a_.OpRoundZ(target_dest, target_src); + a_.OpMAd(target_dest, target_src, dxbc::Src::LF(1.0f / 255.0f), + temp_non_target_src); } void DxbcShaderTranslator::RemapAndConvertVertexIndices( diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 726f96cc2..6b78310e8 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -664,15 +664,23 @@ class DxbcShaderTranslator : public ShaderTranslator { // Frees the last allocated internal r# registers for later reuse. void PopSystemTemp(uint32_t count = 1); - // Converts one scalar to or from PWL gamma, using 1 temporary scalar. - // The target may be the same as any of the source, the piece temporary or the - // accumulator, but not two or three of these. - // The piece and the accumulator can't be the same as source or as each other. - void ConvertPWLGamma(bool to_gamma, int32_t source_temp, - uint32_t source_temp_component, uint32_t target_temp, - uint32_t target_temp_component, uint32_t piece_temp, - uint32_t piece_temp_component, uint32_t accumulator_temp, - uint32_t accumulator_temp_component); + // Converts one scalar from piecewise linear gamma to linear. The target may + // be the same as the source, the temporary variables must be different. If + // the source is not pre-saturated, saturation will be done internally. + void PWLGammaToLinear(uint32_t target_temp, uint32_t target_temp_component, + uint32_t source_temp, uint32_t source_temp_component, + bool source_pre_saturated, uint32_t temp1, + uint32_t temp1_component, uint32_t temp2, + uint32_t temp2_component); + // Converts one scalar, which must be saturated before calling this function, + // from linear to piecewise linear gamma. The target may be the same as either + // the source or as temp_or_target, but not as both (and temp_or_target may + // not be the same as the source). temp_non_target must be different. + void PreSaturatedLinearToPWLGamma( + uint32_t target_temp, uint32_t target_temp_component, + uint32_t source_temp, uint32_t source_temp_component, + uint32_t temp_or_target, uint32_t temp_or_target_component, + uint32_t temp_non_target, uint32_t temp_non_target_component); bool IsSampleRate() const { assert_true(is_pixel_shader()); diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 4a84119f8..7716c4a26 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -2103,8 +2103,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( a_.OpIf(false, dxbc::Src::R(gamma_temp, dxbc::Src::kXXXX)); } // Convert from piecewise linear. - ConvertPWLGamma(false, system_temp_result_, i, system_temp_result_, i, - gamma_temp, 0, gamma_temp, 1); + PWLGammaToLinear(system_temp_result_, i, system_temp_result_, i, false, + gamma_temp, 0, gamma_temp, 1); if (gamma_render_target_as_srgb_) { a_.OpElse(); // Convert from sRGB. diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 685911285..eb5a8bd38 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -1384,8 +1384,8 @@ void DxbcShaderTranslator::ROV_UnpackColor( dxbc::Src::LF(1.0f / 255.0f)); if (i) { for (uint32_t j = 0; j < 3; ++j) { - ConvertPWLGamma(false, color_temp, j, color_temp, j, temp1, - temp1_component, temp2, temp2_component); + PWLGammaToLinear(color_temp, j, color_temp, j, true, temp1, + temp1_component, temp2, temp2_component); } } a_.OpBreak(); @@ -1537,8 +1537,9 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( : xenos::ColorRenderTargetFormat::k_8_8_8_8))); for (uint32_t j = 0; j < 4; ++j) { if (i && j < 3) { - ConvertPWLGamma(true, color_temp, j, temp1, temp1_component, temp1, - temp1_component, temp2, temp2_component); + PreSaturatedLinearToPWLGamma(temp1, temp1_component, color_temp, j, + temp1, temp1_component, temp2, + temp2_component); // Denormalize and add 0.5 for rounding. a_.OpMAd(temp1_dest, temp1_src, dxbc::Src::LF(255.0f), dxbc::Src::LF(0.5f)); @@ -1863,10 +1864,10 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { if (!(shader_writes_color_targets & (1 << i))) { continue; } + uint32_t system_temp_color = system_temps_color_[i]; // Apply the exponent bias after alpha to coverage because it needs the - // unbiased alpha from the shader - a_.OpMul(dxbc::Dest::R(system_temps_color_[i]), - dxbc::Src::R(system_temps_color_[i]), + // unbiased alpha from the shader. + a_.OpMul(dxbc::Dest::R(system_temp_color), dxbc::Src::R(system_temp_color), LoadSystemConstant( SystemConstants::Index::kColorExpBias, offsetof(SystemConstants, color_exp_bias) + sizeof(float) * i, @@ -1878,14 +1879,17 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { a_.OpAnd(dxbc::Dest::R(gamma_temp, 0b0001), LoadFlagsSystemConstant(), dxbc::Src::LU(kSysFlag_ConvertColor0ToGamma << i)); a_.OpIf(true, dxbc::Src::R(gamma_temp, dxbc::Src::kXXXX)); + // Saturate before the gamma conversion. + a_.OpMov(dxbc::Dest::R(system_temp_color, 0b0111), + dxbc::Src::R(system_temp_color), true); for (uint32_t j = 0; j < 3; ++j) { - ConvertPWLGamma(true, system_temps_color_[i], j, system_temps_color_[i], - j, gamma_temp, 0, gamma_temp, 1); + PreSaturatedLinearToPWLGamma(system_temp_color, j, system_temp_color, j, + gamma_temp, 0, gamma_temp, 1); } a_.OpEndIf(); } // Copy the color from a readable temp register to an output register. - a_.OpMov(dxbc::Dest::O(i), dxbc::Src::R(system_temps_color_[i])); + a_.OpMov(dxbc::Dest::O(i), dxbc::Src::R(system_temp_color)); } // Release gamma_temp. PopSystemTemp(); diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index 4d4a279b5..397ba3b24 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -17,6 +17,91 @@ namespace xe { namespace gpu { namespace xenos { +// Based on X360GammaToLinear and X360LinearToGamma from the Source Engine, with +// additional logic from Direct3D 9 code in game executable disassembly, located +// via the floating-point constants involved. +// https://github.com/ValveSoftware/source-sdk-2013/blob/master/mp/src/mathlib/color_conversion.cpp#L329 +// These are provided here in part as a reference for shader translators. + +float PWLGammaToLinear(float gamma) { + // Not found in game executables, so just using the logic similar to that in + // the Source Engine. + gamma = xe::saturate_unsigned(gamma); + float scale, offset; + // While the compiled code for linear to gamma conversion uses `vcmpgtfp + // constant, value` comparison (constant > value, or value < constant), it's + // preferable to use `value >= constant` condition for the higher pieces, as + // it will never pass for NaN, and in case of NaN, the 0...64/255 case will be + // selected regardless of whether it's saturated before or after the + // comparisons (always pre-saturating here, but shader translators may choose + // to saturate later for convenience), as saturation will flush NaN to 0. + if (gamma >= 96.0f / 255.0f) { + if (gamma >= 192.0f / 255.0f) { + scale = 8.0f / 1024.0f; + offset = -1024.0f; + } else { + scale = 4.0f / 1024.0f; + offset = -256.0f; + } + } else { + if (gamma >= 64.0f / 255.0f) { + scale = 2.0f / 1024.0f; + offset = -64.0f; + } else { + scale = 1.0f / 1024.0f; + offset = 0.0f; + // No `floor` term in this case in the Source Engine, but for the largest + // value, 1.0, `floor(255.0f * (1.0f / 1024.0f))` is 0 anyway. + } + } + // Though in the Source Engine, the 1/1024 multiplication is done for the + // truncated part specifically, pre-baking it into the scale is lossless - + // both 1024 and `scale` are powers of 2. + float linear = gamma * ((255.0f * 1024.0f) * scale) + offset; + // For consistency with linear to gamma, and because it's more logical here + // (0 rather than 1 at -epsilon), using `trunc` instead of `floor`. + linear += std::trunc(linear * scale); + linear *= 1.0f / 1023.0f; + // Clamping is not necessary (1 * (255 * 8) - 1024 + 7 is exactly 1023). + return linear; +} + +float LinearToPWLGamma(float linear) { + linear = xe::saturate_unsigned(linear); + float scale, offset; + // While the compiled code uses `vcmpgtfp constant, value` comparison + // (constant > value, or value < constant), it's preferable to use `value >= + // constant` condition for the higher pieces, as it will never pass for NaN, + // and in case of NaN, the 0...64/1023 case will be selected regardless of + // whether it's saturated before or after the comparisons (always + // pre-saturating here, but shader translators may choose to saturate later + // for convenience), as saturation will flush NaN to 0. + if (linear >= 128.0f / 1023.0f) { + if (linear >= 512.0f / 1023.0f) { + scale = 1023.0f / 8.0f; + offset = 128.0f / 255.0f; + } else { + scale = 1023.0f / 4.0f; + offset = 64.0f / 255.0f; + } + } else { + if (linear >= 64.0f / 1023.0f) { + scale = 1023.0f / 2.0f; + offset = 32.0f / 255.0f; + } else { + scale = 1023.0f; + offset = 0.0f; + } + } + // The truncation isn't in X360LinearToGamma in the Source Engine, but is + // there in Direct3D 9 disassembly (the `vrfiz` instructions). + // It also prevents conversion of 1.0 to 1.0034313725490196078431372549016 + // that's handled via clamping in the Source Engine. + // 127.875 (1023 / 8) is truncated to 127, which, after scaling, becomes + // 127 / 255, and when 128 / 255 is added, the result is 1. + return std::trunc(linear * scale) * (1.0f / 255.0f) + offset; +} + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp float Float7e3To32(uint32_t f10) { diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 97c83639e..c4e0870d6 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -327,6 +327,9 @@ enum class DepthRenderTargetFormat : uint32_t { const char* GetDepthRenderTargetFormatName(DepthRenderTargetFormat format); +float PWLGammaToLinear(float gamma); +float LinearToPWLGamma(float linear); + // Converts Xenos floating-point 7e3 color value in bits 0:9 (not clamping) to // an IEEE-754 32-bit floating-point number. float Float7e3To32(uint32_t f10); From 78d1eb8bf82ec621587712c44f7c0f7c6c9ad54c Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 9 Jun 2022 21:34:21 +0300 Subject: [PATCH 2/2] [GPU] TextureCache::GetActiveTextureHostSwizzle --- src/xenia/gpu/texture_cache.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index b2c5b1d60..48a54da38 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -98,8 +98,11 @@ class TextureCache { // "ActiveTexture" means as of the latest RequestTextures call. - // Returns the post-swizzle signedness of a currently bound texture (must be - // called after RequestTextures). + uint8_t GetActiveTextureHostSwizzle(uint32_t fetch_constant_index) const { + const TextureBinding* binding = + GetValidTextureBinding(fetch_constant_index); + return binding ? binding->host_swizzle : xenos::XE_GPU_TEXTURE_SWIZZLE_0000; + } uint8_t GetActiveTextureSwizzledSigns(uint32_t fetch_constant_index) const { const TextureBinding* binding = GetValidTextureBinding(fetch_constant_index);