From 1b18e02fe0be80cf0acc3fad81382f5de39a6d4a Mon Sep 17 00:00:00 2001 From: refractionpcsx2 Date: Tue, 16 Aug 2022 10:11:17 +0100 Subject: [PATCH] GS: Add constant adjustment in blend mix when reverse subtracting. --- bin/resources/shaders/dx11/tfx.fx | 22 ++++++++++++---------- bin/resources/shaders/opengl/tfx_fs.glsl | 20 +++++++++++--------- bin/resources/shaders/vulkan/tfx.glsl | 22 ++++++++++++---------- pcsx2/GS/Renderers/Common/GSDevice.h | 2 +- pcsx2/GS/Renderers/HW/GSRendererHW.cpp | 2 +- pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm | 2 +- pcsx2/GS/Renderers/Metal/tfx.metal | 24 +++++++++++++----------- 7 files changed, 51 insertions(+), 43 deletions(-) diff --git a/bin/resources/shaders/dx11/tfx.fx b/bin/resources/shaders/dx11/tfx.fx index 17c990f1e0..9babd0e450 100644 --- a/bin/resources/shaders/dx11/tfx.fx +++ b/bin/resources/shaders/dx11/tfx.fx @@ -776,20 +776,22 @@ void ps_blend(inout float4 Color, inout float As, float2 pos_xy) // As/Af clamp alpha for Blend mix // We shouldn't clamp blend mix with clr1 as we want alpha higher - if (PS_BLEND_MIX && PS_CLR_HW != 1) + if (PS_BLEND_MIX > 0 && PS_CLR_HW != 1) C = min(C, 1.0f); if (PS_BLEND_A == PS_BLEND_B) Color.rgb = D; - else if (PS_BLEND_MIX) - // In blend_mix, HW adds on some alpha factor * dst. - // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. - // Instead, apply an offset to convert HW's round to a floor. - // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. - // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 - // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. - // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). - Color.rgb = ((A - B) * C + D) - (124.0f/256.0f); + // In blend_mix, HW adds on some alpha factor * dst. + // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. + // Instead, apply an offset to convert HW's round to a floor. + // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. + // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 + // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. + // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). + else if (PS_BLEND_MIX == 2) + Color.rgb = ((A - B) * C + D) + (124.0f/256.0f); + else if (PS_BLEND_MIX == 1) + Color.rgb = ((A - B) * C + D) - (124.0f/256.0f); else Color.rgb = trunc(((A - B) * C) + D); diff --git a/bin/resources/shaders/opengl/tfx_fs.glsl b/bin/resources/shaders/opengl/tfx_fs.glsl index f8d964e22b..b27aa0acf7 100644 --- a/bin/resources/shaders/opengl/tfx_fs.glsl +++ b/bin/resources/shaders/opengl/tfx_fs.glsl @@ -755,20 +755,22 @@ void ps_blend(inout vec4 Color, inout float As) // As/Af clamp alpha for Blend mix // We shouldn't clamp blend mix with clr1 as we want alpha higher -#if PS_BLEND_MIX && PS_CLR_HW != 1 +#if PS_BLEND_MIX > 0 && PS_CLR_HW != 1 C = min(C, 1.0f); #endif #if PS_BLEND_A == PS_BLEND_B Color.rgb = D; -#elif PS_BLEND_MIX - // In blend_mix, HW adds on some alpha factor * dst. - // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. - // Instead, apply an offset to convert HW's round to a floor. - // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. - // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 - // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. - // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). +// In blend_mix, HW adds on some alpha factor * dst. +// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. +// Instead, apply an offset to convert HW's round to a floor. +// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. +// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 +// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. +// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). +#elif PS_BLEND_MIX == 2 + Color.rgb = ((A - B) * C + D) + (124.0f/256.0f); +#elif PS_BLEND_MIX == 1 Color.rgb = ((A - B) * C + D) - (124.0f/256.0f); #else Color.rgb = trunc((A - B) * C + D); diff --git a/bin/resources/shaders/vulkan/tfx.glsl b/bin/resources/shaders/vulkan/tfx.glsl index 2ba3f747fa..ff8c87e7f6 100644 --- a/bin/resources/shaders/vulkan/tfx.glsl +++ b/bin/resources/shaders/vulkan/tfx.glsl @@ -1054,21 +1054,23 @@ void ps_blend(inout vec4 Color, inout float As) // As/Af clamp alpha for Blend mix // We shouldn't clamp blend mix with clr1 as we want alpha higher - #if PS_BLEND_MIX && PS_CLR_HW != 1 + #if PS_BLEND_MIX > 0 && PS_CLR_HW != 1 C = min(C, 1.0f); #endif #if PS_BLEND_A == PS_BLEND_B Color.rgb = D; - #elif PS_BLEND_MIX - // In blend_mix, HW adds on some alpha factor * dst. - // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. - // Instead, apply an offset to convert HW's round to a floor. - // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. - // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 - // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. - // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). - Color.rgb = ((A - B) * C + D) - (124.0f/256.0f); + // In blend_mix, HW adds on some alpha factor * dst. + // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. + // Instead, apply an offset to convert HW's round to a floor. + // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. + // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 + // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. + // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). + #elif PS_BLEND_MIX == 2 + Color.rgb = ((A - B) * C + D) + (124.0f/256.0f); + #elif PS_BLEND_MIX == 1 + Color.rgb = ((A - B) * C + D) - (124.0f/256.0f); #else Color.rgb = trunc((A - B) * C + D); #endif diff --git a/pcsx2/GS/Renderers/Common/GSDevice.h b/pcsx2/GS/Renderers/Common/GSDevice.h index d77da8423d..99a1e32433 100644 --- a/pcsx2/GS/Renderers/Common/GSDevice.h +++ b/pcsx2/GS/Renderers/Common/GSDevice.h @@ -268,7 +268,7 @@ struct alignas(16) GSHWDrawConfig u32 clr_hw : 3; u32 hdr : 1; u32 colclip : 1; - u32 blend_mix : 1; + u32 blend_mix : 2; u32 pabe : 1; u32 no_color : 1; // disables color output entirely (depth only) u32 no_color1 : 1; // disables second color output (when unnecessary) diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index 98efae9e1d..eb2d3813f4 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -2712,7 +2712,7 @@ void GSRendererHW::EmulateBlending(bool& DATE_PRIMID, bool& DATE_BARRIER, bool& { // For mixed blend, the source blend is done in the shader (so we use CONST_ONE as a factor). m_conf.blend = {true, GSDevice::CONST_ONE, blend.dst, blend.op, m_conf.ps.blend_c == 2, ALPHA.FIX}; - m_conf.ps.blend_mix = 1; + m_conf.ps.blend_mix = (blend.op == GSDevice::OP_REV_SUBTRACT) ? 2 : 1; // Elide DSB colour output if not used by dest. m_conf.ps.no_color1 |= !GSDevice::IsDualSourceBlendFactor(blend.dst); diff --git a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm index 5ed2fa824d..a364392323 100644 --- a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm +++ b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm @@ -1210,7 +1210,7 @@ void GSDeviceMTL::MRESetHWPipelineState(GSHWDrawConfig::VSSelector vssel, GSHWDr setFnConstantI(m_fn_constants, pssel.clr_hw, GSMTLConstantIndex_PS_CLR_HW); setFnConstantB(m_fn_constants, pssel.hdr, GSMTLConstantIndex_PS_HDR); setFnConstantB(m_fn_constants, pssel.colclip, GSMTLConstantIndex_PS_COLCLIP); - setFnConstantB(m_fn_constants, pssel.blend_mix, GSMTLConstantIndex_PS_BLEND_MIX); + setFnConstantI(m_fn_constants, pssel.blend_mix, GSMTLConstantIndex_PS_BLEND_MIX); setFnConstantB(m_fn_constants, pssel.fixed_one_a, GSMTLConstantIndex_PS_FIXED_ONE_A); setFnConstantB(m_fn_constants, pssel.pabe, GSMTLConstantIndex_PS_PABE); setFnConstantB(m_fn_constants, pssel.no_color, GSMTLConstantIndex_PS_NO_COLOR); diff --git a/pcsx2/GS/Renderers/Metal/tfx.metal b/pcsx2/GS/Renderers/Metal/tfx.metal index 350979279a..5443517964 100644 --- a/pcsx2/GS/Renderers/Metal/tfx.metal +++ b/pcsx2/GS/Renderers/Metal/tfx.metal @@ -48,7 +48,7 @@ constant uint PS_BLEND_D [[function_constant(GSMTLConstantIndex_PS_BL constant uint PS_CLR_HW [[function_constant(GSMTLConstantIndex_PS_CLR_HW)]]; constant bool PS_HDR [[function_constant(GSMTLConstantIndex_PS_HDR)]]; constant bool PS_COLCLIP [[function_constant(GSMTLConstantIndex_PS_COLCLIP)]]; -constant bool PS_BLEND_MIX [[function_constant(GSMTLConstantIndex_PS_BLEND_MIX)]]; +constant uint PS_BLEND_MIX [[function_constant(GSMTLConstantIndex_PS_BLEND_MIX)]]; constant bool PS_FIXED_ONE_A [[function_constant(GSMTLConstantIndex_PS_FIXED_ONE_A)]]; constant bool PS_PABE [[function_constant(GSMTLConstantIndex_PS_PABE)]]; constant bool PS_NO_COLOR [[function_constant(GSMTLConstantIndex_PS_NO_COLOR)]]; @@ -708,7 +708,7 @@ struct PSMain // Warning: normally blending equation is mult(A, B) = A * B >> 7. GPU have the full accuracy // GS: Color = 1, Alpha = 255 => output 1 // GPU: Color = 1/255, Alpha = 255/255 * 255/128 => output 1.9921875 - if (PS_DFMT == FMT_16 && (PS_HDR || !PS_BLEND_MIX)) + if (PS_DFMT == FMT_16 && (PS_HDR || PS_BLEND_MIX == 0)) // In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania C.rgb = float3(short3(C.rgb) & 0xF8); else if (PS_COLCLIP && !PS_HDR) @@ -745,19 +745,21 @@ struct PSMain // As/Af clamp alpha for Blend mix // We shouldn't clamp blend mix with clr1 as we want alpha higher - if (PS_BLEND_MIX && PS_CLR_HW != 1) + if (PS_BLEND_MIX > 0 && PS_CLR_HW != 1) C = min(C, 1.f); if (PS_BLEND_A == PS_BLEND_B) Color.rgb = D; - else if (PS_BLEND_MIX) - // In blend_mix, HW adds on some alpha factor * dst. - // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. - // Instead, apply an offset to convert HW's round to a floor. - // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. - // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 - // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. - // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). + // In blend_mix, HW adds on some alpha factor * dst. + // Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation. + // Instead, apply an offset to convert HW's round to a floor. + // Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision. + // But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399 + // Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause. + // 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256). + else if (PS_BLEND_MIX == 2) + Color.rgb = ((A - B) * C + D) + (124.f/256.f); + else if (PS_BLEND_MIX == 1) Color.rgb = ((A - B) * C + D) - (124.f/256.f); else Color.rgb = trunc((A - B) * C + D);