GS: Add constant adjustment in blend mix when reverse subtracting.

This commit is contained in:
refractionpcsx2 2022-08-16 10:11:17 +01:00
parent 2e63a4c037
commit 1b18e02fe0
7 changed files with 51 additions and 43 deletions

View File

@ -776,20 +776,22 @@ void ps_blend(inout float4 Color, inout float As, float2 pos_xy)
// As/Af clamp alpha for Blend mix
// We shouldn't clamp blend mix with clr1 as we want alpha higher
if (PS_BLEND_MIX && PS_CLR_HW != 1)
if (PS_BLEND_MIX > 0 && PS_CLR_HW != 1)
C = min(C, 1.0f);
if (PS_BLEND_A == PS_BLEND_B)
Color.rgb = D;
else if (PS_BLEND_MIX)
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
Color.rgb = ((A - B) * C + D) - (124.0f/256.0f);
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
else if (PS_BLEND_MIX == 2)
Color.rgb = ((A - B) * C + D) + (124.0f/256.0f);
else if (PS_BLEND_MIX == 1)
Color.rgb = ((A - B) * C + D) - (124.0f/256.0f);
else
Color.rgb = trunc(((A - B) * C) + D);

View File

@ -755,20 +755,22 @@ void ps_blend(inout vec4 Color, inout float As)
// As/Af clamp alpha for Blend mix
// We shouldn't clamp blend mix with clr1 as we want alpha higher
#if PS_BLEND_MIX && PS_CLR_HW != 1
#if PS_BLEND_MIX > 0 && PS_CLR_HW != 1
C = min(C, 1.0f);
#endif
#if PS_BLEND_A == PS_BLEND_B
Color.rgb = D;
#elif PS_BLEND_MIX
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
#elif PS_BLEND_MIX == 2
Color.rgb = ((A - B) * C + D) + (124.0f/256.0f);
#elif PS_BLEND_MIX == 1
Color.rgb = ((A - B) * C + D) - (124.0f/256.0f);
#else
Color.rgb = trunc((A - B) * C + D);

View File

@ -1054,21 +1054,23 @@ void ps_blend(inout vec4 Color, inout float As)
// As/Af clamp alpha for Blend mix
// We shouldn't clamp blend mix with clr1 as we want alpha higher
#if PS_BLEND_MIX && PS_CLR_HW != 1
#if PS_BLEND_MIX > 0 && PS_CLR_HW != 1
C = min(C, 1.0f);
#endif
#if PS_BLEND_A == PS_BLEND_B
Color.rgb = D;
#elif PS_BLEND_MIX
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
Color.rgb = ((A - B) * C + D) - (124.0f/256.0f);
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
#elif PS_BLEND_MIX == 2
Color.rgb = ((A - B) * C + D) + (124.0f/256.0f);
#elif PS_BLEND_MIX == 1
Color.rgb = ((A - B) * C + D) - (124.0f/256.0f);
#else
Color.rgb = trunc((A - B) * C + D);
#endif

View File

@ -268,7 +268,7 @@ struct alignas(16) GSHWDrawConfig
u32 clr_hw : 3;
u32 hdr : 1;
u32 colclip : 1;
u32 blend_mix : 1;
u32 blend_mix : 2;
u32 pabe : 1;
u32 no_color : 1; // disables color output entirely (depth only)
u32 no_color1 : 1; // disables second color output (when unnecessary)

View File

@ -2712,7 +2712,7 @@ void GSRendererHW::EmulateBlending(bool& DATE_PRIMID, bool& DATE_BARRIER, bool&
{
// For mixed blend, the source blend is done in the shader (so we use CONST_ONE as a factor).
m_conf.blend = {true, GSDevice::CONST_ONE, blend.dst, blend.op, m_conf.ps.blend_c == 2, ALPHA.FIX};
m_conf.ps.blend_mix = 1;
m_conf.ps.blend_mix = (blend.op == GSDevice::OP_REV_SUBTRACT) ? 2 : 1;
// Elide DSB colour output if not used by dest.
m_conf.ps.no_color1 |= !GSDevice::IsDualSourceBlendFactor(blend.dst);

View File

@ -1210,7 +1210,7 @@ void GSDeviceMTL::MRESetHWPipelineState(GSHWDrawConfig::VSSelector vssel, GSHWDr
setFnConstantI(m_fn_constants, pssel.clr_hw, GSMTLConstantIndex_PS_CLR_HW);
setFnConstantB(m_fn_constants, pssel.hdr, GSMTLConstantIndex_PS_HDR);
setFnConstantB(m_fn_constants, pssel.colclip, GSMTLConstantIndex_PS_COLCLIP);
setFnConstantB(m_fn_constants, pssel.blend_mix, GSMTLConstantIndex_PS_BLEND_MIX);
setFnConstantI(m_fn_constants, pssel.blend_mix, GSMTLConstantIndex_PS_BLEND_MIX);
setFnConstantB(m_fn_constants, pssel.fixed_one_a, GSMTLConstantIndex_PS_FIXED_ONE_A);
setFnConstantB(m_fn_constants, pssel.pabe, GSMTLConstantIndex_PS_PABE);
setFnConstantB(m_fn_constants, pssel.no_color, GSMTLConstantIndex_PS_NO_COLOR);

View File

@ -48,7 +48,7 @@ constant uint PS_BLEND_D [[function_constant(GSMTLConstantIndex_PS_BL
constant uint PS_CLR_HW [[function_constant(GSMTLConstantIndex_PS_CLR_HW)]];
constant bool PS_HDR [[function_constant(GSMTLConstantIndex_PS_HDR)]];
constant bool PS_COLCLIP [[function_constant(GSMTLConstantIndex_PS_COLCLIP)]];
constant bool PS_BLEND_MIX [[function_constant(GSMTLConstantIndex_PS_BLEND_MIX)]];
constant uint PS_BLEND_MIX [[function_constant(GSMTLConstantIndex_PS_BLEND_MIX)]];
constant bool PS_FIXED_ONE_A [[function_constant(GSMTLConstantIndex_PS_FIXED_ONE_A)]];
constant bool PS_PABE [[function_constant(GSMTLConstantIndex_PS_PABE)]];
constant bool PS_NO_COLOR [[function_constant(GSMTLConstantIndex_PS_NO_COLOR)]];
@ -708,7 +708,7 @@ struct PSMain
// Warning: normally blending equation is mult(A, B) = A * B >> 7. GPU have the full accuracy
// GS: Color = 1, Alpha = 255 => output 1
// GPU: Color = 1/255, Alpha = 255/255 * 255/128 => output 1.9921875
if (PS_DFMT == FMT_16 && (PS_HDR || !PS_BLEND_MIX))
if (PS_DFMT == FMT_16 && (PS_HDR || PS_BLEND_MIX == 0))
// In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania
C.rgb = float3(short3(C.rgb) & 0xF8);
else if (PS_COLCLIP && !PS_HDR)
@ -745,19 +745,21 @@ struct PSMain
// As/Af clamp alpha for Blend mix
// We shouldn't clamp blend mix with clr1 as we want alpha higher
if (PS_BLEND_MIX && PS_CLR_HW != 1)
if (PS_BLEND_MIX > 0 && PS_CLR_HW != 1)
C = min(C, 1.f);
if (PS_BLEND_A == PS_BLEND_B)
Color.rgb = D;
else if (PS_BLEND_MIX)
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
// In blend_mix, HW adds on some alpha factor * dst.
// Truncating here wouldn't quite get the right result because it prevents the <1 bit here from combining with a <1 bit in dst to form a ≥1 amount that pushes over the truncation.
// Instead, apply an offset to convert HW's round to a floor.
// Since alpha is in 1/128 increments, subtracting (0.5 - 0.5/128 == 127/256) would get us what we want if GPUs blended in full precision.
// But they don't. Details here: https://github.com/PCSX2/pcsx2/pull/6809#issuecomment-1211473399
// Based on the scripts at the above link, the ideal choice for Intel GPUs is 126/256, AMD 120/256. Nvidia is a lost cause.
// 124/256 seems like a reasonable compromise, providing the correct answer 99.3% of the time on Intel (vs 99.6% for 126/256), and 97% of the time on AMD (vs 97.4% for 120/256).
else if (PS_BLEND_MIX == 2)
Color.rgb = ((A - B) * C + D) + (124.f/256.f);
else if (PS_BLEND_MIX == 1)
Color.rgb = ((A - B) * C + D) - (124.f/256.f);
else
Color.rgb = trunc((A - B) * C + D);