GS/HW: Allow blending on normal shuffles

2024-01-07 10:39:13 +00:00 · 2024-01-07 10:39:13 +00:00 · 01842a3c6b
parent 4cd385dbff
commit 01842a3c6b
6 changed files with 271 additions and 151 deletions
--- a/bin/resources/shaders/dx11/tfx.fx
+++ b/bin/resources/shaders/dx11/tfx.fx
@ -742,6 +742,25 @@ float4 ps_color(PS_INPUT input)
 	float4 T = sample_color(st, input.t.w);
 #endif

+	if (PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC)
+	{
+		uint4 denorm_c_before = uint4(T);
+		if (PS_READ_BA)
+		{
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		}
+		else
+		{
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		}
+	}
+
 	float4 C = tfx(T, input.c);

 	atst(C);
@ -925,48 +944,6 @@ PS_OUTPUT ps_main(PS_INPUT input)
 			discard;
 	}

-	if (PS_SHUFFLE)
-	{
-		uint4 denorm_c = uint4(C);
-		uint2 denorm_TA = uint2(float2(TA.xy) * 255.0f + 0.5f);
-
-		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
-		if (PS_SHUFFLE_SAME)
-		{
-			if (PS_READ_BA)
-				C = (float4)(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
-			else
-				C.ga = C.rg;
-		}
-		// Copy of a 16bit source in to this target
-		else if (PS_READ16_SRC)
-		{
-			C.rb = (float2)float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5));
-			if (denorm_c.a & 0x80u)
-				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u));
-			else
-				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u));
-		}
-		// Write RB part. Mask will take care of the correct destination
-		else if (PS_READ_BA)
-		{
-			C.rb = C.bb;
-			if (denorm_c.a & 0x80u)
-				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		}
-		else
-		{
-			C.rb = C.rr;
-			if (denorm_c.g & 0x80u)
-				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
-
-			else
-				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		}
-	}
-
 	// Must be done before alpha correction

 	// AA (Fixed one) will output a coverage of 1.0 as alpha
@ -1023,6 +1000,63 @@ PS_OUTPUT ps_main(PS_INPUT input)

 	ps_blend(C, alpha_blend, input.p.xy);

+	if (PS_SHUFFLE)
+	{
+		if (!PS_SHUFFLE_SAME && !PS_READ16_SRC)
+		{
+			uint4 denorm_c_after = uint4(C);
+			if (PS_READ_BA)
+			{
+				C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			}
+			else
+			{
+				C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			}
+		}
+
+		uint4 denorm_c = uint4(C);
+		uint2 denorm_TA = uint2(float2(TA.xy) * 255.0f + 0.5f);
+
+		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
+		if (PS_SHUFFLE_SAME)
+		{
+			if (PS_READ_BA)
+				C = (float4)(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
+			else
+				C.ga = C.rg;
+		}
+		// Copy of a 16bit source in to this target
+		else if (PS_READ16_SRC)
+		{
+			C.rb = (float2)float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5));
+			if (denorm_c.a & 0x80u)
+				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u));
+			else
+				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u));
+		}
+		// Write RB part. Mask will take care of the correct destination
+		else if (PS_READ_BA)
+		{
+			C.rb = C.bb;
+			if (denorm_c.a & 0x80u)
+				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		}
+		else
+		{
+			C.rb = C.rr;
+			if (denorm_c.g & 0x80u)
+				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
+
+			else
+				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		}
+	}
+
 	ps_dither(C.rgb, input.p.xy);

 	// Color clamp/wrap needs to be done after sw blending and dithering
--- a/bin/resources/shaders/opengl/tfx_fs.glsl
+++ b/bin/resources/shaders/opengl/tfx_fs.glsl
@ -687,6 +687,21 @@ vec4 ps_color()
 	vec4 T = sample_color(st);
 #endif

+	#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_before = uvec4(T);
+		#if PS_READ_BA
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		#else
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		#endif
+	#endif
+	
 	vec4 C = tfx(T, PSin.c);

 	atst(C);
@ -937,7 +952,56 @@ void ps_main()

 	vec4 C = ps_color();

+	// Must be done before alpha correction
+
+	// AA (Fixed one) will output a coverage of 1.0 as alpha
+#if PS_FIXED_ONE_A
+	C.a = 128.0f;
+#endif
+
+#if SW_AD_TO_HW
+	vec4 RT = trunc(fetch_rt() * 255.0f + 0.1f);
+	vec4 alpha_blend = vec4(RT.a / 128.0f);
+#else
+	vec4 alpha_blend = vec4(C.a / 128.0f);
+#endif
+
+	// Correct the ALPHA value based on the output format
+#if (PS_DST_FMT == FMT_16)
+	float A_one = 128.0f; // alpha output will be 0x80
+	C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;
+#elif (PS_DST_FMT == FMT_32) && (PS_FBA != 0)
+	if(C.a < 128.0f) C.a += 128.0f;
+#endif
+
+	// Get first primitive that will write a failling alpha value
+#if PS_DATE == 1
+	// DATM == 0
+	// Pixel with alpha equal to 1 will failed (128-255)
+	SV_Target0 = (C.a > 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
+	return;
+#elif PS_DATE == 2
+	// DATM == 1
+	// Pixel with alpha equal to 0 will failed (0-127)
+	SV_Target0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
+	return;
+#endif
+
+	ps_blend(C, alpha_blend);
+
+
 #if PS_SHUFFLE
+	#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_after = uvec4(C);
+		#if PS_READ_BA
+			C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+			C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+		#else
+			C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+			C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+		#endif
+	#endif
+	
 	uvec4 denorm_c = uvec4(C);
 	uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);

@ -991,43 +1055,6 @@ void ps_main()
 #endif // PS_SHUFFLE_SAME
 #endif // PS_SHUFFLE

-	// Must be done before alpha correction
-
-	// AA (Fixed one) will output a coverage of 1.0 as alpha
-#if PS_FIXED_ONE_A
-	C.a = 128.0f;
-#endif
-
-#if SW_AD_TO_HW
-	vec4 RT = trunc(fetch_rt() * 255.0f + 0.1f);
-	vec4 alpha_blend = vec4(RT.a / 128.0f);
-#else
-	vec4 alpha_blend = vec4(C.a / 128.0f);
-#endif
-
-	// Correct the ALPHA value based on the output format
-#if (PS_DST_FMT == FMT_16)
-	float A_one = 128.0f; // alpha output will be 0x80
-	C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;
-#elif (PS_DST_FMT == FMT_32) && (PS_FBA != 0)
-	if(C.a < 128.0f) C.a += 128.0f;
-#endif
-
-	// Get first primitive that will write a failling alpha value
-#if PS_DATE == 1
-	// DATM == 0
-	// Pixel with alpha equal to 1 will failed (128-255)
-	SV_Target0 = (C.a > 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
-	return;
-#elif PS_DATE == 2
-	// DATM == 1
-	// Pixel with alpha equal to 0 will failed (0-127)
-	SV_Target0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
-	return;
-#endif
-
-	ps_blend(C, alpha_blend);
-
 	ps_dither(C.rgb);

 	// Color clamp/wrap needs to be done after sw blending and dithering
--- a/bin/resources/shaders/vulkan/tfx.glsl
+++ b/bin/resources/shaders/vulkan/tfx.glsl
@ -933,6 +933,21 @@ vec4 ps_color()
 	vec4 T = sample_color(st);
 #endif

+	#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_before = uvec4(T);
+		#if PS_READ_BA
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		#else
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		#endif
+	#endif
+	
 	vec4 C = tfx(T, vsIn.c);

 	atst(C);
@ -1184,40 +1199,6 @@ void main()

 	vec4 C = ps_color();

-	#if PS_SHUFFLE
-		uvec4 denorm_c = uvec4(C);
-		uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
-		
-		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
-		#if PS_SHUFFLE_SAME
-			#if (PS_READ_BA)
-				C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
-			#else
-				C.ga = C.rg;
-			#endif
-		// Copy of a 16bit source in to this target
-		#elif PS_READ16_SRC
-			C.rb = vec2(float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5)));
-			if ((denorm_c.a & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
-		// Write RB part. Mask will take care of the correct destination
-		#elif PS_READ_BA
-			C.rb = C.bb;
-			if ((denorm_c.a & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		#else
-			C.rb = C.rr;
-			if ((denorm_c.g & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		#endif // PS_SHUFFLE_SAME
-	#endif // PS_SHUFFLE
-
 	// Must be done before alpha correction

 	// AA (Fixed one) will output a coverage of 1.0 as alpha
@ -1254,9 +1235,53 @@ void main()
 	o_col0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);

 #else
-
 	ps_blend(C, alpha_blend);

+#if PS_SHUFFLE
+		#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
+			uvec4 denorm_c_after = uvec4(C);
+			#if PS_READ_BA
+				C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			#else
+				C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			#endif
+		#endif
+
+		uvec4 denorm_c = uvec4(C);
+		uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
+		
+		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
+		#if PS_SHUFFLE_SAME
+			#if (PS_READ_BA)
+				C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
+			#else
+				C.ga = C.rg;
+			#endif
+		// Copy of a 16bit source in to this target
+		#elif PS_READ16_SRC
+			C.rb = vec2(float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5)));
+			if ((denorm_c.a & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
+		// Write RB part. Mask will take care of the correct destination
+		#elif PS_READ_BA
+			C.rb = C.bb;
+			if ((denorm_c.a & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		#else
+			C.rb = C.rr;
+			if ((denorm_c.g & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		#endif // PS_SHUFFLE_SAME
+	#endif // PS_SHUFFLE
+
 	ps_dither(C.rgb);

 	// Color clamp/wrap needs to be done after sw blending and dithering
--- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp
+++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp
@ -5169,7 +5169,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 	}

 	bool blending_alpha_pass = false;
-	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && (m_conf.colormask.wrgba & 0x7))
+	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) || (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle)))
 	{
 		EmulateBlending(blend_alpha_min, blend_alpha_max, DATE_PRIMID, DATE_BARRIER, blending_alpha_pass);
 	}
--- a/pcsx2/GS/Renderers/Metal/tfx.metal
+++ b/pcsx2/GS/Renderers/Metal/tfx.metal
@ -807,6 +807,25 @@ struct PSMain
 		else
 			T = sample_color(st);

+		if (PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC)
+		{
+			uint4 denorm_c_before = uint4(T);
+			if (PS_READ_BA)
+			{
+				T.r = float((denorm_c_before.b << 3) & 0xF8);
+				T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+				T.b = float((denorm_c_before.a << 1) & 0xF8);
+				T.a = float(denorm_c_before.a & 0x80);
+			}
+			else
+			{
+				T.r = float((denorm_c_before.r << 3) & 0xF8);
+				T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+				T.b = float((denorm_c_before.g << 1) & 0xF8);
+				T.a = float(denorm_c_before.g & 0x80);
+			}
+		}
+	
 		float4 C = tfx(T, IIP ? in.c : in.fc);
 		if (!atst(C))
 			discard_fragment();
@ -1005,41 +1024,6 @@ struct PSMain

 		float4 C = ps_color();

-		if (PS_SHUFFLE)
-		{
-			uint4 denorm_c = uint4(C);
-			uint2 denorm_TA = uint2(cb.ta * 255.5f);
-
-			// Special case for 32bit input and 16bit output, shuffle used by The Godfather
-			if (PS_SHUFFLE_SAME)
-			{
-				if (PS_READ_BA)
-					C = (denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80);
-				else
-					C.ga = C.rg;
-			}
-			// Copy of a 16bit source in to this target
-			else if (PS_READ16_SRC)
-			{
-				C.rb = (denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5);
-				if (denorm_c.a & 0x80)
-					C.ga = (denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80);
-				else
-					C.ga = (denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80);
-			}
-			// Write RB part. Mask will take care of the correct destination
-			else if (PS_READ_BA)
-			{
-				C.rb = C.bb;	
-				C.ga = (denorm_c.a & 0x7F) | (denorm_c.a & 0x80 ? denorm_TA.y & 0x80 : denorm_TA.x & 0x80);
-			}
-			else
-			{
-				C.rb = C.rr;
-				C.ga = (denorm_c.g & 0x7F) | (denorm_c.g & 0x80 ? denorm_TA.y & 0x80 : denorm_TA.x & 0x80);
-			}
-		}
-
 		// Must be done before alpha correction

 		// AA (Fixed one) will output a coverage of 1.0 as alpha
@ -1077,6 +1061,56 @@ struct PSMain

 		ps_blend(C, alpha_blend);

+		if (PS_SHUFFLE)
+		{
+			if (!PS_SHUFFLE_SAME && !PS_READ16_SRC)
+			{
+				uint4 denorm_c_after = uint4(C);
+				if (PS_READ_BA)
+				{
+					C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+					C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+				}
+				else
+				{
+					C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+					C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+				}
+			}
+
+			uint4 denorm_c = uint4(C);
+			uint2 denorm_TA = uint2(cb.ta * 255.5f);
+
+			// Special case for 32bit input and 16bit output, shuffle used by The Godfather
+			if (PS_SHUFFLE_SAME)
+			{
+				if (PS_READ_BA)
+					C = (denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80);
+				else
+					C.ga = C.rg;
+			}
+			// Copy of a 16bit source in to this target
+			else if (PS_READ16_SRC)
+			{
+				C.rb = (denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5);
+				if (denorm_c.a & 0x80)
+					C.ga = (denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80);
+				else
+					C.ga = (denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80);
+			}
+			// Write RB part. Mask will take care of the correct destination
+			else if (PS_READ_BA)
+			{
+				C.rb = C.bb;	
+				C.ga = (denorm_c.a & 0x7F) | (denorm_c.a & 0x80 ? denorm_TA.y & 0x80 : denorm_TA.x & 0x80);
+			}
+			else
+			{
+				C.rb = C.rr;
+				C.ga = (denorm_c.g & 0x7F) | (denorm_c.g & 0x80 ? denorm_TA.y & 0x80 : denorm_TA.x & 0x80);
+			}
+		}
+		
 		ps_dither(C);

 		// Color clamp/wrap needs to be done after sw blending and dithering
--- a/pcsx2/ShaderCacheVersion.h
+++ b/pcsx2/ShaderCacheVersion.h
@ -3,4 +3,4 @@

 /// Version number for GS and other shaders. Increment whenever any of the contents of the
 /// shaders change, to invalidate the cache.
-static constexpr u32 SHADER_CACHE_VERSION = 37;
+static constexpr u32 SHADER_CACHE_VERSION = 38;