From d31bd97d5952e115f430bacc47c405b7ce9fb5b5 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 26 May 2015 14:59:07 +0200
Subject: [PATCH 1/4] gsdx-ogl: add a variable to select FB output

Either 32bits/24bits/16bits
---
 plugins/GSdx/GSDeviceOGL.cpp   | 1 +
 plugins/GSdx/GSDeviceOGL.h     | 4 +++-
 plugins/GSdx/GSRendererOGL.cpp | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp
index 6b5044603b..23c13de24a 100644
--- a/plugins/GSdx/GSDeviceOGL.cpp
+++ b/plugins/GSdx/GSDeviceOGL.cpp
@@ -627,6 +627,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
 		+ format("#define PS_WMT %d\n", sel.wmt)
 		+ format("#define PS_FMT %d\n", sel.fmt)
 		+ format("#define PS_IFMT %d\n", sel.ifmt)
+		+ format("#define PS_DFMT %d\n", sel.dfmt)
 		+ format("#define PS_AEM %d\n", sel.aem)
 		+ format("#define PS_TFX %d\n", sel.tfx)
 		+ format("#define PS_TCC %d\n", sel.tcc)
diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h
index c46df0dbb9..88b7a25231 100644
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@@ -321,7 +321,9 @@ class GSDeviceOGL : public GSDevice
 
 				// Word 2
 				uint32 blend:8;
-				uint32 _free2:24;
+				uint32 dfmt:2;
+
+				uint32 _free2:22;
 			};
 
 			uint64 key;
diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 0cffc776cc..1d1bf27721 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -249,6 +249,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 	GSDeviceOGL::OMColorMaskSelector om_csel;
 	GSDeviceOGL::OMDepthStencilSelector om_dssel;
 
+	// Format of the output
+	ps_sel.dfmt = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
+
 	// Blend
 
 	if (!IsOpaque())

From 9ee3a173d0a51d1401bd2775293c7b755c417730 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 26 May 2015 15:36:48 +0200
Subject: [PATCH 2/4] gsdx-ogl: use a local ALPHA register

It would allow to easy tune the parameter to support 24 bits format
---
 plugins/GSdx/GSDeviceOGL.h      |  4 ++--
 plugins/GSdx/GSRendererOGL.cpp  | 30 ++++++++++++++++--------------
 plugins/GSdx/GSTextureFXOGL.cpp |  8 ++++----
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h
index 88b7a25231..64f42e9f80 100644
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@@ -619,7 +619,7 @@ class GSDeviceOGL : public GSDevice
 	GLuint CreateSampler(bool bilinear, bool tau, bool tav);
 	GLuint CreateSampler(PSSamplerSelector sel);
 	GSDepthStencilOGL* CreateDepthStencil(OMDepthStencilSelector dssel);
-	GSBlendStateOGL* CreateBlend(OMBlendSelector bsel, uint8 afix);
+	GSBlendStateOGL* CreateBlend(OMBlendSelector bsel, float afix);
 
 
 	void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
@@ -628,7 +628,7 @@ class GSDeviceOGL : public GSDevice
 	void SetupPS(PSSelector sel);
 	void SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb);
 	void SetupSampler(PSSamplerSelector ssel);
-	void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix, bool sw_blending =  false);
+	void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix, bool sw_blending =  false);
 	GLuint GetSamplerID(PSSamplerSelector ssel);
 	GLuint GetPaletteSamplerID();
 
diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 1d1bf27721..42b5f4261f 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -252,16 +252,19 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 	// Format of the output
 	ps_sel.dfmt = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
 
+	GIFRegALPHA ALPHA = context->ALPHA;
+	float afix = (float)context->ALPHA.FIX / 0x80;
+
 	// Blend
 
 	if (!IsOpaque())
 	{
 		om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS;
 
-		om_bsel.a = context->ALPHA.A;
-		om_bsel.b = context->ALPHA.B;
-		om_bsel.c = context->ALPHA.C;
-		om_bsel.d = context->ALPHA.D;
+		om_bsel.a = ALPHA.A;
+		om_bsel.b = ALPHA.B;
+		om_bsel.c = ALPHA.C;
+		om_bsel.d = ALPHA.D;
 
 		if (env.PABE.PABE)
 		{
@@ -439,25 +442,25 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 
 	bool colclip_wrap = env.COLCLAMP.CLAMP == 0 && !tex && PRIM->PRIM != GS_POINTLIST && !m_accurate_colclip;
 	bool acc_colclip_wrap = env.COLCLAMP.CLAMP == 0 && m_accurate_colclip;
-	if (context->ALPHA.A == context->ALPHA.B) { // Optimize-away colclip
+	if (ALPHA.A == ALPHA.B) { // Optimize-away colclip
 		// No addition neither substraction so no risk of overflow the [0:255] range.
 		colclip_wrap = false;
 		acc_colclip_wrap = false;
 #ifdef ENABLE_OGL_DEBUG
 		if (colclip_wrap || acc_colclip_wrap) {
 			const char *col[3] = {"Cs", "Cd", "0"};
-			GL_INS("COLCLIP: DISABLED: blending is a plain copy of %s", col[context->ALPHA.D]);
+			GL_INS("COLCLIP: DISABLED: blending is a plain copy of %s", col[ALPHA.D]);
 		}
 #endif
 	}
 	if (colclip_wrap) {
 		ps_sel.colclip = 1;
-		GL_INS("COLCLIP ENABLED (blending is %d/%d/%d/%d)", context->ALPHA.A, context->ALPHA.B, context->ALPHA.C, context->ALPHA.D);
+		GL_INS("COLCLIP ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D);
 	} else if (acc_colclip_wrap) {
-			ps_sel.colclip = 3;
-			GL_INS("COLCLIP SW ENABLED (blending is %d/%d/%d/%d)", context->ALPHA.A, context->ALPHA.B, context->ALPHA.C, context->ALPHA.D);
-	} else if (env.COLCLAMP.CLAMP == 0 && (context->ALPHA.A != context->ALPHA.B)) {
-			GL_INS("COLCLIP NOT SUPPORTED (blending is %d/%d/%d/%d)", context->ALPHA.A, context->ALPHA.B, context->ALPHA.C, context->ALPHA.D);
+		ps_sel.colclip = 3;
+		GL_INS("COLCLIP SW ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D);
+	} else if (env.COLCLAMP.CLAMP == 0 && (ALPHA.A != ALPHA.B)) {
+		GL_INS("COLCLIP NOT SUPPORTED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D);
 	}
 
 	ps_sel.fba = context->FBA.FBA;
@@ -613,8 +616,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		dev->PSSetShaderResource(3, rt);
 
 		// Require the fix alpha vlaue
-		if (context->ALPHA.C == 2) {
-			ps_cb.AlphaCoeff = GSVector4((float)(int)context->ALPHA.FIX / 0x80);
+		if (ALPHA.C == 2) {
+			ps_cb.AlphaCoeff = GSVector4(afix);
 		}
 
 		// No need to flush for every primitive
@@ -632,7 +635,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 	dev->SetupPS(ps_sel);
 
 	// rs
-	uint8 afix = context->ALPHA.FIX;
 
 	GSVector4i scissor = GSVector4i(GSVector4(rtscale).xyxy() * context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
 
diff --git a/plugins/GSdx/GSTextureFXOGL.cpp b/plugins/GSdx/GSTextureFXOGL.cpp
index 4f0fb08247..7e3ddb09a7 100644
--- a/plugins/GSdx/GSTextureFXOGL.cpp
+++ b/plugins/GSdx/GSTextureFXOGL.cpp
@@ -100,7 +100,7 @@ GSDepthStencilOGL* GSDeviceOGL::CreateDepthStencil(OMDepthStencilSelector dssel)
 	return dss;
 }
 
-GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, uint8 afix)
+GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, float afix)
 {
 	GSBlendStateOGL* bs = new GSBlendStateOGL();
 
@@ -119,7 +119,7 @@ GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, uint8 afix)
 					bs->SetRGB(m_blendMapD3D9[i].op, m_blendMapD3D9[i].src, GL_ONE);
 			}
 
-			const string afixstr = format("%d >> 7", afix);
+			const string afixstr = format("%f", afix);
 			const char *col[3] = {"Cs", "Cd", "0"};
 			const char *alpha[3] = {"As", "Ad", afixstr.c_str()};
 			fprintf(stderr, "Impossible blend for D3D: (%s - %s) * %s + %s\n", col[bsel.a], col[bsel.b], alpha[bsel.c], col[bsel.d]);
@@ -235,7 +235,7 @@ GLuint GSDeviceOGL::GetPaletteSamplerID()
 	return m_palette_ss;
 }
 
-void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix, bool sw_blending)
+void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix, bool sw_blending)
 {
 	GSDepthStencilOGL* dss = m_om_dss[dssel];
 
@@ -267,5 +267,5 @@ void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, ui
 	// *************************************************************
 	// Dynamic
 	// *************************************************************
-	OMSetBlendState(bs, (float)(int)afix / 0x80);
+	OMSetBlendState(bs, afix);
 }

From 419dfe054464eeacadd1fb725c1ca8a1687571d4 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 26 May 2015 16:16:36 +0200
Subject: [PATCH 3/4] glsl: redo color/alpha management correction

Please test it!

GS supports 3 formats for the output:

32 bits: normal case
=> no change

24 bits: like 32 bits but without alpha channel
=> mask alpha channel (ie don't write it anymore)
=> Always uses 1.0f as blending coefficient

16 bits: RGB5A1, emulated by a 32 bits openGL texture. I think it will be more correct to use
a real 16 bits GL texture. Unfortunately it would cost several (slow) target conversions.
Anyway as a current solution
=>  apply a mask of 0xF8 on color when SW blending is used (improve Castlevania shadow)
unfortunately normal blending mode still uses the full range of colors!

This commit also corrects a couple of blending factor. 128/255 is equivalent to 1.0f in PS2, whereas GPU uses 1.0f. So the blending factor must be 255/128 instead of 2

Note: disable CRC hack and enable accurate_colclip to see Castlevania shadow ^^
(issue #380).
Note2: SW renderer is darker on Castlevania. I don't know why maybe linked to the 16 bits format poorly emulated
---
 plugins/GSdx/GSRendererOGL.cpp    |  7 ++++++
 plugins/GSdx/res/glsl/tfx_fs.glsl | 37 ++++++++++++++++++++++++-------
 plugins/GSdx/res/glsl_source.h    | 37 ++++++++++++++++++++++++-------
 3 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 42b5f4261f..030961b540 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -288,6 +288,13 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 	}
 
 	om_csel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
+	if (ps_sel.dfmt == 1) {
+		// 24 bits no alpha channel so use 1.0f fix factor as equivalent
+		ALPHA.C = 2;
+		afix = 1.0f;
+		// Disable writing of the alpha channel
+		om_csel.wa = 0;
+	}
 
 	if (DATE) {
 		if (GLLoader::found_GL_ARB_texture_barrier && !PrimitiveOverlap()) {
diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index a36fcb1444..c0f8e00056 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -404,8 +404,13 @@ vec4 ps_color()
 void ps_blend(inout vec4 c, in float As)
 {
 	vec4 rt = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0);
+#if PS_DFMT == FMT_24
+	float Ad = 1.0f;
+#else
+	// FIXME FMT_16 case
 	// FIXME Ad or Ad * 2?
-	float Ad = rt.a;
+	float Ad = rt.a * 255.0f / 128.0f;
+#endif
 	// Let the compiler do its jobs !
 	vec3 Cd = rt.rgb;
 	vec3 Cs = c.rgb;
@@ -640,12 +645,26 @@ void ps_blend(inout vec4 c, in float As)
 
 #endif
 
-#if PS_COLCLIP == 3
+	// FIXME dithering
+
+	// Correct the Color value based on the output format
+#if PS_COLCLIP != 3
+	// Standard Clamp
+	c.rgb = clamp(c.rgb, vec3(0.0f), vec3(1.0f));
+#endif
+
+#if PS_DFMT == FMT_16
+	// In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania
+
+	// Basically we want to do 'c.rgb &= 0xF8' in denormalized mode
+	c.rgb = vec3(uvec3((c.rgb * 255.0f) + 256.5f) & uvec3(0xF8)) / 255.0f;
+#elif PS_COLCLIP == 3
+	// Basically we want to do 'c.rgb &= 0xFF' in denormalized mode
 	c.rgb = vec3(uvec3((c.rgb * 255.0f) + 256.5f) & uvec3(0xFF)) / 255.0f;
+#endif
 
 	// Don't compile => unable to find compatible overloaded function "mod(vec3)"
 	//c.rgb = mod((c.rgb * 255.0f) + 256.5f) / 255.0f;
-#endif
 }
 
 void ps_main()
@@ -700,14 +719,16 @@ void ps_main()
 	c.a = 0.5f;
 #endif
 
-	float alpha = c.a * 2.0;
+	// Must be done before alpha correction
+	float alpha = c.a * 255.0f / 128.0f;
 
-#if (PS_AOUT != 0) // 16 bit output
+	// Correct the ALPHA value based on the output format
+	// FIXME add support of alpha mask to replace properly PS_AOUT
+#if (PS_DFMT == FMT_16) || (PS_AOUT)
 	float a = 128.0f / 255.0; // alpha output will be 0x80
-
 	c.a = (PS_FBA != 0) ? a : step(0.5, c.a) * a;
-#elif (PS_FBA != 0)
-	if(c.a < 0.5) c.a += 0.5;
+#elif (PS_DFMT == FMT_32) && (PS_FBA != 0)
+	if(c.a < 0.5) c.a += 128.0f/255.0f;
 #endif
 
 	// Get first primitive that will write a failling alpha value
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index 8a1fa11d27..1a6cc4dacd 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -1157,8 +1157,13 @@ static const char* tfx_fs_all_glsl =
 	"void ps_blend(inout vec4 c, in float As)\n"
 	"{\n"
 	"	vec4 rt = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0);\n"
+	"#if PS_DFMT == FMT_24\n"
+	"	float Ad = 1.0f;\n"
+	"#else\n"
+	"	// FIXME FMT_16 case\n"
 	"	// FIXME Ad or Ad * 2?\n"
-	"	float Ad = rt.a;\n"
+	"	float Ad = rt.a * 255.0f / 128.0f;\n"
+	"#endif\n"
 	"	// Let the compiler do its jobs !\n"
 	"	vec3 Cd = rt.rgb;\n"
 	"	vec3 Cs = c.rgb;\n"
@@ -1393,12 +1398,26 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"#endif\n"
 	"\n"
-	"#if PS_COLCLIP == 3\n"
+	"	// FIXME dithering\n"
+	"\n"
+	"	// Correct the Color value based on the output format\n"
+	"#if PS_COLCLIP != 3\n"
+	"	// Standard Clamp\n"
+	"	c.rgb = clamp(c.rgb, vec3(0.0f), vec3(1.0f));\n"
+	"#endif\n"
+	"\n"
+	"#if PS_DFMT == FMT_16\n"
+	"	// In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania\n"
+	"\n"
+	"	// Basically we want to do 'c.rgb &= 0xF8' in denormalized mode\n"
+	"	c.rgb = vec3(uvec3((c.rgb * 255.0f) + 256.5f) & uvec3(0xF8)) / 255.0f;\n"
+	"#elif PS_COLCLIP == 3\n"
+	"	// Basically we want to do 'c.rgb &= 0xFF' in denormalized mode\n"
 	"	c.rgb = vec3(uvec3((c.rgb * 255.0f) + 256.5f) & uvec3(0xFF)) / 255.0f;\n"
+	"#endif\n"
 	"\n"
 	"	// Don't compile => unable to find compatible overloaded function \"mod(vec3)\"\n"
 	"	//c.rgb = mod((c.rgb * 255.0f) + 256.5f) / 255.0f;\n"
-	"#endif\n"
 	"}\n"
 	"\n"
 	"void ps_main()\n"
@@ -1453,14 +1472,16 @@ static const char* tfx_fs_all_glsl =
 	"	c.a = 0.5f;\n"
 	"#endif\n"
 	"\n"
-	"	float alpha = c.a * 2.0;\n"
+	"	// Must be done before alpha correction\n"
+	"	float alpha = c.a * 255.0f / 128.0f;\n"
 	"\n"
-	"#if (PS_AOUT != 0) // 16 bit output\n"
+	"	// Correct the ALPHA value based on the output format\n"
+	"	// FIXME add support of alpha mask to replace properly PS_AOUT\n"
+	"#if (PS_DFMT == FMT_16) || (PS_AOUT)\n"
 	"	float a = 128.0f / 255.0; // alpha output will be 0x80\n"
-	"\n"
 	"	c.a = (PS_FBA != 0) ? a : step(0.5, c.a) * a;\n"
-	"#elif (PS_FBA != 0)\n"
-	"	if(c.a < 0.5) c.a += 0.5;\n"
+	"#elif (PS_DFMT == FMT_32) && (PS_FBA != 0)\n"
+	"	if(c.a < 0.5) c.a += 128.0f/255.0f;\n"
 	"#endif\n"
 	"\n"
 	"	// Get first primitive that will write a failling alpha value\n"

From c43ddaec4f765bddd2775a2d64571a32c52c6ad8 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 26 May 2015 17:03:13 +0200
Subject: [PATCH 4/4] gsdx: add Castlevania hack explanation

---
 plugins/GSdx/GSState.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index 16fbd40b49..c55bdf73f3 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -4314,6 +4314,18 @@ bool GSC_Castlevania(const GSFrameInfo& fi, int& skip)
 {
 	if(skip == 0)
 	{
+		// This hack removes the shadows and globally darker image
+		// I think there are 2 issues on GSdx
+		//
+		// 1/ potential not correctly supported colclip.
+		//
+		// 2/ use of a 32 bits format to emulate a 16 bit formats
+		// For example, if you blend 64 time the value 4 on a dark destination pixels
+		//
+		// FMT32: 4*64 = 256 <= white pixels
+		//
+		// FMT16: output of blending will always be 0 because the 3 lsb of color is dropped.
+		//		  Therefore the pixel remains dark !!!
 		if(fi.TME && fi.FBP == 0 && fi.TBP0 && fi.TPSM == 10 && fi.FBMSK == 0xFFFFFF)
 		{
 			skip = 2;