From 78dd9577174857f1218b2b87457fa1470a6d7140 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 14 Aug 2015 17:53:41 +0200
Subject: [PATCH 1/4] gsdx-ogl: use normalized index coordinate for palette
 texture

In palette mode, 90% of texture accesses are done in 8 bits.
So let's keep this path as light as possible. It reduces GPU load.
---
 plugins/GSdx/res/glsl/tfx_fs.glsl | 18 +++++++++++-------
 plugins/GSdx/res/glsl_source.h    | 18 +++++++++++-------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index af71932801..df482558ed 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -80,9 +80,9 @@ vec4 sample_c(vec2 uv)
 	return texture(TextureSampler, uv);
 }
 
-vec4 sample_p(uint idx)
+vec4 sample_p(float idx)
 {
-	return texelFetch(PaletteSampler, ivec2(idx, 0u), 0);
+	return texture(PaletteSampler, vec2(idx, 0.0f));
 }
 
 vec4 wrapuv(vec4 uv)
@@ -149,7 +149,7 @@ mat4 sample_4c(vec4 uv)
 	return c;
 }
 
-uvec4 sample_4_index(vec4 uv)
+vec4 sample_4_index(vec4 uv)
 {
 	vec4 c;
 
@@ -169,18 +169,22 @@ uvec4 sample_4_index(vec4 uv)
 
 #if PS_IFMT == 1
 	// 4HH
-	return i >> 4u;
+	return vec4(i >> 4u) / 255.0f;
+
 #elif PS_IFMT == 2
 	// 4HL
-	return i & 0xFu;
+	return vec4(i & 0xFu) / 255.0f;
+
 #else
+	// Most of texture will hit this code so keep normalized float value
+
 	// 8 bits
-	return i;
+	return c;
 #endif
 
 }
 
-mat4 sample_4p(uvec4 u)
+mat4 sample_4p(vec4 u)
 {
 	mat4 c;
 
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index 27cb31c417..3139c86853 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -939,9 +939,9 @@ static const char* tfx_fs_all_glsl =
 	"	return texture(TextureSampler, uv);\n"
 	"}\n"
 	"\n"
-	"vec4 sample_p(uint idx)\n"
+	"vec4 sample_p(float idx)\n"
 	"{\n"
-	"	return texelFetch(PaletteSampler, ivec2(idx, 0u), 0);\n"
+	"	return texture(PaletteSampler, vec2(idx, 0.0f));\n"
 	"}\n"
 	"\n"
 	"vec4 wrapuv(vec4 uv)\n"
@@ -1008,7 +1008,7 @@ static const char* tfx_fs_all_glsl =
 	"	return c;\n"
 	"}\n"
 	"\n"
-	"uvec4 sample_4_index(vec4 uv)\n"
+	"vec4 sample_4_index(vec4 uv)\n"
 	"{\n"
 	"	vec4 c;\n"
 	"\n"
@@ -1028,18 +1028,22 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"#if PS_IFMT == 1\n"
 	"	// 4HH\n"
-	"	return i >> 4u;\n"
+	"	return vec4(i >> 4u) / 255.0f;\n"
+	"\n"
 	"#elif PS_IFMT == 2\n"
 	"	// 4HL\n"
-	"	return i & 0xFu;\n"
+	"	return vec4(i & 0xFu) / 255.0f;\n"
+	"\n"
 	"#else\n"
+	"	// Most of texture will hit this code so keep normalized float value\n"
+	"\n"
 	"	// 8 bits\n"
-	"	return i;\n"
+	"	return c;\n"
 	"#endif\n"
 	"\n"
 	"}\n"
 	"\n"
-	"mat4 sample_4p(uvec4 u)\n"
+	"mat4 sample_4p(vec4 u)\n"
 	"{\n"
 	"	mat4 c;\n"
 	"\n"

From 53d1fdd8f1327eaa81336938ede853879f1ee2b5 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 14 Aug 2015 20:14:36 +0200
Subject: [PATCH 2/4] glsl:debug: disable fst when testing texturing shader

Reduce clutter in ASM dump
---
 plugins/GSdx/GSDeviceOGL.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp
index 2c8e511439..9777b96228 100644
--- a/plugins/GSdx/GSDeviceOGL.cpp
+++ b/plugins/GSdx/GSDeviceOGL.cpp
@@ -826,6 +826,7 @@ void GSDeviceOGL::SelfShaderTest()
 							sel.atst = 1;
 							sel.tfx = 1;
 							sel.tcc = 1;
+							sel.fst = 1;
 
 							sel.ltf = ltf;
 							sel.aem = aem;

From c5a786ed2c27ed0669f9518ce9e7a6e499a09d0c Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 14 Aug 2015 20:57:45 +0200
Subject: [PATCH 3/4] gsdx-ogl: remove support WMS/T == 2 in hardware unit

I think behavior was wrong because only first texel coordinate was clamped.

Beside we can't interpolate if AEM isn't yet applied
---
 plugins/GSdx/GSRendererOGL.cpp    |  2 +-
 plugins/GSdx/res/glsl/tfx_fs.glsl | 38 ++++++++++++++-----------------
 plugins/GSdx/res/glsl_source.h    | 38 ++++++++++++++-----------------
 3 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 043684b00c..1b9e122954 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -781,7 +781,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[m_context->TEX0.PSM];
 		const GSLocalMemory::psm_t &cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[m_context->TEX0.CPSM] : psm;
 		bool bilinear = m_filter == 2 ? m_vt.IsLinear() : m_filter != 0;
-		bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && m_context->CLAMP.WMS < 3 && m_context->CLAMP.WMT < 3;
+		bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && m_context->CLAMP.WMS < 2 && m_context->CLAMP.WMT < 2;
 		// Don't force extra filtering on sprite (it creates various upscaling issue)
 		bilinear &= !((m_vt.m_primclass == GS_SPRITE_CLASS) && m_userhacks_round_sprite_offset && !m_vt.IsLinear());
 
diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index df482558ed..beac76b0f1 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -63,16 +63,25 @@ layout(std140, binding = 21) uniform cb21
 {
 	vec3 FogColor;
 	float AREF;
+
 	vec4 WH;
+
 	vec2 MinF;
 	vec2 TA;
+
 	uvec4 MskFix;
+
 	uvec4 FbMask;
-	vec3 _not_yet_used;
+
+	vec3 _pad1;
 	float Af;
+
 	vec4 HalfTexel;
+
 	vec4 MinMax;
+
 	vec2 TC_OffsetHack;
+	vec2 _pad2;
 };
 
 vec4 sample_c(vec2 uv)
@@ -85,7 +94,7 @@ vec4 sample_p(float idx)
 	return texture(PaletteSampler, vec2(idx, 0.0f));
 }
 
-vec4 wrapuv(vec4 uv)
+vec4 clamp_wrap_uv(vec4 uv)
 {
 	vec4 uv_out = uv;
 
@@ -120,21 +129,6 @@ vec4 wrapuv(vec4 uv)
 	return uv_out;
 }
 
-vec2 clampuv(vec2 uv)
-{
-	vec2 uv_out = uv;
-
-#if (PS_WMS == 2) && (PS_WMT == 2)
-	uv_out = clamp(uv, MinF, MinMax.zw);
-#elif PS_WMS == 2
-	uv_out.x = clamp(uv.x, MinF.x, MinMax.z);
-#elif PS_WMT == 2
-	uv_out.y = clamp(uv.y, MinF.y, MinMax.w);
-#endif
-
-	return uv_out;
-}
-
 mat4 sample_4c(vec4 uv)
 {
 	mat4 c;
@@ -211,10 +205,12 @@ vec4 sample_color(vec2 st, float q)
 	mat4 c;
 	vec2 dd;
 
-#if (PS_LTF == 0 && PS_FMT <= FMT_16 && PS_WMS < 3 && PS_WMT < 3)
-	c[0] = sample_c(clampuv(st));
+    // FIXME I'm not sure this condition is useful (I think code will be optimized)
+#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
+	// No software LTF and pure 32 bits RGBA texure without special texture wrapping
+	c[0] = sample_c(st);
 #ifdef TEX_COORD_DEBUG
-	c[0].rg = clampuv(st).xy;
+	c[0].rg = st.xy;
 #endif
 
 #else
@@ -230,7 +226,7 @@ vec4 sample_color(vec2 st, float q)
 		uv = st.xyxy;
 	}
 
-	uv = wrapuv(uv);
+	uv = clamp_wrap_uv(uv);
 
 	if((PS_FMT & FMT_PAL) != 0)
 	{
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index 3139c86853..413c7bd8fd 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -922,16 +922,25 @@ static const char* tfx_fs_all_glsl =
 	"{\n"
 	"	vec3 FogColor;\n"
 	"	float AREF;\n"
+	"\n"
 	"	vec4 WH;\n"
+	"\n"
 	"	vec2 MinF;\n"
 	"	vec2 TA;\n"
+	"\n"
 	"	uvec4 MskFix;\n"
+	"\n"
 	"	uvec4 FbMask;\n"
-	"	vec3 _not_yet_used;\n"
+	"\n"
+	"	vec3 _pad1;\n"
 	"	float Af;\n"
+	"\n"
 	"	vec4 HalfTexel;\n"
+	"\n"
 	"	vec4 MinMax;\n"
+	"\n"
 	"	vec2 TC_OffsetHack;\n"
+	"	vec2 _pad2;\n"
 	"};\n"
 	"\n"
 	"vec4 sample_c(vec2 uv)\n"
@@ -944,7 +953,7 @@ static const char* tfx_fs_all_glsl =
 	"	return texture(PaletteSampler, vec2(idx, 0.0f));\n"
 	"}\n"
 	"\n"
-	"vec4 wrapuv(vec4 uv)\n"
+	"vec4 clamp_wrap_uv(vec4 uv)\n"
 	"{\n"
 	"	vec4 uv_out = uv;\n"
 	"\n"
@@ -979,21 +988,6 @@ static const char* tfx_fs_all_glsl =
 	"	return uv_out;\n"
 	"}\n"
 	"\n"
-	"vec2 clampuv(vec2 uv)\n"
-	"{\n"
-	"	vec2 uv_out = uv;\n"
-	"\n"
-	"#if (PS_WMS == 2) && (PS_WMT == 2)\n"
-	"	uv_out = clamp(uv, MinF, MinMax.zw);\n"
-	"#elif PS_WMS == 2\n"
-	"	uv_out.x = clamp(uv.x, MinF.x, MinMax.z);\n"
-	"#elif PS_WMT == 2\n"
-	"	uv_out.y = clamp(uv.y, MinF.y, MinMax.w);\n"
-	"#endif\n"
-	"\n"
-	"	return uv_out;\n"
-	"}\n"
-	"\n"
 	"mat4 sample_4c(vec4 uv)\n"
 	"{\n"
 	"	mat4 c;\n"
@@ -1070,10 +1064,12 @@ static const char* tfx_fs_all_glsl =
 	"	mat4 c;\n"
 	"	vec2 dd;\n"
 	"\n"
-	"#if (PS_LTF == 0 && PS_FMT <= FMT_16 && PS_WMS < 3 && PS_WMT < 3)\n"
-	"	c[0] = sample_c(clampuv(st));\n"
+	"    // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
+	"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
+	"	// No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
+	"	c[0] = sample_c(st);\n"
 	"#ifdef TEX_COORD_DEBUG\n"
-	"	c[0].rg = clampuv(st).xy;\n"
+	"	c[0].rg = st.xy;\n"
 	"#endif\n"
 	"\n"
 	"#else\n"
@@ -1089,7 +1085,7 @@ static const char* tfx_fs_all_glsl =
 	"		uv = st.xyxy;\n"
 	"	}\n"
 	"\n"
-	"	uv = wrapuv(uv);\n"
+	"	uv = clamp_wrap_uv(uv);\n"
 	"\n"
 	"	if((PS_FMT & FMT_PAL) != 0)\n"
 	"	{\n"

From 37f9bcf9cb5bb6372acbe4394586fc99e301bc79 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 14 Aug 2015 23:53:01 +0200
Subject: [PATCH 4/4] gsdx-ogl: reduce state change

* don't dirty aref when a fog color is uploaded
* only set clamp mode in clamp mode (region clamp is handled in shader)

v2: fix SSE2/3 compilation
---
 plugins/GSdx/GSRendererOGL.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 1b9e122954..63d6cb81c1 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -760,7 +760,13 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 	{
 		ps_sel.fog = 1;
 
-		ps_cb.FogColor_AREF = GSVector4::rgba32(m_env.FOGCOL.u32[0]);
+		GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]);
+#if _M_SSE >= 0x401
+		// Blend AREF to avoid to load a random value for alpha (dirty cache)
+		ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF);
+#else
+		ps_cb.FogColor_AREF = fc;
+#endif
 	}
 
 	if (m_context->TEST.ATE)
@@ -831,13 +837,17 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 
 		if (PRIM->FST)
 		{
+			// FIXME move it in the ps_cb
 			vs_cb.TextureScale = GSVector4(1.0f / 16) / WH.xyxy();
 			ps_sel.fst = 1;
 		}
 
 		ps_cb.WH = WH;
 		ps_cb.HalfTexel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw();
-		ps_cb.MskFix = GSVector4i(m_context->CLAMP.MINU, m_context->CLAMP.MINV, m_context->CLAMP.MAXU, m_context->CLAMP.MAXV);
+		if ((m_context->CLAMP.WMS | m_context->CLAMP.WMT) > 1) {
+			ps_cb.MskFix = GSVector4i(m_context->CLAMP.MINU, m_context->CLAMP.MINV, m_context->CLAMP.MAXU, m_context->CLAMP.MAXV);
+			ps_cb.MinMax = GSVector4(ps_cb.MskFix) / WH.xyxy();
+		}
 
 		// TC Offset Hack
 		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
@@ -849,8 +859,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_cb.MinMax = clamp / WH.xyxy();
 		ps_cb.MinF_TA = (clamp + 0.5f).xyxy(ta) / WH.xyxy(GSVector4(255, 255));
 
-		ps_ssel.tau = (m_context->CLAMP.WMS + 3) >> 1;
-		ps_ssel.tav = (m_context->CLAMP.WMT + 3) >> 1;
+		// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
+		ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);
+		ps_ssel.tav = (m_context->CLAMP.WMT != CLAMP_CLAMP);
 		ps_ssel.ltf = bilinear && simple_sample;
 
 		// Setup Texture ressources