diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp
index 0fef4ec759..618d8f2a6b 100644
--- a/plugins/GSdx/GSDeviceOGL.cpp
+++ b/plugins/GSdx/GSDeviceOGL.cpp
@@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
 	std::string macro = format("#define PS_FST %d\n", sel.fst)
 		+ format("#define PS_WMS %d\n", sel.wms)
 		+ format("#define PS_WMT %d\n", sel.wmt)
-		+ format("#define PS_FMT %d\n", sel.fmt)
-		+ format("#define PS_IFMT %d\n", sel.ifmt)
+		+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
 		+ format("#define PS_DFMT %d\n", sel.dfmt)
 		+ format("#define PS_AEM %d\n", sel.aem)
 		+ format("#define PS_TFX %d\n", sel.tfx)
@@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest()
 	PRINT_TEST("Tfx/Tcc");
 
 	// Test: Texture Sampling
-	for (int fmt = 0; fmt < 8; fmt++) {
+	for (int fmt = 0; fmt < 16; fmt++) {
 		if ((fmt & 3) == 3) continue;
 
 		for (int ltf = 0; ltf < 2; ltf++) {
 			for (int aem = 0; aem < 2; aem++) {
-				for (int ifmt = 0; ifmt < 3; ifmt++) {
-					for (int wms = 1; wms < 4; wms++) {
-						for (int wmt = 1; wmt < 4; wmt++) {
-							PSSelector sel;
-							sel.atst = 1;
-							sel.tfx = 1;
-							sel.tcc = 1;
-							sel.fst = 1;
+				for (int wms = 1; wms < 4; wms++) {
+					for (int wmt = 1; wmt < 4; wmt++) {
+						PSSelector sel;
+						sel.atst = 1;
+						sel.tfx  = 1;
+						sel.tcc  = 1;
+						sel.fst = 1;
 
-							sel.ltf = ltf;
-							sel.aem = aem;
-							sel.fmt = fmt;
-							sel.ifmt = ifmt;
-							sel.wms = wms;
-							sel.wmt = wmt;
-							std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
-									ltf, aem, fmt, ifmt, wms, wmt);
-							RUN_TEST;
-						}
+						sel.ltf     = ltf;
+						sel.aem     = aem;
+						sel.tex_fmt = fmt;
+						sel.wms     = wms;
+						sel.wmt     = wmt;
+						std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
+								ltf, aem, fmt, wms, wmt);
+						RUN_TEST;
 					}
 				}
 			}
diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h
index 04bd033905..d170650dfa 100644
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
 			{
 				// *** Word 1
 				// Format
-				uint32 fmt:3;
-				uint32 ifmt:2;
+				uint32 tex_fmt:4;
 				uint32 dfmt:2;
 				// Alpha extension/Correction
 				uint32 aem:1;
@@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
 				uint32 write_rg:1;
 				uint32 fbmask:1;
 
-				uint32 _free1:1;
+				uint32 _free1:2;
 
 				// *** Word 2
 				// Blend and Colclip
diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 506498075f..bb3a8c7b0d 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.wms = m_context->CLAMP.WMS;
 		ps_sel.wmt = m_context->CLAMP.WMT;
 
+		// Performance note:
+		// 1/ Don't set 0 as it is the default value
+		// 2/ Only keep aem when it is useful (avoid useless shader permutation)
 		if (ps_sel.shuffle) {
-			ps_sel.fmt = 0;
-		} else if (tex->m_palette) {
-			ps_sel.fmt = cpsm.fmt | 4;
-			ps_sel.ifmt = !tex->m_target ? 0
-				: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
-				: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
-				: 0;
+			// Force a 32 bits access (normally shuffle is done on 16 bits)
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			ps_sel.aem     = m_env.TEXA.AEM;
+			ASSERT(tex->m_target);
 
-			// In standard mode palette is only used when alpha channel of the RT is
-			// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
-			// It is a very bad idea to force bilinear filtering on it.
-			if (tex->m_target)
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// FIXME: it is likely a bad idea to do the bilinear interpolation here
+			// bilinear &= m_vt.IsLinear();
+
+		} else if (tex->m_target) {
+			// Use an old target. AEM and index aren't resolved it must be done
+			// on the GPU
+
+			// Select the 32/24/16 bits color (AEM)
+			ps_sel.tex_fmt = cpsm.fmt;
+			ps_sel.aem     = m_env.TEXA.AEM;
+
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// Select the index format
+			if (tex->m_palette) {
+				// FIXME Potentially improve fmt field in GSLocalMemory
+				if (m_context->TEX0.PSM == PSM_PSMT4HL)
+					ps_sel.tex_fmt |= 1 << 2;
+				else if (m_context->TEX0.PSM == PSM_PSMT4HH)
+					ps_sel.tex_fmt |= 2 << 2;
+				else
+					ps_sel.tex_fmt |= 3 << 2;
+
+				// Alpha channel of the RT is reinterpreted as an index. Star
+				// Ocean 3 uses it to emulate a stencil buffer.  It is a very
+				// bad idea to force bilinear filtering on it.
 				bilinear &= m_vt.IsLinear();
+			}
+
+		} else if (tex->m_palette) {
+			// Use a standard 8 bits texture. AEM is already done on the CLUT
+			// Therefore you only need to set the index
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
+
+			// Note 4 bits indexes are converted to 8 bits
+			ps_sel.tex_fmt = 3 << 2;
 
-			//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
 		} else {
-			ps_sel.fmt = cpsm.fmt;
+			// Standard texture. Both index and AEM expansion were already done by the CPU.
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
 		}
-		ps_sel.aem = m_env.TEXA.AEM;
 
 		if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
 			// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
@@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
 		ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
 
-		GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
-		ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));
 
 		// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
 		ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);
diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index 754ffbf9b0..67c71fba46 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -6,7 +6,9 @@
 #define FMT_32 0
 #define FMT_24 1
 #define FMT_16 2
-#define FMT_PAL 4 /* flag bit */
+
+#define PS_PAL_FMT (PS_TEX_FMT >> 2)
+#define PS_AEM_FMT (PS_TEX_FMT & 3)
 
 // APITRACE_DEBUG enables forced pixel output to easily detect
 // the fragment computed by primitive
@@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)
 
     uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
 
-#if PS_IFMT == 1
-    // 4HH
-    return vec4(i >> 4u) / 255.0f;
-
-#elif PS_IFMT == 2
-    // 4HL
+#if PS_PAL_FMT == 1
+	// 4HL
     return vec4(i & 0xFu) / 255.0f;
 
+#elif PS_PAL_FMT == 2
+	// 4HH
+    return vec4(i >> 4u) / 255.0f;
+
 #else
     // Most of texture will hit this code so keep normalized float value
 
@@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
     vec2 dd;
 
     // FIXME I'm not sure this condition is useful (I think code will be optimized)
-#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
+#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
     // No software LTF and pure 32 bits RGBA texure without special texture wrapping
     c[0] = sample_c(st);
 #ifdef TEX_COORD_DEBUG
@@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)
 
     uv = clamp_wrap_uv(uv);
 
-    if((PS_FMT & FMT_PAL) != 0)
-    {
-        c = sample_4p(sample_4_index(uv));
-    }
-    else
-    {
-        c = sample_4c(uv);
-    }
+#if PS_PAL_FMT != 0
+    c = sample_4p(sample_4_index(uv));
+#else
+    c = sample_4c(uv);
+#endif
+
 #ifdef TEX_COORD_DEBUG
     c[0].rg = uv.xy;
     c[1].rg = uv.xy;
@@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q)
 
 #endif
 
-    // PERF: see the impact of the exansion before/after the interpolation
-    for (int i = 0; i < 4; i++)
-    {
-        // PERF note: using dot product reduces by 1 the number of instruction
-        // but I'm not sure it is equivalent neither faster.
+	// PERF note: using dot product reduces by 1 the number of instruction
+	// but I'm not sure it is equivalent neither faster.
+	for (int i = 0; i < 4; i++)
+	{
         //float sum = dot(c[i].rgb, vec3(1.0f));
-#if ((PS_FMT & ~FMT_PAL) == FMT_24)
-        c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
-        //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
-#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
-        c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
-        //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#if (PS_AEM_FMT == FMT_24)
+		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
+		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#elif (PS_AEM_FMT == FMT_16)
+		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
+		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
 #endif
     }
 
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index fc9ecc217b..ccffb134a8 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
 	"#define FMT_32 0\n"
 	"#define FMT_24 1\n"
 	"#define FMT_16 2\n"
-	"#define FMT_PAL 4 /* flag bit */\n"
+	"\n"
+	"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
+	"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
 	"\n"
 	"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
 	"// the fragment computed by primitive\n"
@@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
 	"\n"
-	"#if PS_IFMT == 1\n"
-	"    // 4HH\n"
-	"    return vec4(i >> 4u) / 255.0f;\n"
-	"\n"
-	"#elif PS_IFMT == 2\n"
-	"    // 4HL\n"
+	"#if PS_PAL_FMT == 1\n"
+	"	// 4HL\n"
 	"    return vec4(i & 0xFu) / 255.0f;\n"
 	"\n"
+	"#elif PS_PAL_FMT == 2\n"
+	"	// 4HH\n"
+	"    return vec4(i >> 4u) / 255.0f;\n"
+	"\n"
 	"#else\n"
 	"    // Most of texture will hit this code so keep normalized float value\n"
 	"\n"
@@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
 	"    vec2 dd;\n"
 	"\n"
 	"    // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
-	"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
+	"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
 	"    // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
 	"    c[0] = sample_c(st);\n"
 	"#ifdef TEX_COORD_DEBUG\n"
@@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uv = clamp_wrap_uv(uv);\n"
 	"\n"
-	"    if((PS_FMT & FMT_PAL) != 0)\n"
-	"    {\n"
-	"        c = sample_4p(sample_4_index(uv));\n"
-	"    }\n"
-	"    else\n"
-	"    {\n"
-	"        c = sample_4c(uv);\n"
-	"    }\n"
+	"#if PS_PAL_FMT != 0\n"
+	"    c = sample_4p(sample_4_index(uv));\n"
+	"#else\n"
+	"    c = sample_4c(uv);\n"
+	"#endif\n"
+	"\n"
 	"#ifdef TEX_COORD_DEBUG\n"
 	"    c[0].rg = uv.xy;\n"
 	"    c[1].rg = uv.xy;\n"
@@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"#endif\n"
 	"\n"
-	"    // PERF: see the impact of the exansion before/after the interpolation\n"
-	"    for (int i = 0; i < 4; i++)\n"
-	"    {\n"
-	"        // PERF note: using dot product reduces by 1 the number of instruction\n"
-	"        // but I'm not sure it is equivalent neither faster.\n"
+	"	// PERF note: using dot product reduces by 1 the number of instruction\n"
+	"	// but I'm not sure it is equivalent neither faster.\n"
+	"	for (int i = 0; i < 4; i++)\n"
+	"	{\n"
 	"        //float sum = dot(c[i].rgb, vec3(1.0f));\n"
-	"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
-	"        c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
-	"        //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
-	"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
-	"        c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
-	"        //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#if (PS_AEM_FMT == FMT_24)\n"
+	"		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#elif (PS_AEM_FMT == FMT_16)\n"
+	"		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
 	"#endif\n"
 	"    }\n"
 	"\n"