diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp
index 0fef4ec759..618d8f2a6b 100644
--- a/plugins/GSdx/GSDeviceOGL.cpp
+++ b/plugins/GSdx/GSDeviceOGL.cpp
@@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
 	std::string macro = format("#define PS_FST %d\n", sel.fst)
 		+ format("#define PS_WMS %d\n", sel.wms)
 		+ format("#define PS_WMT %d\n", sel.wmt)
-		+ format("#define PS_FMT %d\n", sel.fmt)
-		+ format("#define PS_IFMT %d\n", sel.ifmt)
+		+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
 		+ format("#define PS_DFMT %d\n", sel.dfmt)
 		+ format("#define PS_AEM %d\n", sel.aem)
 		+ format("#define PS_TFX %d\n", sel.tfx)
@@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest()
 	PRINT_TEST("Tfx/Tcc");
 
 	// Test: Texture Sampling
-	for (int fmt = 0; fmt < 8; fmt++) {
+	for (int fmt = 0; fmt < 16; fmt++) {
 		if ((fmt & 3) == 3) continue;
 
 		for (int ltf = 0; ltf < 2; ltf++) {
 			for (int aem = 0; aem < 2; aem++) {
-				for (int ifmt = 0; ifmt < 3; ifmt++) {
-					for (int wms = 1; wms < 4; wms++) {
-						for (int wmt = 1; wmt < 4; wmt++) {
-							PSSelector sel;
-							sel.atst = 1;
-							sel.tfx = 1;
-							sel.tcc = 1;
-							sel.fst = 1;
+				for (int wms = 1; wms < 4; wms++) {
+					for (int wmt = 1; wmt < 4; wmt++) {
+						PSSelector sel;
+						sel.atst = 1;
+						sel.tfx  = 1;
+						sel.tcc  = 1;
+						sel.fst = 1;
 
-							sel.ltf = ltf;
-							sel.aem = aem;
-							sel.fmt = fmt;
-							sel.ifmt = ifmt;
-							sel.wms = wms;
-							sel.wmt = wmt;
-							std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
-									ltf, aem, fmt, ifmt, wms, wmt);
-							RUN_TEST;
-						}
+						sel.ltf     = ltf;
+						sel.aem     = aem;
+						sel.tex_fmt = fmt;
+						sel.wms     = wms;
+						sel.wmt     = wmt;
+						std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
+								ltf, aem, fmt, wms, wmt);
+						RUN_TEST;
 					}
 				}
 			}
diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h
index 04bd033905..d170650dfa 100644
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
 			{
 				// *** Word 1
 				// Format
-				uint32 fmt:3;
-				uint32 ifmt:2;
+				uint32 tex_fmt:4;
 				uint32 dfmt:2;
 				// Alpha extension/Correction
 				uint32 aem:1;
@@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
 				uint32 write_rg:1;
 				uint32 fbmask:1;
 
-				uint32 _free1:1;
+				uint32 _free1:2;
 
 				// *** Word 2
 				// Blend and Colclip
diff --git a/plugins/GSdx/GSRendererHW.cpp b/plugins/GSdx/GSRendererHW.cpp
index 4efa2eb041..355aff723b 100644
--- a/plugins/GSdx/GSRendererHW.cpp
+++ b/plugins/GSdx/GSRendererHW.cpp
@@ -396,6 +396,7 @@ void GSRendererHW::Draw()
 			return;
 		}
 
+		// FIXME: Could be removed on openGL
 		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
 		{
 			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp
index 506498075f..bb3a8c7b0d 100644
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.wms = m_context->CLAMP.WMS;
 		ps_sel.wmt = m_context->CLAMP.WMT;
 
+		// Performance note:
+		// 1/ Don't set 0 as it is the default value
+		// 2/ Only keep aem when it is useful (avoid useless shader permutation)
 		if (ps_sel.shuffle) {
-			ps_sel.fmt = 0;
-		} else if (tex->m_palette) {
-			ps_sel.fmt = cpsm.fmt | 4;
-			ps_sel.ifmt = !tex->m_target ? 0
-				: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
-				: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
-				: 0;
+			// Force a 32 bits access (normally shuffle is done on 16 bits)
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			ps_sel.aem     = m_env.TEXA.AEM;
+			ASSERT(tex->m_target);
 
-			// In standard mode palette is only used when alpha channel of the RT is
-			// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
-			// It is a very bad idea to force bilinear filtering on it.
-			if (tex->m_target)
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// FIXME: it is likely a bad idea to do the bilinear interpolation here
+			// bilinear &= m_vt.IsLinear();
+
+		} else if (tex->m_target) {
+			// Use an old target. AEM and index aren't resolved it must be done
+			// on the GPU
+
+			// Select the 32/24/16 bits color (AEM)
+			ps_sel.tex_fmt = cpsm.fmt;
+			ps_sel.aem     = m_env.TEXA.AEM;
+
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// Select the index format
+			if (tex->m_palette) {
+				// FIXME Potentially improve fmt field in GSLocalMemory
+				if (m_context->TEX0.PSM == PSM_PSMT4HL)
+					ps_sel.tex_fmt |= 1 << 2;
+				else if (m_context->TEX0.PSM == PSM_PSMT4HH)
+					ps_sel.tex_fmt |= 2 << 2;
+				else
+					ps_sel.tex_fmt |= 3 << 2;
+
+				// Alpha channel of the RT is reinterpreted as an index. Star
+				// Ocean 3 uses it to emulate a stencil buffer.  It is a very
+				// bad idea to force bilinear filtering on it.
 				bilinear &= m_vt.IsLinear();
+			}
+
+		} else if (tex->m_palette) {
+			// Use a standard 8 bits texture. AEM is already done on the CLUT
+			// Therefore you only need to set the index
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
+
+			// Note 4 bits indexes are converted to 8 bits
+			ps_sel.tex_fmt = 3 << 2;
 
-			//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
 		} else {
-			ps_sel.fmt = cpsm.fmt;
+			// Standard texture. Both index and AEM expansion were already done by the CPU.
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
 		}
-		ps_sel.aem = m_env.TEXA.AEM;
 
 		if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
 			// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
@@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
 		ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
 
-		GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
-		ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));
 
 		// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
 		ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);
diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp
index e8144130d2..c31c6213cd 100644
--- a/plugins/GSdx/GSTextureCache.cpp
+++ b/plugins/GSdx/GSTextureCache.cpp
@@ -22,9 +22,13 @@
 #include "stdafx.h"
 #include "GSTextureCache.h"
 
+bool s_IS_OPENGL = false;
+
 GSTextureCache::GSTextureCache(GSRenderer* r)
 	: m_renderer(r)
 {
+	s_IS_OPENGL = (theApp.GetConfig("Renderer", 12) == 12);
+
 	m_spritehack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SpriteHack", 0) : 0;
 	UserHacks_HalfPixelOffset = !!theApp.GetConfig("UserHacks", 0) && !!theApp.GetConfig("UserHacks_HalfPixelOffset", 0);
 
@@ -72,12 +76,18 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
 	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
 	//const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[TEX0.CPSM] : psm;
 
-	GIFRegTEXA plainTEXA;
+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		if(psm.pal > 0)
+			m_renderer->m_mem.m_clut.Read32(TEX0, TEXA);
+	} else {
+		GIFRegTEXA plainTEXA;
 
-	plainTEXA.AEM = 1;
-	plainTEXA.TA0 = 0;
-	plainTEXA.TA1 = 0x80;
-	m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
+		plainTEXA.AEM = 1;
+		plainTEXA.TA0 = 0;
+		plainTEXA.TA1 = 0x80;
+		m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
+	}
 
 	const uint32* clut = m_renderer->m_mem.m_clut;
 
@@ -85,26 +95,27 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
 
 	list<Source*>& m = m_src.m_map[TEX0.TBP0 >> 5];
 
+
 	for(list<Source*>::iterator i = m.begin(); i != m.end(); i++)
 	{
 		Source* s = *i;
 
-		if(((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
-		{
+		if (((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
 			continue;
-		}
 
-		// Special check for palette texture (psm.pal > 0)
-		//
-		// if m_paltex is enabled
-		// 1/ s->m_palette must always be defined
-		// 2/ Clut is useless (will be uploaded again at the end of the function)
-		//
-		// if m_paltex is disabled
-		// 1/ Clut must match if m_palette is NULL
-		if(s->m_palette == NULL && psm.pal > 0 && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
-		{
-			continue;
+		// Target are converted (AEM & palette) on the fly by the GPU. They don't need extra check
+		if (!s->m_target) {
+			// We request a palette texture (psm.pal). If the texture was
+			// converted by the CPU (s->m_palette == NULL), we need to ensure
+			// palette content is the same.
+			// Note: content of the palette will be uploaded at the end of the function
+			if (psm.pal > 0 && s->m_palette == NULL && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
+				continue;
+
+			// We request a 24/16 bit RGBA texture. Alpha expansion was done by
+			// the CPU.  We need to check that TEXA is identical
+			if (psm.pal == 0 && psm.fmt > 0 && s->m_TEXA.u64 != TEXA.u64)
+				continue;
 		}
 
 		m.splice(m.begin(), m, i);
@@ -147,7 +158,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
 				uint32 t_psm = (t->m_dirty_alpha) ? t->m_TEX0.PSM & ~0x1 : t->m_TEX0.PSM;
 
 				if (GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t_psm)) {
-					if (!IsOpenGL() && (psm == PSM_PSMT8)) {
+					if (!s_IS_OPENGL && (psm == PSM_PSMT8)) {
 						// OpenGL can convert the texture directly in the GPU. Not sure we want to keep this
 						// code for DX. It fixes effect but it is slow (MGS3)
 
@@ -324,7 +335,7 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
 			//
 			// From a performance point of view, it might cost a little on big upscaling
 			// but normally few RT are miss so it must remain reasonable.
-			if (IsOpenGL()) {
+			if (s_IS_OPENGL) {
 				switch (type) {
 					case RenderTarget: m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); break;
 					case DepthStencil: m_renderer->m_dev->ClearDepth(dst->m_texture, 0); break;
@@ -863,7 +874,7 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con
 		// TODO: clean up this mess
 
 		int shader = dst->m_type != RenderTarget ? ShaderConvert_FLOAT32_TO_RGBA8 : ShaderConvert_COPY;
-		bool is_8bits = TEX0.PSM == PSM_PSMT8 && IsOpenGL();
+		bool is_8bits = TEX0.PSM == PSM_PSMT8 && s_IS_OPENGL;
 
 		if (is_8bits) {
 			GL_INS("Reading RT as a packed-indexed 8 bits format");
@@ -1417,9 +1428,14 @@ void GSTextureCache::Source::Flush(uint32 count)
 
 	GIFRegTEXA plainTEXA;
 
-	plainTEXA.AEM = 1;
-	plainTEXA.TA0 = 0;
-	plainTEXA.TA1 = 0x80;
+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		plainTEXA = m_TEXA;
+	} else {
+		plainTEXA.AEM = 1;
+		plainTEXA.TA0 = 0;
+		plainTEXA.TA1 = 0x80;
+	}
 
 	if(m_palette)
 	{
diff --git a/plugins/GSdx/GSTextureCache.h b/plugins/GSdx/GSTextureCache.h
index 98dc674f42..0c4cea8111 100644
--- a/plugins/GSdx/GSTextureCache.h
+++ b/plugins/GSdx/GSTextureCache.h
@@ -129,7 +129,6 @@ protected:
 #endif
 
 	virtual bool CanConvertDepth() { return m_can_convert_depth; }
-	virtual bool IsOpenGL() { return false; }
 
 public:
 	GSTextureCache(GSRenderer* r);
diff --git a/plugins/GSdx/GSTextureCacheOGL.h b/plugins/GSdx/GSTextureCacheOGL.h
index 4e241d4be4..f3f1216ae0 100644
--- a/plugins/GSdx/GSTextureCacheOGL.h
+++ b/plugins/GSdx/GSTextureCacheOGL.h
@@ -32,8 +32,6 @@ protected:
 
 	void Read(Target* t, const GSVector4i& r);
 
-	virtual bool IsOpenGL() { return true; }
-
 public:
 	GSTextureCacheOGL(GSRenderer* r);
 };
diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index 754ffbf9b0..67c71fba46 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -6,7 +6,9 @@
 #define FMT_32 0
 #define FMT_24 1
 #define FMT_16 2
-#define FMT_PAL 4 /* flag bit */
+
+#define PS_PAL_FMT (PS_TEX_FMT >> 2)
+#define PS_AEM_FMT (PS_TEX_FMT & 3)
 
 // APITRACE_DEBUG enables forced pixel output to easily detect
 // the fragment computed by primitive
@@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)
 
     uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
 
-#if PS_IFMT == 1
-    // 4HH
-    return vec4(i >> 4u) / 255.0f;
-
-#elif PS_IFMT == 2
-    // 4HL
+#if PS_PAL_FMT == 1
+	// 4HL
     return vec4(i & 0xFu) / 255.0f;
 
+#elif PS_PAL_FMT == 2
+	// 4HH
+    return vec4(i >> 4u) / 255.0f;
+
 #else
     // Most of texture will hit this code so keep normalized float value
 
@@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
     vec2 dd;
 
     // FIXME I'm not sure this condition is useful (I think code will be optimized)
-#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
+#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
     // No software LTF and pure 32 bits RGBA texure without special texture wrapping
     c[0] = sample_c(st);
 #ifdef TEX_COORD_DEBUG
@@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)
 
     uv = clamp_wrap_uv(uv);
 
-    if((PS_FMT & FMT_PAL) != 0)
-    {
-        c = sample_4p(sample_4_index(uv));
-    }
-    else
-    {
-        c = sample_4c(uv);
-    }
+#if PS_PAL_FMT != 0
+    c = sample_4p(sample_4_index(uv));
+#else
+    c = sample_4c(uv);
+#endif
+
 #ifdef TEX_COORD_DEBUG
     c[0].rg = uv.xy;
     c[1].rg = uv.xy;
@@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q)
 
 #endif
 
-    // PERF: see the impact of the exansion before/after the interpolation
-    for (int i = 0; i < 4; i++)
-    {
-        // PERF note: using dot product reduces by 1 the number of instruction
-        // but I'm not sure it is equivalent neither faster.
+	// PERF note: using dot product reduces by 1 the number of instruction
+	// but I'm not sure it is equivalent neither faster.
+	for (int i = 0; i < 4; i++)
+	{
         //float sum = dot(c[i].rgb, vec3(1.0f));
-#if ((PS_FMT & ~FMT_PAL) == FMT_24)
-        c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
-        //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
-#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
-        c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
-        //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#if (PS_AEM_FMT == FMT_24)
+		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
+		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#elif (PS_AEM_FMT == FMT_16)
+		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
+		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
 #endif
     }
 
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index fc9ecc217b..ccffb134a8 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
 	"#define FMT_32 0\n"
 	"#define FMT_24 1\n"
 	"#define FMT_16 2\n"
-	"#define FMT_PAL 4 /* flag bit */\n"
+	"\n"
+	"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
+	"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
 	"\n"
 	"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
 	"// the fragment computed by primitive\n"
@@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
 	"\n"
-	"#if PS_IFMT == 1\n"
-	"    // 4HH\n"
-	"    return vec4(i >> 4u) / 255.0f;\n"
-	"\n"
-	"#elif PS_IFMT == 2\n"
-	"    // 4HL\n"
+	"#if PS_PAL_FMT == 1\n"
+	"	// 4HL\n"
 	"    return vec4(i & 0xFu) / 255.0f;\n"
 	"\n"
+	"#elif PS_PAL_FMT == 2\n"
+	"	// 4HH\n"
+	"    return vec4(i >> 4u) / 255.0f;\n"
+	"\n"
 	"#else\n"
 	"    // Most of texture will hit this code so keep normalized float value\n"
 	"\n"
@@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
 	"    vec2 dd;\n"
 	"\n"
 	"    // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
-	"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
+	"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
 	"    // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
 	"    c[0] = sample_c(st);\n"
 	"#ifdef TEX_COORD_DEBUG\n"
@@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uv = clamp_wrap_uv(uv);\n"
 	"\n"
-	"    if((PS_FMT & FMT_PAL) != 0)\n"
-	"    {\n"
-	"        c = sample_4p(sample_4_index(uv));\n"
-	"    }\n"
-	"    else\n"
-	"    {\n"
-	"        c = sample_4c(uv);\n"
-	"    }\n"
+	"#if PS_PAL_FMT != 0\n"
+	"    c = sample_4p(sample_4_index(uv));\n"
+	"#else\n"
+	"    c = sample_4c(uv);\n"
+	"#endif\n"
+	"\n"
 	"#ifdef TEX_COORD_DEBUG\n"
 	"    c[0].rg = uv.xy;\n"
 	"    c[1].rg = uv.xy;\n"
@@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"#endif\n"
 	"\n"
-	"    // PERF: see the impact of the exansion before/after the interpolation\n"
-	"    for (int i = 0; i < 4; i++)\n"
-	"    {\n"
-	"        // PERF note: using dot product reduces by 1 the number of instruction\n"
-	"        // but I'm not sure it is equivalent neither faster.\n"
+	"	// PERF note: using dot product reduces by 1 the number of instruction\n"
+	"	// but I'm not sure it is equivalent neither faster.\n"
+	"	for (int i = 0; i < 4; i++)\n"
+	"	{\n"
 	"        //float sum = dot(c[i].rgb, vec3(1.0f));\n"
-	"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
-	"        c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
-	"        //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
-	"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
-	"        c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
-	"        //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#if (PS_AEM_FMT == FMT_24)\n"
+	"		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#elif (PS_AEM_FMT == FMT_16)\n"
+	"		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
 	"#endif\n"
 	"    }\n"
 	"\n"