diff --git a/plugins/GSdx/res/glsl/convert.glsl b/plugins/GSdx/res/glsl/convert.glsl
index f7027a0a2f..f3ef9887b6 100644
--- a/plugins/GSdx/res/glsl/convert.glsl
+++ b/plugins/GSdx/res/glsl/convert.glsl
@@ -135,6 +135,7 @@ void ps_main1()
 	// shift Alpha: -7 + 15
     highp uvec4 i = uvec4(c * vec4(1/8.0f, 4.0f, 128.0f, 256.0f)); // Shift value
 
+	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
     SV_Target1 = (i.r & uint(0x001f)) | (i.g & uint(0x03e0)) | (i.b & uint(0x7c00)) | (i.a & uint(0x8000));
 
 #else
@@ -146,6 +147,7 @@ void ps_main1()
 
 	highp uvec4 i = uvec4(c * vec4(uint(0x001f), uint(0x03e0), uint(0x7c00), uint(0x8000)));
 
+	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
     SV_Target1 = (i.x & uint(0x001f)) | (i.y & uint(0x03e0)) | (i.z & uint(0x7c00)) | (i.w & uint(0x8000));
 #endif
 
diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl
index 14916a2d5e..27f5065d0c 100644
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@@ -1,6 +1,7 @@
 //#version 420 // Keep it for text editor detection
 
-// note lerp => mix
+// Require for bit operation
+//#extension GL_ARB_gpu_shader5 : enable
 
 #define FMT_32 0
 #define FMT_24 1
@@ -159,7 +160,8 @@ mat4 sample_4c(vec4 uv)
 {
 	mat4 c;
 
-	// FIXME investigate texture gather (filtering impact?)
+    // Note: texture gather can't be used because of special clamping/wrapping
+    // Also it doesn't support lod
 	c[0] = sample_c(uv.xy);
 	c[1] = sample_c(uv.zy);
 	c[2] = sample_c(uv.xw);
@@ -177,7 +179,8 @@ uvec4 sample_4_index(vec4 uv)
 	//
 	// Or we have an old RT (ie RGBA8) that contains index (4/8) in the alpha channel
 
-	// FIXME investigate texture gather (filtering impact?)
+    // Note: texture gather can't be used because of special clamping/wrapping
+    // Also it doesn't support lod
 	c.x = sample_c(uv.xy).a;
 	c.y = sample_c(uv.zy).a;
 	c.z = sample_c(uv.xw).a;
@@ -266,10 +269,15 @@ vec4 sample_color(vec2 st, float q)
 	// PERF: see the impact of the exansion before/after the interpolation
 	for (int i = 0; i < 4; i++)
 	{
+        // PERF note: using dot produce reduces by 1 the number of instruction
+        // but I'm not it is equivalent neither faster.
+        //float sum = dot(c[i].rgb, vec3(1.0f));
 #if ((PS_FMT & ~FMT_PAL) == FMT_24)
 		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
+		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
 #elif ((PS_FMT & ~FMT_PAL) == FMT_16)
 		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
+		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
 #endif
 	}
 
@@ -540,6 +548,11 @@ void ps_main()
 	// Note: GLSL 4.50/GL_EXT_shader_integer_mix support a mix instruction to select a component\n"
 	// However Nvidia emulate it with an if (at least on kepler arch) ...\n"
 #if PS_READ_BA
+	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
+	// uint my_ta = (bool(bitfieldExtract(denorm_c.a, 7, 1))) ? denorm_TA.y : denorm_TA.x;
+	// denorm_c.a = bitfieldInsert(denorm_c.a, bitfieldExtract(my_ta, 7, 1), 7, 1);
+	// c.ga = vec2(float(denorm_c.a)/ 255.0f);
+
 	if (bool(denorm_c.a & 0x80u))
 		c.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)) / 255.0f);
 	else
diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h
index 06ec947f1d..b6b220dd09 100644
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@@ -160,6 +160,7 @@ static const char* convert_glsl =
 	"	// shift Alpha: -7 + 15\n"
 	"    highp uvec4 i = uvec4(c * vec4(1/8.0f, 4.0f, 128.0f, 256.0f)); // Shift value\n"
 	"\n"
+	"	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below\n"
 	"    SV_Target1 = (i.r & uint(0x001f)) | (i.g & uint(0x03e0)) | (i.b & uint(0x7c00)) | (i.a & uint(0x8000));\n"
 	"\n"
 	"#else\n"
@@ -171,6 +172,7 @@ static const char* convert_glsl =
 	"\n"
 	"	highp uvec4 i = uvec4(c * vec4(uint(0x001f), uint(0x03e0), uint(0x7c00), uint(0x8000)));\n"
 	"\n"
+	"	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below\n"
 	"    SV_Target1 = (i.x & uint(0x001f)) | (i.y & uint(0x03e0)) | (i.z & uint(0x7c00)) | (i.w & uint(0x8000));\n"
 	"#endif\n"
 	"\n"
@@ -861,7 +863,8 @@ static const char* tfx_vgs_glsl =
 static const char* tfx_fs_all_glsl =
 	"//#version 420 // Keep it for text editor detection\n"
 	"\n"
-	"// note lerp => mix\n"
+	"// Require for bit operation\n"
+	"//#extension GL_ARB_gpu_shader5 : enable\n"
 	"\n"
 	"#define FMT_32 0\n"
 	"#define FMT_24 1\n"
@@ -1020,7 +1023,8 @@ static const char* tfx_fs_all_glsl =
 	"{\n"
 	"	mat4 c;\n"
 	"\n"
-	"	// FIXME investigate texture gather (filtering impact?)\n"
+	"    // Note: texture gather can't be used because of special clamping/wrapping\n"
+	"    // Also it doesn't support lod\n"
 	"	c[0] = sample_c(uv.xy);\n"
 	"	c[1] = sample_c(uv.zy);\n"
 	"	c[2] = sample_c(uv.xw);\n"
@@ -1038,7 +1042,8 @@ static const char* tfx_fs_all_glsl =
 	"	//\n"
 	"	// Or we have an old RT (ie RGBA8) that contains index (4/8) in the alpha channel\n"
 	"\n"
-	"	// FIXME investigate texture gather (filtering impact?)\n"
+	"    // Note: texture gather can't be used because of special clamping/wrapping\n"
+	"    // Also it doesn't support lod\n"
 	"	c.x = sample_c(uv.xy).a;\n"
 	"	c.y = sample_c(uv.zy).a;\n"
 	"	c.z = sample_c(uv.xw).a;\n"
@@ -1127,10 +1132,15 @@ static const char* tfx_fs_all_glsl =
 	"	// PERF: see the impact of the exansion before/after the interpolation\n"
 	"	for (int i = 0; i < 4; i++)\n"
 	"	{\n"
+	"        // PERF note: using dot produce reduces by 1 the number of instruction\n"
+	"        // but I'm not it is equivalent neither faster.\n"
+	"        //float sum = dot(c[i].rgb, vec3(1.0f));\n"
 	"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
 	"		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
 	"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
 	"		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
 	"#endif\n"
 	"	}\n"
 	"\n"
@@ -1401,6 +1411,11 @@ static const char* tfx_fs_all_glsl =
 	"	// Note: GLSL 4.50/GL_EXT_shader_integer_mix support a mix instruction to select a component\\n\"\n"
 	"	// However Nvidia emulate it with an if (at least on kepler arch) ...\\n\"\n"
 	"#if PS_READ_BA\n"
+	"	// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below\n"
+	"	// uint my_ta = (bool(bitfieldExtract(denorm_c.a, 7, 1))) ? denorm_TA.y : denorm_TA.x;\n"
+	"	// denorm_c.a = bitfieldInsert(denorm_c.a, bitfieldExtract(my_ta, 7, 1), 7, 1);\n"
+	"	// c.ga = vec2(float(denorm_c.a)/ 255.0f);\n"
+	"\n"
 	"	if (bool(denorm_c.a & 0x80u))\n"
 	"		c.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)) / 255.0f);\n"
 	"	else\n"