gsdx-ogl: wipeout subroutine code

Code was completey bitrotten Code was a partial test (and yet 500 lines already) Shader is more and more complex and multithreading support greatly reduce the cost of shader switch
2015-07-17 18:16:35 +02:00 · 2015-07-17 18:16:35 +02:00 · b4c04ed00a
parent e3751f6cd9
commit b4c04ed00a
14 changed files with 9 additions and 814 deletions
--- a/linux_various/glsl2h.pl
+++ b/linux_various/glsl2h.pl
@ -42,7 +42,7 @@ my $gsdx_out = File::Spec->catdir($gsdx_path, "glsl_source.h");

 # Just a hack to reuse glsl2h function easily
 $gsdx_path = File::Spec->catdir(dirname(abs_path($0)), "..", "plugins", "GSdx", "res", "glsl");
-my @tfx_res = qw/tfx_fs.glsl tfx_fs_subroutine.glsl/;
+my @tfx_res = qw/tfx_fs.glsl/;
 my $tfx_all = File::Spec->catdir($gsdx_path, "tfx_fs_all.glsl");

 my @gsdx_res = qw/convert.glsl interlace.glsl merge.glsl shadeboost.glsl tfx_vgs.glsl tfx_fs_all.glsl fxaa.fx/;
--- a/plugins/GSdx/GLLoader.cpp
+++ b/plugins/GSdx/GLLoader.cpp
@ -87,7 +87,6 @@ PFNGLFLUSHMAPPEDBUFFERRANGEPROC        gl_FlushMappedBufferRange            = NU
 PFNGLBLENDEQUATIONSEPARATEPROC         gl_BlendEquationSeparate             = NULL;
 PFNGLBLENDFUNCSEPARATEPROC             gl_BlendFuncSeparate                 = NULL;
 // GL4.0
-PFNGLUNIFORMSUBROUTINESUIVPROC         gl_UniformSubroutinesuiv             = NULL;
 // GL4.1
 PFNGLBINDPROGRAMPIPELINEPROC           gl_BindProgramPipeline               = NULL;
 PFNGLGENPROGRAMPIPELINESPROC           gl_GenProgramPipelines               = NULL;
@ -339,13 +338,12 @@ namespace GLLoader {
 	bool found_GL_ARB_draw_buffers_blend = false; // DX10 GPU limited driver on windows!

 	// Note: except Apple, all drivers support explicit uniform location
-	bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture
+	bool found_GL_ARB_explicit_uniform_location = false; // need by bindless texture
 	// GL4 hardware
 	bool found_GL_ARB_buffer_storage = false;
 	bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
 	bool found_GL_ARB_gpu_shader5 = false;
 	bool found_GL_ARB_shader_image_load_store = false; // GLES3.1
-	bool found_GL_ARB_shader_subroutine = false;
 	bool found_GL_ARB_bindless_texture = false; // GL5 GPU?
 	bool found_GL_ARB_texture_barrier = false; // Well maybe supported by older hardware I don't know

@ -450,19 +448,6 @@ namespace GLLoader {
 					if (!fglrx_buggy_driver && !mesa_amd_buggy_driver && !intel_buggy_driver) found_GL_ARB_separate_shader_objects = true;
 					else fprintf(stderr, "Buggy driver detected, GL_ARB_separate_shader_objects will be disabled\n");
 				}
-#if 0
-				// Erratum: on nvidia implementation, gain is very nice : 42.5 fps => 46.5 fps
-				//
-				// Strangely it doesn't provide the speed boost as expected.
-				// Note: only atst/colclip was replaced with subroutine for the moment. It replace 2000 program switch on
-				// colin mcrae 3 by 2100 uniform, but code is slower!
-				//
-				// Current hypothesis: the validation of useprogram is done in the "driver thread" whereas the extra function calls
-				// are done on the overloaded main threads.
-				// Apitrace profiling shows faster GPU draw times
-
-				if (ext.compare("GL_ARB_shader_subroutine") == 0) found_GL_ARB_shader_subroutine = true;
-#endif
 				// GL4.2
 				if (ext.compare("GL_ARB_shading_language_420pack") == 0) found_GL_ARB_shading_language_420pack = true;
 				if (ext.compare("GL_ARB_texture_storage") == 0) found_GL_ARB_texture_storage = true;
@ -495,7 +480,6 @@ namespace GLLoader {
 		status &= status_and_override(found_GL_ARB_draw_buffers_blend, "GL_ARB_draw_buffers_blend");
 		// GL4.1
 		status &= status_and_override(found_GL_ARB_separate_shader_objects, "GL_ARB_separate_shader_objects");
-		status &= status_and_override(found_GL_ARB_shader_subroutine, "GL_ARB_shader_subroutine");
 		// GL4.2
 		status &= status_and_override(found_GL_ARB_shader_image_load_store, "GL_ARB_shader_image_load_store");
 		status &= status_and_override(found_GL_ARB_shading_language_420pack, "GL_ARB_shading_language_420pack", true);
--- a/plugins/GSdx/GLLoader.h
+++ b/plugins/GSdx/GLLoader.h
@ -270,7 +270,6 @@ extern   PFNGLFLUSHMAPPEDBUFFERRANGEPROC        gl_FlushMappedBufferRange;
 extern   PFNGLBLENDEQUATIONSEPARATEPROC         gl_BlendEquationSeparate;
 extern   PFNGLBLENDFUNCSEPARATEPROC             gl_BlendFuncSeparate;
 // GL4.0
-extern   PFNGLUNIFORMSUBROUTINESUIVPROC         gl_UniformSubroutinesuiv;
 // GL4.1
 extern   PFNGLBINDPROGRAMPIPELINEPROC           gl_BindProgramPipeline;
 extern   PFNGLDELETEPROGRAMPIPELINESPROC        gl_DeleteProgramPipelines;
@ -361,7 +360,6 @@ namespace GLLoader {
 	extern bool found_GL_ARB_shader_image_load_store;
 	extern bool found_GL_ARB_clear_texture;
 	extern bool found_GL_ARB_buffer_storage;
-	extern bool found_GL_ARB_shader_subroutine;
 	extern bool found_GL_ARB_bindless_texture;
 	extern bool found_GL_ARB_explicit_uniform_location;
 	extern bool found_GL_ARB_clip_control;
--- a/plugins/GSdx/GLState.cpp
+++ b/plugins/GSdx/GLState.cpp
@ -58,8 +58,6 @@ namespace GLState {
 	GLuint vs;
 	GLuint program;
 	bool dirty_prog;
-	bool dirty_subroutine_vs;
-	bool dirty_subroutine_ps;
 #if 0
 	struct {
 		GSVertexBufferStateOGL* vb;
@ -105,8 +103,6 @@ namespace GLState {
 		vs = 0;
 		program = 0;
 		dirty_prog = false;
-		dirty_subroutine_vs = false;
-		dirty_subroutine_ps = false;
 		dirty_ressources = false;
 	}
 }
--- a/plugins/GSdx/GLState.h
+++ b/plugins/GSdx/GLState.h
@ -58,8 +58,6 @@ namespace GLState {
 	extern GLuint vs;
 	extern GLuint program; // monolith program (when sso isn't supported)
 	extern bool dirty_prog;
-	extern bool dirty_subroutine_vs;
-	extern bool dirty_subroutine_ps;
 	extern bool dirty_ressources;

 	extern void Clear();
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@ -229,7 +229,6 @@ class GSDeviceOGL : public GSDevice
 			{
 				uint32 wildhack:1;
 				uint32 bppz:2;
-				// Next param will be handle by subroutine
 				uint32 tme:1;
 				uint32 fst:1;

@ -338,7 +337,6 @@ class GSDeviceOGL : public GSDevice
 				uint32 tcoffsethack:1;
 				//uint32 point_sampler:1; Not tested, so keep the bit for blend
 				uint32 iip:1;
-				// Next param will be handle by subroutine (broken currently)
 				uint32 colclip:2;
 				uint32 atst:3;

--- a/plugins/GSdx/GSShaderOGL.cpp
+++ b/plugins/GSdx/GSShaderOGL.cpp
@ -24,14 +24,8 @@
 #include "GLState.h"

 GSShaderOGL::GSShaderOGL(bool debug) :
-	m_debug_shader(debug),
-	m_vs_sub_count(0),
-	m_ps_sub_count(0)
+	m_debug_shader(debug)
 {
-
-	memset(&m_vs_sub, 0, countof(m_vs_sub)*sizeof(m_vs_sub[0]));
-	memset(&m_ps_sub, 0, countof(m_ps_sub)*sizeof(m_ps_sub[0]));
-
 	m_single_prog.clear();
 	if (GLLoader::found_GL_ARB_separate_shader_objects) {
 		gl_GenProgramPipelines(1, &m_pipeline);
@ -48,41 +42,17 @@ GSShaderOGL::~GSShaderOGL()
 	m_single_prog.clear();
 }

-void GSShaderOGL::VS(GLuint s, GLuint sub_count)
+void GSShaderOGL::VS(GLuint s)
 {
 	if (GLState::vs != s)
 	{
-		m_vs_sub_count = sub_count;
-
 		GLState::vs = s;
 		GLState::dirty_prog = true;
-		GLState::dirty_subroutine_vs = true;
 		if (GLLoader::found_GL_ARB_separate_shader_objects)
 			gl_UseProgramStages(m_pipeline, GL_VERTEX_SHADER_BIT, s);
 	}
 }

-void GSShaderOGL::VS_subroutine(GLuint *sub)
-{
-	if (!(m_vs_sub[0] == sub[0])) {
-		m_vs_sub[0] = sub[0];
-		GLState::dirty_subroutine_vs = true;
-	}
-}
-
-void GSShaderOGL::PS_subroutine(GLuint *sub)
-{
-	// FIXME could be more efficient with GSvector
-	if (!(m_ps_sub[0] == sub[0] && m_ps_sub[1] == sub[1] && m_ps_sub[2] == sub[2] && m_ps_sub[3] == sub[3] && m_ps_sub[4] == sub[4])) {
-		m_ps_sub[0] = sub[0];
-		m_ps_sub[1] = sub[1];
-		m_ps_sub[2] = sub[2];
-		m_ps_sub[3] = sub[3];
-		m_ps_sub[4] = sub[4];
-		GLState::dirty_subroutine_ps = true;
-	}
-}
-
 void GSShaderOGL::PS_ressources(GLuint64 handle[2])
 {
 	if (handle[0] != GLState::tex_handle[0] || handle[1] != GLState::tex_handle[1]) {
@ -92,7 +62,7 @@ void GSShaderOGL::PS_ressources(GLuint64 handle[2])
 	}
 }

-void GSShaderOGL::PS(GLuint s, GLuint sub_count)
+void GSShaderOGL::PS(GLuint s)
 {
 #ifdef _DEBUG
 	if (true)
@ -100,12 +70,9 @@ void GSShaderOGL::PS(GLuint s, GLuint sub_count)
 	if (GLState::ps != s)
 #endif
 	{
-		m_ps_sub_count = sub_count;
-
 		// In debug always sets the program. It allow to replace the program in apitrace easily.
 		GLState::ps = s;
 		GLState::dirty_prog = true;
-		GLState::dirty_subroutine_ps = true;
 		GLState::dirty_ressources = true;
 		if (GLLoader::found_GL_ARB_separate_shader_objects) {
 			gl_UseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s);
@ -142,21 +109,6 @@ void GSShaderOGL::SetupRessources()
 	}
 }

-void GSShaderOGL::SetupSubroutineUniform()
-{
-	if (!GLLoader::found_GL_ARB_shader_subroutine) return;
-
-	if (GLState::dirty_subroutine_vs && m_vs_sub_count) {
-		gl_UniformSubroutinesuiv(GL_VERTEX_SHADER, m_vs_sub_count,  m_vs_sub);
-		GLState::dirty_subroutine_vs = false;
-	}
-
-	if (GLState::dirty_subroutine_ps && m_ps_sub_count) {
-		gl_UniformSubroutinesuiv(GL_FRAGMENT_SHADER, m_ps_sub_count,  m_ps_sub);
-		GLState::dirty_subroutine_ps = false;
-	}
-}
-
 bool GSShaderOGL::ValidateShader(GLuint s)
 {
 	if (!m_debug_shader) return true;
@ -243,8 +195,6 @@ void GSShaderOGL::UseProgram()

 	if (GLState::dirty_prog) {
 		if (!GLLoader::found_GL_ARB_separate_shader_objects) {
-			GLState::dirty_subroutine_vs = true;
-			GLState::dirty_subroutine_ps = true;
 			GLState::dirty_ressources = true;

 			hash_map<uint64, GLuint >::iterator it;
@ -277,8 +227,6 @@ void GSShaderOGL::UseProgram()

 	SetupRessources();

-	SetupSubroutineUniform();
-
 	GLState::dirty_prog = false;

 	GL_POP();
@ -294,11 +242,6 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co
 		// Need GL version 410
 		header += "#extension GL_ARB_separate_shader_objects: require\n";
 	}
-	if (GLLoader::found_GL_ARB_shader_subroutine && GLLoader::found_GL_ARB_explicit_uniform_location) {
-		// Need GL version 400
-		header += "#define SUBROUTINE_GL40 1\n";
-		header += "#extension GL_ARB_shader_subroutine: require\n";
-	}
 	if (GLLoader::found_GL_ARB_explicit_uniform_location) {
 		// Need GL version 430
 		header += "#extension GL_ARB_explicit_uniform_location: require\n";
--- a/plugins/GSdx/GSShaderOGL.h
+++ b/plugins/GSdx/GSShaderOGL.h
@ -25,13 +25,7 @@ class GSShaderOGL {
 	GLuint m_pipeline;
 	hash_map<uint64, GLuint > m_single_prog;
 	const bool m_debug_shader;
-	GLuint m_vs_sub_count;
-	GLuint m_ps_sub_count;

-	GLuint m_vs_sub[1];
-	GLuint m_ps_sub[5];
-
-	void SetupSubroutineUniform();
 	void SetupRessources();

 	bool ValidateShader(GLuint p);
@ -46,11 +40,9 @@ class GSShaderOGL {
 	~GSShaderOGL();

 	void GS(GLuint s);
-	void PS(GLuint s, GLuint sub_count = 0);
-	void PS_subroutine(GLuint *sub);
+	void PS(GLuint s);
 	void PS_ressources(GLuint64 handle[2]);
-	void VS(GLuint s, GLuint sub_count = 0);
-	void VS_subroutine(GLuint *sub);
+	void VS(GLuint s);

 	void UseProgram();

--- a/plugins/GSdx/GSTextureFXOGL.cpp
+++ b/plugins/GSdx/GSTextureFXOGL.cpp
@ -143,16 +143,7 @@ void GSDeviceOGL::SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer*

 void GSDeviceOGL::SetupVS(VSSelector sel)
 {
-	if (GLLoader::found_GL_ARB_shader_subroutine) {
-		GLuint sub[1];
-		sub[0] = sel.tme ? 1 + (uint32)sel.fst : 0;
-		m_shader->VS_subroutine(sub);
-		// Handle by subroutine useless now
-		sel.tme = 0;
-		sel.fst = 0;
-	}
-
-	m_shader->VS(m_vs[sel], 1);
+	m_shader->VS(m_vs[sel]);
 }

 void GSDeviceOGL::SetupGS(GSSelector sel)
@ -162,38 +153,6 @@ void GSDeviceOGL::SetupGS(GSSelector sel)

 void GSDeviceOGL::SetupPS(PSSelector sel)
 {
-	if (GLLoader::found_GL_ARB_shader_subroutine) {
-		GLuint tfx = sel.tfx > 3 ? 19 : 11 + (uint32)sel.tfx + (uint32)sel.tcc*4;
-
-		GLuint colclip = 8 + (uint32)sel.colclip;
-
-		GLuint clamp = 
-			(sel.wms == 2 && sel.wmt == 2) ? 20 :
-			(sel.wms == 2)                 ? 21 :
-			(sel.wmt == 2)                 ? 22 : 23;
-
-		GLuint wrap = 
-			(sel.wms == 2 && sel.wmt == 2) ? 24 :
-			(sel.wms == 3 && sel.wmt == 3) ? 25 :
-			(sel.wms == 2 && sel.wmt == 3) ? 26 :
-			(sel.wms == 3 && sel.wmt == 2) ? 27 :
-			(sel.wms == 2)                 ? 28 :
-			(sel.wmt == 3)                 ? 29 :
-			(sel.wms == 3)                 ? 30 :
-			(sel.wmt == 2)                 ? 31 : 32;
-
-		GLuint sub[5] = {sel.atst, colclip, tfx, clamp, wrap};
-
-		m_shader->PS_subroutine(sub);
-		// Handle by subroutine useless now
-		sel.atst = 0;
-		sel.colclip = 0;
-		sel.tfx = 0;
-		sel.tcc = 0;
-		// sel.wms = 0;
-		// sel.wmt = 0;
-	}
-
 	// *************************************************************
 	// Static
 	// *************************************************************
@ -210,7 +169,7 @@ void GSDeviceOGL::SetupPS(PSSelector sel)
 	// *************************************************************
 	// Dynamic
 	// *************************************************************
-	m_shader->PS(ps, 3);
+	m_shader->PS(ps);
 }

 void GSDeviceOGL::SetupSampler(PSSamplerSelector ssel)
--- a/plugins/GSdx/GSWnd.cpp
+++ b/plugins/GSdx/GSWnd.cpp
@ -80,7 +80,6 @@ void GSWndGL::PopulateGlFunction()
 	*(void**)&(gl_ClientWaitSync) = GetProcAddress("glClientWaitSync");
 	*(void**)&(gl_FlushMappedBufferRange) = GetProcAddress("glFlushMappedBufferRange");
 	// GL4.0
-	*(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv", true);
 	*(void**)&(gl_BlendEquationSeparateiARB) = GetProcAddress("glBlendEquationSeparateiARB", true);
 	*(void**)&(gl_BlendFuncSeparateiARB) = GetProcAddress("glBlendFuncSeparateiARB", true);
 	// GL4.1
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@ -80,19 +80,6 @@ layout(std140, binding = 21) uniform cb21
 	vec2 TC_OffsetHack;
 };

-#ifdef SUBROUTINE_GL40
-// Function pointer type + the functionn pointer variable
-subroutine void AlphaTestType(vec4 c);
-layout(location = 0) subroutine uniform AlphaTestType atst;
-
-subroutine vec4 TfxType(vec4 t, vec4 c);
-layout(location = 2) subroutine uniform TfxType tfx;
-
-subroutine void ColClipType(inout vec4 c);
-layout(location = 1) subroutine uniform ColClipType colclip;
-#endif
-
-
 vec4 sample_c(vec2 uv)
 {
 	// FIXME: check the issue on openGL
@ -291,7 +278,6 @@ vec4 sample_color(vec2 st, float q)
 }

 // FIXME Precompute the factor 255/128 in VS
-#ifndef SUBROUTINE_GL40
 vec4 tfx(vec4 t, vec4 c)
 {
 	vec4 c_out = c;
@ -319,9 +305,7 @@ vec4 tfx(vec4 t, vec4 c)

 	return c_out;
 }
-#endif

-#ifndef SUBROUTINE_GL40
 void atst(vec4 c)
 {
 	float a = trunc(c.a * 255.0 + 0.01);
@ -350,9 +334,7 @@ void atst(vec4 c)
 		discard;
 #endif
 }
-#endif

-#ifndef SUBROUTINE_GL40
 void colclip(inout vec4 c)
 {
 #if (PS_COLCLIP == 2)
@ -363,7 +345,6 @@ void colclip(inout vec4 c)
 	c.rgb *= vec3(factor);
 #endif
 }
-#endif

 void fog(inout vec4 c, float f)
 {
--- a/plugins/GSdx/res/glsl/tfx_fs_subroutine.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs_subroutine.glsl
@ -1,285 +0,0 @@
-//#version 420 // Keep it for text editor detection
-
-// Subroutine of standard fs function (I don't know if it will be ever used one day)
-
-// FIXME crash nvidia
-#if 0
-// Function pointer type
-subroutine vec4 WrapType(vec4 uv);
-
-// a function pointer variable
-layout(location = 4) subroutine uniform WrapType wrapuv;
-
-layout(index = 24) subroutine(WrapType)
-vec4 wrapuv_wms_wmt_2(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out = clamp(uv, MinMax.xyxy, MinMax.zwzw);
-    return uv_out;
-}
-
-layout(index = 25) subroutine(WrapType)
-vec4 wrapuv_wms_wmt3(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out = vec4((ivec4(uv * WH.xyxy) & ivec4(MskFix.xyxy)) | ivec4(MskFix.zwzw)) / WH.xyxy;
-    return uv_out;
-}
-
-layout(index = 26) subroutine(WrapType)
-vec4 wrapuv_wms2_wmt3(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);
-    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;
-    return uv_out;
-}
-
-layout(index = 27) subroutine(WrapType)
-vec4 wrapuv_wms3_wmt2(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;
-    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);
-    return uv_out;
-}
-
-layout(index = 28) subroutine(WrapType)
-vec4 wrapuv_wms2_wmtx(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);
-    return uv_out;
-}
-
-layout(index = 29) subroutine(WrapType)
-vec4 wrapuv_wmsx_wmt3(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;
-    return uv_out;
-}
-
-layout(index = 30) subroutine(WrapType)
-vec4 wrapuv_wms3_wmtx(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;
-    return uv_out;
-}
-
-layout(index = 31) subroutine(WrapType)
-vec4 wrapuv_wmsx_wmt2(vec4 uv)
-{
-    vec4 uv_out = uv;
-    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);
-    return uv_out;
-}
-
-layout(index = 32) subroutine(WrapType)
-vec4 wrapuv_dummy(vec4 uv)
-{
-    return uv;
-}
-#endif
-
-// FIXME crash nvidia
-#if 0
-// Function pointer type
-subroutine vec2 ClampType(vec2 uv);
-
-// a function pointer variable
-layout(location = 3) subroutine uniform ClampType clampuv;
-
-layout(index = 20) subroutine(ClampType)
-vec2 clampuv_wms2_wmt2(vec2 uv)
-{
-    return clamp(uv, MinF, MinMax.zw);
-}
-
-layout(index = 21) subroutine(ClampType)
-vec2 clampuv_wms2(vec2 uv)
-{
-    vec2 uv_out = uv;
-    uv_out.x = clamp(uv.x, MinF.x, MinMax.z);
-    return uv_out;
-}
-
-layout(index = 22) subroutine(ClampType)
-vec2 clampuv_wmt2(vec2 uv)
-{
-    vec2 uv_out = uv;
-    uv_out.y = clamp(uv.y, MinF.y, MinMax.w);
-    return uv_out;
-}
-
-layout(index = 23) subroutine(ClampType)
-vec2 clampuv_dummy(vec2 uv)
-{
-    return uv;
-}
-#endif
-
-#ifdef SUBROUTINE_GL40
-layout(index = 11) subroutine(TfxType)
-vec4 tfx_0_tcc_0(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f;
-    return c_out;
-}
-
-layout(index = 12) subroutine(TfxType)
-vec4 tfx_1_tcc_0(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = t.rgb;
-    return c_out;
-}
-
-layout(index = 13) subroutine(TfxType)
-vec4 tfx_2_tcc_0(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;
-    return c_out;
-}
-
-layout(index = 14) subroutine(TfxType)
-vec4 tfx_3_tcc_0(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;
-    return c_out;
-}
-
-layout(index = 15) subroutine(TfxType)
-vec4 tfx_0_tcc_1(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out = c * t * 255.0f / 128.0f;
-    return c_out;
-}
-
-layout(index = 16) subroutine(TfxType)
-vec4 tfx_1_tcc_1(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out = t;
-    return c_out;
-}
-
-layout(index = 17) subroutine(TfxType)
-vec4 tfx_2_tcc_1(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;
-    c_out.a += t.a;
-    return c_out;
-}
-
-layout(index = 18) subroutine(TfxType)
-vec4 tfx_3_tcc_1(vec4 t, vec4 c)
-{
-    vec4 c_out = c;
-    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;
-    c_out.a = t.a;
-    return c_out;
-}
-
-layout(index = 19) subroutine(TfxType)
-vec4 tfx_dummy(vec4 t, vec4 c)
-{
-    return c;
-}
-#endif
-
-#ifdef SUBROUTINE_GL40
-layout(index = 0) subroutine(AlphaTestType)
-void atest_never(vec4 c)
-{
-    discard;
-}
-
-layout(index = 1) subroutine(AlphaTestType)
-void atest_always(vec4 c)
-{
-    // Nothing to do
-}
-
-layout(index = 2) subroutine(AlphaTestType)
-void atest_l(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if (PS_SPRITEHACK == 0)
-        if ((AREF - a - 0.5f) < 0.0f)
-            discard;
-}
-
-layout(index = 3) subroutine(AlphaTestType)
-void atest_le(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if ((AREF - a + 0.5f) < 0.0f)
-        discard;
-}
-
-layout(index = 4) subroutine(AlphaTestType)
-void atest_e(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if ((0.5f - abs(a - AREF)) < 0.0f)
-        discard;
-}
-
-layout(index = 5) subroutine(AlphaTestType)
-void atest_ge(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if ((a-AREF + 0.5f) < 0.0f)
-        discard;
-}
-
-layout(index = 6) subroutine(AlphaTestType)
-void atest_g(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if ((a-AREF - 0.5f) < 0.0f)
-        discard;
-}
-
-layout(index = 7) subroutine(AlphaTestType)
-void atest_ne(vec4 c)
-{
-    float a = trunc(c.a * 255.0 + 0.01);
-    if ((abs(a - AREF) - 0.5f) < 0.0f)
-        discard;
-}
-#endif
-
-#ifdef SUBROUTINE_GL40
-layout(index = 8) subroutine(ColClipType)
-void colclip_0(inout vec4 c)
-{
-	// nothing to do
-}
-
-layout(index = 9) subroutine(ColClipType)
-void colclip_1(inout vec4 c)
-{
-	// FIXME !!!!
-	//c.rgb *= c.rgb < 128./255;
-	bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
-	c.rgb *= vec3(factor);
-}
-
-layout(index = 10) subroutine(ColClipType)
-void colclip_2(inout vec4 c)
-{
-	c.rgb = 256.0f/255.0f - c.rgb;
-	// FIXME !!!!
-	//c.rgb *= c.rgb < 128./255;
-	bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
-	c.rgb *= vec3(factor);
-}
-#endif
--- a/plugins/GSdx/res/glsl/tfx_vgs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_vgs.glsl
@ -42,36 +42,6 @@ const float exp_min32 = exp2(-32.0f);
 const float exp_min31 = exp2(-31.0f);
 #endif

-#ifdef SUBROUTINE_GL40
-// Function pointer type
-subroutine void TextureCoordType(void);
-
-// a function pointer variable
-layout(location = 0) subroutine uniform TextureCoordType texture_coord;
-
-layout(index = 0) subroutine(TextureCoordType)
-void tme_0()
-{
-    VSout_t.xy = vec2(0.0f, 0.0f);
-    VSout_t.w = 1.0f;
-}
-
-layout(index = 1) subroutine(TextureCoordType)
-void tme_1_fst_0()
-{
-    VSout_t.xy = i_st;
-    VSout_t.w = i_q;
-}
-
-layout(index = 2) subroutine(TextureCoordType)
-void tme_1_fst_1()
-{
-    VSout_t.xy = vec2(i_uv) * TextureScale;
-    VSout_t.w = 1.0f;
-}
-
-#else
-
 void texture_coord()
 {
    if(VS_TME != 0)
@ -98,8 +68,6 @@ void texture_coord()
    }
 }

-#endif
-
 void vs_main()
 {
    highp uint z;
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@ -650,36 +650,6 @@ static const char* tfx_vgs_glsl =
 	"const float exp_min31 = exp2(-31.0f);\n"
 	"#endif\n"
 	"\n"
-	"#ifdef SUBROUTINE_GL40\n"
-	"// Function pointer type\n"
-	"subroutine void TextureCoordType(void);\n"
-	"\n"
-	"// a function pointer variable\n"
-	"layout(location = 0) subroutine uniform TextureCoordType texture_coord;\n"
-	"\n"
-	"layout(index = 0) subroutine(TextureCoordType)\n"
-	"void tme_0()\n"
-	"{\n"
-	"    VSout_t.xy = vec2(0.0f, 0.0f);\n"
-	"    VSout_t.w = 1.0f;\n"
-	"}\n"
-	"\n"
-	"layout(index = 1) subroutine(TextureCoordType)\n"
-	"void tme_1_fst_0()\n"
-	"{\n"
-	"    VSout_t.xy = i_st;\n"
-	"    VSout_t.w = i_q;\n"
-	"}\n"
-	"\n"
-	"layout(index = 2) subroutine(TextureCoordType)\n"
-	"void tme_1_fst_1()\n"
-	"{\n"
-	"    VSout_t.xy = vec2(i_uv) * TextureScale;\n"
-	"    VSout_t.w = 1.0f;\n"
-	"}\n"
-	"\n"
-	"#else\n"
-	"\n"
 	"void texture_coord()\n"
 	"{\n"
 	"    if(VS_TME != 0)\n"
@ -706,8 +676,6 @@ static const char* tfx_vgs_glsl =
 	"    }\n"
 	"}\n"
 	"\n"
-	"#endif\n"
-	"\n"
 	"void vs_main()\n"
 	"{\n"
 	"    highp uint z;\n"
@ -973,19 +941,6 @@ static const char* tfx_fs_all_glsl =
 	"	vec2 TC_OffsetHack;\n"
 	"};\n"
 	"\n"
-	"#ifdef SUBROUTINE_GL40\n"
-	"// Function pointer type + the functionn pointer variable\n"
-	"subroutine void AlphaTestType(vec4 c);\n"
-	"layout(location = 0) subroutine uniform AlphaTestType atst;\n"
-	"\n"
-	"subroutine vec4 TfxType(vec4 t, vec4 c);\n"
-	"layout(location = 2) subroutine uniform TfxType tfx;\n"
-	"\n"
-	"subroutine void ColClipType(inout vec4 c);\n"
-	"layout(location = 1) subroutine uniform ColClipType colclip;\n"
-	"#endif\n"
-	"\n"
-	"\n"
 	"vec4 sample_c(vec2 uv)\n"
 	"{\n"
 	"	// FIXME: check the issue on openGL\n"
@ -1184,7 +1139,6 @@ static const char* tfx_fs_all_glsl =
 	"}\n"
 	"\n"
 	"// FIXME Precompute the factor 255/128 in VS\n"
-	"#ifndef SUBROUTINE_GL40\n"
 	"vec4 tfx(vec4 t, vec4 c)\n"
 	"{\n"
 	"	vec4 c_out = c;\n"
@ -1212,9 +1166,7 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"	return c_out;\n"
 	"}\n"
-	"#endif\n"
 	"\n"
-	"#ifndef SUBROUTINE_GL40\n"
 	"void atst(vec4 c)\n"
 	"{\n"
 	"	float a = trunc(c.a * 255.0 + 0.01);\n"
@ -1243,9 +1195,7 @@ static const char* tfx_fs_all_glsl =
 	"		discard;\n"
 	"#endif\n"
 	"}\n"
-	"#endif\n"
 	"\n"
-	"#ifndef SUBROUTINE_GL40\n"
 	"void colclip(inout vec4 c)\n"
 	"{\n"
 	"#if (PS_COLCLIP == 2)\n"
@ -1256,7 +1206,6 @@ static const char* tfx_fs_all_glsl =
 	"	c.rgb *= vec3(factor);\n"
 	"#endif\n"
 	"}\n"
-	"#endif\n"
 	"\n"
 	"void fog(inout vec4 c, float f)\n"
 	"{\n"
@ -1516,291 +1465,6 @@ static const char* tfx_fs_all_glsl =
 	"}\n"
 	"\n"
 	"#endif\n"
-	"//#version 420 // Keep it for text editor detection\n"
-	"\n"
-	"// Subroutine of standard fs function (I don't know if it will be ever used one day)\n"
-	"\n"
-	"// FIXME crash nvidia\n"
-	"#if 0\n"
-	"// Function pointer type\n"
-	"subroutine vec4 WrapType(vec4 uv);\n"
-	"\n"
-	"// a function pointer variable\n"
-	"layout(location = 4) subroutine uniform WrapType wrapuv;\n"
-	"\n"
-	"layout(index = 24) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms_wmt_2(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out = clamp(uv, MinMax.xyxy, MinMax.zwzw);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 25) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms_wmt3(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out = vec4((ivec4(uv * WH.xyxy) & ivec4(MskFix.xyxy)) | ivec4(MskFix.zwzw)) / WH.xyxy;\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 26) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms2_wmt3(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);\n"
-	"    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 27) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms3_wmt2(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;\n"
-	"    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 28) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms2_wmtx(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 29) subroutine(WrapType)\n"
-	"vec4 wrapuv_wmsx_wmt3(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 30) subroutine(WrapType)\n"
-	"vec4 wrapuv_wms3_wmtx(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 31) subroutine(WrapType)\n"
-	"vec4 wrapuv_wmsx_wmt2(vec4 uv)\n"
-	"{\n"
-	"    vec4 uv_out = uv;\n"
-	"    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 32) subroutine(WrapType)\n"
-	"vec4 wrapuv_dummy(vec4 uv)\n"
-	"{\n"
-	"    return uv;\n"
-	"}\n"
-	"#endif\n"
-	"\n"
-	"// FIXME crash nvidia\n"
-	"#if 0\n"
-	"// Function pointer type\n"
-	"subroutine vec2 ClampType(vec2 uv);\n"
-	"\n"
-	"// a function pointer variable\n"
-	"layout(location = 3) subroutine uniform ClampType clampuv;\n"
-	"\n"
-	"layout(index = 20) subroutine(ClampType)\n"
-	"vec2 clampuv_wms2_wmt2(vec2 uv)\n"
-	"{\n"
-	"    return clamp(uv, MinF, MinMax.zw);\n"
-	"}\n"
-	"\n"
-	"layout(index = 21) subroutine(ClampType)\n"
-	"vec2 clampuv_wms2(vec2 uv)\n"
-	"{\n"
-	"    vec2 uv_out = uv;\n"
-	"    uv_out.x = clamp(uv.x, MinF.x, MinMax.z);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 22) subroutine(ClampType)\n"
-	"vec2 clampuv_wmt2(vec2 uv)\n"
-	"{\n"
-	"    vec2 uv_out = uv;\n"
-	"    uv_out.y = clamp(uv.y, MinF.y, MinMax.w);\n"
-	"    return uv_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 23) subroutine(ClampType)\n"
-	"vec2 clampuv_dummy(vec2 uv)\n"
-	"{\n"
-	"    return uv;\n"
-	"}\n"
-	"#endif\n"
-	"\n"
-	"#ifdef SUBROUTINE_GL40\n"
-	"layout(index = 11) subroutine(TfxType)\n"
-	"vec4 tfx_0_tcc_0(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 12) subroutine(TfxType)\n"
-	"vec4 tfx_1_tcc_0(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = t.rgb;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 13) subroutine(TfxType)\n"
-	"vec4 tfx_2_tcc_0(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 14) subroutine(TfxType)\n"
-	"vec4 tfx_3_tcc_0(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 15) subroutine(TfxType)\n"
-	"vec4 tfx_0_tcc_1(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out = c * t * 255.0f / 128.0f;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 16) subroutine(TfxType)\n"
-	"vec4 tfx_1_tcc_1(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out = t;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 17) subroutine(TfxType)\n"
-	"vec4 tfx_2_tcc_1(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;\n"
-	"    c_out.a += t.a;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 18) subroutine(TfxType)\n"
-	"vec4 tfx_3_tcc_1(vec4 t, vec4 c)\n"
-	"{\n"
-	"    vec4 c_out = c;\n"
-	"    c_out.rgb = c.rgb * t.rgb * 255.0f / 128.0f + c.a;\n"
-	"    c_out.a = t.a;\n"
-	"    return c_out;\n"
-	"}\n"
-	"\n"
-	"layout(index = 19) subroutine(TfxType)\n"
-	"vec4 tfx_dummy(vec4 t, vec4 c)\n"
-	"{\n"
-	"    return c;\n"
-	"}\n"
-	"#endif\n"
-	"\n"
-	"#ifdef SUBROUTINE_GL40\n"
-	"layout(index = 0) subroutine(AlphaTestType)\n"
-	"void atest_never(vec4 c)\n"
-	"{\n"
-	"    discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 1) subroutine(AlphaTestType)\n"
-	"void atest_always(vec4 c)\n"
-	"{\n"
-	"    // Nothing to do\n"
-	"}\n"
-	"\n"
-	"layout(index = 2) subroutine(AlphaTestType)\n"
-	"void atest_l(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if (PS_SPRITEHACK == 0)\n"
-	"        if ((AREF - a - 0.5f) < 0.0f)\n"
-	"            discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 3) subroutine(AlphaTestType)\n"
-	"void atest_le(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if ((AREF - a + 0.5f) < 0.0f)\n"
-	"        discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 4) subroutine(AlphaTestType)\n"
-	"void atest_e(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if ((0.5f - abs(a - AREF)) < 0.0f)\n"
-	"        discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 5) subroutine(AlphaTestType)\n"
-	"void atest_ge(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if ((a-AREF + 0.5f) < 0.0f)\n"
-	"        discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 6) subroutine(AlphaTestType)\n"
-	"void atest_g(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if ((a-AREF - 0.5f) < 0.0f)\n"
-	"        discard;\n"
-	"}\n"
-	"\n"
-	"layout(index = 7) subroutine(AlphaTestType)\n"
-	"void atest_ne(vec4 c)\n"
-	"{\n"
-	"    float a = trunc(c.a * 255.0 + 0.01);\n"
-	"    if ((abs(a - AREF) - 0.5f) < 0.0f)\n"
-	"        discard;\n"
-	"}\n"
-	"#endif\n"
-	"\n"
-	"#ifdef SUBROUTINE_GL40\n"
-	"layout(index = 8) subroutine(ColClipType)\n"
-	"void colclip_0(inout vec4 c)\n"
-	"{\n"
-	"	// nothing to do\n"
-	"}\n"
-	"\n"
-	"layout(index = 9) subroutine(ColClipType)\n"
-	"void colclip_1(inout vec4 c)\n"
-	"{\n"
-	"	// FIXME !!!!\n"
-	"	//c.rgb *= c.rgb < 128./255;\n"
-	"	bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
-	"	c.rgb *= vec3(factor);\n"
-	"}\n"
-	"\n"
-	"layout(index = 10) subroutine(ColClipType)\n"
-	"void colclip_2(inout vec4 c)\n"
-	"{\n"
-	"	c.rgb = 256.0f/255.0f - c.rgb;\n"
-	"	// FIXME !!!!\n"
-	"	//c.rgb *= c.rgb < 128./255;\n"
-	"	bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
-	"	c.rgb *= vec3(factor);\n"
-	"}\n"
-	"#endif\n"
 	;

 static const char* fxaa_fx =