diff --git a/plugins/GSdx/GLLoader.cpp b/plugins/GSdx/GLLoader.cpp index 46436c6c83..70eac560bc 100644 --- a/plugins/GSdx/GLLoader.cpp +++ b/plugins/GSdx/GLLoader.cpp @@ -81,7 +81,9 @@ PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages = NULL; PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL; PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL; PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL; -// GL 4.1 +// GL4.0 +PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL; +// GL4.1 PFNGLBINDPROGRAMPIPELINEPROC gl_BindProgramPipeline = NULL; PFNGLGENPROGRAMPIPELINESPROC gl_GenProgramPipelines = NULL; PFNGLDELETEPROGRAMPIPELINESPROC gl_DeleteProgramPipelines = NULL; @@ -122,9 +124,10 @@ namespace GLLoader { bool found_GL_ARB_clear_texture = false; // Don't know if GL3 hardawe can support it bool found_GL_ARB_buffer_storage = false; // GL4 hardware - bool found_GL_ARB_copy_image = false; + bool found_GL_ARB_copy_image = false; // Not sure actually bool found_GL_ARB_gpu_shader5 = false; bool found_GL_ARB_shader_image_load_store = false; + bool found_GL_ARB_shader_subroutine = false; // Mandatory for FULL GL (but optional for GLES) bool found_GL_ARB_multi_bind = false; // Not yet. Wait Mesa & AMD drivers @@ -221,6 +224,17 @@ namespace GLLoader { if (ext.compare("GL_ARB_copy_image") == 0) found_GL_ARB_copy_image = true; if (ext.compare("GL_ARB_gpu_shader5") == 0) found_GL_ARB_gpu_shader5 = true; if (ext.compare("GL_ARB_shader_image_load_store") == 0) found_GL_ARB_shader_image_load_store = true; +#if 0 + // Strangely it doesn't provide the speed boost as expected. + // Note: only atst/colclip was replaced with subroutine for the moment. It replace 2000 program switch on + // colin mcrae 3 by 2100 uniform, but code is slower! + // + // Current hypothesis: the validation of useprogram is done in the "driver thread" whereas the extra function calls + // are done on the overloaded main threads. + // Apitrace profiling shows faster GPU draw times + + if (ext.compare("GL_ARB_shader_subroutine") == 0) found_GL_ARB_shader_subroutine = true; +#endif #ifdef GL44 // Need to debug the code first if (ext.compare("GL_ARB_clear_texture") == 0) found_GL_ARB_clear_texture = true; if (ext.compare("GL_ARB_multi_bind") == 0) found_GL_ARB_multi_bind = true; @@ -242,6 +256,7 @@ namespace GLLoader { status &= status_and_override(found_GL_ARB_shader_image_load_store,"GL_ARB_shader_image_load_store"); status &= status_and_override(found_GL_ARB_clear_texture,"GL_ARB_clear_texture"); status &= status_and_override(found_GL_ARB_buffer_storage,"GL_ARB_buffer_storage"); + status &= status_and_override(found_GL_ARB_shader_subroutine,"GL_ARB_shader_subroutine"); status &= status_and_override(found_GL_ARB_texture_storage, "GL_ARB_texture_storage", true); status &= status_and_override(found_GL_ARB_shading_language_420pack,"GL_ARB_shading_language_420pack"); diff --git a/plugins/GSdx/GLLoader.h b/plugins/GSdx/GLLoader.h index 6d1cb477e7..4248145734 100644 --- a/plugins/GSdx/GLLoader.h +++ b/plugins/GSdx/GLLoader.h @@ -134,6 +134,8 @@ extern PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages; extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer; extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer; extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData; +// GL4.0 +extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv; // GL4.1 extern PFNGLBINDPROGRAMPIPELINEPROC gl_BindProgramPipeline; extern PFNGLDELETEPROGRAMPIPELINESPROC gl_DeleteProgramPipelines; @@ -254,4 +256,5 @@ namespace GLLoader { extern bool found_GL_ARB_clear_texture; extern bool found_GL_ARB_multi_bind; extern bool found_GL_ARB_buffer_storage; + extern bool found_GL_ARB_shader_subroutine; } diff --git a/plugins/GSdx/GLState.cpp b/plugins/GSdx/GLState.cpp index a85143425b..766efc3af5 100644 --- a/plugins/GSdx/GLState.cpp +++ b/plugins/GSdx/GLState.cpp @@ -62,6 +62,8 @@ namespace GLState { GLuint gs = 0; GLuint vs = 0; GLuint program = 0; + bool dirty_prog = false; + bool dirty_subroutine_ps = false; #if 0 struct { GSVertexBufferStateOGL* vb; @@ -112,5 +114,7 @@ namespace GLState { gs = 0; vs = 0; program = 0; + dirty_prog = false; + dirty_subroutine_ps = false; } } diff --git a/plugins/GSdx/GLState.h b/plugins/GSdx/GLState.h index 29f9270109..7d0a215ded 100644 --- a/plugins/GSdx/GLState.h +++ b/plugins/GSdx/GLState.h @@ -64,6 +64,8 @@ namespace GLState { extern GLuint gs; extern GLuint vs; extern GLuint program; // monolith program (when sso isn't supported) + extern bool dirty_prog; + extern bool dirty_subroutine_ps; extern void Clear(); } diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp index d32d465c96..bb99c9b19c 100644 --- a/plugins/GSdx/GSDeviceOGL.cpp +++ b/plugins/GSdx/GSDeviceOGL.cpp @@ -168,7 +168,7 @@ bool GSDeviceOGL::Create(GSWnd* wnd) // **************************************************************** // Various object // **************************************************************** - m_shader = new GSShaderOGL(!!theApp.GetConfig("debug_ogl_shader", 1), GLLoader::found_GL_ARB_separate_shader_objects, GLLoader::found_GL_ARB_shading_language_420pack); + m_shader = new GSShaderOGL(!!theApp.GetConfig("debug_ogl_shader", 1)); gl_GenFramebuffers(1, &m_fbo); gl_GenFramebuffers(1, &m_fbo_read); @@ -572,6 +572,7 @@ void GSDeviceOGL::Barrier(GLbitfield b) //#endif } +/* Note: must be here because tfx_glsl is static */ GLuint GSDeviceOGL::CompileVS(VSSelector sel) { std::string macro = format("#define VS_BPPZ %d\n", sel.bppz) @@ -582,6 +583,7 @@ GLuint GSDeviceOGL::CompileVS(VSSelector sel) return m_shader->Compile("tfx.glsl", "vs_main", GL_VERTEX_SHADER, tfx_glsl, macro); } +/* Note: must be here because tfx_glsl is static */ GLuint GSDeviceOGL::CompileGS(GSSelector sel) { // Easy case @@ -598,6 +600,7 @@ GLuint GSDeviceOGL::CompileGS(GSSelector sel) #endif } +/* Note: must be here because tfx_glsl is static */ GLuint GSDeviceOGL::CompilePS(PSSelector sel) { std::string macro = format("#define PS_FST %d\n", sel.fst) @@ -720,6 +723,14 @@ void GSDeviceOGL::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, GSVector2i ds = dt->GetSize(); + // WARNING: setup of the program must be done first. So you can setup + // 1/ subroutine uniform + // 2/ bindless texture uniform + // 3/ others uniform? + m_shader->VS(m_convert.vs); + m_shader->GS(0); + m_shader->PS(ps); + // ************************************ // om // ************************************ @@ -764,32 +775,17 @@ void GSDeviceOGL::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, {GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(flip_sr.x, flip_sr.w)}, {GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(flip_sr.z, flip_sr.w)}, }; - //fprintf(stderr, "A:%fx%f B:%fx%f\n", left, top, bottom, right); - //fprintf(stderr, "SR: %f %f %f %f\n", sr.x, sr.y, sr.z, sr.w); IASetVertexState(m_vb_sr); IASetVertexBuffer(vertices, 4); IASetPrimitiveTopology(GL_TRIANGLE_STRIP); // ************************************ - // vs + // Texture // ************************************ - m_shader->VS(m_convert.vs); - - // ************************************ - // gs - // ************************************ - - m_shader->GS(0); - - // ************************************ - // ps - // ************************************ - - PSSetShaderResource(0, static_cast(st)->GetID()); + PSSetShaderResource(static_cast(st)->GetID()); PSSetSamplerState(linear ? m_convert.ln : m_convert.pt); - m_shader->PS(ps); // ************************************ // Draw @@ -886,6 +882,14 @@ void GSDeviceOGL::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* ver ClearStencil(ds, 0); + // WARNING: setup of the program must be done first. So you can setup + // 1/ subroutine uniform + // 2/ bindless texture uniform + // 3/ others uniform? + m_shader->VS(m_convert.vs); + m_shader->GS(0); + m_shader->PS(m_convert.ps[datm ? 2 : 3]); + // om OMSetDepthStencilState(m_date.dss, 1); @@ -898,19 +902,11 @@ void GSDeviceOGL::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* ver IASetVertexBuffer(vertices, 4); IASetPrimitiveTopology(GL_TRIANGLE_STRIP); - // vs - m_shader->VS(m_convert.vs); + // Texture - // gs - - m_shader->GS(0); - - // ps - - PSSetShaderResource(0, static_cast(rt)->GetID()); + PSSetShaderResource(static_cast(rt)->GetID()); PSSetSamplerState(m_convert.pt); - m_shader->PS(m_convert.ps[datm ? 2 : 3]); // @@ -966,16 +962,16 @@ void GSDeviceOGL::IASetPrimitiveTopology(GLenum topology) m_state.vb->SetTopology(topology); } -void GSDeviceOGL::PSSetShaderResource(const int i, GLuint sr) +void GSDeviceOGL::PSSetShaderResource(GLuint sr) { - if (GLState::tex_unit[i] != sr) { - GLState::tex_unit[i] = sr; + if (GLState::tex_unit[0] != sr) { + GLState::tex_unit[0] = sr; if (GLLoader::found_GL_ARB_multi_bind) { GLuint textures[1] = {sr}; - gl_BindTextures(i, 1, textures); + gl_BindTextures(0, 1, textures); } else { - gl_ActiveTexture(GL_TEXTURE0 + i); + gl_ActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, sr); // Get back to the expected active texture unit @@ -987,8 +983,21 @@ void GSDeviceOGL::PSSetShaderResource(const int i, GLuint sr) void GSDeviceOGL::PSSetShaderResources(GLuint tex[2]) { if (GLState::tex_unit[0] != tex[0] || GLState::tex_unit[1] != tex[1]) { - GLuint textures[2] = {tex[0], tex[1]}; - gl_BindTextures(0, 2, textures); + GLState::tex_unit[0] = tex[0]; + GLState::tex_unit[1] = tex[1]; + + if (GLLoader::found_GL_ARB_multi_bind) { + gl_BindTextures(0, 2, tex); + } else { + gl_ActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, tex[0]); + + gl_ActiveTexture(GL_TEXTURE0 + 1); + glBindTexture(GL_TEXTURE_2D, tex[1]); + + // Get back to the expected active texture unit + gl_ActiveTexture(GL_TEXTURE0 + 3); + } } } diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h index ab2e596e61..2b0987f1a5 100644 --- a/plugins/GSdx/GSDeviceOGL.h +++ b/plugins/GSdx/GSDeviceOGL.h @@ -606,10 +606,9 @@ class GSDeviceOGL : public GSDevice void IASetIndexBuffer(const void* index, size_t count); void IASetVertexState(GSVertexBufferStateOGL* vb = NULL); - void PSSetShaderResource(const int i, GLuint sr); + void PSSetShaderResource(GLuint sr); void PSSetShaderResources(GLuint tex[2]); void PSSetSamplerState(GLuint ss); - void PSSetSamplerStates(const int count, const GLuint* samplers); void OMSetDepthStencilState(GSDepthStencilOGL* dss, uint8 sref); void OMSetBlendState(GSBlendStateOGL* bs, float bf); @@ -627,9 +626,10 @@ class GSDeviceOGL : public GSDevice void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim); - void SetupVS(VSSelector sel, const VSConstantBuffer* cb); + void SetupVS(VSSelector sel); void SetupGS(GSSelector sel); - void SetupPS(PSSelector sel, const PSConstantBuffer* cb); + void SetupPS(PSSelector sel); + void SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb); void SetupSampler(PSSelector sel, PSSamplerSelector ssel); void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix); diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 11b5ac3a8e..cb8aae54f6 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -163,6 +163,7 @@ void GSRendererOGL::SetupIA() dev->IASetVertexState(); if(UserHacks_WildHack && !isPackedUV_HackFlag) { + // FIXME: why not put it on the Vertex shader if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertex), m_vertex.next)) { GSVector4i::storent(ptr, m_vertex.buff, sizeof(GSVertex) * m_vertex.next); @@ -441,6 +442,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour om_dssel.alpha_stencil = 1; } + // By default don't use texture + ps_sel.tfx = 4; + if(tex) { const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[context->TEX0.PSM]; @@ -456,6 +460,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_sel.tcc = context->TEX0.TCC; ps_sel.ltf = bilinear && !simple_sample; ps_sel.spritehack = tex->m_spritehack_t; + // FIXME the ati is currently disabled on the shader. I need to find a .gs to test that we got same + // bug on opengl ps_sel.point_sampler = !(bilinear && simple_sample); int w = tex->m_texture->GetWidth(); @@ -491,23 +497,30 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_ssel.tau = (context->CLAMP.WMS + 3) >> 1; ps_ssel.tav = (context->CLAMP.WMT + 3) >> 1; ps_ssel.ltf = bilinear && simple_sample; - - dev->SetupSampler(ps_sel, ps_ssel); - if (tex->m_palette) { - if (GLLoader::found_GL_ARB_multi_bind) { - GLuint textures[2] = {static_cast(tex->m_texture)->GetID(), static_cast(tex->m_palette)->GetID()}; - dev->PSSetShaderResources(textures); - } else { - dev->PSSetShaderResource(1, static_cast(tex->m_palette)->GetID()); - dev->PSSetShaderResource(0, static_cast(tex->m_texture)->GetID()); - } - } else { - dev->PSSetShaderResource(0, static_cast(tex->m_texture)->GetID()); - } } - else - { - ps_sel.tfx = 4; + + // WARNING: setup of the program must be done first. So you can setup + // 1/ subroutine uniform + // 2/ bindless texture uniform + // 3/ others uniform? + dev->SetupVS(vs_sel); + dev->SetupGS(gs_sel); + dev->SetupPS(ps_sel); + + // Note: bindless texture will use uniform so it must be done after the program setup + if(tex) { + if (tex->m_palette) { + // 2 textures (main + palette) + dev->SetupSampler(ps_sel, ps_ssel); + + GLuint textures[2] = {static_cast(tex->m_texture)->GetID(), static_cast(tex->m_palette)->GetID()}; + dev->PSSetShaderResources(textures); + } else if (tex->m_texture) { + // Only main texture + dev->SetupSampler(ps_sel, ps_ssel); + + dev->PSSetShaderResource(static_cast(tex->m_texture)->GetID()); + } } // rs @@ -521,9 +534,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour SetupIA(); dev->SetupOM(om_dssel, om_bsel, afix); - dev->SetupVS(vs_sel, &vs_cb); - dev->SetupGS(gs_sel); - dev->SetupPS(ps_sel, &ps_cb); + dev->SetupCB(&vs_cb, &ps_cb); if (advance_DATE) { // Create an r32ui image that will contain primitive ID @@ -539,7 +550,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour dev->OMSetWriteBuffer(); ps_sel.date = 3; - dev->SetupPS(ps_sel, &ps_cb); + dev->SetupPS(ps_sel); // Be sure that first pass is finished ! dev->Barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); @@ -558,7 +569,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_selneg.colclip = 2; dev->SetupOM(om_dssel, om_bselneg, afix); - dev->SetupPS(ps_selneg, &ps_cb); + dev->SetupPS(ps_selneg); dev->DrawIndexedPrimitive(); dev->SetupOM(om_dssel, om_bsel, afix); @@ -573,7 +584,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_sel.atst = iatst[ps_sel.atst]; - dev->SetupPS(ps_sel, &ps_cb); + dev->SetupPS(ps_sel); bool z = om_dssel.zwe; bool r = om_bsel.wr; @@ -583,11 +594,11 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour switch(context->TEST.AFAIL) { - case AFAIL_KEEP: z = r = g = b = a = false; break; // none - case AFAIL_FB_ONLY: z = false; break; // rgba - case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z - case AFAIL_RGB_ONLY: z = a = false; break; // rgb - default: __assume(0); + case AFAIL_KEEP: z = r = g = b = a = false; break; // none + case AFAIL_FB_ONLY: z = false; break; // rgba + case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z + case AFAIL_RGB_ONLY: z = a = false; break; // rgb + default: __assume(0); } if(z || r || g || b || a) @@ -611,7 +622,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_selneg.colclip = 2; dev->SetupOM(om_dssel, om_bselneg, afix); - dev->SetupPS(ps_selneg, &ps_cb); + dev->SetupPS(ps_selneg); dev->DrawIndexedPrimitive(); } diff --git a/plugins/GSdx/GSShaderOGL.cpp b/plugins/GSdx/GSShaderOGL.cpp index e217de63aa..61a298f500 100644 --- a/plugins/GSdx/GSShaderOGL.cpp +++ b/plugins/GSdx/GSShaderOGL.cpp @@ -23,14 +23,16 @@ #include "GSShaderOGL.h" #include "GLState.h" -GSShaderOGL::GSShaderOGL(bool debug, bool sso, bool glsl420) : +GSShaderOGL::GSShaderOGL(bool debug) : m_debug_shader(debug), - m_sso(sso), - m_glsl420(glsl420) + m_sub_count(0) { + + memset(&m_ps_sub, 0, countof(m_ps_sub)*sizeof(GLuint)); + m_single_prog.clear(); #ifndef ENABLE_GLES - if (sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { gl_GenProgramPipelines(1, &m_pipeline); gl_BindProgramPipeline(m_pipeline); } @@ -40,7 +42,7 @@ GSShaderOGL::GSShaderOGL(bool debug, bool sso, bool glsl420) : GSShaderOGL::~GSShaderOGL() { #ifndef ENABLE_GLES - if (m_sso) + if (GLLoader::found_GL_ARB_separate_shader_objects) gl_DeleteProgramPipelines(1, &m_pipeline); #endif @@ -53,21 +55,35 @@ void GSShaderOGL::VS(GLuint s) if (GLState::vs != s) { GLState::vs = s; + GLState::dirty_prog = true; #ifndef ENABLE_GLES - if (m_sso) + if (GLLoader::found_GL_ARB_separate_shader_objects) gl_UseProgramStages(m_pipeline, GL_VERTEX_SHADER_BIT, s); #endif } } -void GSShaderOGL::PS(GLuint s) +void GSShaderOGL::PS_subroutine(GLuint *sub) +{ + if (!(m_ps_sub[0] == sub[0] && m_ps_sub[1] == sub[1])) { + m_ps_sub[0] = sub[0]; + m_ps_sub[1] = sub[1]; + GLState::dirty_subroutine_ps = true; + } +} + +void GSShaderOGL::PS(GLuint s, GLuint sub_count) { if (GLState::ps != s) { + m_sub_count = sub_count; + GLState::ps = s; + GLState::dirty_prog = true; #ifndef ENABLE_GLES - if (m_sso) + if (GLLoader::found_GL_ARB_separate_shader_objects) { gl_UseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s); + } #endif } } @@ -77,8 +93,9 @@ void GSShaderOGL::GS(GLuint s) if (GLState::gs != s) { GLState::gs = s; + GLState::dirty_prog = true; #ifndef ENABLE_GLES - if (m_sso) + if (GLLoader::found_GL_ARB_separate_shader_objects) gl_UseProgramStages(m_pipeline, GL_GEOMETRY_SHADER_BIT, s); #endif } @@ -97,7 +114,7 @@ void GSShaderOGL::SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding) { GLint loc = gl_GetUniformLocation(prog, name); if (loc != -1) { - if (m_sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { #ifndef ENABLE_GLES gl_ProgramUniform1i(prog, loc, binding); #endif @@ -109,9 +126,9 @@ void GSShaderOGL::SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding) void GSShaderOGL::SetupUniform() { - if (m_glsl420) return; + if (GLLoader::found_GL_ARB_shading_language_420pack) return; - if (m_sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { SetUniformBinding(GLState::vs, "cb20", 20); SetUniformBinding(GLState::ps, "cb21", 21); @@ -138,6 +155,17 @@ void GSShaderOGL::SetupUniform() } } +void GSShaderOGL::SetSubroutineUniform() +{ + if (!GLLoader::found_GL_ARB_shader_subroutine) return; + if (m_sub_count == 0) return; + + if (GLState::dirty_subroutine_ps || GLState::dirty_prog) + gl_UniformSubroutinesuiv(GL_FRAGMENT_SHADER, m_sub_count, m_ps_sub); + + GLState::dirty_subroutine_ps = false; +} + bool GSShaderOGL::ValidateShader(GLuint s) { if (!m_debug_shader) return true; @@ -223,37 +251,44 @@ GLuint GSShaderOGL::LinkNewProgram() void GSShaderOGL::UseProgram() { - hash_map::iterator it; - if (!m_sso) { - // Note: shader are integer lookup pointer. They start from 1 and incr - // every time you create a new shader OR a new program. - // Note2: vs & gs are precompiled at startup. FGLRX and radeon got value < 128. - // We migth be able to pack the value in a 32bits int - // I would need to check the behavior on Nvidia (pause/resume). - uint64 sel = (uint64)GLState::vs << 40 | (uint64)GLState::gs << 20 | GLState::ps; - it = m_single_prog.find(sel); - if (it == m_single_prog.end()) { - GLState::program = LinkNewProgram(); - m_single_prog[sel] = GLState::program; + if (GLState::dirty_prog) { + if (!GLLoader::found_GL_ARB_separate_shader_objects) { + hash_map::iterator it; + // Note: shader are integer lookup pointer. They start from 1 and incr + // every time you create a new shader OR a new program. + // Note2: vs & gs are precompiled at startup. FGLRX and radeon got value < 128. + // We migth be able to pack the value in a 32bits int + // I would need to check the behavior on Nvidia (pause/resume). + uint64 sel = (uint64)GLState::vs << 40 | (uint64)GLState::gs << 20 | GLState::ps; + it = m_single_prog.find(sel); + if (it == m_single_prog.end()) { + GLState::program = LinkNewProgram(); + m_single_prog[sel] = GLState::program; - ValidateProgram(GLState::program); + ValidateProgram(GLState::program); - gl_UseProgram(GLState::program); - // warning it must be done after the "setup" of the program - SetupUniform(); - } else { - GLuint prog = it->second; - if (prog != GLState::program) { - GLState::program = prog; gl_UseProgram(GLState::program); + + // warning it must be done after the "setup" of the program + SetupUniform(); + } else { + GLuint prog = it->second; + if (prog != GLState::program) { + GLState::program = prog; + gl_UseProgram(GLState::program); + } } + + } else { + ValidatePipeline(m_pipeline); + + SetupUniform(); } - - } else { - ValidatePipeline(m_pipeline); - - SetupUniform(); } + + SetSubroutineUniform(); + + GLState::dirty_prog = false; } std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, const std::string& macro) @@ -266,13 +301,13 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co } else { header = "#version 330 core\n"; } - if (m_glsl420) { + if (GLLoader::found_GL_ARB_shading_language_420pack) { // Need GL version 420 header += "#extension GL_ARB_shading_language_420pack: require\n"; } else { header += "#define DISABLE_GL42\n"; } - if (m_sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { // Need GL version 410 header += "#extension GL_ARB_separate_shader_objects : require\n"; } else { @@ -284,6 +319,10 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co // Need version 140 header += "#extension GL_ARB_uniform_buffer_object : require\n"; } + if (GLLoader::found_GL_ARB_shader_subroutine) { + // Need GL version 400 + header += "#define SUBROUTINE_GL40 1\n"; + } #ifdef ENABLE_OGL_STENCIL_DEBUG header += "#define ENABLE_OGL_STENCIL_DEBUG 1\n"; #endif @@ -351,7 +390,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent sources[0] = header.append(glsl_h_code).c_str(); #endif - if (m_sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { #ifndef ENABLE_GLES program = gl_CreateShaderProgramv(type, shader_nb, sources); #endif @@ -362,7 +401,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent } bool status; - if (m_sso) + if (GLLoader::found_GL_ARB_separate_shader_objects) status = ValidateProgram(program); else status = ValidateShader(program); @@ -378,7 +417,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent void GSShaderOGL::Delete(GLuint s) { - if (m_sso) { + if (GLLoader::found_GL_ARB_separate_shader_objects) { gl_DeleteProgram(s); } else { gl_DeleteShader(s); diff --git a/plugins/GSdx/GSShaderOGL.h b/plugins/GSdx/GSShaderOGL.h index de5920e244..7423a41b72 100644 --- a/plugins/GSdx/GSShaderOGL.h +++ b/plugins/GSdx/GSShaderOGL.h @@ -25,10 +25,11 @@ class GSShaderOGL { GLuint m_pipeline; hash_map m_single_prog; const bool m_debug_shader; - const bool m_sso; - const bool m_glsl420; + GLuint m_sub_count; + GLuint m_ps_sub[2]; + void SetSubroutineUniform(); void SetupUniform(); void SetUniformBinding(GLuint prog, GLchar* name, GLuint binding); void SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding); @@ -41,11 +42,12 @@ class GSShaderOGL { GLuint LinkNewProgram(); public: - GSShaderOGL(bool debug, bool sso, bool glsl420); + GSShaderOGL(bool debug); ~GSShaderOGL(); void GS(GLuint s); - void PS(GLuint s); + void PS(GLuint s, GLuint sub_count = 0); + void PS_subroutine(GLuint *sub); void VS(GLuint s); void UseProgram(); diff --git a/plugins/GSdx/GSTextureFXOGL.cpp b/plugins/GSdx/GSTextureFXOGL.cpp index 3cf9431c2d..b58392a02c 100644 --- a/plugins/GSdx/GSTextureFXOGL.cpp +++ b/plugins/GSdx/GSTextureFXOGL.cpp @@ -133,14 +133,22 @@ GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, uint8 afix) return bs; } -void GSDeviceOGL::SetupVS(VSSelector sel, const VSConstantBuffer* cb) +void GSDeviceOGL::SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb) +{ + if(m_vs_cb_cache.Update(vs_cb)) { + m_vs_cb->upload(vs_cb); + } + + if(m_ps_cb_cache.Update(ps_cb)) { + m_ps_cb->upload(ps_cb); + } + +} + +void GSDeviceOGL::SetupVS(VSSelector sel) { GLuint vs = m_vs[sel]; - if(m_vs_cb_cache.Update(cb)) { - m_vs_cb->upload(cb); - } - m_shader->VS(vs); } @@ -151,8 +159,16 @@ void GSDeviceOGL::SetupGS(GSSelector sel) m_shader->GS(gs); } -void GSDeviceOGL::SetupPS(PSSelector sel, const PSConstantBuffer* cb) +void GSDeviceOGL::SetupPS(PSSelector sel) { + if (GLLoader::found_GL_ARB_shader_subroutine) { + GLuint sub[2] = {sel.atst, (uint32)sel.colclip + 8}; + m_shader->PS_subroutine(sub); + // Handle by subroutine useless now + sel.atst = 0; + sel.colclip = 0; + } + // ************************************************************* // Static // ************************************************************* @@ -169,11 +185,7 @@ void GSDeviceOGL::SetupPS(PSSelector sel, const PSConstantBuffer* cb) // ************************************************************* // Dynamic // ************************************************************* - if(m_ps_cb_cache.Update(cb)) { - m_ps_cb->upload(cb); - } - - m_shader->PS(ps); + m_shader->PS(ps, 2); } void GSDeviceOGL::SetupSampler(PSSelector sel, PSSamplerSelector ssel) diff --git a/plugins/GSdx/GSTextureOGL.cpp b/plugins/GSdx/GSTextureOGL.cpp index 9f41dc768f..61fdc29d3b 100644 --- a/plugins/GSdx/GSTextureOGL.cpp +++ b/plugins/GSdx/GSTextureOGL.cpp @@ -177,7 +177,6 @@ GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read) break; default: break; } - } GSTextureOGL::~GSTextureOGL() @@ -210,7 +209,6 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch) EnableUnit(); -#if 1 PboPool::BindPbo(); glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); @@ -230,47 +228,19 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch) PboPool::UnbindPbo(); return true; -#else + +#if 0 // pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift); -#ifdef _LINUX - if (GLLoader::fglrx_buggy_driver && !GLLoader::in_replayer) { - // FIXME : it crash on colin mcrae rally 3 (others game too) when the texture is small - //if ((pitch >> 2) == 32 || r.width() < 32 || r.height() < 32) { - if ((r.width() < 32) || (pitch == 128 && r.width() == 32)) { -#ifdef ENABLE_OGL_DEBUG - fprintf(stderr, "Skip Texture %dx%d with a pitch of %d pixel. Type %x\n", m_size.x, m_size.y, pitch >>2, m_format); - fprintf(stderr, "Box (%d,%d)x(%d,%d)\n", r.x, r.y, r.width(), r.height()); -#endif - - // FIXME useful? - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); // Restore default behavior - return false; - } - } -#endif - glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, data); // FIXME useful? glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); // Restore default behavior return true; -#if 0 - if(m_dev && m_texture) - { - D3D11_BOX box = {r.left, r.top, 0, r.right, r.bottom, 1}; - - m_ctx->UpdateSubresource(m_texture, 0, &box, data, pitch, 0); - - return true; - } - - return false; -#endif #endif } diff --git a/plugins/GSdx/GSWnd.cpp b/plugins/GSdx/GSWnd.cpp index 7bcddf9ed9..d61b2aea90 100644 --- a/plugins/GSdx/GSWnd.cpp +++ b/plugins/GSdx/GSWnd.cpp @@ -82,6 +82,8 @@ void GSWndGL::PopulateGlFunction() *(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer"); *(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer"); *(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData"); + // GL4.0 + *(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv"); // GL4.1 *(void**)&(gl_BindProgramPipeline) = GetProcAddress("glBindProgramPipeline"); *(void**)&(gl_DeleteProgramPipelines) = GetProcAddress("glDeleteProgramPipelines"); diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h index 657b0a742a..7faf30caed 100644 --- a/plugins/GSdx/res/glsl_source.h +++ b/plugins/GSdx/res/glsl_source.h @@ -1166,6 +1166,77 @@ static const char* tfx_glsl = "#endif\n" "}\n" "\n" + "// Note layout stuff might require gl4.3\n" + "#ifdef SUBROUTINE_GL40\n" + "// Function pointer type\n" + "subroutine void AlphaTestType(vec4 c);\n" + "\n" + "// a function pointer variable\n" + "layout(location = 0) subroutine uniform AlphaTestType atst;\n" + "\n" + "// The function attached to AlphaTestType\n" + "layout(index = 0) subroutine(AlphaTestType)\n" + "void atest_never(vec4 c)\n" + "{\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 1) subroutine(AlphaTestType)\n" + "void atest_always(vec4 c)\n" + "{\n" + " // Nothing to do\n" + "}\n" + "\n" + "layout(index = 2) subroutine(AlphaTestType)\n" + "void atest_l(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if (PS_SPRITEHACK == 0)\n" + " if ((AREF - a - 0.5f) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 3) subroutine(AlphaTestType)\n" + "void atest_le(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if ((AREF - a + 0.5f) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 4) subroutine(AlphaTestType)\n" + "void atest_e(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if ((0.5f - abs(a - AREF)) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 5) subroutine(AlphaTestType)\n" + "void atest_ge(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if ((a-AREF + 0.5f) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 6) subroutine(AlphaTestType)\n" + "void atest_g(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if ((a-AREF - 0.5f) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "layout(index = 7) subroutine(AlphaTestType)\n" + "void atest_ne(vec4 c)\n" + "{\n" + " float a = trunc(c.a * 255.0 + 0.01);\n" + " if ((abs(a - AREF) - 0.5f) < 0.0f)\n" + " discard;\n" + "}\n" + "\n" + "#else\n" "void atst(vec4 c)\n" "{\n" " float a = trunc(c.a * 255.0 + 0.01);\n" @@ -1210,16 +1281,64 @@ static const char* tfx_glsl = " discard;\n" " }\n" "}\n" + "#endif\n" "\n" - "vec4 fog(vec4 c, float f)\n" + "// Note layout stuff might require gl4.3\n" + "#ifdef SUBROUTINE_GL40\n" + "// Function pointer type\n" + "subroutine void ColClipType(inout vec4 c);\n" + "\n" + "// a function pointer variable\n" + "layout(location = 1) subroutine uniform ColClipType colclip;\n" + "\n" + "layout(index = 8) subroutine(ColClipType)\n" + "void colclip_0(inout vec4 c)\n" + "{\n" + " // nothing to do\n" + "}\n" + "\n" + "layout(index = 9) subroutine(ColClipType)\n" + "void colclip_1(inout vec4 c)\n" + "{\n" + " // FIXME !!!!\n" + " //c.rgb *= c.rgb < 128./255;\n" + " bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n" + " c.rgb *= vec3(factor);\n" + "}\n" + "\n" + "layout(index = 10) subroutine(ColClipType)\n" + "void colclip_2(inout vec4 c)\n" + "{\n" + " c.rgb = 256.0f/255.0f - c.rgb;\n" + " // FIXME !!!!\n" + " //c.rgb *= c.rgb < 128./255;\n" + " bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n" + " c.rgb *= vec3(factor);\n" + "}\n" + "\n" + "#else\n" + "void colclip(inout vec4 c)\n" + "{\n" + " if (PS_COLCLIP == 2)\n" + " {\n" + " c.rgb = 256.0f/255.0f - c.rgb;\n" + " }\n" + " if (PS_COLCLIP > 0)\n" + " {\n" + " // FIXME !!!!\n" + " //c.rgb *= c.rgb < 128./255;\n" + " bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n" + " c.rgb *= vec3(factor);\n" + " }\n" + "}\n" + "#endif\n" + "\n" + "void fog(vec4 c, float f)\n" "{\n" - " vec4 c_out = c;\n" " if(PS_FOG != 0)\n" " {\n" - " c_out.rgb = mix(FogColor, c.rgb, f);\n" + " c.rgb = mix(FogColor, c.rgb, f);\n" " }\n" - "\n" - " return c_out;\n" "}\n" "\n" "vec4 ps_color()\n" @@ -1232,19 +1351,9 @@ static const char* tfx_glsl = "\n" " atst(c);\n" "\n" - " c = fog(c, PSin_t.z);\n" + " fog(c, PSin_t.z);\n" "\n" - " if (PS_COLCLIP == 2)\n" - " {\n" - " c.rgb = 256.0f/255.0f - c.rgb;\n" - " }\n" - " if (PS_COLCLIP > 0)\n" - " {\n" - " // FIXME !!!!\n" - " //c.rgb *= c.rgb < 128./255;\n" - " bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n" - " c.rgb *= vec3(factor);\n" - " }\n" + " colclip(c);\n" "\n" " if(PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes\n" " {\n" diff --git a/plugins/GSdx/res/tfx.glsl b/plugins/GSdx/res/tfx.glsl index d34b975bf5..5a11a5d6e8 100644 --- a/plugins/GSdx/res/tfx.glsl +++ b/plugins/GSdx/res/tfx.glsl @@ -656,6 +656,77 @@ void datst() #endif } +// Note layout stuff might require gl4.3 +#ifdef SUBROUTINE_GL40 +// Function pointer type +subroutine void AlphaTestType(vec4 c); + +// a function pointer variable +layout(location = 0) subroutine uniform AlphaTestType atst; + +// The function attached to AlphaTestType +layout(index = 0) subroutine(AlphaTestType) +void atest_never(vec4 c) +{ + discard; +} + +layout(index = 1) subroutine(AlphaTestType) +void atest_always(vec4 c) +{ + // Nothing to do +} + +layout(index = 2) subroutine(AlphaTestType) +void atest_l(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if (PS_SPRITEHACK == 0) + if ((AREF - a - 0.5f) < 0.0f) + discard; +} + +layout(index = 3) subroutine(AlphaTestType) +void atest_le(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if ((AREF - a + 0.5f) < 0.0f) + discard; +} + +layout(index = 4) subroutine(AlphaTestType) +void atest_e(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if ((0.5f - abs(a - AREF)) < 0.0f) + discard; +} + +layout(index = 5) subroutine(AlphaTestType) +void atest_ge(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if ((a-AREF + 0.5f) < 0.0f) + discard; +} + +layout(index = 6) subroutine(AlphaTestType) +void atest_g(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if ((a-AREF - 0.5f) < 0.0f) + discard; +} + +layout(index = 7) subroutine(AlphaTestType) +void atest_ne(vec4 c) +{ + float a = trunc(c.a * 255.0 + 0.01); + if ((abs(a - AREF) - 0.5f) < 0.0f) + discard; +} + +#else void atst(vec4 c) { float a = trunc(c.a * 255.0 + 0.01); @@ -700,16 +771,64 @@ void atst(vec4 c) discard; } } +#endif -vec4 fog(vec4 c, float f) +// Note layout stuff might require gl4.3 +#ifdef SUBROUTINE_GL40 +// Function pointer type +subroutine void ColClipType(inout vec4 c); + +// a function pointer variable +layout(location = 1) subroutine uniform ColClipType colclip; + +layout(index = 8) subroutine(ColClipType) +void colclip_0(inout vec4 c) +{ + // nothing to do +} + +layout(index = 9) subroutine(ColClipType) +void colclip_1(inout vec4 c) +{ + // FIXME !!!! + //c.rgb *= c.rgb < 128./255; + bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f); + c.rgb *= vec3(factor); +} + +layout(index = 10) subroutine(ColClipType) +void colclip_2(inout vec4 c) +{ + c.rgb = 256.0f/255.0f - c.rgb; + // FIXME !!!! + //c.rgb *= c.rgb < 128./255; + bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f); + c.rgb *= vec3(factor); +} + +#else +void colclip(inout vec4 c) +{ + if (PS_COLCLIP == 2) + { + c.rgb = 256.0f/255.0f - c.rgb; + } + if (PS_COLCLIP > 0) + { + // FIXME !!!! + //c.rgb *= c.rgb < 128./255; + bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f); + c.rgb *= vec3(factor); + } +} +#endif + +void fog(vec4 c, float f) { - vec4 c_out = c; if(PS_FOG != 0) { - c_out.rgb = mix(FogColor, c.rgb, f); + c.rgb = mix(FogColor, c.rgb, f); } - - return c_out; } vec4 ps_color() @@ -722,19 +841,9 @@ vec4 ps_color() atst(c); - c = fog(c, PSin_t.z); + fog(c, PSin_t.z); - if (PS_COLCLIP == 2) - { - c.rgb = 256.0f/255.0f - c.rgb; - } - if (PS_COLCLIP > 0) - { - // FIXME !!!! - //c.rgb *= c.rgb < 128./255; - bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f); - c.rgb *= vec3(factor); - } + colclip(c); if(PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes {