gsdx ogl: Test the ARB_shader_subroutine GL4.0 extension

The idea was to replace shader program swith by pointer function calls inside
shaders.  At least parameters that are often changed between draw call. So far
I only ported atst and colclip. Unfortunately code is "slower" (on GSdx standalone).
For the moment keep the code but disabled.

If I understand well the validation of program is done in the "driver thread"
but the additional call are done in the overloaded MTGS thread. Apitrace
profiling shows faster GPU draw calls. Another possibility is that the driver still
need to validate the draw call because of others state change.

Here some stats on colin3 (90 frames):
without subroutine: UseProgram 125246
with subroutine: UseProgram 2906, subroutine 125945 => 3605 extra calls overhead (not
all parameters are ported to subroutine)



git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5715 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2013-08-10 19:43:59 +00:00
parent c9755361ec
commit 0f603a98d5
14 changed files with 479 additions and 192 deletions

View File

@ -81,7 +81,9 @@ PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages = NULL;
PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL;
PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL;
PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL;
// GL 4.1
// GL4.0
PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL;
// GL4.1
PFNGLBINDPROGRAMPIPELINEPROC gl_BindProgramPipeline = NULL;
PFNGLGENPROGRAMPIPELINESPROC gl_GenProgramPipelines = NULL;
PFNGLDELETEPROGRAMPIPELINESPROC gl_DeleteProgramPipelines = NULL;
@ -122,9 +124,10 @@ namespace GLLoader {
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 hardawe can support it
bool found_GL_ARB_buffer_storage = false;
// GL4 hardware
bool found_GL_ARB_copy_image = false;
bool found_GL_ARB_copy_image = false; // Not sure actually
bool found_GL_ARB_gpu_shader5 = false;
bool found_GL_ARB_shader_image_load_store = false;
bool found_GL_ARB_shader_subroutine = false;
// Mandatory for FULL GL (but optional for GLES)
bool found_GL_ARB_multi_bind = false; // Not yet. Wait Mesa & AMD drivers
@ -221,6 +224,17 @@ namespace GLLoader {
if (ext.compare("GL_ARB_copy_image") == 0) found_GL_ARB_copy_image = true;
if (ext.compare("GL_ARB_gpu_shader5") == 0) found_GL_ARB_gpu_shader5 = true;
if (ext.compare("GL_ARB_shader_image_load_store") == 0) found_GL_ARB_shader_image_load_store = true;
#if 0
// Strangely it doesn't provide the speed boost as expected.
// Note: only atst/colclip was replaced with subroutine for the moment. It replace 2000 program switch on
// colin mcrae 3 by 2100 uniform, but code is slower!
//
// Current hypothesis: the validation of useprogram is done in the "driver thread" whereas the extra function calls
// are done on the overloaded main threads.
// Apitrace profiling shows faster GPU draw times
if (ext.compare("GL_ARB_shader_subroutine") == 0) found_GL_ARB_shader_subroutine = true;
#endif
#ifdef GL44 // Need to debug the code first
if (ext.compare("GL_ARB_clear_texture") == 0) found_GL_ARB_clear_texture = true;
if (ext.compare("GL_ARB_multi_bind") == 0) found_GL_ARB_multi_bind = true;
@ -242,6 +256,7 @@ namespace GLLoader {
status &= status_and_override(found_GL_ARB_shader_image_load_store,"GL_ARB_shader_image_load_store");
status &= status_and_override(found_GL_ARB_clear_texture,"GL_ARB_clear_texture");
status &= status_and_override(found_GL_ARB_buffer_storage,"GL_ARB_buffer_storage");
status &= status_and_override(found_GL_ARB_shader_subroutine,"GL_ARB_shader_subroutine");
status &= status_and_override(found_GL_ARB_texture_storage, "GL_ARB_texture_storage", true);
status &= status_and_override(found_GL_ARB_shading_language_420pack,"GL_ARB_shading_language_420pack");

View File

@ -134,6 +134,8 @@ extern PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages;
extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer;
extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer;
extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData;
// GL4.0
extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv;
// GL4.1
extern PFNGLBINDPROGRAMPIPELINEPROC gl_BindProgramPipeline;
extern PFNGLDELETEPROGRAMPIPELINESPROC gl_DeleteProgramPipelines;
@ -254,4 +256,5 @@ namespace GLLoader {
extern bool found_GL_ARB_clear_texture;
extern bool found_GL_ARB_multi_bind;
extern bool found_GL_ARB_buffer_storage;
extern bool found_GL_ARB_shader_subroutine;
}

View File

@ -62,6 +62,8 @@ namespace GLState {
GLuint gs = 0;
GLuint vs = 0;
GLuint program = 0;
bool dirty_prog = false;
bool dirty_subroutine_ps = false;
#if 0
struct {
GSVertexBufferStateOGL* vb;
@ -112,5 +114,7 @@ namespace GLState {
gs = 0;
vs = 0;
program = 0;
dirty_prog = false;
dirty_subroutine_ps = false;
}
}

View File

@ -64,6 +64,8 @@ namespace GLState {
extern GLuint gs;
extern GLuint vs;
extern GLuint program; // monolith program (when sso isn't supported)
extern bool dirty_prog;
extern bool dirty_subroutine_ps;
extern void Clear();
}

View File

@ -168,7 +168,7 @@ bool GSDeviceOGL::Create(GSWnd* wnd)
// ****************************************************************
// Various object
// ****************************************************************
m_shader = new GSShaderOGL(!!theApp.GetConfig("debug_ogl_shader", 1), GLLoader::found_GL_ARB_separate_shader_objects, GLLoader::found_GL_ARB_shading_language_420pack);
m_shader = new GSShaderOGL(!!theApp.GetConfig("debug_ogl_shader", 1));
gl_GenFramebuffers(1, &m_fbo);
gl_GenFramebuffers(1, &m_fbo_read);
@ -572,6 +572,7 @@ void GSDeviceOGL::Barrier(GLbitfield b)
//#endif
}
/* Note: must be here because tfx_glsl is static */
GLuint GSDeviceOGL::CompileVS(VSSelector sel)
{
std::string macro = format("#define VS_BPPZ %d\n", sel.bppz)
@ -582,6 +583,7 @@ GLuint GSDeviceOGL::CompileVS(VSSelector sel)
return m_shader->Compile("tfx.glsl", "vs_main", GL_VERTEX_SHADER, tfx_glsl, macro);
}
/* Note: must be here because tfx_glsl is static */
GLuint GSDeviceOGL::CompileGS(GSSelector sel)
{
// Easy case
@ -598,6 +600,7 @@ GLuint GSDeviceOGL::CompileGS(GSSelector sel)
#endif
}
/* Note: must be here because tfx_glsl is static */
GLuint GSDeviceOGL::CompilePS(PSSelector sel)
{
std::string macro = format("#define PS_FST %d\n", sel.fst)
@ -720,6 +723,14 @@ void GSDeviceOGL::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt,
GSVector2i ds = dt->GetSize();
// WARNING: setup of the program must be done first. So you can setup
// 1/ subroutine uniform
// 2/ bindless texture uniform
// 3/ others uniform?
m_shader->VS(m_convert.vs);
m_shader->GS(0);
m_shader->PS(ps);
// ************************************
// om
// ************************************
@ -764,32 +775,17 @@ void GSDeviceOGL::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt,
{GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(flip_sr.x, flip_sr.w)},
{GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(flip_sr.z, flip_sr.w)},
};
//fprintf(stderr, "A:%fx%f B:%fx%f\n", left, top, bottom, right);
//fprintf(stderr, "SR: %f %f %f %f\n", sr.x, sr.y, sr.z, sr.w);
IASetVertexState(m_vb_sr);
IASetVertexBuffer(vertices, 4);
IASetPrimitiveTopology(GL_TRIANGLE_STRIP);
// ************************************
// vs
// Texture
// ************************************
m_shader->VS(m_convert.vs);
// ************************************
// gs
// ************************************
m_shader->GS(0);
// ************************************
// ps
// ************************************
PSSetShaderResource(0, static_cast<GSTextureOGL*>(st)->GetID());
PSSetShaderResource(static_cast<GSTextureOGL*>(st)->GetID());
PSSetSamplerState(linear ? m_convert.ln : m_convert.pt);
m_shader->PS(ps);
// ************************************
// Draw
@ -886,6 +882,14 @@ void GSDeviceOGL::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* ver
ClearStencil(ds, 0);
// WARNING: setup of the program must be done first. So you can setup
// 1/ subroutine uniform
// 2/ bindless texture uniform
// 3/ others uniform?
m_shader->VS(m_convert.vs);
m_shader->GS(0);
m_shader->PS(m_convert.ps[datm ? 2 : 3]);
// om
OMSetDepthStencilState(m_date.dss, 1);
@ -898,19 +902,11 @@ void GSDeviceOGL::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* ver
IASetVertexBuffer(vertices, 4);
IASetPrimitiveTopology(GL_TRIANGLE_STRIP);
// vs
m_shader->VS(m_convert.vs);
// Texture
// gs
m_shader->GS(0);
// ps
PSSetShaderResource(0, static_cast<GSTextureOGL*>(rt)->GetID());
PSSetShaderResource(static_cast<GSTextureOGL*>(rt)->GetID());
PSSetSamplerState(m_convert.pt);
m_shader->PS(m_convert.ps[datm ? 2 : 3]);
//
@ -966,16 +962,16 @@ void GSDeviceOGL::IASetPrimitiveTopology(GLenum topology)
m_state.vb->SetTopology(topology);
}
void GSDeviceOGL::PSSetShaderResource(const int i, GLuint sr)
void GSDeviceOGL::PSSetShaderResource(GLuint sr)
{
if (GLState::tex_unit[i] != sr) {
GLState::tex_unit[i] = sr;
if (GLState::tex_unit[0] != sr) {
GLState::tex_unit[0] = sr;
if (GLLoader::found_GL_ARB_multi_bind) {
GLuint textures[1] = {sr};
gl_BindTextures(i, 1, textures);
gl_BindTextures(0, 1, textures);
} else {
gl_ActiveTexture(GL_TEXTURE0 + i);
gl_ActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, sr);
// Get back to the expected active texture unit
@ -987,8 +983,21 @@ void GSDeviceOGL::PSSetShaderResource(const int i, GLuint sr)
void GSDeviceOGL::PSSetShaderResources(GLuint tex[2])
{
if (GLState::tex_unit[0] != tex[0] || GLState::tex_unit[1] != tex[1]) {
GLuint textures[2] = {tex[0], tex[1]};
gl_BindTextures(0, 2, textures);
GLState::tex_unit[0] = tex[0];
GLState::tex_unit[1] = tex[1];
if (GLLoader::found_GL_ARB_multi_bind) {
gl_BindTextures(0, 2, tex);
} else {
gl_ActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, tex[0]);
gl_ActiveTexture(GL_TEXTURE0 + 1);
glBindTexture(GL_TEXTURE_2D, tex[1]);
// Get back to the expected active texture unit
gl_ActiveTexture(GL_TEXTURE0 + 3);
}
}
}

View File

@ -606,10 +606,9 @@ class GSDeviceOGL : public GSDevice
void IASetIndexBuffer(const void* index, size_t count);
void IASetVertexState(GSVertexBufferStateOGL* vb = NULL);
void PSSetShaderResource(const int i, GLuint sr);
void PSSetShaderResource(GLuint sr);
void PSSetShaderResources(GLuint tex[2]);
void PSSetSamplerState(GLuint ss);
void PSSetSamplerStates(const int count, const GLuint* samplers);
void OMSetDepthStencilState(GSDepthStencilOGL* dss, uint8 sref);
void OMSetBlendState(GSBlendStateOGL* bs, float bf);
@ -627,9 +626,10 @@ class GSDeviceOGL : public GSDevice
void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
void SetupVS(VSSelector sel);
void SetupGS(GSSelector sel);
void SetupPS(PSSelector sel, const PSConstantBuffer* cb);
void SetupPS(PSSelector sel);
void SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb);
void SetupSampler(PSSelector sel, PSSamplerSelector ssel);
void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix);

View File

@ -163,6 +163,7 @@ void GSRendererOGL::SetupIA()
dev->IASetVertexState();
if(UserHacks_WildHack && !isPackedUV_HackFlag) {
// FIXME: why not put it on the Vertex shader
if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertex), m_vertex.next))
{
GSVector4i::storent(ptr, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
@ -441,6 +442,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
om_dssel.alpha_stencil = 1;
}
// By default don't use texture
ps_sel.tfx = 4;
if(tex)
{
const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[context->TEX0.PSM];
@ -456,6 +460,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.tcc = context->TEX0.TCC;
ps_sel.ltf = bilinear && !simple_sample;
ps_sel.spritehack = tex->m_spritehack_t;
// FIXME the ati is currently disabled on the shader. I need to find a .gs to test that we got same
// bug on opengl
ps_sel.point_sampler = !(bilinear && simple_sample);
int w = tex->m_texture->GetWidth();
@ -491,23 +497,30 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_ssel.tau = (context->CLAMP.WMS + 3) >> 1;
ps_ssel.tav = (context->CLAMP.WMT + 3) >> 1;
ps_ssel.ltf = bilinear && simple_sample;
dev->SetupSampler(ps_sel, ps_ssel);
if (tex->m_palette) {
if (GLLoader::found_GL_ARB_multi_bind) {
GLuint textures[2] = {static_cast<GSTextureOGL*>(tex->m_texture)->GetID(), static_cast<GSTextureOGL*>(tex->m_palette)->GetID()};
dev->PSSetShaderResources(textures);
} else {
dev->PSSetShaderResource(1, static_cast<GSTextureOGL*>(tex->m_palette)->GetID());
dev->PSSetShaderResource(0, static_cast<GSTextureOGL*>(tex->m_texture)->GetID());
}
} else {
dev->PSSetShaderResource(0, static_cast<GSTextureOGL*>(tex->m_texture)->GetID());
}
}
else
{
ps_sel.tfx = 4;
// WARNING: setup of the program must be done first. So you can setup
// 1/ subroutine uniform
// 2/ bindless texture uniform
// 3/ others uniform?
dev->SetupVS(vs_sel);
dev->SetupGS(gs_sel);
dev->SetupPS(ps_sel);
// Note: bindless texture will use uniform so it must be done after the program setup
if(tex) {
if (tex->m_palette) {
// 2 textures (main + palette)
dev->SetupSampler(ps_sel, ps_ssel);
GLuint textures[2] = {static_cast<GSTextureOGL*>(tex->m_texture)->GetID(), static_cast<GSTextureOGL*>(tex->m_palette)->GetID()};
dev->PSSetShaderResources(textures);
} else if (tex->m_texture) {
// Only main texture
dev->SetupSampler(ps_sel, ps_ssel);
dev->PSSetShaderResource(static_cast<GSTextureOGL*>(tex->m_texture)->GetID());
}
}
// rs
@ -521,9 +534,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
SetupIA();
dev->SetupOM(om_dssel, om_bsel, afix);
dev->SetupVS(vs_sel, &vs_cb);
dev->SetupGS(gs_sel);
dev->SetupPS(ps_sel, &ps_cb);
dev->SetupCB(&vs_cb, &ps_cb);
if (advance_DATE) {
// Create an r32ui image that will contain primitive ID
@ -539,7 +550,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
dev->OMSetWriteBuffer();
ps_sel.date = 3;
dev->SetupPS(ps_sel, &ps_cb);
dev->SetupPS(ps_sel);
// Be sure that first pass is finished !
dev->Barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
@ -558,7 +569,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_selneg.colclip = 2;
dev->SetupOM(om_dssel, om_bselneg, afix);
dev->SetupPS(ps_selneg, &ps_cb);
dev->SetupPS(ps_selneg);
dev->DrawIndexedPrimitive();
dev->SetupOM(om_dssel, om_bsel, afix);
@ -573,7 +584,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.atst = iatst[ps_sel.atst];
dev->SetupPS(ps_sel, &ps_cb);
dev->SetupPS(ps_sel);
bool z = om_dssel.zwe;
bool r = om_bsel.wr;
@ -583,11 +594,11 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
switch(context->TEST.AFAIL)
{
case AFAIL_KEEP: z = r = g = b = a = false; break; // none
case AFAIL_FB_ONLY: z = false; break; // rgba
case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z
case AFAIL_RGB_ONLY: z = a = false; break; // rgb
default: __assume(0);
case AFAIL_KEEP: z = r = g = b = a = false; break; // none
case AFAIL_FB_ONLY: z = false; break; // rgba
case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z
case AFAIL_RGB_ONLY: z = a = false; break; // rgb
default: __assume(0);
}
if(z || r || g || b || a)
@ -611,7 +622,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_selneg.colclip = 2;
dev->SetupOM(om_dssel, om_bselneg, afix);
dev->SetupPS(ps_selneg, &ps_cb);
dev->SetupPS(ps_selneg);
dev->DrawIndexedPrimitive();
}

View File

@ -23,14 +23,16 @@
#include "GSShaderOGL.h"
#include "GLState.h"
GSShaderOGL::GSShaderOGL(bool debug, bool sso, bool glsl420) :
GSShaderOGL::GSShaderOGL(bool debug) :
m_debug_shader(debug),
m_sso(sso),
m_glsl420(glsl420)
m_sub_count(0)
{
memset(&m_ps_sub, 0, countof(m_ps_sub)*sizeof(GLuint));
m_single_prog.clear();
#ifndef ENABLE_GLES
if (sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
gl_GenProgramPipelines(1, &m_pipeline);
gl_BindProgramPipeline(m_pipeline);
}
@ -40,7 +42,7 @@ GSShaderOGL::GSShaderOGL(bool debug, bool sso, bool glsl420) :
GSShaderOGL::~GSShaderOGL()
{
#ifndef ENABLE_GLES
if (m_sso)
if (GLLoader::found_GL_ARB_separate_shader_objects)
gl_DeleteProgramPipelines(1, &m_pipeline);
#endif
@ -53,21 +55,35 @@ void GSShaderOGL::VS(GLuint s)
if (GLState::vs != s)
{
GLState::vs = s;
GLState::dirty_prog = true;
#ifndef ENABLE_GLES
if (m_sso)
if (GLLoader::found_GL_ARB_separate_shader_objects)
gl_UseProgramStages(m_pipeline, GL_VERTEX_SHADER_BIT, s);
#endif
}
}
void GSShaderOGL::PS(GLuint s)
void GSShaderOGL::PS_subroutine(GLuint *sub)
{
if (!(m_ps_sub[0] == sub[0] && m_ps_sub[1] == sub[1])) {
m_ps_sub[0] = sub[0];
m_ps_sub[1] = sub[1];
GLState::dirty_subroutine_ps = true;
}
}
void GSShaderOGL::PS(GLuint s, GLuint sub_count)
{
if (GLState::ps != s)
{
m_sub_count = sub_count;
GLState::ps = s;
GLState::dirty_prog = true;
#ifndef ENABLE_GLES
if (m_sso)
if (GLLoader::found_GL_ARB_separate_shader_objects) {
gl_UseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s);
}
#endif
}
}
@ -77,8 +93,9 @@ void GSShaderOGL::GS(GLuint s)
if (GLState::gs != s)
{
GLState::gs = s;
GLState::dirty_prog = true;
#ifndef ENABLE_GLES
if (m_sso)
if (GLLoader::found_GL_ARB_separate_shader_objects)
gl_UseProgramStages(m_pipeline, GL_GEOMETRY_SHADER_BIT, s);
#endif
}
@ -97,7 +114,7 @@ void GSShaderOGL::SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding)
{
GLint loc = gl_GetUniformLocation(prog, name);
if (loc != -1) {
if (m_sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
#ifndef ENABLE_GLES
gl_ProgramUniform1i(prog, loc, binding);
#endif
@ -109,9 +126,9 @@ void GSShaderOGL::SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding)
void GSShaderOGL::SetupUniform()
{
if (m_glsl420) return;
if (GLLoader::found_GL_ARB_shading_language_420pack) return;
if (m_sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
SetUniformBinding(GLState::vs, "cb20", 20);
SetUniformBinding(GLState::ps, "cb21", 21);
@ -138,6 +155,17 @@ void GSShaderOGL::SetupUniform()
}
}
void GSShaderOGL::SetSubroutineUniform()
{
if (!GLLoader::found_GL_ARB_shader_subroutine) return;
if (m_sub_count == 0) return;
if (GLState::dirty_subroutine_ps || GLState::dirty_prog)
gl_UniformSubroutinesuiv(GL_FRAGMENT_SHADER, m_sub_count, m_ps_sub);
GLState::dirty_subroutine_ps = false;
}
bool GSShaderOGL::ValidateShader(GLuint s)
{
if (!m_debug_shader) return true;
@ -223,37 +251,44 @@ GLuint GSShaderOGL::LinkNewProgram()
void GSShaderOGL::UseProgram()
{
hash_map<uint64, GLuint >::iterator it;
if (!m_sso) {
// Note: shader are integer lookup pointer. They start from 1 and incr
// every time you create a new shader OR a new program.
// Note2: vs & gs are precompiled at startup. FGLRX and radeon got value < 128.
// We migth be able to pack the value in a 32bits int
// I would need to check the behavior on Nvidia (pause/resume).
uint64 sel = (uint64)GLState::vs << 40 | (uint64)GLState::gs << 20 | GLState::ps;
it = m_single_prog.find(sel);
if (it == m_single_prog.end()) {
GLState::program = LinkNewProgram();
m_single_prog[sel] = GLState::program;
if (GLState::dirty_prog) {
if (!GLLoader::found_GL_ARB_separate_shader_objects) {
hash_map<uint64, GLuint >::iterator it;
// Note: shader are integer lookup pointer. They start from 1 and incr
// every time you create a new shader OR a new program.
// Note2: vs & gs are precompiled at startup. FGLRX and radeon got value < 128.
// We migth be able to pack the value in a 32bits int
// I would need to check the behavior on Nvidia (pause/resume).
uint64 sel = (uint64)GLState::vs << 40 | (uint64)GLState::gs << 20 | GLState::ps;
it = m_single_prog.find(sel);
if (it == m_single_prog.end()) {
GLState::program = LinkNewProgram();
m_single_prog[sel] = GLState::program;
ValidateProgram(GLState::program);
ValidateProgram(GLState::program);
gl_UseProgram(GLState::program);
// warning it must be done after the "setup" of the program
SetupUniform();
} else {
GLuint prog = it->second;
if (prog != GLState::program) {
GLState::program = prog;
gl_UseProgram(GLState::program);
// warning it must be done after the "setup" of the program
SetupUniform();
} else {
GLuint prog = it->second;
if (prog != GLState::program) {
GLState::program = prog;
gl_UseProgram(GLState::program);
}
}
} else {
ValidatePipeline(m_pipeline);
SetupUniform();
}
} else {
ValidatePipeline(m_pipeline);
SetupUniform();
}
SetSubroutineUniform();
GLState::dirty_prog = false;
}
std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, const std::string& macro)
@ -266,13 +301,13 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co
} else {
header = "#version 330 core\n";
}
if (m_glsl420) {
if (GLLoader::found_GL_ARB_shading_language_420pack) {
// Need GL version 420
header += "#extension GL_ARB_shading_language_420pack: require\n";
} else {
header += "#define DISABLE_GL42\n";
}
if (m_sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
// Need GL version 410
header += "#extension GL_ARB_separate_shader_objects : require\n";
} else {
@ -284,6 +319,10 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co
// Need version 140
header += "#extension GL_ARB_uniform_buffer_object : require\n";
}
if (GLLoader::found_GL_ARB_shader_subroutine) {
// Need GL version 400
header += "#define SUBROUTINE_GL40 1\n";
}
#ifdef ENABLE_OGL_STENCIL_DEBUG
header += "#define ENABLE_OGL_STENCIL_DEBUG 1\n";
#endif
@ -351,7 +390,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent
sources[0] = header.append(glsl_h_code).c_str();
#endif
if (m_sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
#ifndef ENABLE_GLES
program = gl_CreateShaderProgramv(type, shader_nb, sources);
#endif
@ -362,7 +401,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent
}
bool status;
if (m_sso)
if (GLLoader::found_GL_ARB_separate_shader_objects)
status = ValidateProgram(program);
else
status = ValidateShader(program);
@ -378,7 +417,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent
void GSShaderOGL::Delete(GLuint s)
{
if (m_sso) {
if (GLLoader::found_GL_ARB_separate_shader_objects) {
gl_DeleteProgram(s);
} else {
gl_DeleteShader(s);

View File

@ -25,10 +25,11 @@ class GSShaderOGL {
GLuint m_pipeline;
hash_map<uint64, GLuint > m_single_prog;
const bool m_debug_shader;
const bool m_sso;
const bool m_glsl420;
GLuint m_sub_count;
GLuint m_ps_sub[2];
void SetSubroutineUniform();
void SetupUniform();
void SetUniformBinding(GLuint prog, GLchar* name, GLuint binding);
void SetSamplerBinding(GLuint prog, GLchar* name, GLuint binding);
@ -41,11 +42,12 @@ class GSShaderOGL {
GLuint LinkNewProgram();
public:
GSShaderOGL(bool debug, bool sso, bool glsl420);
GSShaderOGL(bool debug);
~GSShaderOGL();
void GS(GLuint s);
void PS(GLuint s);
void PS(GLuint s, GLuint sub_count = 0);
void PS_subroutine(GLuint *sub);
void VS(GLuint s);
void UseProgram();

View File

@ -133,14 +133,22 @@ GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, uint8 afix)
return bs;
}
void GSDeviceOGL::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
void GSDeviceOGL::SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb)
{
if(m_vs_cb_cache.Update(vs_cb)) {
m_vs_cb->upload(vs_cb);
}
if(m_ps_cb_cache.Update(ps_cb)) {
m_ps_cb->upload(ps_cb);
}
}
void GSDeviceOGL::SetupVS(VSSelector sel)
{
GLuint vs = m_vs[sel];
if(m_vs_cb_cache.Update(cb)) {
m_vs_cb->upload(cb);
}
m_shader->VS(vs);
}
@ -151,8 +159,16 @@ void GSDeviceOGL::SetupGS(GSSelector sel)
m_shader->GS(gs);
}
void GSDeviceOGL::SetupPS(PSSelector sel, const PSConstantBuffer* cb)
void GSDeviceOGL::SetupPS(PSSelector sel)
{
if (GLLoader::found_GL_ARB_shader_subroutine) {
GLuint sub[2] = {sel.atst, (uint32)sel.colclip + 8};
m_shader->PS_subroutine(sub);
// Handle by subroutine useless now
sel.atst = 0;
sel.colclip = 0;
}
// *************************************************************
// Static
// *************************************************************
@ -169,11 +185,7 @@ void GSDeviceOGL::SetupPS(PSSelector sel, const PSConstantBuffer* cb)
// *************************************************************
// Dynamic
// *************************************************************
if(m_ps_cb_cache.Update(cb)) {
m_ps_cb->upload(cb);
}
m_shader->PS(ps);
m_shader->PS(ps, 2);
}
void GSDeviceOGL::SetupSampler(PSSelector sel, PSSamplerSelector ssel)

View File

@ -177,7 +177,6 @@ GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read)
break;
default: break;
}
}
GSTextureOGL::~GSTextureOGL()
@ -210,7 +209,6 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
EnableUnit();
#if 1
PboPool::BindPbo();
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
@ -230,47 +228,19 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
PboPool::UnbindPbo();
return true;
#else
#if 0
// pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift);
#ifdef _LINUX
if (GLLoader::fglrx_buggy_driver && !GLLoader::in_replayer) {
// FIXME : it crash on colin mcrae rally 3 (others game too) when the texture is small
//if ((pitch >> 2) == 32 || r.width() < 32 || r.height() < 32) {
if ((r.width() < 32) || (pitch == 128 && r.width() == 32)) {
#ifdef ENABLE_OGL_DEBUG
fprintf(stderr, "Skip Texture %dx%d with a pitch of %d pixel. Type %x\n", m_size.x, m_size.y, pitch >>2, m_format);
fprintf(stderr, "Box (%d,%d)x(%d,%d)\n", r.x, r.y, r.width(), r.height());
#endif
// FIXME useful?
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); // Restore default behavior
return false;
}
}
#endif
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, data);
// FIXME useful?
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); // Restore default behavior
return true;
#if 0
if(m_dev && m_texture)
{
D3D11_BOX box = {r.left, r.top, 0, r.right, r.bottom, 1};
m_ctx->UpdateSubresource(m_texture, 0, &box, data, pitch, 0);
return true;
}
return false;
#endif
#endif
}

View File

@ -82,6 +82,8 @@ void GSWndGL::PopulateGlFunction()
*(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer");
*(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer");
*(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData");
// GL4.0
*(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv");
// GL4.1
*(void**)&(gl_BindProgramPipeline) = GetProcAddress("glBindProgramPipeline");
*(void**)&(gl_DeleteProgramPipelines) = GetProcAddress("glDeleteProgramPipelines");

View File

@ -1166,6 +1166,77 @@ static const char* tfx_glsl =
"#endif\n"
"}\n"
"\n"
"// Note layout stuff might require gl4.3\n"
"#ifdef SUBROUTINE_GL40\n"
"// Function pointer type\n"
"subroutine void AlphaTestType(vec4 c);\n"
"\n"
"// a function pointer variable\n"
"layout(location = 0) subroutine uniform AlphaTestType atst;\n"
"\n"
"// The function attached to AlphaTestType\n"
"layout(index = 0) subroutine(AlphaTestType)\n"
"void atest_never(vec4 c)\n"
"{\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 1) subroutine(AlphaTestType)\n"
"void atest_always(vec4 c)\n"
"{\n"
" // Nothing to do\n"
"}\n"
"\n"
"layout(index = 2) subroutine(AlphaTestType)\n"
"void atest_l(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if (PS_SPRITEHACK == 0)\n"
" if ((AREF - a - 0.5f) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 3) subroutine(AlphaTestType)\n"
"void atest_le(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if ((AREF - a + 0.5f) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 4) subroutine(AlphaTestType)\n"
"void atest_e(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if ((0.5f - abs(a - AREF)) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 5) subroutine(AlphaTestType)\n"
"void atest_ge(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if ((a-AREF + 0.5f) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 6) subroutine(AlphaTestType)\n"
"void atest_g(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if ((a-AREF - 0.5f) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"layout(index = 7) subroutine(AlphaTestType)\n"
"void atest_ne(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
" if ((abs(a - AREF) - 0.5f) < 0.0f)\n"
" discard;\n"
"}\n"
"\n"
"#else\n"
"void atst(vec4 c)\n"
"{\n"
" float a = trunc(c.a * 255.0 + 0.01);\n"
@ -1210,16 +1281,64 @@ static const char* tfx_glsl =
" discard;\n"
" }\n"
"}\n"
"#endif\n"
"\n"
"vec4 fog(vec4 c, float f)\n"
"// Note layout stuff might require gl4.3\n"
"#ifdef SUBROUTINE_GL40\n"
"// Function pointer type\n"
"subroutine void ColClipType(inout vec4 c);\n"
"\n"
"// a function pointer variable\n"
"layout(location = 1) subroutine uniform ColClipType colclip;\n"
"\n"
"layout(index = 8) subroutine(ColClipType)\n"
"void colclip_0(inout vec4 c)\n"
"{\n"
" // nothing to do\n"
"}\n"
"\n"
"layout(index = 9) subroutine(ColClipType)\n"
"void colclip_1(inout vec4 c)\n"
"{\n"
" // FIXME !!!!\n"
" //c.rgb *= c.rgb < 128./255;\n"
" bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
" c.rgb *= vec3(factor);\n"
"}\n"
"\n"
"layout(index = 10) subroutine(ColClipType)\n"
"void colclip_2(inout vec4 c)\n"
"{\n"
" c.rgb = 256.0f/255.0f - c.rgb;\n"
" // FIXME !!!!\n"
" //c.rgb *= c.rgb < 128./255;\n"
" bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
" c.rgb *= vec3(factor);\n"
"}\n"
"\n"
"#else\n"
"void colclip(inout vec4 c)\n"
"{\n"
" if (PS_COLCLIP == 2)\n"
" {\n"
" c.rgb = 256.0f/255.0f - c.rgb;\n"
" }\n"
" if (PS_COLCLIP > 0)\n"
" {\n"
" // FIXME !!!!\n"
" //c.rgb *= c.rgb < 128./255;\n"
" bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
" c.rgb *= vec3(factor);\n"
" }\n"
"}\n"
"#endif\n"
"\n"
"void fog(vec4 c, float f)\n"
"{\n"
" vec4 c_out = c;\n"
" if(PS_FOG != 0)\n"
" {\n"
" c_out.rgb = mix(FogColor, c.rgb, f);\n"
" c.rgb = mix(FogColor, c.rgb, f);\n"
" }\n"
"\n"
" return c_out;\n"
"}\n"
"\n"
"vec4 ps_color()\n"
@ -1232,19 +1351,9 @@ static const char* tfx_glsl =
"\n"
" atst(c);\n"
"\n"
" c = fog(c, PSin_t.z);\n"
" fog(c, PSin_t.z);\n"
"\n"
" if (PS_COLCLIP == 2)\n"
" {\n"
" c.rgb = 256.0f/255.0f - c.rgb;\n"
" }\n"
" if (PS_COLCLIP > 0)\n"
" {\n"
" // FIXME !!!!\n"
" //c.rgb *= c.rgb < 128./255;\n"
" bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);\n"
" c.rgb *= vec3(factor);\n"
" }\n"
" colclip(c);\n"
"\n"
" if(PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes\n"
" {\n"

View File

@ -656,6 +656,77 @@ void datst()
#endif
}
// Note layout stuff might require gl4.3
#ifdef SUBROUTINE_GL40
// Function pointer type
subroutine void AlphaTestType(vec4 c);
// a function pointer variable
layout(location = 0) subroutine uniform AlphaTestType atst;
// The function attached to AlphaTestType
layout(index = 0) subroutine(AlphaTestType)
void atest_never(vec4 c)
{
discard;
}
layout(index = 1) subroutine(AlphaTestType)
void atest_always(vec4 c)
{
// Nothing to do
}
layout(index = 2) subroutine(AlphaTestType)
void atest_l(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if (PS_SPRITEHACK == 0)
if ((AREF - a - 0.5f) < 0.0f)
discard;
}
layout(index = 3) subroutine(AlphaTestType)
void atest_le(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if ((AREF - a + 0.5f) < 0.0f)
discard;
}
layout(index = 4) subroutine(AlphaTestType)
void atest_e(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if ((0.5f - abs(a - AREF)) < 0.0f)
discard;
}
layout(index = 5) subroutine(AlphaTestType)
void atest_ge(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if ((a-AREF + 0.5f) < 0.0f)
discard;
}
layout(index = 6) subroutine(AlphaTestType)
void atest_g(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if ((a-AREF - 0.5f) < 0.0f)
discard;
}
layout(index = 7) subroutine(AlphaTestType)
void atest_ne(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
if ((abs(a - AREF) - 0.5f) < 0.0f)
discard;
}
#else
void atst(vec4 c)
{
float a = trunc(c.a * 255.0 + 0.01);
@ -700,16 +771,64 @@ void atst(vec4 c)
discard;
}
}
#endif
vec4 fog(vec4 c, float f)
// Note layout stuff might require gl4.3
#ifdef SUBROUTINE_GL40
// Function pointer type
subroutine void ColClipType(inout vec4 c);
// a function pointer variable
layout(location = 1) subroutine uniform ColClipType colclip;
layout(index = 8) subroutine(ColClipType)
void colclip_0(inout vec4 c)
{
// nothing to do
}
layout(index = 9) subroutine(ColClipType)
void colclip_1(inout vec4 c)
{
// FIXME !!!!
//c.rgb *= c.rgb < 128./255;
bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
c.rgb *= vec3(factor);
}
layout(index = 10) subroutine(ColClipType)
void colclip_2(inout vec4 c)
{
c.rgb = 256.0f/255.0f - c.rgb;
// FIXME !!!!
//c.rgb *= c.rgb < 128./255;
bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
c.rgb *= vec3(factor);
}
#else
void colclip(inout vec4 c)
{
if (PS_COLCLIP == 2)
{
c.rgb = 256.0f/255.0f - c.rgb;
}
if (PS_COLCLIP > 0)
{
// FIXME !!!!
//c.rgb *= c.rgb < 128./255;
bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
c.rgb *= vec3(factor);
}
}
#endif
void fog(vec4 c, float f)
{
vec4 c_out = c;
if(PS_FOG != 0)
{
c_out.rgb = mix(FogColor, c.rgb, f);
c.rgb = mix(FogColor, c.rgb, f);
}
return c_out;
}
vec4 ps_color()
@ -722,19 +841,9 @@ vec4 ps_color()
atst(c);
c = fog(c, PSin_t.z);
fog(c, PSin_t.z);
if (PS_COLCLIP == 2)
{
c.rgb = 256.0f/255.0f - c.rgb;
}
if (PS_COLCLIP > 0)
{
// FIXME !!!!
//c.rgb *= c.rgb < 128./255;
bvec3 factor = bvec3(128.0f/255.0f, 128.0f/255.0f, 128.0f/255.0f);
c.rgb *= vec3(factor);
}
colclip(c);
if(PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes
{