gsdx ogl: the proof of concept commit

* GL_ARB_shader_subroutine for perf
fix for nvidia => add missing shader declaration. Nvidia got +4fps on colin3 :) 

For the moment only 2 PS parameters are supported. Code need to be extended to support others games that often
switch shader program (like xenosaga).
require GL4 class hardware and the option override_GL_ARB_shader_subroutine = 1
Note: strangely on AMD linux it is slower!

* GL_ARB_shader_image_load_store for accuraccy (Date)
Use a signed integer texture and reenable color buffer writing

Current status: Amagami_transparency.gs & P3_battle_shadows.gs are now working on Nvidia with a small perf impact.
Current implementation detail:
1/ setup the standard stencil as before
2/ on remaining pixel, draw once to compute first primitive that will write a fail alpha value.
3/ final draw based on primitive id of step 2

Note: I think we would get a bad behavior if depth test&mask are enabled on step 2/3
Note2: on my limited testcase the perf impact was on CPU. It would be possible to merge step1&2 to nullifying it (could 
even be faster actually), however it would require more GPU power.

Again require GL4 class hardware. And the option UserHacks_DateGL4 = 1



git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5725 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2013-08-28 08:44:16 +00:00
parent 452cf72ddf
commit e01c6cd9ce
11 changed files with 67 additions and 73 deletions

View File

@ -132,6 +132,7 @@ namespace GLLoader {
bool found_only_gl30 = false; // Drop it when mesa support GLSL330
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
bool found_GL_ARB_buffer_storage = false;
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine
// GL4 hardware
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
bool found_GL_ARB_gpu_shader5 = false;
@ -233,6 +234,8 @@ namespace GLLoader {
if (ext.compare("GL_ARB_gpu_shader5") == 0) found_GL_ARB_gpu_shader5 = true;
if (ext.compare("GL_ARB_shader_image_load_store") == 0) found_GL_ARB_shader_image_load_store = true;
#if 0
// Erratum: on nvidia implementation, gain is very nice : 42.5 fps => 46.5 fps
//
// Strangely it doesn't provide the speed boost as expected.
// Note: only atst/colclip was replaced with subroutine for the moment. It replace 2000 program switch on
// colin mcrae 3 by 2100 uniform, but code is slower!
@ -243,6 +246,7 @@ namespace GLLoader {
if (ext.compare("GL_ARB_shader_subroutine") == 0) found_GL_ARB_shader_subroutine = true;
#endif
if (ext.compare("GL_ARB_explicit_uniform_location") == 0) found_GL_ARB_explicit_uniform_location = true;
#ifdef GL44 // Need to debug the code first
if (ext.compare("GL_ARB_clear_texture") == 0) found_GL_ARB_clear_texture = true;
if (ext.compare("GL_ARB_multi_bind") == 0) found_GL_ARB_multi_bind = true;
@ -268,6 +272,7 @@ namespace GLLoader {
status &= status_and_override(found_GL_ARB_clear_texture,"GL_ARB_clear_texture");
status &= status_and_override(found_GL_ARB_buffer_storage,"GL_ARB_buffer_storage");
status &= status_and_override(found_GL_ARB_shader_subroutine,"GL_ARB_shader_subroutine");
status &= status_and_override(found_GL_ARB_explicit_uniform_location,"GL_ARB_explicit_uniform_location");
status &= status_and_override(found_GL_ARB_texture_storage, "GL_ARB_texture_storage", true);
status &= status_and_override(found_GL_ARB_shading_language_420pack,"GL_ARB_shading_language_420pack");

View File

@ -279,4 +279,5 @@ namespace GLLoader {
extern bool found_GL_ARB_buffer_storage;
extern bool found_GL_ARB_shader_subroutine;
extern bool found_GL_ARB_bindless_texture;
extern bool found_GL_ARB_explicit_uniform_location;
}

View File

@ -826,7 +826,8 @@ EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
{
string s = "GSdx";
if(s_gs != NULL) // TODO: this gets called from a different thread concurrently with GSOpen (on linux)
// TODO: this gets called from a different thread concurrently with GSOpen (on linux)
if(s_gs == NULL) return;
if(s_gs->m_GStitleInfoBuffer[0])
{

View File

@ -108,6 +108,7 @@ GSDeviceOGL::~GSDeviceOGL()
delete m_ps_cb;
gl_DeleteSamplers(1, &m_palette_ss);
delete m_vb;
m_shader->Delete(m_apitrace);
for (uint32 key = 0; key < VSSelector::size(); key++) m_shader->Delete(m_vs[key]);
m_shader->Delete(m_gs);
@ -296,10 +297,11 @@ bool GSDeviceOGL::Create(GSWnd* wnd)
m_date.dss->SetStencil(GL_ALWAYS, GL_REPLACE);
m_date.bs = new GSBlendStateOGL();
#ifndef ENABLE_OGL_STENCIL_DEBUG
// Only keep stencil data
m_date.bs->SetMask(false, false, false, false);
#endif
// FIXME impact image load?
//#ifndef ENABLE_OGL_STENCIL_DEBUG
// // Only keep stencil data
// m_date.bs->SetMask(false, false, false, false);
//#endif
// ****************************************************************
// HW renderer shader
@ -538,9 +540,9 @@ void GSDeviceOGL::InitPrimDateTexture(int w, int h)
{
// Create a texture to avoid the useless clean@0
if (m_date.t == NULL)
m_date.t = CreateTexture(w, h, GL_R32UI);
m_date.t = CreateTexture(w, h, GL_R32I);
ClearRenderTarget_ui(m_date.t, 0xFFFFFFFF);
ClearRenderTarget_ui(m_date.t, 0x0FFFFFFF);
#ifdef ENABLE_OGL_STENCIL_DEBUG
gl_ActiveTexture(GL_TEXTURE0 + 5);
@ -557,9 +559,9 @@ void GSDeviceOGL::BindDateTexture()
// TODO: multibind?
// GLuint textures[1] = {static_cast<GSTextureOGL*>(m_date.t)->GetID()};
// gl_BindImageTextures(0, 1, textures);
//gl_BindImageTexture(0, 0, 0, true, 0, GL_READ_WRITE, GL_R32UI);
//gl_BindImageTexture(0, 0, 0, true, 0, GL_READ_WRITE, GL_R32I);
gl_BindImageTexture(0, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32UI);
gl_BindImageTexture(0, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32I);
}
void GSDeviceOGL::RecycleDateTexture()

View File

@ -509,6 +509,7 @@ class GSDeviceOGL : public GSDevice
GSDepthStencilOGL* m_om_dss[1<<6];
hash_map<uint32, GLuint > m_ps;
hash_map<uint32, GSBlendStateOGL* > m_om_bs;
GLuint m_apitrace;
GLuint m_palette_ss;
GLuint m_rt_ss;

View File

@ -268,10 +268,11 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
// TODO
//if (UserHacks_DateGL4 && DATE && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) {
if (UserHacks_DateGL4 && DATE) {
if (UserHacks_DateGL4 && DATE && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) {
//if (!(context->FBA.FBA && context->TEST.DATM == 1))
advance_DATE = true;
//advance_DATE = true;
advance_DATE = GLLoader::found_GL_ARB_shader_image_load_store;
}
// vs
@ -500,8 +501,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
dev->SetupCB(&vs_cb, &ps_cb);
if (advance_DATE) {
// Create an r32ui image that will contain primitive ID
// Note: do it at the beginning because the clean will dirty the state
// Create an r32i image that will contain primitive ID
// Note: do it at the beginning because the clean will dirty the FBO state
//dev->InitPrimDateTexture(rtsize.x, rtsize.y);
// Don't write anything on the color buffer

View File

@ -89,6 +89,7 @@ void GSShaderOGL::PS(GLuint s, GLuint sub_count)
GLState::ps = s;
GLState::dirty_prog = true;
GLState::dirty_subroutine_ps = true;
#ifndef ENABLE_GLES
if (GLLoader::found_GL_ARB_separate_shader_objects) {
gl_UseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s);
@ -278,10 +279,10 @@ GLuint GSShaderOGL::LinkNewProgram()
void GSShaderOGL::UseProgram()
{
if (GLState::dirty_prog) {
GLState::dirty_subroutine_ps = true;
GLState::dirty_ressources = true;
if (!GLLoader::found_GL_ARB_separate_shader_objects) {
GLState::dirty_subroutine_ps = true;
GLState::dirty_ressources = true;
hash_map<uint64, GLuint >::iterator it;
// Note: shader are integer lookup pointer. They start from 1 and incr
// every time you create a new shader OR a new program.
@ -340,19 +341,22 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co
}
if (GLLoader::found_GL_ARB_separate_shader_objects) {
// Need GL version 410
header += "#extension GL_ARB_separate_shader_objects : require\n";
header += "#extension GL_ARB_separate_shader_objects: require\n";
} else {
header += "#define DISABLE_SSO\n";
}
if (GLLoader::found_only_gl30) {
// Need version 330
header += "#extension GL_ARB_explicit_attrib_location : require\n";
header += "#extension GL_ARB_explicit_attrib_location: require\n";
// Need version 140
header += "#extension GL_ARB_uniform_buffer_object : require\n";
header += "#extension GL_ARB_uniform_buffer_object: require\n";
}
if (GLLoader::found_GL_ARB_shader_subroutine) {
if (GLLoader::found_GL_ARB_shader_subroutine && GLLoader::found_GL_ARB_explicit_uniform_location) {
// Need GL version 400
header += "#define SUBROUTINE_GL40 1\n";
header += "#extension GL_ARB_shader_subroutine: require\n";
// Need GL version 430
header += "#extension GL_ARB_explicit_uniform_location: require\n";
}
#ifdef ENABLE_OGL_STENCIL_DEBUG
header += "#define ENABLE_OGL_STENCIL_DEBUG 1\n";
@ -413,7 +417,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent
std::string header = GenGlslHeader(entry, type, macro_sel);
int shader_nb = 1;
#if 0
#if 1
sources[0] = header.c_str();
sources[1] = glsl_h_code;
shader_nb++;

View File

@ -56,6 +56,9 @@ void GSDeviceOGL::CreateTextureFX()
for (uint32 key = 0; key < OMDepthStencilSelector::size(); key++)
m_om_dss[key] = CreateDepthStencil(OMDepthStencilSelector(key));
// Help to debug FS in apitrace
m_apitrace = CompilePS(PSSelector());
}
GSDepthStencilOGL* GSDeviceOGL::CreateDepthStencil(OMDepthStencilSelector dssel)

View File

@ -204,9 +204,9 @@ GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read)
// Bunch of constant parameter
switch (m_format) {
case GL_R32UI:
case GL_R32I:
m_int_format = GL_RED_INTEGER;
m_int_type = GL_UNSIGNED_INT;
m_int_type = GL_INT;
m_int_alignment = 4;
m_int_shift = 2;
break;
@ -559,7 +559,7 @@ void GSTextureOGL::SaveRaw(const string& fn, const void* image, uint32 pitch)
for(int h = m_size.y; h > 0; h--) {
for (int w = m_size.x; w > 0; w--, data += 1) {
if (*data == 0xffffffff)
if (*data > 0xffffff)
fprintf(fp, "");
else {
fprintf(fp, "%x", *data);
@ -594,11 +594,11 @@ bool GSTextureOGL::Save(const string& fn, bool dds)
glReadPixels(0, 0, m_size.x, m_size.y, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, image);
gl_BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
} else if(m_format == GL_R32UI) {
} else if(m_format == GL_R32I) {
gl_ActiveTexture(GL_TEXTURE0 + 6);
glBindTexture(GL_TEXTURE_2D, m_texture_id);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_INT, image);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, image);
SaveRaw(fn, image, pitch);
// Not supported in Save function

View File

@ -817,7 +817,7 @@ static const char* tfx_glsl =
"#ifndef DISABLE_GL42_image\n"
"#if PS_DATE > 0\n"
"// FIXME how to declare memory access\n"
"layout(r32ui, binding = 0) coherent uniform uimage2D img_prim_min;\n"
"layout(r32i, binding = 0) coherent uniform iimage2D img_prim_min;\n"
"#endif\n"
"#else\n"
"// use basic stencil\n"
@ -1316,6 +1316,16 @@ static const char* tfx_glsl =
"#if !GL_ES\n"
"void ps_main()\n"
"{\n"
"#if PS_DATE == 3 && !defined(DISABLE_GL42_image)\n"
" int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));\n"
" // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update\n"
" // the bad alpha value so we must keep it.\n"
"\n"
" if (gl_PrimitiveID > stencil_ceil) {\n"
" discard;\n"
" }\n"
"#endif\n"
"\n"
" vec4 c = ps_color();\n"
"\n"
" float alpha = c.a * 2.0;\n"
@ -1347,33 +1357,11 @@ static const char* tfx_glsl =
" }\n"
"#endif\n"
"\n"
" // TODO\n"
" // warning non uniform flow ???\n"
"#if PS_DATE == 3 && !defined(DISABLE_GL42_image)\n"
" uint stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));\n"
" // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update\n"
" // the bad alpha value so we must keep it.\n"
"#if 0\n"
" if (stencil_ceil > 0)\n"
" c = vec4(1.0, 0.0, 0.0, 1.0);\n"
" else\n"
" c = vec4(0.0, 1.0, 0.0, 1.0);\n"
"#endif\n"
"\n"
"#if 1\n"
" if (gl_PrimitiveID > stencil_ceil) {\n"
" discard;\n"
" }\n"
"#endif\n"
"\n"
"#endif\n"
"\n"
"\n"
"#if (PS_DATE == 2 || PS_DATE == 1) && !defined(DISABLE_GL42_image)\n"
" // Don't write anything on the framebuffer\n"
" // Note: you can't use discard because it will also drop\n"
" // image operation\n"
" // Note2: output will be disabled too in opengl\n"
"#else\n"
" SV_Target0 = c;\n"
" SV_Target1 = vec4(alpha, alpha, alpha, alpha);\n"

View File

@ -307,7 +307,7 @@ layout(binding = 1) uniform sampler2D PaletteSampler;
#ifndef DISABLE_GL42_image
#if PS_DATE > 0
// FIXME how to declare memory access
layout(r32ui, binding = 0) coherent uniform uimage2D img_prim_min;
layout(r32i, binding = 0) coherent uniform iimage2D img_prim_min;
#endif
#else
// use basic stencil
@ -806,6 +806,16 @@ void ps_main()
#if !GL_ES
void ps_main()
{
#if PS_DATE == 3 && !defined(DISABLE_GL42_image)
int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
// the bad alpha value so we must keep it.
if (gl_PrimitiveID > stencil_ceil) {
discard;
}
#endif
vec4 c = ps_color();
float alpha = c.a * 2.0;
@ -837,33 +847,11 @@ void ps_main()
}
#endif
// TODO
// warning non uniform flow ???
#if PS_DATE == 3 && !defined(DISABLE_GL42_image)
uint stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
// the bad alpha value so we must keep it.
#if 0
if (stencil_ceil > 0)
c = vec4(1.0, 0.0, 0.0, 1.0);
else
c = vec4(0.0, 1.0, 0.0, 1.0);
#endif
#if 1
if (gl_PrimitiveID > stencil_ceil) {
discard;
}
#endif
#endif
#if (PS_DATE == 2 || PS_DATE == 1) && !defined(DISABLE_GL42_image)
// Don't write anything on the framebuffer
// Note: you can't use discard because it will also drop
// image operation
// Note2: output will be disabled too in opengl
#else
SV_Target0 = c;
SV_Target1 = vec4(alpha, alpha, alpha, alpha);