mirror of https://github.com/PCSX2/pcsx2.git
gsdx ogl: the proof of concept commit
* GL_ARB_shader_subroutine for perf fix for nvidia => add missing shader declaration. Nvidia got +4fps on colin3 :) For the moment only 2 PS parameters are supported. Code need to be extended to support others games that often switch shader program (like xenosaga). require GL4 class hardware and the option override_GL_ARB_shader_subroutine = 1 Note: strangely on AMD linux it is slower! * GL_ARB_shader_image_load_store for accuraccy (Date) Use a signed integer texture and reenable color buffer writing Current status: Amagami_transparency.gs & P3_battle_shadows.gs are now working on Nvidia with a small perf impact. Current implementation detail: 1/ setup the standard stencil as before 2/ on remaining pixel, draw once to compute first primitive that will write a fail alpha value. 3/ final draw based on primitive id of step 2 Note: I think we would get a bad behavior if depth test&mask are enabled on step 2/3 Note2: on my limited testcase the perf impact was on CPU. It would be possible to merge step1&2 to nullifying it (could even be faster actually), however it would require more GPU power. Again require GL4 class hardware. And the option UserHacks_DateGL4 = 1 git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5725 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
452cf72ddf
commit
e01c6cd9ce
|
@ -132,6 +132,7 @@ namespace GLLoader {
|
|||
bool found_only_gl30 = false; // Drop it when mesa support GLSL330
|
||||
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
|
||||
bool found_GL_ARB_buffer_storage = false;
|
||||
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine
|
||||
// GL4 hardware
|
||||
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
|
||||
bool found_GL_ARB_gpu_shader5 = false;
|
||||
|
@ -233,6 +234,8 @@ namespace GLLoader {
|
|||
if (ext.compare("GL_ARB_gpu_shader5") == 0) found_GL_ARB_gpu_shader5 = true;
|
||||
if (ext.compare("GL_ARB_shader_image_load_store") == 0) found_GL_ARB_shader_image_load_store = true;
|
||||
#if 0
|
||||
// Erratum: on nvidia implementation, gain is very nice : 42.5 fps => 46.5 fps
|
||||
//
|
||||
// Strangely it doesn't provide the speed boost as expected.
|
||||
// Note: only atst/colclip was replaced with subroutine for the moment. It replace 2000 program switch on
|
||||
// colin mcrae 3 by 2100 uniform, but code is slower!
|
||||
|
@ -243,6 +246,7 @@ namespace GLLoader {
|
|||
|
||||
if (ext.compare("GL_ARB_shader_subroutine") == 0) found_GL_ARB_shader_subroutine = true;
|
||||
#endif
|
||||
if (ext.compare("GL_ARB_explicit_uniform_location") == 0) found_GL_ARB_explicit_uniform_location = true;
|
||||
#ifdef GL44 // Need to debug the code first
|
||||
if (ext.compare("GL_ARB_clear_texture") == 0) found_GL_ARB_clear_texture = true;
|
||||
if (ext.compare("GL_ARB_multi_bind") == 0) found_GL_ARB_multi_bind = true;
|
||||
|
@ -268,6 +272,7 @@ namespace GLLoader {
|
|||
status &= status_and_override(found_GL_ARB_clear_texture,"GL_ARB_clear_texture");
|
||||
status &= status_and_override(found_GL_ARB_buffer_storage,"GL_ARB_buffer_storage");
|
||||
status &= status_and_override(found_GL_ARB_shader_subroutine,"GL_ARB_shader_subroutine");
|
||||
status &= status_and_override(found_GL_ARB_explicit_uniform_location,"GL_ARB_explicit_uniform_location");
|
||||
|
||||
status &= status_and_override(found_GL_ARB_texture_storage, "GL_ARB_texture_storage", true);
|
||||
status &= status_and_override(found_GL_ARB_shading_language_420pack,"GL_ARB_shading_language_420pack");
|
||||
|
|
|
@ -279,4 +279,5 @@ namespace GLLoader {
|
|||
extern bool found_GL_ARB_buffer_storage;
|
||||
extern bool found_GL_ARB_shader_subroutine;
|
||||
extern bool found_GL_ARB_bindless_texture;
|
||||
extern bool found_GL_ARB_explicit_uniform_location;
|
||||
}
|
||||
|
|
|
@ -826,7 +826,8 @@ EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
|
|||
{
|
||||
string s = "GSdx";
|
||||
|
||||
if(s_gs != NULL) // TODO: this gets called from a different thread concurrently with GSOpen (on linux)
|
||||
// TODO: this gets called from a different thread concurrently with GSOpen (on linux)
|
||||
if(s_gs == NULL) return;
|
||||
|
||||
if(s_gs->m_GStitleInfoBuffer[0])
|
||||
{
|
||||
|
|
|
@ -108,6 +108,7 @@ GSDeviceOGL::~GSDeviceOGL()
|
|||
delete m_ps_cb;
|
||||
gl_DeleteSamplers(1, &m_palette_ss);
|
||||
delete m_vb;
|
||||
m_shader->Delete(m_apitrace);
|
||||
|
||||
for (uint32 key = 0; key < VSSelector::size(); key++) m_shader->Delete(m_vs[key]);
|
||||
m_shader->Delete(m_gs);
|
||||
|
@ -296,10 +297,11 @@ bool GSDeviceOGL::Create(GSWnd* wnd)
|
|||
m_date.dss->SetStencil(GL_ALWAYS, GL_REPLACE);
|
||||
|
||||
m_date.bs = new GSBlendStateOGL();
|
||||
#ifndef ENABLE_OGL_STENCIL_DEBUG
|
||||
// Only keep stencil data
|
||||
m_date.bs->SetMask(false, false, false, false);
|
||||
#endif
|
||||
// FIXME impact image load?
|
||||
//#ifndef ENABLE_OGL_STENCIL_DEBUG
|
||||
// // Only keep stencil data
|
||||
// m_date.bs->SetMask(false, false, false, false);
|
||||
//#endif
|
||||
|
||||
// ****************************************************************
|
||||
// HW renderer shader
|
||||
|
@ -538,9 +540,9 @@ void GSDeviceOGL::InitPrimDateTexture(int w, int h)
|
|||
{
|
||||
// Create a texture to avoid the useless clean@0
|
||||
if (m_date.t == NULL)
|
||||
m_date.t = CreateTexture(w, h, GL_R32UI);
|
||||
m_date.t = CreateTexture(w, h, GL_R32I);
|
||||
|
||||
ClearRenderTarget_ui(m_date.t, 0xFFFFFFFF);
|
||||
ClearRenderTarget_ui(m_date.t, 0x0FFFFFFF);
|
||||
|
||||
#ifdef ENABLE_OGL_STENCIL_DEBUG
|
||||
gl_ActiveTexture(GL_TEXTURE0 + 5);
|
||||
|
@ -557,9 +559,9 @@ void GSDeviceOGL::BindDateTexture()
|
|||
// TODO: multibind?
|
||||
// GLuint textures[1] = {static_cast<GSTextureOGL*>(m_date.t)->GetID()};
|
||||
// gl_BindImageTextures(0, 1, textures);
|
||||
//gl_BindImageTexture(0, 0, 0, true, 0, GL_READ_WRITE, GL_R32UI);
|
||||
//gl_BindImageTexture(0, 0, 0, true, 0, GL_READ_WRITE, GL_R32I);
|
||||
|
||||
gl_BindImageTexture(0, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32UI);
|
||||
gl_BindImageTexture(0, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32I);
|
||||
}
|
||||
|
||||
void GSDeviceOGL::RecycleDateTexture()
|
||||
|
|
|
@ -509,6 +509,7 @@ class GSDeviceOGL : public GSDevice
|
|||
GSDepthStencilOGL* m_om_dss[1<<6];
|
||||
hash_map<uint32, GLuint > m_ps;
|
||||
hash_map<uint32, GSBlendStateOGL* > m_om_bs;
|
||||
GLuint m_apitrace;
|
||||
|
||||
GLuint m_palette_ss;
|
||||
GLuint m_rt_ss;
|
||||
|
|
|
@ -268,10 +268,11 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
|
|||
om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
|
||||
|
||||
// TODO
|
||||
//if (UserHacks_DateGL4 && DATE && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) {
|
||||
if (UserHacks_DateGL4 && DATE) {
|
||||
if (UserHacks_DateGL4 && DATE && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) {
|
||||
//if (!(context->FBA.FBA && context->TEST.DATM == 1))
|
||||
advance_DATE = true;
|
||||
|
||||
//advance_DATE = true;
|
||||
advance_DATE = GLLoader::found_GL_ARB_shader_image_load_store;
|
||||
}
|
||||
|
||||
// vs
|
||||
|
@ -500,8 +501,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
|
|||
dev->SetupCB(&vs_cb, &ps_cb);
|
||||
|
||||
if (advance_DATE) {
|
||||
// Create an r32ui image that will contain primitive ID
|
||||
// Note: do it at the beginning because the clean will dirty the state
|
||||
// Create an r32i image that will contain primitive ID
|
||||
// Note: do it at the beginning because the clean will dirty the FBO state
|
||||
//dev->InitPrimDateTexture(rtsize.x, rtsize.y);
|
||||
|
||||
// Don't write anything on the color buffer
|
||||
|
|
|
@ -89,6 +89,7 @@ void GSShaderOGL::PS(GLuint s, GLuint sub_count)
|
|||
|
||||
GLState::ps = s;
|
||||
GLState::dirty_prog = true;
|
||||
GLState::dirty_subroutine_ps = true;
|
||||
#ifndef ENABLE_GLES
|
||||
if (GLLoader::found_GL_ARB_separate_shader_objects) {
|
||||
gl_UseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s);
|
||||
|
@ -278,10 +279,10 @@ GLuint GSShaderOGL::LinkNewProgram()
|
|||
void GSShaderOGL::UseProgram()
|
||||
{
|
||||
if (GLState::dirty_prog) {
|
||||
GLState::dirty_subroutine_ps = true;
|
||||
GLState::dirty_ressources = true;
|
||||
|
||||
if (!GLLoader::found_GL_ARB_separate_shader_objects) {
|
||||
GLState::dirty_subroutine_ps = true;
|
||||
GLState::dirty_ressources = true;
|
||||
|
||||
hash_map<uint64, GLuint >::iterator it;
|
||||
// Note: shader are integer lookup pointer. They start from 1 and incr
|
||||
// every time you create a new shader OR a new program.
|
||||
|
@ -340,19 +341,22 @@ std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, co
|
|||
}
|
||||
if (GLLoader::found_GL_ARB_separate_shader_objects) {
|
||||
// Need GL version 410
|
||||
header += "#extension GL_ARB_separate_shader_objects : require\n";
|
||||
header += "#extension GL_ARB_separate_shader_objects: require\n";
|
||||
} else {
|
||||
header += "#define DISABLE_SSO\n";
|
||||
}
|
||||
if (GLLoader::found_only_gl30) {
|
||||
// Need version 330
|
||||
header += "#extension GL_ARB_explicit_attrib_location : require\n";
|
||||
header += "#extension GL_ARB_explicit_attrib_location: require\n";
|
||||
// Need version 140
|
||||
header += "#extension GL_ARB_uniform_buffer_object : require\n";
|
||||
header += "#extension GL_ARB_uniform_buffer_object: require\n";
|
||||
}
|
||||
if (GLLoader::found_GL_ARB_shader_subroutine) {
|
||||
if (GLLoader::found_GL_ARB_shader_subroutine && GLLoader::found_GL_ARB_explicit_uniform_location) {
|
||||
// Need GL version 400
|
||||
header += "#define SUBROUTINE_GL40 1\n";
|
||||
header += "#extension GL_ARB_shader_subroutine: require\n";
|
||||
// Need GL version 430
|
||||
header += "#extension GL_ARB_explicit_uniform_location: require\n";
|
||||
}
|
||||
#ifdef ENABLE_OGL_STENCIL_DEBUG
|
||||
header += "#define ENABLE_OGL_STENCIL_DEBUG 1\n";
|
||||
|
@ -413,7 +417,7 @@ GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& ent
|
|||
|
||||
std::string header = GenGlslHeader(entry, type, macro_sel);
|
||||
int shader_nb = 1;
|
||||
#if 0
|
||||
#if 1
|
||||
sources[0] = header.c_str();
|
||||
sources[1] = glsl_h_code;
|
||||
shader_nb++;
|
||||
|
|
|
@ -56,6 +56,9 @@ void GSDeviceOGL::CreateTextureFX()
|
|||
|
||||
for (uint32 key = 0; key < OMDepthStencilSelector::size(); key++)
|
||||
m_om_dss[key] = CreateDepthStencil(OMDepthStencilSelector(key));
|
||||
|
||||
// Help to debug FS in apitrace
|
||||
m_apitrace = CompilePS(PSSelector());
|
||||
}
|
||||
|
||||
GSDepthStencilOGL* GSDeviceOGL::CreateDepthStencil(OMDepthStencilSelector dssel)
|
||||
|
|
|
@ -204,9 +204,9 @@ GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read)
|
|||
|
||||
// Bunch of constant parameter
|
||||
switch (m_format) {
|
||||
case GL_R32UI:
|
||||
case GL_R32I:
|
||||
m_int_format = GL_RED_INTEGER;
|
||||
m_int_type = GL_UNSIGNED_INT;
|
||||
m_int_type = GL_INT;
|
||||
m_int_alignment = 4;
|
||||
m_int_shift = 2;
|
||||
break;
|
||||
|
@ -559,7 +559,7 @@ void GSTextureOGL::SaveRaw(const string& fn, const void* image, uint32 pitch)
|
|||
|
||||
for(int h = m_size.y; h > 0; h--) {
|
||||
for (int w = m_size.x; w > 0; w--, data += 1) {
|
||||
if (*data == 0xffffffff)
|
||||
if (*data > 0xffffff)
|
||||
fprintf(fp, "");
|
||||
else {
|
||||
fprintf(fp, "%x", *data);
|
||||
|
@ -594,11 +594,11 @@ bool GSTextureOGL::Save(const string& fn, bool dds)
|
|||
glReadPixels(0, 0, m_size.x, m_size.y, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, image);
|
||||
|
||||
gl_BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
|
||||
} else if(m_format == GL_R32UI) {
|
||||
} else if(m_format == GL_R32I) {
|
||||
gl_ActiveTexture(GL_TEXTURE0 + 6);
|
||||
glBindTexture(GL_TEXTURE_2D, m_texture_id);
|
||||
|
||||
glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_INT, image);
|
||||
glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, image);
|
||||
SaveRaw(fn, image, pitch);
|
||||
|
||||
// Not supported in Save function
|
||||
|
|
|
@ -817,7 +817,7 @@ static const char* tfx_glsl =
|
|||
"#ifndef DISABLE_GL42_image\n"
|
||||
"#if PS_DATE > 0\n"
|
||||
"// FIXME how to declare memory access\n"
|
||||
"layout(r32ui, binding = 0) coherent uniform uimage2D img_prim_min;\n"
|
||||
"layout(r32i, binding = 0) coherent uniform iimage2D img_prim_min;\n"
|
||||
"#endif\n"
|
||||
"#else\n"
|
||||
"// use basic stencil\n"
|
||||
|
@ -1316,6 +1316,16 @@ static const char* tfx_glsl =
|
|||
"#if !GL_ES\n"
|
||||
"void ps_main()\n"
|
||||
"{\n"
|
||||
"#if PS_DATE == 3 && !defined(DISABLE_GL42_image)\n"
|
||||
" int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));\n"
|
||||
" // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update\n"
|
||||
" // the bad alpha value so we must keep it.\n"
|
||||
"\n"
|
||||
" if (gl_PrimitiveID > stencil_ceil) {\n"
|
||||
" discard;\n"
|
||||
" }\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
" vec4 c = ps_color();\n"
|
||||
"\n"
|
||||
" float alpha = c.a * 2.0;\n"
|
||||
|
@ -1347,33 +1357,11 @@ static const char* tfx_glsl =
|
|||
" }\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
" // TODO\n"
|
||||
" // warning non uniform flow ???\n"
|
||||
"#if PS_DATE == 3 && !defined(DISABLE_GL42_image)\n"
|
||||
" uint stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));\n"
|
||||
" // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update\n"
|
||||
" // the bad alpha value so we must keep it.\n"
|
||||
"#if 0\n"
|
||||
" if (stencil_ceil > 0)\n"
|
||||
" c = vec4(1.0, 0.0, 0.0, 1.0);\n"
|
||||
" else\n"
|
||||
" c = vec4(0.0, 1.0, 0.0, 1.0);\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"#if 1\n"
|
||||
" if (gl_PrimitiveID > stencil_ceil) {\n"
|
||||
" discard;\n"
|
||||
" }\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#if (PS_DATE == 2 || PS_DATE == 1) && !defined(DISABLE_GL42_image)\n"
|
||||
" // Don't write anything on the framebuffer\n"
|
||||
" // Note: you can't use discard because it will also drop\n"
|
||||
" // image operation\n"
|
||||
" // Note2: output will be disabled too in opengl\n"
|
||||
"#else\n"
|
||||
" SV_Target0 = c;\n"
|
||||
" SV_Target1 = vec4(alpha, alpha, alpha, alpha);\n"
|
||||
|
|
|
@ -307,7 +307,7 @@ layout(binding = 1) uniform sampler2D PaletteSampler;
|
|||
#ifndef DISABLE_GL42_image
|
||||
#if PS_DATE > 0
|
||||
// FIXME how to declare memory access
|
||||
layout(r32ui, binding = 0) coherent uniform uimage2D img_prim_min;
|
||||
layout(r32i, binding = 0) coherent uniform iimage2D img_prim_min;
|
||||
#endif
|
||||
#else
|
||||
// use basic stencil
|
||||
|
@ -806,6 +806,16 @@ void ps_main()
|
|||
#if !GL_ES
|
||||
void ps_main()
|
||||
{
|
||||
#if PS_DATE == 3 && !defined(DISABLE_GL42_image)
|
||||
int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));
|
||||
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
|
||||
// the bad alpha value so we must keep it.
|
||||
|
||||
if (gl_PrimitiveID > stencil_ceil) {
|
||||
discard;
|
||||
}
|
||||
#endif
|
||||
|
||||
vec4 c = ps_color();
|
||||
|
||||
float alpha = c.a * 2.0;
|
||||
|
@ -837,33 +847,11 @@ void ps_main()
|
|||
}
|
||||
#endif
|
||||
|
||||
// TODO
|
||||
// warning non uniform flow ???
|
||||
#if PS_DATE == 3 && !defined(DISABLE_GL42_image)
|
||||
uint stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy));
|
||||
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
|
||||
// the bad alpha value so we must keep it.
|
||||
#if 0
|
||||
if (stencil_ceil > 0)
|
||||
c = vec4(1.0, 0.0, 0.0, 1.0);
|
||||
else
|
||||
c = vec4(0.0, 1.0, 0.0, 1.0);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
if (gl_PrimitiveID > stencil_ceil) {
|
||||
discard;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if (PS_DATE == 2 || PS_DATE == 1) && !defined(DISABLE_GL42_image)
|
||||
// Don't write anything on the framebuffer
|
||||
// Note: you can't use discard because it will also drop
|
||||
// image operation
|
||||
// Note2: output will be disabled too in opengl
|
||||
#else
|
||||
SV_Target0 = c;
|
||||
SV_Target1 = vec4(alpha, alpha, alpha, alpha);
|
||||
|
|
Loading…
Reference in New Issue