gsdx-ogl: improve date performance for GL45

If there is no overlap, it is allowed to directly read from the render target.

On SotC testcase with 6x scaling: 30fps -> 40fps

Note: it requires GL_ARB_texture_barrier extension so be sure to have a recent driver

Note2: it requires a lots of testing too

Open question: in case of complex date (written alpha)
Will it be faster to split the draw call into multiple call with no
primitive overlap
This commit is contained in:
Gregory Hainaut 2015-04-24 20:53:59 +02:00
parent 795ae50ecd
commit c207632e49
8 changed files with 69 additions and 25 deletions

View File

@ -54,8 +54,8 @@ namespace GLState {
GLuint rt = 0; GLuint rt = 0;
GLuint ds = 0; GLuint ds = 0;
GLuint tex_unit[2] = {0, 0}; GLuint tex_unit[4] = {0, 0, 0, 0};
GLuint64 tex_handle[2] = { 0, 0}; GLuint64 tex_handle[4] = { 0, 0, 0, 0};
bool dirty_ressources = false; bool dirty_ressources = false;
GLuint ps = 0; GLuint ps = 0;
@ -106,10 +106,10 @@ namespace GLState {
rt = 0; rt = 0;
ds = 0; ds = 0;
tex_unit[0] = 0; for (int i = 0; i < 4; i++) {
tex_unit[1] = 0; tex_unit[i] = 0;
tex_handle[0] = 0; tex_handle[i] = 0;
tex_handle[1] = 0; }
ps = 0; ps = 0;
gs = 0; gs = 0;

View File

@ -56,8 +56,8 @@ namespace GLState {
extern GLuint rt; // render target extern GLuint rt; // render target
extern GLuint ds; // Depth-Stencil extern GLuint ds; // Depth-Stencil
extern GLuint tex_unit[2]; // shader input texture extern GLuint tex_unit[4]; // shader input texture
extern GLuint64 tex_handle[2]; // shader input texture extern GLuint64 tex_handle[4]; // shader input texture
extern GLuint ps; extern GLuint ps;
extern GLuint gs; extern GLuint gs;

View File

@ -579,7 +579,6 @@ void GSDeviceOGL::InitPrimDateTexture(GSTexture* rt)
#ifndef ENABLE_GLES #ifndef ENABLE_GLES
gl_BindImageTexture(2, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32I); gl_BindImageTexture(2, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32I);
#endif #endif
gl_BindTextureUnit(3, static_cast<GSTextureOGL*>(rt)->GetID());
} }
void GSDeviceOGL::RecycleDateTexture() void GSDeviceOGL::RecycleDateTexture()

View File

@ -372,7 +372,7 @@ class GSDeviceOGL : public GSDevice
uint32 clr1:1; uint32 clr1:1;
uint32 fba:1; uint32 fba:1;
uint32 aout:1; uint32 aout:1;
uint32 date:2; uint32 date:3;
uint32 spritehack:1; uint32 spritehack:1;
uint32 tcoffsethack:1; uint32 tcoffsethack:1;
uint32 point_sampler:1; uint32 point_sampler:1;

View File

@ -151,6 +151,39 @@ void GSRendererOGL::SetupIA()
dev->IASetPrimitiveTopology(t); dev->IASetPrimitiveTopology(t);
} }
bool GSRendererOGL::PrimitiveOverlap()
{
if (m_vertex.next < 4)
return false;
if (m_vt.m_primclass != GS_SPRITE_CLASS)
return true;
// Check intersection of sprite primitive only
size_t count = m_vertex.next;
GSVertex* v = &m_vertex.buff[0];
for(size_t i = 0; i < count; i += 2) {
// Very bad code
GSVector4i vi(v[i].XYZ.X, v[i].XYZ.Y, v[i+1].XYZ.X, v[i+1].XYZ.Y);
for (size_t j = i+2; j < count; j += 2) {
GSVector4i vj(v[j].XYZ.X, v[j].XYZ.Y, v[j+1].XYZ.X, v[j+1].XYZ.Y);
GSVector4i inter = vi.rintersect(vj);
if (!inter.rempty()) {
//fprintf(stderr, "Overlap found between %d and %d (draw of %d vertices)\n", i, j, count);
//vi.print();
//vj.print();
//inter.print();
//exit(0);
return true;
}
}
}
//fprintf(stderr, "Yes, code can be optimized (draw of %d vertices)\n", count);
return false;
}
void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
{ {
GSDrawingEnvironment& env = m_env; GSDrawingEnvironment& env = m_env;
@ -160,7 +193,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
const GSVector2& rtscale = rt->GetScale(); const GSVector2& rtscale = rt->GetScale();
bool DATE = m_context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; bool DATE = m_context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
bool advance_DATE = false; bool DATE_GL42 = false;
bool DATE_GL45 = false;
ASSERT(m_dev != NULL); ASSERT(m_dev != NULL);
@ -208,14 +242,21 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask(); om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
if (DATE && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) { if (DATE) {
advance_DATE = GLLoader::found_GL_ARB_shader_image_load_store && !UserHacks_AlphaStencil; if (gl_TextureBarrier && !PrimitiveOverlap()) {
DATE_GL45 = true;
DATE = false;
} else if (om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == ATST_ALWAYS)) {
DATE_GL42 = GLLoader::found_GL_ARB_shader_image_load_store && !UserHacks_AlphaStencil;
}
} }
// DATE // DATE
if(DATE) if (DATE_GL45) {
{ gl_TextureBarrier();
dev->PSSetShaderResource(3, rt);
} else if (DATE) {
// TODO: do I need to clamp the value (if yes how? rintersect with rt?) // TODO: do I need to clamp the value (if yes how? rintersect with rt?)
GSVector4 si = GSVector4(rtscale.x, rtscale.y); GSVector4 si = GSVector4(rtscale.x, rtscale.y);
GSVector4 o = GSVector4(-1.0f, 1.0f); // Round value GSVector4 o = GSVector4(-1.0f, 1.0f); // Round value
@ -227,8 +268,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
// Must be done here to avoid any GL state pertubation (clear function...) // Must be done here to avoid any GL state pertubation (clear function...)
// Create an r32ui image that will containt primitive ID // Create an r32ui image that will containt primitive ID
if (advance_DATE) { if (DATE_GL42) {
dev->InitPrimDateTexture(rt); dev->InitPrimDateTexture(rt);
dev->PSSetShaderResource(3, rt);
} else { } else {
GSVector4 s = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y); GSVector4 s = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y);
@ -349,9 +391,10 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
// GS_SPRITE_CLASS are already flat (either by CPU or the GS) // GS_SPRITE_CLASS are already flat (either by CPU or the GS)
ps_sel.iip = (m_vt.m_primclass == GS_SPRITE_CLASS) ? 1 : PRIM->IIP; ps_sel.iip = (m_vt.m_primclass == GS_SPRITE_CLASS) ? 1 : PRIM->IIP;
if(DATE) if (DATE_GL45) {
{ ps_sel.date = 5 + context->TEST.DATM;
if (advance_DATE) } else if(DATE) {
if (DATE_GL42)
ps_sel.date = 1 + context->TEST.DATM; ps_sel.date = 1 + context->TEST.DATM;
else else
om_dssel.date = 1; om_dssel.date = 1;
@ -497,7 +540,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
dev->SetupOM(om_dssel, om_bsel, afix); dev->SetupOM(om_dssel, om_bsel, afix);
dev->SetupCB(&vs_cb, &ps_cb, ps_sel.sprite ? &gs_cb : NULL); dev->SetupCB(&vs_cb, &ps_cb, ps_sel.sprite ? &gs_cb : NULL);
if (advance_DATE) { if (DATE_GL42) {
// Create an r32i image that will contain primitive ID // Create an r32i image that will contain primitive ID
// Note: do it at the beginning because the clean will dirty the FBO state // Note: do it at the beginning because the clean will dirty the FBO state
//dev->InitPrimDateTexture(rtsize.x, rtsize.y); //dev->InitPrimDateTexture(rtsize.x, rtsize.y);
@ -598,7 +641,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
} }
} }
} }
if (advance_DATE) if (DATE_GL42)
dev->RecycleDateTexture(); dev->RecycleDateTexture();
dev->EndScene(); dev->EndScene();

View File

@ -56,4 +56,6 @@ class GSRendererOGL : public GSRendererHW
void UpdateFBA(GSTexture* rt); void UpdateFBA(GSTexture* rt);
void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex); void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex);
bool PrimitiveOverlap();
}; };

View File

@ -438,13 +438,13 @@ void ps_main()
#if !pGL_ES #if !pGL_ES
void ps_main() void ps_main()
{ {
#if PS_DATE == 1 && !defined(DISABLE_GL42_image) #if (PS_DATE & 3) == 1 && !defined(DISABLE_GL42_image)
// DATM == 0 // DATM == 0
// Pixel with alpha equal to 1 will failed // Pixel with alpha equal to 1 will failed
float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a; float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;
if ((127.5f / 255.0f) < rt_a) // < 0x80 pass (== 0x80 should not pass) if ((127.5f / 255.0f) < rt_a) // < 0x80 pass (== 0x80 should not pass)
discard; discard;
#elif PS_DATE == 2 && !defined(DISABLE_GL42_image) #elif (PS_DATE & 3) == 2 && !defined(DISABLE_GL42_image)
// DATM == 1 // DATM == 1
// Pixel with alpha equal to 0 will failed // Pixel with alpha equal to 0 will failed
float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a; float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;

View File

@ -1189,13 +1189,13 @@ static const char* tfx_fs_all_glsl =
"#if !pGL_ES\n" "#if !pGL_ES\n"
"void ps_main()\n" "void ps_main()\n"
"{\n" "{\n"
"#if PS_DATE == 1 && !defined(DISABLE_GL42_image)\n" "#if (PS_DATE & 3) == 1 && !defined(DISABLE_GL42_image)\n"
" // DATM == 0\n" " // DATM == 0\n"
" // Pixel with alpha equal to 1 will failed\n" " // Pixel with alpha equal to 1 will failed\n"
" float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;\n" " float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;\n"
" if ((127.5f / 255.0f) < rt_a) // < 0x80 pass (== 0x80 should not pass)\n" " if ((127.5f / 255.0f) < rt_a) // < 0x80 pass (== 0x80 should not pass)\n"
" discard;\n" " discard;\n"
"#elif PS_DATE == 2 && !defined(DISABLE_GL42_image)\n" "#elif (PS_DATE & 3) == 2 && !defined(DISABLE_GL42_image)\n"
" // DATM == 1\n" " // DATM == 1\n"
" // Pixel with alpha equal to 0 will failed\n" " // Pixel with alpha equal to 0 will failed\n"
" float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;\n" " float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;\n"