gsdx ogl: accelerate special case of accurate date.

Game often uses date to allow a single pixel pass. If this
use case is detected, stencil buffer will be cleared after first pixels
that pass both depth&stencil test.

It seems to reduce the load on the GPU.

Note: with the help of texture barriere, maybe we could implement the algo
with a single pass.
This commit is contained in:
Gregory Hainaut 2016-05-15 17:22:58 +02:00
parent 025be70c42
commit 3ab12cef2f
3 changed files with 25 additions and 6 deletions

View File

@ -699,7 +699,10 @@ GSDepthStencilOGL* GSDeviceOGL::CreateDepthStencil(OMDepthStencilSelector dssel)
if (dssel.date) if (dssel.date)
{ {
dss->EnableStencil(); dss->EnableStencil();
dss->SetStencil(GL_EQUAL, GL_KEEP); if (dssel.date_one)
dss->SetStencil(GL_EQUAL, GL_ZERO);
else
dss->SetStencil(GL_EQUAL, GL_KEEP);
} }
if(dssel.ztst != ZTST_ALWAYS || dssel.zwe) if(dssel.ztst != ZTST_ALWAYS || dssel.zwe)

View File

@ -336,8 +336,9 @@ class GSDeviceOGL final : public GSDevice
uint32 ztst:2; uint32 ztst:2;
uint32 zwe:1; uint32 zwe:1;
uint32 date:1; uint32 date:1;
uint32 date_one:1;
uint32 _free:28; uint32 _free:27;
}; };
uint32 key; uint32 key;
@ -441,7 +442,7 @@ class GSDeviceOGL final : public GSDevice
GLuint m_vs[1<<3]; GLuint m_vs[1<<3];
GLuint m_gs[1<<2]; GLuint m_gs[1<<2];
GLuint m_ps_ss[1<<4]; GLuint m_ps_ss[1<<4];
GSDepthStencilOGL* m_om_dss[1<<4]; GSDepthStencilOGL* m_om_dss[1<<5];
hash_map<uint64, GLuint > m_ps; hash_map<uint64, GLuint > m_ps;
GLuint m_apitrace; GLuint m_apitrace;

View File

@ -821,13 +821,28 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
DATE = false; DATE = false;
} else if (m_accurate_date && om_csel.wa /* FIXME Check the msb bit of the mask instead + the dfmt*/ } else if (m_accurate_date && om_csel.wa /* FIXME Check the msb bit of the mask instead + the dfmt*/
&& (!m_context->TEST.ATE || m_context->TEST.ATST == ATST_ALWAYS)) { && (!m_context->TEST.ATE || m_context->TEST.ATST == ATST_ALWAYS)) {
// texture barrier will split the draw call into n draw call. It is very efficient for // Performance note: check alpha range with GetAlphaMinMax()
// few primitive draws. Otherwise it sucks. // Note: all my dump are already above 120fps, but it seems to reduce GPU load
if ((m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 50) || (m_index.tail < 100)) { // with big upscaling
GetAlphaMinMax();
if (m_context->TEST.DATM && m_vt.m_alpha.max < 128) {
// Only first pixel (write 0) will pass (alpha is 1)
GL_PERF("Fast DATE with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max);
om_dssel.date_one = 1;
} else if (!m_context->TEST.DATM && m_vt.m_alpha.min >= 128) {
// Only first pixel (write 1) will pass (alpha is 0)
GL_PERF("Fast DATE with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max);
om_dssel.date_one = 1;
} else if ((m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 50) || (m_index.tail < 100)) {
// texture barrier will split the draw call into n draw call. It is very efficient for
// few primitive draws. Otherwise it sucks.
GL_PERF("Slower DATE with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max);
require_barrier = true; require_barrier = true;
DATE_GL45 = true; DATE_GL45 = true;
DATE = false; DATE = false;
} else { } else {
GL_PERF("Slow DATE with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max);
if (GLLoader::found_GL_ARB_shader_image_load_store) { if (GLLoader::found_GL_ARB_shader_image_load_store) {
DATE_GL42 = true; DATE_GL42 = true;
} else { } else {