From ae8df002af3a242460ad4de8b60d9fd5d16b5330 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Wed, 22 Jul 2015 09:17:48 +0200 Subject: [PATCH] gsdx-ogl: optimize Cs * As + Cd and Cs * Af + Cd blending Basically the code does the alpha multiplication in the shader therefore the blend unit only does a pure addition. This way the multiplication is accurate and accurate_blending doesn't requires a costly barrier. This code also avoid variable duplication to make the code more separated. Hopefully blending can be done in a separated function It is preliminary work to support fast color clipping with HDR v2: fix assertion compilation failure v3: fix regression in not accurate mode v3: Cs * As/Af is not an accumulation Those cases don't need the Cd addition and were already optimized anyway Fix a regression on GoW2 --- plugins/GSdx/GSDeviceOGL.cpp | 1 + plugins/GSdx/GSDeviceOGL.h | 14 +++++--- plugins/GSdx/GSRendererOGL.cpp | 56 ++++++++++++++++++------------- plugins/GSdx/GSTextureFXOGL.cpp | 9 +++-- plugins/GSdx/res/glsl/tfx_fs.glsl | 3 ++ plugins/GSdx/res/glsl_source.h | 3 ++ 6 files changed, 54 insertions(+), 32 deletions(-) diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp index d26604bf17..a90811cca8 100644 --- a/plugins/GSdx/GSDeviceOGL.cpp +++ b/plugins/GSdx/GSDeviceOGL.cpp @@ -666,6 +666,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel) + format("#define PS_SHUFFLE %d\n", sel.shuffle) + format("#define PS_READ_BA %d\n", sel.read_ba) + format("#define PS_FBMASK %d\n", sel.fbmask) + + format("#define PS_BLEND_ACCU %d\n", sel.blend_accu) ; return m_shader->Compile("tfx.glsl", "ps_main", GL_FRAGMENT_SHADER, tfx_fs_all_glsl, macro); diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h index 16e1e6b9e7..291b831cdd 100644 --- a/plugins/GSdx/GSDeviceOGL.h +++ b/plugins/GSdx/GSDeviceOGL.h @@ -355,6 +355,7 @@ class GSDeviceOGL : public GSDevice uint32 blend_b:2; uint32 blend_c:2; uint32 blend_d:2; + uint32 blend_accu:1; uint32 dfmt:2; uint32 _free2:21; @@ -461,8 +462,10 @@ class GSDeviceOGL : public GSDevice uint32 c:2; uint32 d:2; uint32 negative:1; + uint32 accu:1; + uint32 ps:1; - uint32 _free:22; + uint32 _free:20; }; struct @@ -470,15 +473,16 @@ class GSDeviceOGL : public GSDevice uint32 _abe:1; uint32 abcd:8; uint32 _negative:1; + uint32 _accu:1; + uint32 _ps:1; - uint32 _free2:22; + uint32 _free2:20; }; uint32 key; }; - // FIXME is the & useful ? - operator uint32() {return key & 0x3ff;} + operator uint32() {return key;} OMBlendSelector() : key(0) {} @@ -666,7 +670,7 @@ class GSDeviceOGL : public GSDevice void SetupPS(PSSelector sel); void SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb); void SetupSampler(PSSamplerSelector ssel); - void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix, bool sw_blending = false); + void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix); GLuint GetSamplerID(PSSamplerSelector ssel); GLuint GetPaletteSamplerID(); diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 4d21eb093b..2dd47de9a9 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -452,8 +452,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour const GIFRegALPHA& ALPHA = context->ALPHA; float afix = (float)context->ALPHA.FIX / 0x80; - bool sw_blending = false; - bool colclip_wrap = false; if (!IsOpaque() && rt) { @@ -471,7 +469,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour { // this works because with PABE alpha blending is on when alpha >= 0x80, but since the pixel shader // cannot output anything over 0x80 (== 1.0) blending with 0x80 or turning it off gives the same result - om_bsel.abe = 0; } else @@ -489,24 +486,34 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour int blend_sel = ((om_bsel.a * 3 + om_bsel.b) * 3 + om_bsel.c) * 3 + om_bsel.d; int blend_flag = GSDeviceOGL::m_blendMapD3D9[blend_sel].bogus; // SW Blend is (nearly) free. Let's use it. - int free_blend = m_sw_blending && ((blend_flag & NO_BAR) || (m_prim_overlap == PRIM_OVERLAP_NO)); + bool free_blend = (blend_flag & NO_BAR) || (m_prim_overlap == PRIM_OVERLAP_NO); + // We really need SW blending for this one, barely used + bool impossible_blend = (blend_flag & A_MAX); + // Do the multiplication in shader for blending accumulation: Cs*As + Cd or Cs*Af + Cd + ps_sel.blend_accu = m_sw_blending && ALPHA.A == 0 && ALPHA.B == 2 && ALPHA.C != 1 && ALPHA.D == 1; + om_bsel.accu = ps_sel.blend_accu; + + bool sw_blending_base = m_sw_blending && (free_blend || impossible_blend /*|| ps_sel.blend_accu*/); // Color clip - bool acc_colclip_wrap = false; + bool acc_colclip_wrap = false; if (env.COLCLAMP.CLAMP == 0) { - colclip_wrap = !tex && PRIM->PRIM != GS_POINTLIST; - acc_colclip_wrap = (m_sw_blending >= ACC_BLEND_CCLIP || free_blend); + // Not supported yet in colclip + om_bsel.accu = ps_sel.blend_accu = 0; + + acc_colclip_wrap = (m_sw_blending >= ACC_BLEND_CCLIP || sw_blending_base); if (acc_colclip_wrap) { - colclip_wrap = false; ps_sel.colclip = 3; GL_INS("COLCLIP SW ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D); - } else if (colclip_wrap) { + } else if (!PRIM->TME && PRIM->PRIM != GS_POINTLIST) { + // Standard (inaccurate) colclip ps_sel.colclip = 1; GL_INS("COLCLIP ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D); } + } else { + sw_blending_base |= m_sw_blending && ps_sel.blend_accu; } - bool impossible_blend = m_sw_blending && (blend_flag & A_MAX); bool all_blend_sw; switch (m_sw_blending) { case ACC_BLEND_ULTRA: all_blend_sw = true; break; @@ -516,8 +523,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour default: all_blend_sw = false; break; } - sw_blending = free_blend // Free case - || impossible_blend || all_blend_sw // Impossible blend or all + bool sw_blending = sw_blending_base // Free case or Impossible blend + || all_blend_sw // all blend || acc_colclip_wrap // accurate colclip || ps_sel.fbmask; // accurate fbmask @@ -536,6 +543,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour #endif if (sw_blending && om_bsel.abe) { // select a shader that support blending + om_bsel.ps = 1; ps_sel.blend_a = om_bsel.a; ps_sel.blend_b = om_bsel.b; ps_sel.blend_c = om_bsel.c; @@ -549,18 +557,18 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour } // No need to flush for every primitive - require_barrier |= !(blend_flag & NO_BAR); + require_barrier |= !(blend_flag & NO_BAR) && !ps_sel.blend_accu; } else { ps_sel.clr1 = om_bsel.IsCLR1(); + if (ps_sel.dfmt == 1 && ALPHA.C == 1) { + // 24 bits doesn't have an alpha channel so use 1.0f fix factor as equivalent + om_bsel.c = 2; + afix = 1.0f; + } } } if (ps_sel.dfmt == 1) { - if (ALPHA.C == 1 && !sw_blending) { - // 24 bits doesn't have an alpha channel so use 1.0f fix factor as equivalent - om_bsel.c = 2; - afix = 1.0f; - } // Disable writing of the alpha channel om_csel.wa = 0; } @@ -864,7 +872,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour GL_POP(); dev->OMSetColorMaskState(om_csel); - dev->SetupOM(om_dssel, om_bsel, afix, sw_blending); + dev->SetupOM(om_dssel, om_bsel, afix); dev->SetupCB(&vs_cb, &ps_cb); @@ -907,9 +915,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour { SendDraw(require_barrier); - if (colclip_wrap) + if (ps_sel.colclip == 1) { - ASSERT(!sw_blending); + ASSERT(!om_bsel.ps); GL_PUSH("COLCLIP"); GSDeviceOGL::OMBlendSelector om_bselneg(om_bsel); GSDeviceOGL::PSSelector ps_selneg(ps_sel); @@ -963,13 +971,13 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour om_csel.wa = a; dev->OMSetColorMaskState(om_csel); - dev->SetupOM(om_dssel, om_bsel, afix, sw_blending); + dev->SetupOM(om_dssel, om_bsel, afix); SendDraw(require_barrier); - if (colclip_wrap) + if (ps_sel.colclip == 1) { - ASSERT(!sw_blending); + ASSERT(!om_bsel.ps); GL_PUSH("COLCLIP"); GSDeviceOGL::OMBlendSelector om_bselneg(om_bsel); GSDeviceOGL::PSSelector ps_selneg(ps_sel); diff --git a/plugins/GSdx/GSTextureFXOGL.cpp b/plugins/GSdx/GSTextureFXOGL.cpp index f307381ac2..488ad6e0c2 100644 --- a/plugins/GSdx/GSTextureFXOGL.cpp +++ b/plugins/GSdx/GSTextureFXOGL.cpp @@ -104,7 +104,10 @@ GSBlendStateOGL* GSDeviceOGL::CreateBlend(OMBlendSelector bsel, float afix) { int i = ((bsel.a * 3 + bsel.b) * 3 + bsel.c) * 3 + bsel.d; - bs->SetRGB(m_blendMapD3D9[i].op, m_blendMapD3D9[i].src, m_blendMapD3D9[i].dst); + if (bsel.accu) + bs->SetRGB(GL_FUNC_ADD, GL_ONE, GL_ONE); + else + bs->SetRGB(m_blendMapD3D9[i].op, m_blendMapD3D9[i].src, m_blendMapD3D9[i].dst); if (m_blendMapD3D9[i].bogus & A_MAX) { if (!theApp.GetConfig("accurate_blending_unit", 1)) { @@ -187,13 +190,13 @@ GLuint GSDeviceOGL::GetPaletteSamplerID() return m_palette_ss; } -void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix, bool sw_blending) +void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, float afix) { GSDepthStencilOGL* dss = m_om_dss[dssel]; OMSetDepthStencilState(dss, 1); - if (sw_blending) { + if (bsel.ps && !bsel.accu) { if (GLState::blend) { GLState::blend = false; glDisable(GL_BLEND); diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl index f063db1ddb..794060a56e 100644 --- a/plugins/GSdx/res/glsl/tfx_fs.glsl +++ b/plugins/GSdx/res/glsl/tfx_fs.glsl @@ -453,6 +453,9 @@ void ps_blend(inout vec4 Color, float As) #if PS_BLEND_A == PS_BLEND_B Color.rgb = D; +#elif PS_BLEND_ACCU == 1 + // The D addition will be done in the blending unit + Color.rgb = trunc(A * C); #else Color.rgb = trunc((A - B) * C + D); #endif diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h index 9c9570327c..dbe510afdf 100644 --- a/plugins/GSdx/res/glsl_source.h +++ b/plugins/GSdx/res/glsl_source.h @@ -1316,6 +1316,9 @@ static const char* tfx_fs_all_glsl = "\n" "#if PS_BLEND_A == PS_BLEND_B\n" " Color.rgb = D;\n" + "#elif PS_BLEND_ACCU == 1\n" + " // The D addition will be done in the blending unit\n" + " Color.rgb = trunc(A * C);\n" "#else\n" " Color.rgb = trunc((A - B) * C + D);\n" "#endif\n"