From 78dd9577174857f1218b2b87457fa1470a6d7140 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 14 Aug 2015 17:53:41 +0200 Subject: [PATCH 1/4] gsdx-ogl: use normalized index coordinate for palette texture In palette mode, 90% of texture accesses are done in 8 bits. So let's keep this path as light as possible. It reduces GPU load. --- plugins/GSdx/res/glsl/tfx_fs.glsl | 18 +++++++++++------- plugins/GSdx/res/glsl_source.h | 18 +++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl index af71932801..df482558ed 100644 --- a/plugins/GSdx/res/glsl/tfx_fs.glsl +++ b/plugins/GSdx/res/glsl/tfx_fs.glsl @@ -80,9 +80,9 @@ vec4 sample_c(vec2 uv) return texture(TextureSampler, uv); } -vec4 sample_p(uint idx) +vec4 sample_p(float idx) { - return texelFetch(PaletteSampler, ivec2(idx, 0u), 0); + return texture(PaletteSampler, vec2(idx, 0.0f)); } vec4 wrapuv(vec4 uv) @@ -149,7 +149,7 @@ mat4 sample_4c(vec4 uv) return c; } -uvec4 sample_4_index(vec4 uv) +vec4 sample_4_index(vec4 uv) { vec4 c; @@ -169,18 +169,22 @@ uvec4 sample_4_index(vec4 uv) #if PS_IFMT == 1 // 4HH - return i >> 4u; + return vec4(i >> 4u) / 255.0f; + #elif PS_IFMT == 2 // 4HL - return i & 0xFu; + return vec4(i & 0xFu) / 255.0f; + #else + // Most of texture will hit this code so keep normalized float value + // 8 bits - return i; + return c; #endif } -mat4 sample_4p(uvec4 u) +mat4 sample_4p(vec4 u) { mat4 c; diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h index 27cb31c417..3139c86853 100644 --- a/plugins/GSdx/res/glsl_source.h +++ b/plugins/GSdx/res/glsl_source.h @@ -939,9 +939,9 @@ static const char* tfx_fs_all_glsl = " return texture(TextureSampler, uv);\n" "}\n" "\n" - "vec4 sample_p(uint idx)\n" + "vec4 sample_p(float idx)\n" "{\n" - " return texelFetch(PaletteSampler, ivec2(idx, 0u), 0);\n" + " return texture(PaletteSampler, vec2(idx, 0.0f));\n" "}\n" "\n" "vec4 wrapuv(vec4 uv)\n" @@ -1008,7 +1008,7 @@ static const char* tfx_fs_all_glsl = " return c;\n" "}\n" "\n" - "uvec4 sample_4_index(vec4 uv)\n" + "vec4 sample_4_index(vec4 uv)\n" "{\n" " vec4 c;\n" "\n" @@ -1028,18 +1028,22 @@ static const char* tfx_fs_all_glsl = "\n" "#if PS_IFMT == 1\n" " // 4HH\n" - " return i >> 4u;\n" + " return vec4(i >> 4u) / 255.0f;\n" + "\n" "#elif PS_IFMT == 2\n" " // 4HL\n" - " return i & 0xFu;\n" + " return vec4(i & 0xFu) / 255.0f;\n" + "\n" "#else\n" + " // Most of texture will hit this code so keep normalized float value\n" + "\n" " // 8 bits\n" - " return i;\n" + " return c;\n" "#endif\n" "\n" "}\n" "\n" - "mat4 sample_4p(uvec4 u)\n" + "mat4 sample_4p(vec4 u)\n" "{\n" " mat4 c;\n" "\n" From 53d1fdd8f1327eaa81336938ede853879f1ee2b5 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 14 Aug 2015 20:14:36 +0200 Subject: [PATCH 2/4] glsl:debug: disable fst when testing texturing shader Reduce clutter in ASM dump --- plugins/GSdx/GSDeviceOGL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp index 2c8e511439..9777b96228 100644 --- a/plugins/GSdx/GSDeviceOGL.cpp +++ b/plugins/GSdx/GSDeviceOGL.cpp @@ -826,6 +826,7 @@ void GSDeviceOGL::SelfShaderTest() sel.atst = 1; sel.tfx = 1; sel.tcc = 1; + sel.fst = 1; sel.ltf = ltf; sel.aem = aem; From c5a786ed2c27ed0669f9518ce9e7a6e499a09d0c Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 14 Aug 2015 20:57:45 +0200 Subject: [PATCH 3/4] gsdx-ogl: remove support WMS/T == 2 in hardware unit I think behavior was wrong because only first texel coordinate was clamped. Beside we can't interpolate if AEM isn't yet applied --- plugins/GSdx/GSRendererOGL.cpp | 2 +- plugins/GSdx/res/glsl/tfx_fs.glsl | 38 ++++++++++++++----------------- plugins/GSdx/res/glsl_source.h | 38 ++++++++++++++----------------- 3 files changed, 35 insertions(+), 43 deletions(-) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 043684b00c..1b9e122954 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -781,7 +781,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[m_context->TEX0.PSM]; const GSLocalMemory::psm_t &cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[m_context->TEX0.CPSM] : psm; bool bilinear = m_filter == 2 ? m_vt.IsLinear() : m_filter != 0; - bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && m_context->CLAMP.WMS < 3 && m_context->CLAMP.WMT < 3; + bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && m_context->CLAMP.WMS < 2 && m_context->CLAMP.WMT < 2; // Don't force extra filtering on sprite (it creates various upscaling issue) bilinear &= !((m_vt.m_primclass == GS_SPRITE_CLASS) && m_userhacks_round_sprite_offset && !m_vt.IsLinear()); diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl index df482558ed..beac76b0f1 100644 --- a/plugins/GSdx/res/glsl/tfx_fs.glsl +++ b/plugins/GSdx/res/glsl/tfx_fs.glsl @@ -63,16 +63,25 @@ layout(std140, binding = 21) uniform cb21 { vec3 FogColor; float AREF; + vec4 WH; + vec2 MinF; vec2 TA; + uvec4 MskFix; + uvec4 FbMask; - vec3 _not_yet_used; + + vec3 _pad1; float Af; + vec4 HalfTexel; + vec4 MinMax; + vec2 TC_OffsetHack; + vec2 _pad2; }; vec4 sample_c(vec2 uv) @@ -85,7 +94,7 @@ vec4 sample_p(float idx) return texture(PaletteSampler, vec2(idx, 0.0f)); } -vec4 wrapuv(vec4 uv) +vec4 clamp_wrap_uv(vec4 uv) { vec4 uv_out = uv; @@ -120,21 +129,6 @@ vec4 wrapuv(vec4 uv) return uv_out; } -vec2 clampuv(vec2 uv) -{ - vec2 uv_out = uv; - -#if (PS_WMS == 2) && (PS_WMT == 2) - uv_out = clamp(uv, MinF, MinMax.zw); -#elif PS_WMS == 2 - uv_out.x = clamp(uv.x, MinF.x, MinMax.z); -#elif PS_WMT == 2 - uv_out.y = clamp(uv.y, MinF.y, MinMax.w); -#endif - - return uv_out; -} - mat4 sample_4c(vec4 uv) { mat4 c; @@ -211,10 +205,12 @@ vec4 sample_color(vec2 st, float q) mat4 c; vec2 dd; -#if (PS_LTF == 0 && PS_FMT <= FMT_16 && PS_WMS < 3 && PS_WMT < 3) - c[0] = sample_c(clampuv(st)); + // FIXME I'm not sure this condition is useful (I think code will be optimized) +#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2) + // No software LTF and pure 32 bits RGBA texure without special texture wrapping + c[0] = sample_c(st); #ifdef TEX_COORD_DEBUG - c[0].rg = clampuv(st).xy; + c[0].rg = st.xy; #endif #else @@ -230,7 +226,7 @@ vec4 sample_color(vec2 st, float q) uv = st.xyxy; } - uv = wrapuv(uv); + uv = clamp_wrap_uv(uv); if((PS_FMT & FMT_PAL) != 0) { diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h index 3139c86853..413c7bd8fd 100644 --- a/plugins/GSdx/res/glsl_source.h +++ b/plugins/GSdx/res/glsl_source.h @@ -922,16 +922,25 @@ static const char* tfx_fs_all_glsl = "{\n" " vec3 FogColor;\n" " float AREF;\n" + "\n" " vec4 WH;\n" + "\n" " vec2 MinF;\n" " vec2 TA;\n" + "\n" " uvec4 MskFix;\n" + "\n" " uvec4 FbMask;\n" - " vec3 _not_yet_used;\n" + "\n" + " vec3 _pad1;\n" " float Af;\n" + "\n" " vec4 HalfTexel;\n" + "\n" " vec4 MinMax;\n" + "\n" " vec2 TC_OffsetHack;\n" + " vec2 _pad2;\n" "};\n" "\n" "vec4 sample_c(vec2 uv)\n" @@ -944,7 +953,7 @@ static const char* tfx_fs_all_glsl = " return texture(PaletteSampler, vec2(idx, 0.0f));\n" "}\n" "\n" - "vec4 wrapuv(vec4 uv)\n" + "vec4 clamp_wrap_uv(vec4 uv)\n" "{\n" " vec4 uv_out = uv;\n" "\n" @@ -979,21 +988,6 @@ static const char* tfx_fs_all_glsl = " return uv_out;\n" "}\n" "\n" - "vec2 clampuv(vec2 uv)\n" - "{\n" - " vec2 uv_out = uv;\n" - "\n" - "#if (PS_WMS == 2) && (PS_WMT == 2)\n" - " uv_out = clamp(uv, MinF, MinMax.zw);\n" - "#elif PS_WMS == 2\n" - " uv_out.x = clamp(uv.x, MinF.x, MinMax.z);\n" - "#elif PS_WMT == 2\n" - " uv_out.y = clamp(uv.y, MinF.y, MinMax.w);\n" - "#endif\n" - "\n" - " return uv_out;\n" - "}\n" - "\n" "mat4 sample_4c(vec4 uv)\n" "{\n" " mat4 c;\n" @@ -1070,10 +1064,12 @@ static const char* tfx_fs_all_glsl = " mat4 c;\n" " vec2 dd;\n" "\n" - "#if (PS_LTF == 0 && PS_FMT <= FMT_16 && PS_WMS < 3 && PS_WMT < 3)\n" - " c[0] = sample_c(clampuv(st));\n" + " // FIXME I'm not sure this condition is useful (I think code will be optimized)\n" + "#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n" + " // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n" + " c[0] = sample_c(st);\n" "#ifdef TEX_COORD_DEBUG\n" - " c[0].rg = clampuv(st).xy;\n" + " c[0].rg = st.xy;\n" "#endif\n" "\n" "#else\n" @@ -1089,7 +1085,7 @@ static const char* tfx_fs_all_glsl = " uv = st.xyxy;\n" " }\n" "\n" - " uv = wrapuv(uv);\n" + " uv = clamp_wrap_uv(uv);\n" "\n" " if((PS_FMT & FMT_PAL) != 0)\n" " {\n" From 37f9bcf9cb5bb6372acbe4394586fc99e301bc79 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 14 Aug 2015 23:53:01 +0200 Subject: [PATCH 4/4] gsdx-ogl: reduce state change * don't dirty aref when a fog color is uploaded * only set clamp mode in clamp mode (region clamp is handled in shader) v2: fix SSE2/3 compilation --- plugins/GSdx/GSRendererOGL.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 1b9e122954..63d6cb81c1 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -760,7 +760,13 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour { ps_sel.fog = 1; - ps_cb.FogColor_AREF = GSVector4::rgba32(m_env.FOGCOL.u32[0]); + GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]); +#if _M_SSE >= 0x401 + // Blend AREF to avoid to load a random value for alpha (dirty cache) + ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF); +#else + ps_cb.FogColor_AREF = fc; +#endif } if (m_context->TEST.ATE) @@ -831,13 +837,17 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour if (PRIM->FST) { + // FIXME move it in the ps_cb vs_cb.TextureScale = GSVector4(1.0f / 16) / WH.xyxy(); ps_sel.fst = 1; } ps_cb.WH = WH; ps_cb.HalfTexel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw(); - ps_cb.MskFix = GSVector4i(m_context->CLAMP.MINU, m_context->CLAMP.MINV, m_context->CLAMP.MAXU, m_context->CLAMP.MAXV); + if ((m_context->CLAMP.WMS | m_context->CLAMP.WMT) > 1) { + ps_cb.MskFix = GSVector4i(m_context->CLAMP.MINU, m_context->CLAMP.MINV, m_context->CLAMP.MAXU, m_context->CLAMP.MAXV); + ps_cb.MinMax = GSVector4(ps_cb.MskFix) / WH.xyxy(); + } // TC Offset Hack ps_sel.tcoffsethack = !!UserHacks_TCOffset; @@ -849,8 +859,9 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_cb.MinMax = clamp / WH.xyxy(); ps_cb.MinF_TA = (clamp + 0.5f).xyxy(ta) / WH.xyxy(GSVector4(255, 255)); - ps_ssel.tau = (m_context->CLAMP.WMS + 3) >> 1; - ps_ssel.tav = (m_context->CLAMP.WMT + 3) >> 1; + // Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader + ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP); + ps_ssel.tav = (m_context->CLAMP.WMT != CLAMP_CLAMP); ps_ssel.ltf = bilinear && simple_sample; // Setup Texture ressources