diff --git a/plugins/GSdx/GSDeviceOGL.cpp b/plugins/GSdx/GSDeviceOGL.cpp index 0fef4ec759..618d8f2a6b 100644 --- a/plugins/GSdx/GSDeviceOGL.cpp +++ b/plugins/GSdx/GSDeviceOGL.cpp @@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel) std::string macro = format("#define PS_FST %d\n", sel.fst) + format("#define PS_WMS %d\n", sel.wms) + format("#define PS_WMT %d\n", sel.wmt) - + format("#define PS_FMT %d\n", sel.fmt) - + format("#define PS_IFMT %d\n", sel.ifmt) + + format("#define PS_TEX_FMT %d\n", sel.tex_fmt) + format("#define PS_DFMT %d\n", sel.dfmt) + format("#define PS_AEM %d\n", sel.aem) + format("#define PS_TFX %d\n", sel.tfx) @@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest() PRINT_TEST("Tfx/Tcc"); // Test: Texture Sampling - for (int fmt = 0; fmt < 8; fmt++) { + for (int fmt = 0; fmt < 16; fmt++) { if ((fmt & 3) == 3) continue; for (int ltf = 0; ltf < 2; ltf++) { for (int aem = 0; aem < 2; aem++) { - for (int ifmt = 0; ifmt < 3; ifmt++) { - for (int wms = 1; wms < 4; wms++) { - for (int wmt = 1; wmt < 4; wmt++) { - PSSelector sel; - sel.atst = 1; - sel.tfx = 1; - sel.tcc = 1; - sel.fst = 1; + for (int wms = 1; wms < 4; wms++) { + for (int wmt = 1; wmt < 4; wmt++) { + PSSelector sel; + sel.atst = 1; + sel.tfx = 1; + sel.tcc = 1; + sel.fst = 1; - sel.ltf = ltf; - sel.aem = aem; - sel.fmt = fmt; - sel.ifmt = ifmt; - sel.wms = wms; - sel.wmt = wmt; - std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm", - ltf, aem, fmt, ifmt, wms, wmt); - RUN_TEST; - } + sel.ltf = ltf; + sel.aem = aem; + sel.tex_fmt = fmt; + sel.wms = wms; + sel.wmt = wmt; + std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm", + ltf, aem, fmt, wms, wmt); + RUN_TEST; } } } diff --git a/plugins/GSdx/GSDeviceOGL.h b/plugins/GSdx/GSDeviceOGL.h index 04bd033905..d170650dfa 100644 --- a/plugins/GSdx/GSDeviceOGL.h +++ b/plugins/GSdx/GSDeviceOGL.h @@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice { // *** Word 1 // Format - uint32 fmt:3; - uint32 ifmt:2; + uint32 tex_fmt:4; uint32 dfmt:2; // Alpha extension/Correction uint32 aem:1; @@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice uint32 write_rg:1; uint32 fbmask:1; - uint32 _free1:1; + uint32 _free1:2; // *** Word 2 // Blend and Colclip diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 506498075f..bb3a8c7b0d 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_sel.wms = m_context->CLAMP.WMS; ps_sel.wmt = m_context->CLAMP.WMT; + // Performance note: + // 1/ Don't set 0 as it is the default value + // 2/ Only keep aem when it is useful (avoid useless shader permutation) if (ps_sel.shuffle) { - ps_sel.fmt = 0; - } else if (tex->m_palette) { - ps_sel.fmt = cpsm.fmt | 4; - ps_sel.ifmt = !tex->m_target ? 0 - : (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2 - : (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1 - : 0; + // Force a 32 bits access (normally shuffle is done on 16 bits) + // ps_sel.tex_fmt = 0; // removed as an optimization + ps_sel.aem = m_env.TEXA.AEM; + ASSERT(tex->m_target); - // In standard mode palette is only used when alpha channel of the RT is - // reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer. - // It is a very bad idea to force bilinear filtering on it. - if (tex->m_target) + GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff()); + ps_cb.MinF_TA = ta.xyxy() / 255.0f; + + // FIXME: it is likely a bad idea to do the bilinear interpolation here + // bilinear &= m_vt.IsLinear(); + + } else if (tex->m_target) { + // Use an old target. AEM and index aren't resolved it must be done + // on the GPU + + // Select the 32/24/16 bits color (AEM) + ps_sel.tex_fmt = cpsm.fmt; + ps_sel.aem = m_env.TEXA.AEM; + + GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff()); + ps_cb.MinF_TA = ta.xyxy() / 255.0f; + + // Select the index format + if (tex->m_palette) { + // FIXME Potentially improve fmt field in GSLocalMemory + if (m_context->TEX0.PSM == PSM_PSMT4HL) + ps_sel.tex_fmt |= 1 << 2; + else if (m_context->TEX0.PSM == PSM_PSMT4HH) + ps_sel.tex_fmt |= 2 << 2; + else + ps_sel.tex_fmt |= 3 << 2; + + // Alpha channel of the RT is reinterpreted as an index. Star + // Ocean 3 uses it to emulate a stencil buffer. It is a very + // bad idea to force bilinear filtering on it. bilinear &= m_vt.IsLinear(); + } + + } else if (tex->m_palette) { + // Use a standard 8 bits texture. AEM is already done on the CLUT + // Therefore you only need to set the index + // ps_sel.tex_fmt = 0; // removed as an optimization + // ps_sel.aem = 0; // removed as an optimization + + // Note 4 bits indexes are converted to 8 bits + ps_sel.tex_fmt = 3 << 2; - //GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt); } else { - ps_sel.fmt = cpsm.fmt; + // Standard texture. Both index and AEM expansion were already done by the CPU. + // ps_sel.tex_fmt = 0; // removed as an optimization + // ps_sel.aem = 0; // removed as an optimization } - ps_sel.aem = m_env.TEXA.AEM; if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) { // Micro optimization that reduces GPU load (removes 5 instructions on the FS program) @@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour ps_sel.tcoffsethack = !!UserHacks_TCOffset; ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy(); - GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff()); - ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255)); // Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP); diff --git a/plugins/GSdx/res/glsl/tfx_fs.glsl b/plugins/GSdx/res/glsl/tfx_fs.glsl index 754ffbf9b0..67c71fba46 100644 --- a/plugins/GSdx/res/glsl/tfx_fs.glsl +++ b/plugins/GSdx/res/glsl/tfx_fs.glsl @@ -6,7 +6,9 @@ #define FMT_32 0 #define FMT_24 1 #define FMT_16 2 -#define FMT_PAL 4 /* flag bit */ + +#define PS_PAL_FMT (PS_TEX_FMT >> 2) +#define PS_AEM_FMT (PS_TEX_FMT & 3) // APITRACE_DEBUG enables forced pixel output to easily detect // the fragment computed by primitive @@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv) uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value -#if PS_IFMT == 1 - // 4HH - return vec4(i >> 4u) / 255.0f; - -#elif PS_IFMT == 2 - // 4HL +#if PS_PAL_FMT == 1 + // 4HL return vec4(i & 0xFu) / 255.0f; +#elif PS_PAL_FMT == 2 + // 4HH + return vec4(i >> 4u) / 255.0f; + #else // Most of texture will hit this code so keep normalized float value @@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q) vec2 dd; // FIXME I'm not sure this condition is useful (I think code will be optimized) -#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2) +#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2) // No software LTF and pure 32 bits RGBA texure without special texture wrapping c[0] = sample_c(st); #ifdef TEX_COORD_DEBUG @@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q) uv = clamp_wrap_uv(uv); - if((PS_FMT & FMT_PAL) != 0) - { - c = sample_4p(sample_4_index(uv)); - } - else - { - c = sample_4c(uv); - } +#if PS_PAL_FMT != 0 + c = sample_4p(sample_4_index(uv)); +#else + c = sample_4c(uv); +#endif + #ifdef TEX_COORD_DEBUG c[0].rg = uv.xy; c[1].rg = uv.xy; @@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q) #endif - // PERF: see the impact of the exansion before/after the interpolation - for (int i = 0; i < 4; i++) - { - // PERF note: using dot product reduces by 1 the number of instruction - // but I'm not sure it is equivalent neither faster. + // PERF note: using dot product reduces by 1 the number of instruction + // but I'm not sure it is equivalent neither faster. + for (int i = 0; i < 4; i++) + { //float sum = dot(c[i].rgb, vec3(1.0f)); -#if ((PS_FMT & ~FMT_PAL) == FMT_24) - c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f; - //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f; -#elif ((PS_FMT & ~FMT_PAL) == FMT_16) - c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f; - //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f; +#if (PS_AEM_FMT == FMT_24) + c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f; + //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f; +#elif (PS_AEM_FMT == FMT_16) + c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f; + //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f; #endif } diff --git a/plugins/GSdx/res/glsl_source.h b/plugins/GSdx/res/glsl_source.h index fc9ecc217b..ccffb134a8 100644 --- a/plugins/GSdx/res/glsl_source.h +++ b/plugins/GSdx/res/glsl_source.h @@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl = "#define FMT_32 0\n" "#define FMT_24 1\n" "#define FMT_16 2\n" - "#define FMT_PAL 4 /* flag bit */\n" + "\n" + "#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n" + "#define PS_AEM_FMT (PS_TEX_FMT & 3)\n" "\n" "// APITRACE_DEBUG enables forced pixel output to easily detect\n" "// the fragment computed by primitive\n" @@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl = "\n" " uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n" "\n" - "#if PS_IFMT == 1\n" - " // 4HH\n" - " return vec4(i >> 4u) / 255.0f;\n" - "\n" - "#elif PS_IFMT == 2\n" - " // 4HL\n" + "#if PS_PAL_FMT == 1\n" + " // 4HL\n" " return vec4(i & 0xFu) / 255.0f;\n" "\n" + "#elif PS_PAL_FMT == 2\n" + " // 4HH\n" + " return vec4(i >> 4u) / 255.0f;\n" + "\n" "#else\n" " // Most of texture will hit this code so keep normalized float value\n" "\n" @@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl = " vec2 dd;\n" "\n" " // FIXME I'm not sure this condition is useful (I think code will be optimized)\n" - "#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n" + "#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n" " // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n" " c[0] = sample_c(st);\n" "#ifdef TEX_COORD_DEBUG\n" @@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl = "\n" " uv = clamp_wrap_uv(uv);\n" "\n" - " if((PS_FMT & FMT_PAL) != 0)\n" - " {\n" - " c = sample_4p(sample_4_index(uv));\n" - " }\n" - " else\n" - " {\n" - " c = sample_4c(uv);\n" - " }\n" + "#if PS_PAL_FMT != 0\n" + " c = sample_4p(sample_4_index(uv));\n" + "#else\n" + " c = sample_4c(uv);\n" + "#endif\n" + "\n" "#ifdef TEX_COORD_DEBUG\n" " c[0].rg = uv.xy;\n" " c[1].rg = uv.xy;\n" @@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl = "\n" "#endif\n" "\n" - " // PERF: see the impact of the exansion before/after the interpolation\n" - " for (int i = 0; i < 4; i++)\n" - " {\n" - " // PERF note: using dot product reduces by 1 the number of instruction\n" - " // but I'm not sure it is equivalent neither faster.\n" + " // PERF note: using dot product reduces by 1 the number of instruction\n" + " // but I'm not sure it is equivalent neither faster.\n" + " for (int i = 0; i < 4; i++)\n" + " {\n" " //float sum = dot(c[i].rgb, vec3(1.0f));\n" - "#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n" - " c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n" - " //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n" - "#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n" - " c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n" - " //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n" + "#if (PS_AEM_FMT == FMT_24)\n" + " c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n" + " //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n" + "#elif (PS_AEM_FMT == FMT_16)\n" + " c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n" + " //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n" "#endif\n" " }\n" "\n"