gsdx-ogl: redo properly the setup of texture format

* add lengthly comment to explain the format
* Likely reduce the number of shader permutation
* Avoid slow AEM (on GPU)

Expect regressions because TC needs some fixes

v2: fix palette mode
This commit is contained in:
Gregory Hainaut 2015-08-08 13:34:55 +02:00
parent 2a8bae187f
commit 78569ee833
5 changed files with 121 additions and 94 deletions

View File

@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
std::string macro = format("#define PS_FST %d\n", sel.fst)
+ format("#define PS_WMS %d\n", sel.wms)
+ format("#define PS_WMT %d\n", sel.wmt)
+ format("#define PS_FMT %d\n", sel.fmt)
+ format("#define PS_IFMT %d\n", sel.ifmt)
+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
+ format("#define PS_DFMT %d\n", sel.dfmt)
+ format("#define PS_AEM %d\n", sel.aem)
+ format("#define PS_TFX %d\n", sel.tfx)
@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest()
PRINT_TEST("Tfx/Tcc");
// Test: Texture Sampling
for (int fmt = 0; fmt < 8; fmt++) {
for (int fmt = 0; fmt < 16; fmt++) {
if ((fmt & 3) == 3) continue;
for (int ltf = 0; ltf < 2; ltf++) {
for (int aem = 0; aem < 2; aem++) {
for (int ifmt = 0; ifmt < 3; ifmt++) {
for (int wms = 1; wms < 4; wms++) {
for (int wmt = 1; wmt < 4; wmt++) {
PSSelector sel;
sel.atst = 1;
sel.tfx = 1;
sel.tcc = 1;
sel.fst = 1;
for (int wms = 1; wms < 4; wms++) {
for (int wmt = 1; wmt < 4; wmt++) {
PSSelector sel;
sel.atst = 1;
sel.tfx = 1;
sel.tcc = 1;
sel.fst = 1;
sel.ltf = ltf;
sel.aem = aem;
sel.fmt = fmt;
sel.ifmt = ifmt;
sel.wms = wms;
sel.wmt = wmt;
std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
ltf, aem, fmt, ifmt, wms, wmt);
RUN_TEST;
}
sel.ltf = ltf;
sel.aem = aem;
sel.tex_fmt = fmt;
sel.wms = wms;
sel.wmt = wmt;
std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
ltf, aem, fmt, wms, wmt);
RUN_TEST;
}
}
}

View File

@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
{
// *** Word 1
// Format
uint32 fmt:3;
uint32 ifmt:2;
uint32 tex_fmt:4;
uint32 dfmt:2;
// Alpha extension/Correction
uint32 aem:1;
@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
uint32 write_rg:1;
uint32 fbmask:1;
uint32 _free1:1;
uint32 _free1:2;
// *** Word 2
// Blend and Colclip

View File

@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.wms = m_context->CLAMP.WMS;
ps_sel.wmt = m_context->CLAMP.WMT;
// Performance note:
// 1/ Don't set 0 as it is the default value
// 2/ Only keep aem when it is useful (avoid useless shader permutation)
if (ps_sel.shuffle) {
ps_sel.fmt = 0;
} else if (tex->m_palette) {
ps_sel.fmt = cpsm.fmt | 4;
ps_sel.ifmt = !tex->m_target ? 0
: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
: 0;
// Force a 32 bits access (normally shuffle is done on 16 bits)
// ps_sel.tex_fmt = 0; // removed as an optimization
ps_sel.aem = m_env.TEXA.AEM;
ASSERT(tex->m_target);
// In standard mode palette is only used when alpha channel of the RT is
// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
// It is a very bad idea to force bilinear filtering on it.
if (tex->m_target)
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
// FIXME: it is likely a bad idea to do the bilinear interpolation here
// bilinear &= m_vt.IsLinear();
} else if (tex->m_target) {
// Use an old target. AEM and index aren't resolved it must be done
// on the GPU
// Select the 32/24/16 bits color (AEM)
ps_sel.tex_fmt = cpsm.fmt;
ps_sel.aem = m_env.TEXA.AEM;
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
// Select the index format
if (tex->m_palette) {
// FIXME Potentially improve fmt field in GSLocalMemory
if (m_context->TEX0.PSM == PSM_PSMT4HL)
ps_sel.tex_fmt |= 1 << 2;
else if (m_context->TEX0.PSM == PSM_PSMT4HH)
ps_sel.tex_fmt |= 2 << 2;
else
ps_sel.tex_fmt |= 3 << 2;
// Alpha channel of the RT is reinterpreted as an index. Star
// Ocean 3 uses it to emulate a stencil buffer. It is a very
// bad idea to force bilinear filtering on it.
bilinear &= m_vt.IsLinear();
}
} else if (tex->m_palette) {
// Use a standard 8 bits texture. AEM is already done on the CLUT
// Therefore you only need to set the index
// ps_sel.tex_fmt = 0; // removed as an optimization
// ps_sel.aem = 0; // removed as an optimization
// Note 4 bits indexes are converted to 8 bits
ps_sel.tex_fmt = 3 << 2;
//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
} else {
ps_sel.fmt = cpsm.fmt;
// Standard texture. Both index and AEM expansion were already done by the CPU.
// ps_sel.tex_fmt = 0; // removed as an optimization
// ps_sel.aem = 0; // removed as an optimization
}
ps_sel.aem = m_env.TEXA.AEM;
if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.tcoffsethack = !!UserHacks_TCOffset;
ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));
// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);

View File

@ -6,7 +6,9 @@
#define FMT_32 0
#define FMT_24 1
#define FMT_16 2
#define FMT_PAL 4 /* flag bit */
#define PS_PAL_FMT (PS_TEX_FMT >> 2)
#define PS_AEM_FMT (PS_TEX_FMT & 3)
// APITRACE_DEBUG enables forced pixel output to easily detect
// the fragment computed by primitive
@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)
uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
#if PS_IFMT == 1
// 4HH
return vec4(i >> 4u) / 255.0f;
#elif PS_IFMT == 2
// 4HL
#if PS_PAL_FMT == 1
// 4HL
return vec4(i & 0xFu) / 255.0f;
#elif PS_PAL_FMT == 2
// 4HH
return vec4(i >> 4u) / 255.0f;
#else
// Most of texture will hit this code so keep normalized float value
@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
vec2 dd;
// FIXME I'm not sure this condition is useful (I think code will be optimized)
#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
// No software LTF and pure 32 bits RGBA texure without special texture wrapping
c[0] = sample_c(st);
#ifdef TEX_COORD_DEBUG
@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)
uv = clamp_wrap_uv(uv);
if((PS_FMT & FMT_PAL) != 0)
{
c = sample_4p(sample_4_index(uv));
}
else
{
c = sample_4c(uv);
}
#if PS_PAL_FMT != 0
c = sample_4p(sample_4_index(uv));
#else
c = sample_4c(uv);
#endif
#ifdef TEX_COORD_DEBUG
c[0].rg = uv.xy;
c[1].rg = uv.xy;
@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q)
#endif
// PERF: see the impact of the exansion before/after the interpolation
for (int i = 0; i < 4; i++)
{
// PERF note: using dot product reduces by 1 the number of instruction
// but I'm not sure it is equivalent neither faster.
// PERF note: using dot product reduces by 1 the number of instruction
// but I'm not sure it is equivalent neither faster.
for (int i = 0; i < 4; i++)
{
//float sum = dot(c[i].rgb, vec3(1.0f));
#if ((PS_FMT & ~FMT_PAL) == FMT_24)
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#if (PS_AEM_FMT == FMT_24)
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#elif (PS_AEM_FMT == FMT_16)
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#endif
}

View File

@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
"#define FMT_32 0\n"
"#define FMT_24 1\n"
"#define FMT_16 2\n"
"#define FMT_PAL 4 /* flag bit */\n"
"\n"
"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
"\n"
"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
"// the fragment computed by primitive\n"
@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
"\n"
" uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
"\n"
"#if PS_IFMT == 1\n"
" // 4HH\n"
" return vec4(i >> 4u) / 255.0f;\n"
"\n"
"#elif PS_IFMT == 2\n"
" // 4HL\n"
"#if PS_PAL_FMT == 1\n"
" // 4HL\n"
" return vec4(i & 0xFu) / 255.0f;\n"
"\n"
"#elif PS_PAL_FMT == 2\n"
" // 4HH\n"
" return vec4(i >> 4u) / 255.0f;\n"
"\n"
"#else\n"
" // Most of texture will hit this code so keep normalized float value\n"
"\n"
@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
" vec2 dd;\n"
"\n"
" // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
" // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
" c[0] = sample_c(st);\n"
"#ifdef TEX_COORD_DEBUG\n"
@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
"\n"
" uv = clamp_wrap_uv(uv);\n"
"\n"
" if((PS_FMT & FMT_PAL) != 0)\n"
" {\n"
" c = sample_4p(sample_4_index(uv));\n"
" }\n"
" else\n"
" {\n"
" c = sample_4c(uv);\n"
" }\n"
"#if PS_PAL_FMT != 0\n"
" c = sample_4p(sample_4_index(uv));\n"
"#else\n"
" c = sample_4c(uv);\n"
"#endif\n"
"\n"
"#ifdef TEX_COORD_DEBUG\n"
" c[0].rg = uv.xy;\n"
" c[1].rg = uv.xy;\n"
@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl =
"\n"
"#endif\n"
"\n"
" // PERF: see the impact of the exansion before/after the interpolation\n"
" for (int i = 0; i < 4; i++)\n"
" {\n"
" // PERF note: using dot product reduces by 1 the number of instruction\n"
" // but I'm not sure it is equivalent neither faster.\n"
" // PERF note: using dot product reduces by 1 the number of instruction\n"
" // but I'm not sure it is equivalent neither faster.\n"
" for (int i = 0; i < 4; i++)\n"
" {\n"
" //float sum = dot(c[i].rgb, vec3(1.0f));\n"
"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#if (PS_AEM_FMT == FMT_24)\n"
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#elif (PS_AEM_FMT == FMT_16)\n"
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#endif\n"
" }\n"
"\n"