mirror of https://github.com/PCSX2/pcsx2.git
gsdx-ogl: redo properly the setup of texture format
* add lengthly comment to explain the format * Likely reduce the number of shader permutation * Avoid slow AEM (on GPU) Expect regressions because TC needs some fixes v2: fix palette mode
This commit is contained in:
parent
2a8bae187f
commit
78569ee833
|
@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
|
|||
std::string macro = format("#define PS_FST %d\n", sel.fst)
|
||||
+ format("#define PS_WMS %d\n", sel.wms)
|
||||
+ format("#define PS_WMT %d\n", sel.wmt)
|
||||
+ format("#define PS_FMT %d\n", sel.fmt)
|
||||
+ format("#define PS_IFMT %d\n", sel.ifmt)
|
||||
+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
|
||||
+ format("#define PS_DFMT %d\n", sel.dfmt)
|
||||
+ format("#define PS_AEM %d\n", sel.aem)
|
||||
+ format("#define PS_TFX %d\n", sel.tfx)
|
||||
|
@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest()
|
|||
PRINT_TEST("Tfx/Tcc");
|
||||
|
||||
// Test: Texture Sampling
|
||||
for (int fmt = 0; fmt < 8; fmt++) {
|
||||
for (int fmt = 0; fmt < 16; fmt++) {
|
||||
if ((fmt & 3) == 3) continue;
|
||||
|
||||
for (int ltf = 0; ltf < 2; ltf++) {
|
||||
for (int aem = 0; aem < 2; aem++) {
|
||||
for (int ifmt = 0; ifmt < 3; ifmt++) {
|
||||
for (int wms = 1; wms < 4; wms++) {
|
||||
for (int wmt = 1; wmt < 4; wmt++) {
|
||||
PSSelector sel;
|
||||
sel.atst = 1;
|
||||
sel.tfx = 1;
|
||||
sel.tcc = 1;
|
||||
sel.fst = 1;
|
||||
for (int wms = 1; wms < 4; wms++) {
|
||||
for (int wmt = 1; wmt < 4; wmt++) {
|
||||
PSSelector sel;
|
||||
sel.atst = 1;
|
||||
sel.tfx = 1;
|
||||
sel.tcc = 1;
|
||||
sel.fst = 1;
|
||||
|
||||
sel.ltf = ltf;
|
||||
sel.aem = aem;
|
||||
sel.fmt = fmt;
|
||||
sel.ifmt = ifmt;
|
||||
sel.wms = wms;
|
||||
sel.wmt = wmt;
|
||||
std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
|
||||
ltf, aem, fmt, ifmt, wms, wmt);
|
||||
RUN_TEST;
|
||||
}
|
||||
sel.ltf = ltf;
|
||||
sel.aem = aem;
|
||||
sel.tex_fmt = fmt;
|
||||
sel.wms = wms;
|
||||
sel.wmt = wmt;
|
||||
std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
|
||||
ltf, aem, fmt, wms, wmt);
|
||||
RUN_TEST;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
|
|||
{
|
||||
// *** Word 1
|
||||
// Format
|
||||
uint32 fmt:3;
|
||||
uint32 ifmt:2;
|
||||
uint32 tex_fmt:4;
|
||||
uint32 dfmt:2;
|
||||
// Alpha extension/Correction
|
||||
uint32 aem:1;
|
||||
|
@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
|
|||
uint32 write_rg:1;
|
||||
uint32 fbmask:1;
|
||||
|
||||
uint32 _free1:1;
|
||||
uint32 _free1:2;
|
||||
|
||||
// *** Word 2
|
||||
// Blend and Colclip
|
||||
|
|
|
@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
|
|||
ps_sel.wms = m_context->CLAMP.WMS;
|
||||
ps_sel.wmt = m_context->CLAMP.WMT;
|
||||
|
||||
// Performance note:
|
||||
// 1/ Don't set 0 as it is the default value
|
||||
// 2/ Only keep aem when it is useful (avoid useless shader permutation)
|
||||
if (ps_sel.shuffle) {
|
||||
ps_sel.fmt = 0;
|
||||
} else if (tex->m_palette) {
|
||||
ps_sel.fmt = cpsm.fmt | 4;
|
||||
ps_sel.ifmt = !tex->m_target ? 0
|
||||
: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
|
||||
: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
|
||||
: 0;
|
||||
// Force a 32 bits access (normally shuffle is done on 16 bits)
|
||||
// ps_sel.tex_fmt = 0; // removed as an optimization
|
||||
ps_sel.aem = m_env.TEXA.AEM;
|
||||
ASSERT(tex->m_target);
|
||||
|
||||
// In standard mode palette is only used when alpha channel of the RT is
|
||||
// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
|
||||
// It is a very bad idea to force bilinear filtering on it.
|
||||
if (tex->m_target)
|
||||
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
|
||||
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
|
||||
|
||||
// FIXME: it is likely a bad idea to do the bilinear interpolation here
|
||||
// bilinear &= m_vt.IsLinear();
|
||||
|
||||
} else if (tex->m_target) {
|
||||
// Use an old target. AEM and index aren't resolved it must be done
|
||||
// on the GPU
|
||||
|
||||
// Select the 32/24/16 bits color (AEM)
|
||||
ps_sel.tex_fmt = cpsm.fmt;
|
||||
ps_sel.aem = m_env.TEXA.AEM;
|
||||
|
||||
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
|
||||
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
|
||||
|
||||
// Select the index format
|
||||
if (tex->m_palette) {
|
||||
// FIXME Potentially improve fmt field in GSLocalMemory
|
||||
if (m_context->TEX0.PSM == PSM_PSMT4HL)
|
||||
ps_sel.tex_fmt |= 1 << 2;
|
||||
else if (m_context->TEX0.PSM == PSM_PSMT4HH)
|
||||
ps_sel.tex_fmt |= 2 << 2;
|
||||
else
|
||||
ps_sel.tex_fmt |= 3 << 2;
|
||||
|
||||
// Alpha channel of the RT is reinterpreted as an index. Star
|
||||
// Ocean 3 uses it to emulate a stencil buffer. It is a very
|
||||
// bad idea to force bilinear filtering on it.
|
||||
bilinear &= m_vt.IsLinear();
|
||||
}
|
||||
|
||||
} else if (tex->m_palette) {
|
||||
// Use a standard 8 bits texture. AEM is already done on the CLUT
|
||||
// Therefore you only need to set the index
|
||||
// ps_sel.tex_fmt = 0; // removed as an optimization
|
||||
// ps_sel.aem = 0; // removed as an optimization
|
||||
|
||||
// Note 4 bits indexes are converted to 8 bits
|
||||
ps_sel.tex_fmt = 3 << 2;
|
||||
|
||||
//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
|
||||
} else {
|
||||
ps_sel.fmt = cpsm.fmt;
|
||||
// Standard texture. Both index and AEM expansion were already done by the CPU.
|
||||
// ps_sel.tex_fmt = 0; // removed as an optimization
|
||||
// ps_sel.aem = 0; // removed as an optimization
|
||||
}
|
||||
ps_sel.aem = m_env.TEXA.AEM;
|
||||
|
||||
if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
|
||||
// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
|
||||
|
@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
|
|||
ps_sel.tcoffsethack = !!UserHacks_TCOffset;
|
||||
ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
|
||||
|
||||
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
|
||||
ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));
|
||||
|
||||
// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
|
||||
ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);
|
||||
|
|
|
@ -6,7 +6,9 @@
|
|||
#define FMT_32 0
|
||||
#define FMT_24 1
|
||||
#define FMT_16 2
|
||||
#define FMT_PAL 4 /* flag bit */
|
||||
|
||||
#define PS_PAL_FMT (PS_TEX_FMT >> 2)
|
||||
#define PS_AEM_FMT (PS_TEX_FMT & 3)
|
||||
|
||||
// APITRACE_DEBUG enables forced pixel output to easily detect
|
||||
// the fragment computed by primitive
|
||||
|
@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)
|
|||
|
||||
uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
|
||||
|
||||
#if PS_IFMT == 1
|
||||
// 4HH
|
||||
return vec4(i >> 4u) / 255.0f;
|
||||
|
||||
#elif PS_IFMT == 2
|
||||
// 4HL
|
||||
#if PS_PAL_FMT == 1
|
||||
// 4HL
|
||||
return vec4(i & 0xFu) / 255.0f;
|
||||
|
||||
#elif PS_PAL_FMT == 2
|
||||
// 4HH
|
||||
return vec4(i >> 4u) / 255.0f;
|
||||
|
||||
#else
|
||||
// Most of texture will hit this code so keep normalized float value
|
||||
|
||||
|
@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
|
|||
vec2 dd;
|
||||
|
||||
// FIXME I'm not sure this condition is useful (I think code will be optimized)
|
||||
#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
|
||||
#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
|
||||
// No software LTF and pure 32 bits RGBA texure without special texture wrapping
|
||||
c[0] = sample_c(st);
|
||||
#ifdef TEX_COORD_DEBUG
|
||||
|
@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)
|
|||
|
||||
uv = clamp_wrap_uv(uv);
|
||||
|
||||
if((PS_FMT & FMT_PAL) != 0)
|
||||
{
|
||||
c = sample_4p(sample_4_index(uv));
|
||||
}
|
||||
else
|
||||
{
|
||||
c = sample_4c(uv);
|
||||
}
|
||||
#if PS_PAL_FMT != 0
|
||||
c = sample_4p(sample_4_index(uv));
|
||||
#else
|
||||
c = sample_4c(uv);
|
||||
#endif
|
||||
|
||||
#ifdef TEX_COORD_DEBUG
|
||||
c[0].rg = uv.xy;
|
||||
c[1].rg = uv.xy;
|
||||
|
@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q)
|
|||
|
||||
#endif
|
||||
|
||||
// PERF: see the impact of the exansion before/after the interpolation
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
// PERF note: using dot product reduces by 1 the number of instruction
|
||||
// but I'm not sure it is equivalent neither faster.
|
||||
// PERF note: using dot product reduces by 1 the number of instruction
|
||||
// but I'm not sure it is equivalent neither faster.
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
//float sum = dot(c[i].rgb, vec3(1.0f));
|
||||
#if ((PS_FMT & ~FMT_PAL) == FMT_24)
|
||||
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
|
||||
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
|
||||
#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
|
||||
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
|
||||
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
|
||||
#if (PS_AEM_FMT == FMT_24)
|
||||
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
|
||||
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
|
||||
#elif (PS_AEM_FMT == FMT_16)
|
||||
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
|
||||
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
|
|||
"#define FMT_32 0\n"
|
||||
"#define FMT_24 1\n"
|
||||
"#define FMT_16 2\n"
|
||||
"#define FMT_PAL 4 /* flag bit */\n"
|
||||
"\n"
|
||||
"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
|
||||
"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
|
||||
"\n"
|
||||
"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
|
||||
"// the fragment computed by primitive\n"
|
||||
|
@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
|
|||
"\n"
|
||||
" uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
|
||||
"\n"
|
||||
"#if PS_IFMT == 1\n"
|
||||
" // 4HH\n"
|
||||
" return vec4(i >> 4u) / 255.0f;\n"
|
||||
"\n"
|
||||
"#elif PS_IFMT == 2\n"
|
||||
" // 4HL\n"
|
||||
"#if PS_PAL_FMT == 1\n"
|
||||
" // 4HL\n"
|
||||
" return vec4(i & 0xFu) / 255.0f;\n"
|
||||
"\n"
|
||||
"#elif PS_PAL_FMT == 2\n"
|
||||
" // 4HH\n"
|
||||
" return vec4(i >> 4u) / 255.0f;\n"
|
||||
"\n"
|
||||
"#else\n"
|
||||
" // Most of texture will hit this code so keep normalized float value\n"
|
||||
"\n"
|
||||
|
@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
|
|||
" vec2 dd;\n"
|
||||
"\n"
|
||||
" // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
|
||||
"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
|
||||
"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
|
||||
" // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
|
||||
" c[0] = sample_c(st);\n"
|
||||
"#ifdef TEX_COORD_DEBUG\n"
|
||||
|
@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
|
|||
"\n"
|
||||
" uv = clamp_wrap_uv(uv);\n"
|
||||
"\n"
|
||||
" if((PS_FMT & FMT_PAL) != 0)\n"
|
||||
" {\n"
|
||||
" c = sample_4p(sample_4_index(uv));\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" c = sample_4c(uv);\n"
|
||||
" }\n"
|
||||
"#if PS_PAL_FMT != 0\n"
|
||||
" c = sample_4p(sample_4_index(uv));\n"
|
||||
"#else\n"
|
||||
" c = sample_4c(uv);\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"#ifdef TEX_COORD_DEBUG\n"
|
||||
" c[0].rg = uv.xy;\n"
|
||||
" c[1].rg = uv.xy;\n"
|
||||
|
@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl =
|
|||
"\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
" // PERF: see the impact of the exansion before/after the interpolation\n"
|
||||
" for (int i = 0; i < 4; i++)\n"
|
||||
" {\n"
|
||||
" // PERF note: using dot product reduces by 1 the number of instruction\n"
|
||||
" // but I'm not sure it is equivalent neither faster.\n"
|
||||
" // PERF note: using dot product reduces by 1 the number of instruction\n"
|
||||
" // but I'm not sure it is equivalent neither faster.\n"
|
||||
" for (int i = 0; i < 4; i++)\n"
|
||||
" {\n"
|
||||
" //float sum = dot(c[i].rgb, vec3(1.0f));\n"
|
||||
"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
|
||||
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
|
||||
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
|
||||
"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
|
||||
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
|
||||
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
|
||||
"#if (PS_AEM_FMT == FMT_24)\n"
|
||||
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
|
||||
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
|
||||
"#elif (PS_AEM_FMT == FMT_16)\n"
|
||||
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
|
||||
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
|
||||
"#endif\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
|
|
Loading…
Reference in New Issue