Merge pull request #763 from PCSX2/gsdx-texture-format

Gsdx improves texture format setup
This commit is contained in:
Gregory Hainaut 2015-09-11 14:44:31 +02:00
commit 0a7eed686c
9 changed files with 163 additions and 122 deletions

View File

@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
std::string macro = format("#define PS_FST %d\n", sel.fst)
+ format("#define PS_WMS %d\n", sel.wms)
+ format("#define PS_WMT %d\n", sel.wmt)
+ format("#define PS_FMT %d\n", sel.fmt)
+ format("#define PS_IFMT %d\n", sel.ifmt)
+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
+ format("#define PS_DFMT %d\n", sel.dfmt)
+ format("#define PS_AEM %d\n", sel.aem)
+ format("#define PS_TFX %d\n", sel.tfx)
@ -812,30 +811,27 @@ void GSDeviceOGL::SelfShaderTest()
PRINT_TEST("Tfx/Tcc");
// Test: Texture Sampling
for (int fmt = 0; fmt < 8; fmt++) {
for (int fmt = 0; fmt < 16; fmt++) {
if ((fmt & 3) == 3) continue;
for (int ltf = 0; ltf < 2; ltf++) {
for (int aem = 0; aem < 2; aem++) {
for (int ifmt = 0; ifmt < 3; ifmt++) {
for (int wms = 1; wms < 4; wms++) {
for (int wmt = 1; wmt < 4; wmt++) {
PSSelector sel;
sel.atst = 1;
sel.tfx = 1;
sel.tcc = 1;
sel.fst = 1;
for (int wms = 1; wms < 4; wms++) {
for (int wmt = 1; wmt < 4; wmt++) {
PSSelector sel;
sel.atst = 1;
sel.tfx = 1;
sel.tcc = 1;
sel.fst = 1;
sel.ltf = ltf;
sel.aem = aem;
sel.fmt = fmt;
sel.ifmt = ifmt;
sel.wms = wms;
sel.wmt = wmt;
std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
ltf, aem, fmt, ifmt, wms, wmt);
RUN_TEST;
}
sel.ltf = ltf;
sel.aem = aem;
sel.tex_fmt = fmt;
sel.wms = wms;
sel.wmt = wmt;
std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
ltf, aem, fmt, wms, wmt);
RUN_TEST;
}
}
}

View File

@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
{
// *** Word 1
// Format
uint32 fmt:3;
uint32 ifmt:2;
uint32 tex_fmt:4;
uint32 dfmt:2;
// Alpha extension/Correction
uint32 aem:1;
@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
uint32 write_rg:1;
uint32 fbmask:1;
uint32 _free1:1;
uint32 _free1:2;
// *** Word 2
// Blend and Colclip

View File

@ -396,6 +396,7 @@ void GSRendererHW::Draw()
return;
}
// FIXME: Could be removed on openGL
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);

View File

@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.wms = m_context->CLAMP.WMS;
ps_sel.wmt = m_context->CLAMP.WMT;
// Performance note:
// 1/ Don't set 0 as it is the default value
// 2/ Only keep aem when it is useful (avoid useless shader permutation)
if (ps_sel.shuffle) {
ps_sel.fmt = 0;
} else if (tex->m_palette) {
ps_sel.fmt = cpsm.fmt | 4;
ps_sel.ifmt = !tex->m_target ? 0
: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
: 0;
// Force a 32 bits access (normally shuffle is done on 16 bits)
// ps_sel.tex_fmt = 0; // removed as an optimization
ps_sel.aem = m_env.TEXA.AEM;
ASSERT(tex->m_target);
// In standard mode palette is only used when alpha channel of the RT is
// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
// It is a very bad idea to force bilinear filtering on it.
if (tex->m_target)
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
// FIXME: it is likely a bad idea to do the bilinear interpolation here
// bilinear &= m_vt.IsLinear();
} else if (tex->m_target) {
// Use an old target. AEM and index aren't resolved it must be done
// on the GPU
// Select the 32/24/16 bits color (AEM)
ps_sel.tex_fmt = cpsm.fmt;
ps_sel.aem = m_env.TEXA.AEM;
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / 255.0f;
// Select the index format
if (tex->m_palette) {
// FIXME Potentially improve fmt field in GSLocalMemory
if (m_context->TEX0.PSM == PSM_PSMT4HL)
ps_sel.tex_fmt |= 1 << 2;
else if (m_context->TEX0.PSM == PSM_PSMT4HH)
ps_sel.tex_fmt |= 2 << 2;
else
ps_sel.tex_fmt |= 3 << 2;
// Alpha channel of the RT is reinterpreted as an index. Star
// Ocean 3 uses it to emulate a stencil buffer. It is a very
// bad idea to force bilinear filtering on it.
bilinear &= m_vt.IsLinear();
}
} else if (tex->m_palette) {
// Use a standard 8 bits texture. AEM is already done on the CLUT
// Therefore you only need to set the index
// ps_sel.tex_fmt = 0; // removed as an optimization
// ps_sel.aem = 0; // removed as an optimization
// Note 4 bits indexes are converted to 8 bits
ps_sel.tex_fmt = 3 << 2;
//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
} else {
ps_sel.fmt = cpsm.fmt;
// Standard texture. Both index and AEM expansion were already done by the CPU.
// ps_sel.tex_fmt = 0; // removed as an optimization
// ps_sel.aem = 0; // removed as an optimization
}
ps_sel.aem = m_env.TEXA.AEM;
if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
ps_sel.tcoffsethack = !!UserHacks_TCOffset;
ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));
// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);

View File

@ -22,9 +22,13 @@
#include "stdafx.h"
#include "GSTextureCache.h"
bool s_IS_OPENGL = false;
GSTextureCache::GSTextureCache(GSRenderer* r)
: m_renderer(r)
{
s_IS_OPENGL = (theApp.GetConfig("Renderer", 12) == 12);
m_spritehack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SpriteHack", 0) : 0;
UserHacks_HalfPixelOffset = !!theApp.GetConfig("UserHacks", 0) && !!theApp.GetConfig("UserHacks_HalfPixelOffset", 0);
@ -72,12 +76,18 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
//const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[TEX0.CPSM] : psm;
GIFRegTEXA plainTEXA;
// Until DX is fixed
if (s_IS_OPENGL) {
if(psm.pal > 0)
m_renderer->m_mem.m_clut.Read32(TEX0, TEXA);
} else {
GIFRegTEXA plainTEXA;
plainTEXA.AEM = 1;
plainTEXA.TA0 = 0;
plainTEXA.TA1 = 0x80;
m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
plainTEXA.AEM = 1;
plainTEXA.TA0 = 0;
plainTEXA.TA1 = 0x80;
m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
}
const uint32* clut = m_renderer->m_mem.m_clut;
@ -85,26 +95,27 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
list<Source*>& m = m_src.m_map[TEX0.TBP0 >> 5];
for(list<Source*>::iterator i = m.begin(); i != m.end(); i++)
{
Source* s = *i;
if(((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
{
if (((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
continue;
}
// Special check for palette texture (psm.pal > 0)
//
// if m_paltex is enabled
// 1/ s->m_palette must always be defined
// 2/ Clut is useless (will be uploaded again at the end of the function)
//
// if m_paltex is disabled
// 1/ Clut must match if m_palette is NULL
if(s->m_palette == NULL && psm.pal > 0 && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
{
continue;
// Target are converted (AEM & palette) on the fly by the GPU. They don't need extra check
if (!s->m_target) {
// We request a palette texture (psm.pal). If the texture was
// converted by the CPU (s->m_palette == NULL), we need to ensure
// palette content is the same.
// Note: content of the palette will be uploaded at the end of the function
if (psm.pal > 0 && s->m_palette == NULL && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
continue;
// We request a 24/16 bit RGBA texture. Alpha expansion was done by
// the CPU. We need to check that TEXA is identical
if (psm.pal == 0 && psm.fmt > 0 && s->m_TEXA.u64 != TEXA.u64)
continue;
}
m.splice(m.begin(), m, i);
@ -147,7 +158,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
uint32 t_psm = (t->m_dirty_alpha) ? t->m_TEX0.PSM & ~0x1 : t->m_TEX0.PSM;
if (GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t_psm)) {
if (!IsOpenGL() && (psm == PSM_PSMT8)) {
if (!s_IS_OPENGL && (psm == PSM_PSMT8)) {
// OpenGL can convert the texture directly in the GPU. Not sure we want to keep this
// code for DX. It fixes effect but it is slow (MGS3)
@ -324,7 +335,7 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
//
// From a performance point of view, it might cost a little on big upscaling
// but normally few RT are miss so it must remain reasonable.
if (IsOpenGL()) {
if (s_IS_OPENGL) {
switch (type) {
case RenderTarget: m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); break;
case DepthStencil: m_renderer->m_dev->ClearDepth(dst->m_texture, 0); break;
@ -863,7 +874,7 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con
// TODO: clean up this mess
int shader = dst->m_type != RenderTarget ? ShaderConvert_FLOAT32_TO_RGBA8 : ShaderConvert_COPY;
bool is_8bits = TEX0.PSM == PSM_PSMT8 && IsOpenGL();
bool is_8bits = TEX0.PSM == PSM_PSMT8 && s_IS_OPENGL;
if (is_8bits) {
GL_INS("Reading RT as a packed-indexed 8 bits format");
@ -1417,9 +1428,14 @@ void GSTextureCache::Source::Flush(uint32 count)
GIFRegTEXA plainTEXA;
plainTEXA.AEM = 1;
plainTEXA.TA0 = 0;
plainTEXA.TA1 = 0x80;
// Until DX is fixed
if (s_IS_OPENGL) {
plainTEXA = m_TEXA;
} else {
plainTEXA.AEM = 1;
plainTEXA.TA0 = 0;
plainTEXA.TA1 = 0x80;
}
if(m_palette)
{

View File

@ -129,7 +129,6 @@ protected:
#endif
virtual bool CanConvertDepth() { return m_can_convert_depth; }
virtual bool IsOpenGL() { return false; }
public:
GSTextureCache(GSRenderer* r);

View File

@ -32,8 +32,6 @@ protected:
void Read(Target* t, const GSVector4i& r);
virtual bool IsOpenGL() { return true; }
public:
GSTextureCacheOGL(GSRenderer* r);
};

View File

@ -6,7 +6,9 @@
#define FMT_32 0
#define FMT_24 1
#define FMT_16 2
#define FMT_PAL 4 /* flag bit */
#define PS_PAL_FMT (PS_TEX_FMT >> 2)
#define PS_AEM_FMT (PS_TEX_FMT & 3)
// APITRACE_DEBUG enables forced pixel output to easily detect
// the fragment computed by primitive
@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)
uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
#if PS_IFMT == 1
// 4HH
return vec4(i >> 4u) / 255.0f;
#elif PS_IFMT == 2
// 4HL
#if PS_PAL_FMT == 1
// 4HL
return vec4(i & 0xFu) / 255.0f;
#elif PS_PAL_FMT == 2
// 4HH
return vec4(i >> 4u) / 255.0f;
#else
// Most of texture will hit this code so keep normalized float value
@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
vec2 dd;
// FIXME I'm not sure this condition is useful (I think code will be optimized)
#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
// No software LTF and pure 32 bits RGBA texure without special texture wrapping
c[0] = sample_c(st);
#ifdef TEX_COORD_DEBUG
@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)
uv = clamp_wrap_uv(uv);
if((PS_FMT & FMT_PAL) != 0)
{
c = sample_4p(sample_4_index(uv));
}
else
{
c = sample_4c(uv);
}
#if PS_PAL_FMT != 0
c = sample_4p(sample_4_index(uv));
#else
c = sample_4c(uv);
#endif
#ifdef TEX_COORD_DEBUG
c[0].rg = uv.xy;
c[1].rg = uv.xy;
@ -246,18 +246,17 @@ vec4 sample_color(vec2 st, float q)
#endif
// PERF: see the impact of the exansion before/after the interpolation
for (int i = 0; i < 4; i++)
{
// PERF note: using dot product reduces by 1 the number of instruction
// but I'm not sure it is equivalent neither faster.
// PERF note: using dot product reduces by 1 the number of instruction
// but I'm not sure it is equivalent neither faster.
for (int i = 0; i < 4; i++)
{
//float sum = dot(c[i].rgb, vec3(1.0f));
#if ((PS_FMT & ~FMT_PAL) == FMT_24)
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#if (PS_AEM_FMT == FMT_24)
c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#elif (PS_AEM_FMT == FMT_16)
c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
#endif
}

View File

@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
"#define FMT_32 0\n"
"#define FMT_24 1\n"
"#define FMT_16 2\n"
"#define FMT_PAL 4 /* flag bit */\n"
"\n"
"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
"\n"
"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
"// the fragment computed by primitive\n"
@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
"\n"
" uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
"\n"
"#if PS_IFMT == 1\n"
" // 4HH\n"
" return vec4(i >> 4u) / 255.0f;\n"
"\n"
"#elif PS_IFMT == 2\n"
" // 4HL\n"
"#if PS_PAL_FMT == 1\n"
" // 4HL\n"
" return vec4(i & 0xFu) / 255.0f;\n"
"\n"
"#elif PS_PAL_FMT == 2\n"
" // 4HH\n"
" return vec4(i >> 4u) / 255.0f;\n"
"\n"
"#else\n"
" // Most of texture will hit this code so keep normalized float value\n"
"\n"
@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
" vec2 dd;\n"
"\n"
" // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
" // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
" c[0] = sample_c(st);\n"
"#ifdef TEX_COORD_DEBUG\n"
@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
"\n"
" uv = clamp_wrap_uv(uv);\n"
"\n"
" if((PS_FMT & FMT_PAL) != 0)\n"
" {\n"
" c = sample_4p(sample_4_index(uv));\n"
" }\n"
" else\n"
" {\n"
" c = sample_4c(uv);\n"
" }\n"
"#if PS_PAL_FMT != 0\n"
" c = sample_4p(sample_4_index(uv));\n"
"#else\n"
" c = sample_4c(uv);\n"
"#endif\n"
"\n"
"#ifdef TEX_COORD_DEBUG\n"
" c[0].rg = uv.xy;\n"
" c[1].rg = uv.xy;\n"
@ -1150,18 +1150,17 @@ static const char* tfx_fs_all_glsl =
"\n"
"#endif\n"
"\n"
" // PERF: see the impact of the exansion before/after the interpolation\n"
" for (int i = 0; i < 4; i++)\n"
" {\n"
" // PERF note: using dot product reduces by 1 the number of instruction\n"
" // but I'm not sure it is equivalent neither faster.\n"
" // PERF note: using dot product reduces by 1 the number of instruction\n"
" // but I'm not sure it is equivalent neither faster.\n"
" for (int i = 0; i < 4; i++)\n"
" {\n"
" //float sum = dot(c[i].rgb, vec3(1.0f));\n"
"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#if (PS_AEM_FMT == FMT_24)\n"
" c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#elif (PS_AEM_FMT == FMT_16)\n"
" c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
" //c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
"#endif\n"
" }\n"
"\n"