Merge pull request #763 from PCSX2/gsdx-texture-format

Gsdx improves texture format setup
2015-09-11 14:44:31 +02:00 · 2015-09-11 14:44:31 +02:00 · 0a7eed686c
parent 2a8bae187f cee0fde940
commit 0a7eed686c
9 changed files with 163 additions and 122 deletions
--- a/plugins/GSdx/GSDeviceOGL.cpp
+++ b/plugins/GSdx/GSDeviceOGL.cpp
@ -651,8 +651,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
 	std::string macro = format("#define PS_FST %d\n", sel.fst)
 		+ format("#define PS_WMS %d\n", sel.wms)
 		+ format("#define PS_WMT %d\n", sel.wmt)
-		+ format("#define PS_FMT %d\n", sel.fmt)
-		+ format("#define PS_IFMT %d\n", sel.ifmt)
+		+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
 		+ format("#define PS_DFMT %d\n", sel.dfmt)
 		+ format("#define PS_AEM %d\n", sel.aem)
 		+ format("#define PS_TFX %d\n", sel.tfx)
@ -812,12 +811,11 @@ void GSDeviceOGL::SelfShaderTest()
 	PRINT_TEST("Tfx/Tcc");

 	// Test: Texture Sampling
-	for (int fmt = 0; fmt < 8; fmt++) {
+	for (int fmt = 0; fmt < 16; fmt++) {
 		if ((fmt & 3) == 3) continue;

 		for (int ltf = 0; ltf < 2; ltf++) {
 			for (int aem = 0; aem < 2; aem++) {
-				for (int ifmt = 0; ifmt < 3; ifmt++) {
 				for (int wms = 1; wms < 4; wms++) {
 					for (int wmt = 1; wmt < 4; wmt++) {
 						PSSelector sel;
@ -828,19 +826,17 @@ void GSDeviceOGL::SelfShaderTest()

 						sel.ltf     = ltf;
 						sel.aem     = aem;
-							sel.fmt = fmt;
-							sel.ifmt = ifmt;
+						sel.tex_fmt = fmt;
 						sel.wms     = wms;
 						sel.wmt     = wmt;
-							std::string file = format("Shader_Ltf_%d__Aem_%d__Fmt_%d__Ifmt_%d__Wms_%d__Wmt_%d.glsl.asm",
-									ltf, aem, fmt, ifmt, wms, wmt);
+						std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
+								ltf, aem, fmt, wms, wmt);
 						RUN_TEST;
 					}
 				}
 			}
 		}
 	}
-	}
 	PRINT_TEST("Texture Sampling");

 	fprintf(stderr, "\nTotal %d\n", all);
--- a/plugins/GSdx/GSDeviceOGL.h
+++ b/plugins/GSdx/GSDeviceOGL.h
@ -250,8 +250,7 @@ class GSDeviceOGL : public GSDevice
 			{
 				// *** Word 1
 				// Format
-				uint32 fmt:3;
-				uint32 ifmt:2;
+				uint32 tex_fmt:4;
 				uint32 dfmt:2;
 				// Alpha extension/Correction
 				uint32 aem:1;
@ -276,7 +275,7 @@ class GSDeviceOGL : public GSDevice
 				uint32 write_rg:1;
 				uint32 fbmask:1;

-				uint32 _free1:1;
+				uint32 _free1:2;

 				// *** Word 2
 				// Blend and Colclip
--- a/plugins/GSdx/GSRendererHW.cpp
+++ b/plugins/GSdx/GSRendererHW.cpp
@ -396,6 +396,7 @@ void GSRendererHW::Draw()
 			return;
 		}

+		// FIXME: Could be removed on openGL
 		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
 		{
 			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
--- a/plugins/GSdx/GSRendererOGL.cpp
+++ b/plugins/GSdx/GSRendererOGL.cpp
@ -802,26 +802,62 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.wms = m_context->CLAMP.WMS;
 		ps_sel.wmt = m_context->CLAMP.WMT;

+		// Performance note:
+		// 1/ Don't set 0 as it is the default value
+		// 2/ Only keep aem when it is useful (avoid useless shader permutation)
 		if (ps_sel.shuffle) {
-			ps_sel.fmt = 0;
-		} else if (tex->m_palette) {
-			ps_sel.fmt = cpsm.fmt | 4;
-			ps_sel.ifmt = !tex->m_target ? 0
-				: (m_context->TEX0.PSM == PSM_PSMT4HL) ? 2
-				: (m_context->TEX0.PSM == PSM_PSMT4HH) ? 1
-				: 0;
-
-			// In standard mode palette is only used when alpha channel of the RT is
-			// reinterpreted as an index. Star Ocean 3 uses it to emulate a stencil buffer.
-			// It is a very bad idea to force bilinear filtering on it.
-			if (tex->m_target)
-				bilinear &= m_vt.IsLinear();
-
-			//GL_INS("Use palette with format %d and index format %d", ps_sel.fmt, ps_sel.ifmt);
-		} else {
-			ps_sel.fmt = cpsm.fmt;
-		}
+			// Force a 32 bits access (normally shuffle is done on 16 bits)
+			// ps_sel.tex_fmt = 0; // removed as an optimization
 			ps_sel.aem     = m_env.TEXA.AEM;
+			ASSERT(tex->m_target);
+
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// FIXME: it is likely a bad idea to do the bilinear interpolation here
+			// bilinear &= m_vt.IsLinear();
+
+		} else if (tex->m_target) {
+			// Use an old target. AEM and index aren't resolved it must be done
+			// on the GPU
+
+			// Select the 32/24/16 bits color (AEM)
+			ps_sel.tex_fmt = cpsm.fmt;
+			ps_sel.aem     = m_env.TEXA.AEM;
+
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ps_cb.MinF_TA = ta.xyxy() / 255.0f;
+
+			// Select the index format
+			if (tex->m_palette) {
+				// FIXME Potentially improve fmt field in GSLocalMemory
+				if (m_context->TEX0.PSM == PSM_PSMT4HL)
+					ps_sel.tex_fmt |= 1 << 2;
+				else if (m_context->TEX0.PSM == PSM_PSMT4HH)
+					ps_sel.tex_fmt |= 2 << 2;
+				else
+					ps_sel.tex_fmt |= 3 << 2;
+
+				// Alpha channel of the RT is reinterpreted as an index. Star
+				// Ocean 3 uses it to emulate a stencil buffer.  It is a very
+				// bad idea to force bilinear filtering on it.
+				bilinear &= m_vt.IsLinear();
+			}
+
+		} else if (tex->m_palette) {
+			// Use a standard 8 bits texture. AEM is already done on the CLUT
+			// Therefore you only need to set the index
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
+
+			// Note 4 bits indexes are converted to 8 bits
+			ps_sel.tex_fmt = 3 << 2;
+
+		} else {
+			// Standard texture. Both index and AEM expansion were already done by the CPU.
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
+		}

 		if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
 			// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
@ -856,8 +892,6 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
 		ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();

-		GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
-		ps_cb.MinF_TA = ta.xyxy() / WH.xyxy(GSVector4(255, 255));

 		// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
 		ps_ssel.tau = (m_context->CLAMP.WMS != CLAMP_CLAMP);
--- a/plugins/GSdx/GSTextureCache.cpp
+++ b/plugins/GSdx/GSTextureCache.cpp
@ -22,9 +22,13 @@
 #include "stdafx.h"
 #include "GSTextureCache.h"

+bool s_IS_OPENGL = false;
+
 GSTextureCache::GSTextureCache(GSRenderer* r)
 	: m_renderer(r)
 {
+	s_IS_OPENGL = (theApp.GetConfig("Renderer", 12) == 12);
+
 	m_spritehack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SpriteHack", 0) : 0;
 	UserHacks_HalfPixelOffset = !!theApp.GetConfig("UserHacks", 0) && !!theApp.GetConfig("UserHacks_HalfPixelOffset", 0);

@ -72,12 +76,18 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
 	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
 	//const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[TEX0.CPSM] : psm;

+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		if(psm.pal > 0)
+			m_renderer->m_mem.m_clut.Read32(TEX0, TEXA);
+	} else {
 		GIFRegTEXA plainTEXA;

 		plainTEXA.AEM = 1;
 		plainTEXA.TA0 = 0;
 		plainTEXA.TA1 = 0x80;
 		m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
+	}

 	const uint32* clut = m_renderer->m_mem.m_clut;

@ -85,25 +95,26 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con

 	list<Source*>& m = m_src.m_map[TEX0.TBP0 >> 5];

+
 	for(list<Source*>::iterator i = m.begin(); i != m.end(); i++)
 	{
 		Source* s = *i;

-		if(((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
-		{
+		if (((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
 			continue;
-		}

-		// Special check for palette texture (psm.pal > 0)
-		//
-		// if m_paltex is enabled
-		// 1/ s->m_palette must always be defined
-		// 2/ Clut is useless (will be uploaded again at the end of the function)
-		//
-		// if m_paltex is disabled
-		// 1/ Clut must match if m_palette is NULL
-		if(s->m_palette == NULL && psm.pal > 0 && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
-		{
+		// Target are converted (AEM & palette) on the fly by the GPU. They don't need extra check
+		if (!s->m_target) {
+			// We request a palette texture (psm.pal). If the texture was
+			// converted by the CPU (s->m_palette == NULL), we need to ensure
+			// palette content is the same.
+			// Note: content of the palette will be uploaded at the end of the function
+			if (psm.pal > 0 && s->m_palette == NULL && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
+				continue;
+
+			// We request a 24/16 bit RGBA texture. Alpha expansion was done by
+			// the CPU.  We need to check that TEXA is identical
+			if (psm.pal == 0 && psm.fmt > 0 && s->m_TEXA.u64 != TEXA.u64)
 				continue;
 		}

@ -147,7 +158,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
 				uint32 t_psm = (t->m_dirty_alpha) ? t->m_TEX0.PSM & ~0x1 : t->m_TEX0.PSM;

 				if (GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t_psm)) {
-					if (!IsOpenGL() && (psm == PSM_PSMT8)) {
+					if (!s_IS_OPENGL && (psm == PSM_PSMT8)) {
 						// OpenGL can convert the texture directly in the GPU. Not sure we want to keep this
 						// code for DX. It fixes effect but it is slow (MGS3)

@ -324,7 +335,7 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
 			//
 			// From a performance point of view, it might cost a little on big upscaling
 			// but normally few RT are miss so it must remain reasonable.
-			if (IsOpenGL()) {
+			if (s_IS_OPENGL) {
 				switch (type) {
 					case RenderTarget: m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); break;
 					case DepthStencil: m_renderer->m_dev->ClearDepth(dst->m_texture, 0); break;
@ -863,7 +874,7 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con
 		// TODO: clean up this mess

 		int shader = dst->m_type != RenderTarget ? ShaderConvert_FLOAT32_TO_RGBA8 : ShaderConvert_COPY;
-		bool is_8bits = TEX0.PSM == PSM_PSMT8 && IsOpenGL();
+		bool is_8bits = TEX0.PSM == PSM_PSMT8 && s_IS_OPENGL;

 		if (is_8bits) {
 			GL_INS("Reading RT as a packed-indexed 8 bits format");
@ -1417,9 +1428,14 @@ void GSTextureCache::Source::Flush(uint32 count)

 	GIFRegTEXA plainTEXA;

+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		plainTEXA = m_TEXA;
+	} else {
 		plainTEXA.AEM = 1;
 		plainTEXA.TA0 = 0;
 		plainTEXA.TA1 = 0x80;
+	}

 	if(m_palette)
 	{
--- a/plugins/GSdx/GSTextureCache.h
+++ b/plugins/GSdx/GSTextureCache.h
@ -129,7 +129,6 @@ protected:
 #endif

 	virtual bool CanConvertDepth() { return m_can_convert_depth; }
-	virtual bool IsOpenGL() { return false; }

 public:
 	GSTextureCache(GSRenderer* r);
--- a/plugins/GSdx/GSTextureCacheOGL.h
+++ b/plugins/GSdx/GSTextureCacheOGL.h
@ -32,8 +32,6 @@ protected:

 	void Read(Target* t, const GSVector4i& r);

-	virtual bool IsOpenGL() { return true; }
-
 public:
 	GSTextureCacheOGL(GSRenderer* r);
 };
--- a/plugins/GSdx/res/glsl/tfx_fs.glsl
+++ b/plugins/GSdx/res/glsl/tfx_fs.glsl
@ -6,7 +6,9 @@
 #define FMT_32 0
 #define FMT_24 1
 #define FMT_16 2
-#define FMT_PAL 4 /* flag bit */
+
+#define PS_PAL_FMT (PS_TEX_FMT >> 2)
+#define PS_AEM_FMT (PS_TEX_FMT & 3)

 // APITRACE_DEBUG enables forced pixel output to easily detect
 // the fragment computed by primitive
@ -162,14 +164,14 @@ vec4 sample_4_index(vec4 uv)

    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value

-#if PS_IFMT == 1
-    // 4HH
-    return vec4(i >> 4u) / 255.0f;
-
-#elif PS_IFMT == 2
+#if PS_PAL_FMT == 1
 	// 4HL
    return vec4(i & 0xFu) / 255.0f;

+#elif PS_PAL_FMT == 2
+	// 4HH
+    return vec4(i >> 4u) / 255.0f;
+
 #else
    // Most of texture will hit this code so keep normalized float value

@ -207,7 +209,7 @@ vec4 sample_color(vec2 st, float q)
    vec2 dd;

    // FIXME I'm not sure this condition is useful (I think code will be optimized)
-#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)
+#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
    // No software LTF and pure 32 bits RGBA texure without special texture wrapping
    c[0] = sample_c(st);
 #ifdef TEX_COORD_DEBUG
@ -229,14 +231,12 @@ vec4 sample_color(vec2 st, float q)

    uv = clamp_wrap_uv(uv);

-    if((PS_FMT & FMT_PAL) != 0)
-    {
+#if PS_PAL_FMT != 0
    c = sample_4p(sample_4_index(uv));
-    }
-    else
-    {
+#else
    c = sample_4c(uv);
-    }
+#endif
+
 #ifdef TEX_COORD_DEBUG
    c[0].rg = uv.xy;
    c[1].rg = uv.xy;
@ -246,16 +246,15 @@ vec4 sample_color(vec2 st, float q)

 #endif

-    // PERF: see the impact of the exansion before/after the interpolation
-    for (int i = 0; i < 4; i++)
-    {
 	// PERF note: using dot product reduces by 1 the number of instruction
 	// but I'm not sure it is equivalent neither faster.
+	for (int i = 0; i < 4; i++)
+	{
        //float sum = dot(c[i].rgb, vec3(1.0f));
-#if ((PS_FMT & ~FMT_PAL) == FMT_24)
+#if (PS_AEM_FMT == FMT_24)
 		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
 		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
-#elif ((PS_FMT & ~FMT_PAL) == FMT_16)
+#elif (PS_AEM_FMT == FMT_16)
 		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
 		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
 #endif
--- a/plugins/GSdx/res/glsl_source.h
+++ b/plugins/GSdx/res/glsl_source.h
@ -910,7 +910,9 @@ static const char* tfx_fs_all_glsl =
 	"#define FMT_32 0\n"
 	"#define FMT_24 1\n"
 	"#define FMT_16 2\n"
-	"#define FMT_PAL 4 /* flag bit */\n"
+	"\n"
+	"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
+	"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
 	"\n"
 	"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
 	"// the fragment computed by primitive\n"
@ -1066,14 +1068,14 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
 	"\n"
-	"#if PS_IFMT == 1\n"
-	"    // 4HH\n"
-	"    return vec4(i >> 4u) / 255.0f;\n"
-	"\n"
-	"#elif PS_IFMT == 2\n"
+	"#if PS_PAL_FMT == 1\n"
 	"	// 4HL\n"
 	"    return vec4(i & 0xFu) / 255.0f;\n"
 	"\n"
+	"#elif PS_PAL_FMT == 2\n"
+	"	// 4HH\n"
+	"    return vec4(i >> 4u) / 255.0f;\n"
+	"\n"
 	"#else\n"
 	"    // Most of texture will hit this code so keep normalized float value\n"
 	"\n"
@ -1111,7 +1113,7 @@ static const char* tfx_fs_all_glsl =
 	"    vec2 dd;\n"
 	"\n"
 	"    // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
-	"#if (PS_LTF == 0 && PS_FMT == FMT_32 && PS_WMS < 2 && PS_WMT < 2)\n"
+	"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
 	"    // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
 	"    c[0] = sample_c(st);\n"
 	"#ifdef TEX_COORD_DEBUG\n"
@ -1133,14 +1135,12 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"    uv = clamp_wrap_uv(uv);\n"
 	"\n"
-	"    if((PS_FMT & FMT_PAL) != 0)\n"
-	"    {\n"
+	"#if PS_PAL_FMT != 0\n"
 	"    c = sample_4p(sample_4_index(uv));\n"
-	"    }\n"
-	"    else\n"
-	"    {\n"
+	"#else\n"
 	"    c = sample_4c(uv);\n"
-	"    }\n"
+	"#endif\n"
+	"\n"
 	"#ifdef TEX_COORD_DEBUG\n"
 	"    c[0].rg = uv.xy;\n"
 	"    c[1].rg = uv.xy;\n"
@ -1150,16 +1150,15 @@ static const char* tfx_fs_all_glsl =
 	"\n"
 	"#endif\n"
 	"\n"
-	"    // PERF: see the impact of the exansion before/after the interpolation\n"
-	"    for (int i = 0; i < 4; i++)\n"
-	"    {\n"
 	"	// PERF note: using dot product reduces by 1 the number of instruction\n"
 	"	// but I'm not sure it is equivalent neither faster.\n"
+	"	for (int i = 0; i < 4; i++)\n"
+	"	{\n"
 	"        //float sum = dot(c[i].rgb, vec3(1.0f));\n"
-	"#if ((PS_FMT & ~FMT_PAL) == FMT_24)\n"
+	"#if (PS_AEM_FMT == FMT_24)\n"
 	"		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
 	"		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
-	"#elif ((PS_FMT & ~FMT_PAL) == FMT_16)\n"
+	"#elif (PS_AEM_FMT == FMT_16)\n"
 	"		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
 	"		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
 	"#endif\n"