gsdx: date/datm fix for 16-bit frame buffer in sw rendering mode

2015-08-02 20:21:49 +02:00 · 2015-08-02 20:21:49 +02:00 · a1a842b07f
parent 1f402b1b56
commit a1a842b07f
11 changed files with 599 additions and 289 deletions
--- a/plugins/GSdx/GPURenderer.cpp
+++ b/plugins/GSdx/GPURenderer.cpp
@ -113,13 +113,13 @@ bool GPURenderer::Merge()

 	GSVector2i s = st[0]->GetSize();

-	GSVector4 sRect[2];
-	GSVector4 dRect[2];
+	GSVector4 sr[2];
+	GSVector4 dr[2];

-	sRect[0] = GSVector4(0, 0, 1, 1);
-	dRect[0] = GSVector4(0, 0, s.x, s.y);
+	sr[0] = GSVector4(0, 0, 1, 1);
+	dr[0] = GSVector4(0, 0, s.x, s.y);

-	m_dev->Merge(st, sRect, dRect, s, 1, 1, GSVector4(0, 0, 0, 1));
+	m_dev->Merge(st, sr, dr, s, 1, 1, GSVector4(0, 0, 0, 1));

 	if(m_shadeboost)
 	{
--- a/plugins/GSdx/GSDrawScanline.cpp
+++ b/plugins/GSdx/GSDrawScanline.cpp
@ -1148,7 +1148,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.srl32(15) == GSVector8i::zero();
+						// test |= fd.srl32(15) == GSVector8i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector8i::zero();
 					}
 					else
 					{
@ -1159,7 +1160,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.sll32(16).sra32(31);
+						test |= fd.sll32(16).sra32(31); // == GSVector8i::xffffffff();
 					}
 					else
 					{
@ -2264,7 +2265,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.srl32(15) == GSVector4i::zero();
+						// test |= fd.srl32(15) == GSVector4i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector4i::zero();
 					}
 					else
 					{
@ -2275,7 +2277,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.sll32(16).sra32(31);
+						test |= fd.sll32(16).sra32(31); // == GSVector4i::xffffffff();
 					}
 					else
 					{
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
@ -1326,7 +1326,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(xmm0, xmm0);
-			vpsrld(xmm1, xmm6, 15);
+			//vpsrld(xmm1, xmm6, 15);
+			vpslld(xmm1, xmm6, 16);
+			vpsrad(xmm1, 31);
 			vpcmpeqd(xmm1, xmm0);
 		}
 		else
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
@ -2330,7 +2330,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(xmm0, xmm0);
-			vpsrld(xmm1, xmm2, 15);
+			//vpsrld(xmm1, xmm2, 15);
+			vpslld(xmm1, xmm2, 16);
+			vpsrad(xmm1, 31);
 			vpcmpeqd(xmm1, xmm0);
 		}
 		else
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
@ -2295,7 +2295,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(ymm0, ymm0);
-			vpsrld(ymm1, ymm2, 15);
+			//vpsrld(ymm1, ymm2, 15);
+			vpslld(ymm1, ymm2, 16);
+			vpsrad(ymm1, 31);
 			vpcmpeqd(ymm1, ymm0);
 		}
 		else
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@ -108,6 +108,7 @@ class GSRendererCL : public GSRenderer
 			uint32 noscissor:1; // 53
 			uint32 tpsm:4; // 54
 			uint32 aem:1; // 58
+			uint32 merged:1; // 59
 			// TODO
 		};

@ -148,6 +149,7 @@ class GSRendererCL : public GSRenderer
 	{
 		GSVector4i scissor;
 		GSVector4i dimx; // 4x4 signed char
+		TFXSelector sel;
 		uint32 fbp, zbp, bw;
 		uint32 fm, zm;
 		uint32 fog; // rgb
@ -172,6 +174,7 @@ class GSRendererCL : public GSRenderer
 		GSVector4i* src_pages; // read by any texture level
 		GSVector4i* dst_pages; // f/z writes to it
 		uint32 fbp, zbp, bw;
+		uint32 fpsm, zpsm, tpsm;
 #ifdef DEBUG
 		TFXParameter* pb;
 #endif
@ -227,16 +230,23 @@ class GSRendererCL : public GSRenderer

 	void Enqueue();
 	void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
-	void UpdateTextureCache(TFXJob* job);
+	void JoinTFX(std::list<shared_ptr<TFXJob>>& jobs);
+	bool UpdateTextureCache(TFXJob* job);
 	void InvalidateTextureCache(TFXJob* job);
+	void UsePages(uint32* pages);
+	void ReleasePages(uint32* pages);
+
+	static void CL_CALLBACK ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data);

 protected:
 	GSTexture* m_texture[2];
 	uint8* m_output;
 	
 	GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
-	GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?)
-	GSVector4i m_tmp_pages[4]; // TODO: this should be block level, too many overlaps inside pages with render targets
+	GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) // TODO: this should be block level, too many overlaps inside pages with render targets
+	GSVector4i m_tmp_pages[4];
+	uint32 m_tmp_pages2[MAX_PAGES + 1];
+	uint32 m_rw_pages_rendering[512]; // pages that are currently in-use

 	void Reset();
 	void VSync(int field);
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@ -66,6 +66,9 @@ GSState::GSState()
 	//s_dump = 1;
 	//s_save = 1;
 	//s_savez = 1;
+	//s_savet = 1;
+	//s_savef = 1;
+	//s_saven = 656;

 	UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;
 	m_crc_hack_level = theApp.GetConfig("crc_hack_level", 3);
--- a/plugins/GSdx/GSUtil.cpp
+++ b/plugins/GSdx/GSUtil.cpp
@ -224,7 +224,7 @@ bool GSUtil::CheckSSE()
 	return true;
 }

-#define OCL_PROGRAM_VERSION 1
+#define OCL_PROGRAM_VERSION 3

 #ifdef ENABLE_OPENCL
 void GSUtil::GetDeviceDescs(list<OCLDeviceDesc>& dl)
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@ -1,3 +1,9 @@
+#if defined(CL_VERSION_2_0)
+
+#error hello
+
+#endif
+
 #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency

 #ifdef cl_amd_printf
@ -26,6 +32,13 @@
 	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
 #endif

+#define TFX_ABA(sel) ((sel.x >> 24) & 3)
+#define TFX_ABB(sel) ((sel.x >> 26) & 3)
+#define TFX_ABC(sel) ((sel.x >> 28) & 3)
+#define TFX_ABD(sel) ((sel.x >> 30) & 3)
+#define TFX_WMS(sel) ((sel.y >>  8) & 3)
+#define TFX_WMT(sel) ((sel.y >> 10) & 3)
+
 typedef struct
 {
 	union {float4 p; struct {float x, y; uint z, f;};};
@ -35,9 +48,9 @@ typedef struct
 typedef struct
 {
 	gs_vertex v[3];
-	uint zmin;
+	uint zmin, zmax;
 	uint pb_index;
-	uint _pad[2];
+	uint _pad;
 } gs_prim;

 typedef struct
@ -60,6 +73,7 @@ typedef struct
 {
 	int4 scissor;
 	char dimx[4][4];
+	uint2 sel;
 	int fbp, zbp, bw;
 	uint fm, zm;
 	uchar4 fog; // rgb
@ -110,10 +124,10 @@ enum GS_TFX

 enum GS_CLAMP
 {
-	CLAMP_REPEAT		= 0,
-	CLAMP_CLAMP			= 1,
-	CLAMP_REGION_CLAMP	= 2,
-	CLAMP_REGION_REPEAT	= 3,
+	CLAMP_REGION_REPEAT	= 0,
+	CLAMP_REPEAT		= 1,
+	CLAMP_CLAMP			= 2,
+	CLAMP_REGION_CLAMP	= 3,
 };

 enum GS_ZTST
@ -603,6 +617,7 @@ __kernel void KERNEL_PRIM(
 		// only interpolate the relative to zmin and hopefully small values

 		uint zmin = min(min(v0->z, v1->z), v2->z);
+		uint zmax = max(max(v0->z, v1->z), v2->z);
 		
 		prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
 		prim->v[0].tc = v0->tc;
@ -612,6 +627,7 @@ __kernel void KERNEL_PRIM(
 		prim->v[2].tc = v2->tc;

 		prim->zmin = zmin;
+		prim->zmax = zmax;

 		float4 dp0 = v1->p - v0->p;
 		float4 dp1 = v0->p - v2->p;
@ -969,27 +985,34 @@ bool DestAlphaTest(uint fd)

 int Wrap(int a, int b, int c, int mode)
 {
-	switch(mode)
+	if(MERGED)
 	{
-	case CLAMP_REPEAT:
-		return a & b;
-	case CLAMP_CLAMP:
-		return clamp(a, 0, c);
-	case CLAMP_REGION_CLAMP:
-		return clamp(a, b, c);
-	case CLAMP_REGION_REPEAT:
-		return (a & b) | c;
+		return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0);
+	}
+	else
+	{
+		switch(mode)
+		{
+		case CLAMP_REGION_REPEAT:
+			return (a & b) | c;
+		case CLAMP_REPEAT:
+			return a & b;
+		case CLAMP_CLAMP:
+			return clamp(a, 0, c);
+		case CLAMP_REGION_CLAMP:
+			return clamp(a, b, c);
+		}
 	}
 }

-int4 AlphaBlend(int4 c, int afix, uint fd)
+int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel)
 {
 	if(FWRITE && (ABE || AA1))
 	{
 		int4 cs = c;
 		int4 cd;

-		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1)
+		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED)
 		{
 			if(is32bit(FPSM) || is24bit(FPSM))
 			{
@ -1007,50 +1030,69 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
 			}
 		}

-		if(ABA != ABB)
+		if(MERGED)
 		{
-			switch(ABA)
-			{
-			case 0: break; // c.xyz = cs.xyz;
-			case 1: c.xyz = cd.xyz; break;
-			case 2: c.xyz = 0; break;
-			}
+			int aba = TFX_ABA(sel);
+			int abb = TFX_ABB(sel);
+			int abc = TFX_ABC(sel);
+			int abd = TFX_ABD(sel);

-			switch(ABB)
-			{
-			case 0: c.xyz -= cs.xyz; break;
-			case 1: c.xyz -= cd.xyz; break;
-			case 2: break;
-			}
+			int ad = !is24bit(FPSM) ? cd.w : 0x80;

-			if(!(is24bit(FPSM) && ABC == 1))
-			{
-				int a = 0;
+			int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0;
+			int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0;
+			int C = abc == 0 ? cs.w : abc == 1 ? ad : afix;
+			int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0;

-				switch(ABC)
-				{
-				case 0: a = cs.w; break;
-				case 1: a = cd.w; break;
-				case 2: a = afix; break;
-				}
-
-				c.xyz = c.xyz * a >> 7;
-			}
-
-			switch(ABD)
-			{
-			case 0: c.xyz += cs.xyz; break;
-			case 1: c.xyz += cd.xyz; break;
-			case 2: break;
-			}
+			c.xyz = (mul24(A - B, C) >> 7) + D;
 		}
 		else
 		{
-			switch(ABD)
+			if(ABA != ABB)
 			{
-			case 0: break;
-			case 1: c.xyz = cd.xyz; break;
-			case 2: c.xyz = 0; break;
+				switch(ABA)
+				{
+				case 0: break; // c.xyz = cs.xyz;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
+
+				switch(ABB)
+				{
+				case 0: c.xyz -= cs.xyz; break;
+				case 1: c.xyz -= cd.xyz; break;
+				case 2: break;
+				}
+
+				if(!(is24bit(FPSM) && ABC == 1))
+				{
+					int a = 0;
+
+					switch(ABC)
+					{
+					case 0: a = cs.w; break;
+					case 1: a = cd.w; break;
+					case 2: a = afix; break;
+					}
+
+					c.xyz = c.xyz * a >> 7;
+				}
+
+				switch(ABD)
+				{
+				case 0: c.xyz += cs.xyz; break;
+				case 1: c.xyz += cd.xyz; break;
+				case 2: break;
+				}
+			}
+			else
+			{
+				switch(ABD)
+				{
+				case 0: break;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
 			}
 		}

@ -1150,8 +1192,6 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
 		if(!FST)
 		{
 			uv = convert_int2_rte(t.xy * native_recip(t.z));
-
-			if(LTF) uv -= 0x0008;
 		}
 		else
 		{
@ -1167,15 +1207,17 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
 			uv = convert_int2(t.xy); 
 		}

+		if(LTF) uv -= 0x0008;
+
 		int2 uvf = uv & 0x000f;

 		int2 uv0 = uv >> 4;
 		int2 uv1 = uv0 + 1;

-		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
-		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
-		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
-		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
+		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
+		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);

 		int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
 		int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
@ -1356,6 +1398,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 			{
 				// TODO: aa1: draw edge as a line

+				if(!ZTest(prim->zmax, zd))
+				{
+					continue;
+				}
+
 				__global gs_barycentric* b = &barycentric[prim_index + i];

 				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
@ -1420,8 +1467,6 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(

 			if(TFX != TFX_NONE)
 			{
-				tex = vm; // TODO: use the texture cache
-
 				ct = SampleTexture(tex, pb, t);
 			}

@ -1515,7 +1560,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(

 			// alpha blend

-			c = AlphaBlend(c, pb->afix, fd);
+			c = AlphaBlend(c, fd, pb->afix, pb->sel);

 			// write frame

--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@ -123,8 +123,15 @@ using namespace std;
 	#include <GL/wglext.h>
 	#include "GLLoader.h"

+	#if _MSC_VER >= 1800 
+	#include <unordered_map>
+	#include <unordered_set>
+	#define hash_map unordered_map
+	#define hash_set unordered_set
+	#else
 	#include <hash_map>
 	#include <hash_set>
+	#endif

 	using namespace stdext;