Modified for opencl 1.1. While it runs on nvidia cards now, you can't use its sdk to compile gsdx, cl.hpp is missing there. Intel or amd is ok.

2014-09-19 06:53:05 +02:00 · 2014-09-19 06:53:05 +02:00 · 9e20387595
parent 263c097d13
commit 9e20387595
5 changed files with 88 additions and 82 deletions
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@ -328,8 +328,8 @@ void GSRendererCL::Draw()

 				std::vector<cl::Event> el(1);

-				m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]);
-				m_cl.wq->enqueueBarrierWithWaitList(&el, NULL);
+				m_cl.queue[2].enqueueMarker(&el[0]);
+				m_cl.wq->enqueueWaitForEvents(el);

 				// switch to the other queue/buffer (double buffering)

@ -404,15 +404,7 @@ void GSRendererCL::Draw()
 		job->pb_start = m_cl.pb.tail;

 #ifdef DEBUG
-		job->fbp = context->FRAME.Block();
-		job->fbw = context->FRAME.FBW;
-		job->fpsm = context->FRAME.PSM;
-		job->zbp = context->ZBUF.Block();
-		job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff;
-		job->tbw = PRIM->TME ? context->TEX0.TBW : 1;
-		job->tpsm = PRIM->TME ? context->TEX0.PSM : 0;
-		job->tw = PRIM->TME ? context->TEX0.TW : 0;
-		job->th = PRIM->TME ? context->TEX0.TH : 0;
+		job->param = pb;
 #endif

 		m_jobs.push_back(job);
@ -687,10 +679,10 @@ void GSRendererCL::Enqueue()

 		m_cl.Unmap();

-		std::vector<cl::Event> el2(1);
+		std::vector<cl::Event> el(1);

-		m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]);
-		m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL);
+		m_cl.wq->enqueueMarker(&el[0]);
+		m_cl.queue[2].enqueueWaitForEvents(el);

 		//

@ -813,6 +805,9 @@ void GSRendererCL::Enqueue()
 					{
 						ASSERT(prim_start < MAX_PRIM_COUNT);

+						// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same 
+						// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
+
 						tfxcount++;

 						//if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); }
@ -1574,7 +1569,8 @@ GSRendererCL::CL::CL()
 			case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
 			}

-			if(strstr(version.c_str(), "OpenCL C 1.2") != NULL)
+			if(strstr(version.c_str(), "OpenCL C 1.1") != NULL
+			|| strstr(version.c_str(), "OpenCL C 1.2") != NULL)
 			{
 #ifdef IOCL_DEBUG
 				if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
@ -1646,7 +1642,7 @@ void GSRendererCL::CL::Map()

 	if(vb.head < vb.size)
 	{
-		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head);
+		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
 		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
 		ASSERT(((size_t)vb.ptr & 15) == 0);
 		ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
@ -1654,14 +1650,13 @@ void GSRendererCL::CL::Map()

 	if(ib.head < ib.size)
 	{
-		ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head);
+		ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head);
 		ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
-		ASSERT(((size_t)ib.ptr & 15) == 0);
 	}

 	if(pb.head < pb.size)
 	{
-		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head);
+		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
 		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
 		ASSERT(((size_t)pb.ptr & 15) == 0);
 		ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
@ -1681,6 +1676,7 @@ void GSRendererCL::CL::Unmap()

 static void AddDefs(ostringstream& opt)
 {
+	opt << "-cl-std=CL1.1 ";
 	opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
 	opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
 	opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@ -172,7 +172,7 @@ class GSRendererCL : public GSRenderer
 		GSVector4i* src_pages; // read by any texture level
 		GSVector4i* dst_pages; // f/z writes to it
 #ifdef DEBUG
-		uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th;
+		TFXParameter* param;
 #endif
 		TFXJob()
 			: src_pages(NULL)
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@ -1,4 +1,4 @@
-#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency
+#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency

 #ifndef CL_FLT_EPSILON
 #define CL_FLT_EPSILON 1.1920928955078125e-7
@ -1133,6 +1133,66 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p
 	return convert_int4(c);
 }

+int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
+{
+	int4 c;
+
+	if(0)//if(MMIN)
+	{
+		// TODO
+	}
+	else
+	{
+		int2 uv;
+
+		if(!FST)
+		{
+			uv = convert_int2_rte(t.xy * native_recip(t.z));
+
+			if(LTF) uv -= 0x0008;
+		}
+		else
+		{
+			// sfex capcom logo third drawing call at (0,223) calculated as:
+			// t0 + (p - p0) * (t - t0) / (p1 - p0)  
+			// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
+			// due to rounding errors (multiply-add instruction maybe):
+			// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
+			// TODO: sw renderer samples at 112 exactly, check which one is correct
+
+			// last line error in persona 3 movie clips if rounding is enabled
+
+			uv = convert_int2(t.xy); 
+		}
+
+		int2 uvf = uv & 0x000f;
+
+		int2 uv0 = uv >> 4;
+		int2 uv1 = uv0 + 1;
+
+		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
+		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
+		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
+		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
+					
+		int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
+		int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
+		int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
+		int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
+
+		if(LTF)
+		{
+			c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
+			c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
+			c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
+		}
+
+		c = c00;
+	}
+
+	return c;
+}
+
 // TODO: 2x2 MSAA idea
 // downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
 // hittest and ztest 2x2 (create write mask, only skip if all -1) 
@ -1249,8 +1309,6 @@ __kernel void KERNEL_TFX(

 	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);

-	__local gs_prim prim_cache;
-
 	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
 	{
 		while(bin_value != 0)
@ -1267,7 +1325,7 @@ __kernel void KERNEL_TFX(
 			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);

 			uint2 zf;
-			float4 t;
+			float3 t;
 			int4 c;

 			 // TODO: do not hittest if we know the tile is fully inside the prim
@ -1282,7 +1340,7 @@ __kernel void KERNEL_TFX(
 				}

 				zf = as_uint2(prim->v[0].p.zw);
-				t.xyz = prim->v[0].tc.xyz;
+				t = prim->v[0].tc.xyz;
 				c = convert_int4(prim->v[0].c);
 			}
 			else if(PRIM == GS_LINE_CLASS)
@ -1313,7 +1371,7 @@ __kernel void KERNEL_TFX(
 				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
 				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);

-				t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
+				t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;

 				if(IIP)
 				{
@ -1361,62 +1419,9 @@ __kernel void KERNEL_TFX(

 			if(TFX != TFX_NONE)
 			{
-				// TODO
+				tex = vm; // TODO: use the texture cache

-				if(0)//if(MMIN)
-				{
-					// TODO
-				}
-				else
-				{
-					int2 uv;
-
-					if(!FST)
-					{
-						uv = convert_int2_rte(t.xy * native_recip(t.z));
-
-						if(LTF) uv -= 0x0008;
-					}
-					else
-					{
-						// sfex capcom logo third drawing call at (0,223) calculated as:
-						// t0 + (p - p0) * (t - t0) / (p1 - p0)  
-						// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
-						// due to rounding errors (multiply-add instruction maybe):
-						// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
-						// TODO: sw renderer samples at 112 exactly, check which one is correct
-
-						// last line error in persona 3 movie clips if rounding is enabled
-
-						uv = convert_int2(t.xy); 
-					}
-
-					int2 uvf = uv & 0x000f;
-
-					int2 uv0 = uv >> 4;
-					int2 uv1 = uv0 + 1;
-
-					uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
-					uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
-					uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
-					uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
-					
-					tex = vm; // TODO: use the texture cache
-
-					int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
-					int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
-					int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
-					int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
-
-					if(LTF)
-					{
-						c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
-						c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
-						c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
-					}
-
-					ct = c00;
-				}
+				ct = SampleTexture(tex, pb, t);
 			}

 			// alpha tfx
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@ -43,6 +43,11 @@
 #include <d3dx9.h>
 #include <comutil.h>
 #include "../../common/include/comptr.h"
+
+#include <CL/cl.h>
+#undef CL_VERSION_1_2
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define __CL_ENABLE_EXCEPTIONS
 #include <CL/cl.hpp>

 #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
--- a/plugins/GSdx/vsprops/common.props
+++ b/plugins/GSdx/vsprops/common.props
@ -8,7 +8,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FloatingPointModel>Fast</FloatingPointModel>
      <RuntimeTypeInfo>false</RuntimeTypeInfo>
      <WarningLevel>Level4</WarningLevel>