From db7c26cde7134af84f4cd72d919c1397d91800bd Mon Sep 17 00:00:00 2001
From: gabest11 <gabest11@gmail.com>
Date: Mon, 15 Sep 2014 15:49:16 +0200
Subject: [PATCH] - Experimental OpenCL renderer (missing features: point,
 line, texture cache, mipmap, aa1, device selection). Needs any OpenCL SDK for
 the common headers and stub lib to compile, tested with AMD and Intel. Too
 bad it is not part of the Windows SDK yet. - Renumbered renderer ids,
 compatible with old numbering, but it does not follow the mod3 logic anymore.

---
 plugins/GSdx/GPURenderer.h               |    4 +-
 plugins/GSdx/GS.cpp                      |  162 +-
 plugins/GSdx/GSDevice9.h                 |    1 -
 plugins/GSdx/GSLocalMemory.cpp           |  134 +-
 plugins/GSdx/GSLocalMemory.h             |    2 +-
 plugins/GSdx/GSRendererCL.cpp            | 1780 ++++++++++++++++++++++
 plugins/GSdx/GSRendererCL.h              |  310 ++++
 plugins/GSdx/GSRendererSW.cpp            |   13 +-
 plugins/GSdx/GSSettingsDlg.cpp           |   13 +-
 plugins/GSdx/GSState.cpp                 |    9 +-
 plugins/GSdx/GSdx.cpp                    |   31 +-
 plugins/GSdx/GSdx.h                      |    3 +
 plugins/GSdx/GSdx.rc                     |   17 +-
 plugins/GSdx/GSdx_vs2013.vcxproj         |    3 +
 plugins/GSdx/GSdx_vs2013.vcxproj.filters |   11 +-
 plugins/GSdx/res/cs.fx                   |    4 +
 plugins/GSdx/res/tfx.cl                  | 1619 ++++++++++++++++++++
 plugins/GSdx/stdafx.h                    |    1 +
 plugins/GSdx/vsprops/common.props        |    8 +-
 plugins/GSdx/vsprops/x64.props           |    2 +-
 plugins/GSdx/vsprops/x86.props           |    2 +-
 21 files changed, 3947 insertions(+), 182 deletions(-)
 create mode 100644 plugins/GSdx/GSRendererCL.cpp
 create mode 100644 plugins/GSdx/GSRendererCL.h
 create mode 100644 plugins/GSdx/res/tfx.cl
diff --git a/plugins/GSdx/GPURenderer.h b/plugins/GSdx/GPURenderer.h
index 1a7b505ced..16568a0da2 100644
--- a/plugins/GSdx/GPURenderer.h
+++ b/plugins/GSdx/GPURenderer.h
@@ -123,13 +123,13 @@ protected:
 		int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
 		Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);
 
-		if (!vertices)
+		if(vertices == NULL)
 		{
 			printf("GSdx: failed to allocate %d bytes for verticles.\n", sizeof(Vertex) * maxcount);
 			throw GSDXError();
 		}
 
-		if (m_vertices != NULL)
+		if(m_vertices != NULL)
 		{
 			memcpy(vertices, m_vertices, sizeof(Vertex) * m_maxcount);
 			_aligned_free(m_vertices);
diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp
index 2f23f0903f..5d91813659 100644
--- a/plugins/GSdx/GS.cpp
+++ b/plugins/GSdx/GS.cpp
@@ -37,6 +37,7 @@
 #include "GSWndDX.h"
 #include "GSWndWGL.h"
 #include "GSRendererCS.h"
+#include "GSRendererCL.h"
 #include "GSSettingsDlg.h"
 
 static HRESULT s_hr = E_FAIL;
@@ -203,6 +204,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
 	}
 
 	GSWnd* wnd[2];
+
 	try
 	{
 		if(s_renderer != renderer)
@@ -216,78 +218,72 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
 			s_gs = NULL;
 		}
 
-		if(renderer == 15)
+		switch(renderer)
 		{
-			#ifdef _WINDOWS
-	
-			dev = new GSDevice11();
-
-			if(dev == NULL)
-			{
-				return -1;
-			}
-
-			delete s_gs;
-
-			s_gs = new GSRendererCS();
-
-			s_renderer = renderer;
-
-			#endif
+		default:
+#ifdef _WINDOWS
+		case 0: case 1: case 2: case 14:
+			dev = new GSDevice9(); 
+			break;
+		case 3: case 4: case 5: case 15:
+			dev = new GSDevice11(); 
+			break;
+#endif
+		case 9: case 10: case 11: case 16:
+			dev = new GSDeviceNull(); 
+			break;
+		case 12: case 13: case 17:
+			dev = new GSDeviceOGL(); 
+			break;
 		}
-		else
+
+		if(dev == NULL)
 		{
-			switch(renderer / 3)
+			return -1;
+		}
+
+		if(s_gs == NULL)
+		{
+			switch(renderer)
 			{
 			default:
-				#ifdef _WINDOWS
-				case 0: dev = new GSDevice9(); break;
-				case 1: dev = new GSDevice11(); break;
-				#endif
-				case 3: dev = new GSDeviceNull(); break;
-				case 4: dev = new GSDeviceOGL(); break;
-			}
-
-			if(dev == NULL)
-			{
-				return -1;
-			}
-
-			if(s_gs == NULL)
-			{
-				switch(renderer % 3)
-				{
-					default:
-					case 0:
-						switch(renderer) 
-						{ 
-							default:
 #ifdef _WINDOWS
-							case 0: s_gs = (GSRenderer*)new GSRendererDX9(); break;
-							case 3: s_gs = (GSRenderer*)new GSRendererDX11(); break;
+			case 0:
+				s_gs = (GSRenderer*)new GSRendererDX9();
+				break;
+			case 3: 
+				s_gs = (GSRenderer*)new GSRendererDX11(); 
+				break;
 #endif
-							case 12: s_gs = (GSRenderer*)new GSRendererOGL(); break;
-						}
-						break;
-					case 1:
-						s_gs = new GSRendererSW(threads);
-						break;
-					case 2:
-						s_gs = new GSRendererNull();
-						break;
-				}
-
-				s_renderer = renderer;
+			case 12: 
+				s_gs = (GSRenderer*)new GSRendererOGL(); 
+				break;
+			case 1: case 4: case 10: case 13:
+				s_gs = new GSRendererSW(threads);
+				break;
+			case 2: case 5: case 11:
+				s_gs = new GSRendererNull();
+				break;
+			case 14: case 15: case 16: case 17:
+				s_gs = new GSRendererCL();
+				break;
 			}
+
+			s_renderer = renderer;
 		}
 
 		if (s_gs->m_wnd == NULL)
 		{
 #ifdef _WINDOWS
-			if (renderer / 3 == 4)
+			switch(renderer)
+			{
+			case 12: case 13: case 17:
 				s_gs->m_wnd = new GSWndWGL();
-			else
+				break;
+			default:
 				s_gs->m_wnd = new GSWndDX();
+				break;
+			}
 #else
 #ifdef ENABLE_GLES
 			wnd[0] = NULL;
@@ -681,8 +677,10 @@ EXPORT_C GSkeyEvent(GSKeyEventData* e)
 {
 	try
 	{
-		if (gsopen_done)
+		if(gsopen_done)
+		{
 			s_gs->KeyEvent(e);
+		}
 	}
 	catch (GSDXRecoverableError)
 	{
@@ -1218,15 +1216,11 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 {
 	::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS);
 
-	FILE* file = fopen("c:\\temp1\\log.txt", "a");
-
-	fprintf(file, "-------------------------\n\n");
+	Console console("GSdx", true);
 
 	if(1)
 	{
-		GSLocalMemory * pMem = new GSLocalMemory();
-		GSLocalMemory& mem(*pMem);
-		
+		GSLocalMemory* mem = new GSLocalMemory();		
 
 		static struct {int psm; const char* name;} s_format[] =
 		{
@@ -1258,7 +1252,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 			int w = 1 << tbw;
 			int h = 1 << tbw;
 
-			fprintf(file, "%d x %d\n\n", w, h);
+			printf("%d x %d\n\n", w, h);
 
 			for(size_t i = 0; i < countof(s_format); i++)
 			{
@@ -1308,7 +1302,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 
 				clock_t start, end;
 
-				_ftprintf(file, _T("[%4s] "), s_format[i].name);
+				printf("[%4s] ", s_format[i].name);
 
 				start = clock();
 
@@ -1317,12 +1311,12 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 					int x = 0;
 					int y = 0;
 
-					(mem.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+					(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
 				start = clock();
 
@@ -1331,25 +1325,25 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 					int x = 0;
 					int y = 0;
 
-					(mem.*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+					(mem->*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
-				const GSOffset* o = mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+				const GSOffset* o = mem->GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
 
 				start = clock();
 
 				for(int j = 0; j < n; j++)
 				{
-					(mem.*rtx)(o, r, ptr, w * 4, TEXA);
+					(mem->*rtx)(o, r, ptr, w * 4, TEXA);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
 				if(psm.pal > 0)
 				{
@@ -1357,32 +1351,30 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 
 					for(int j = 0; j < n; j++)
 					{
-						(mem.*rtxP)(o, r, ptr, w, TEXA);
+						(mem->*rtxP)(o, r, ptr, w, TEXA);
 					}
 
 					end = clock();
 
-					fprintf(file, "| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+					printf("| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 				}
 
-				fprintf(file, "\n");
-
-				fflush(file);
+				printf("\n");
 			}
 
-			fprintf(file, "\n");
+			printf("\n");
 		}
 
 		_aligned_free(ptr);
-		delete pMem;
+
+		delete mem;
 	}
 
 	//
 
 	if(0)
 	{
-		GSLocalMemory * pMem2 = new GSLocalMemory();
-		GSLocalMemory& mem2(*pMem2);
+		GSLocalMemory* mem = new GSLocalMemory();
 
 		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
 
@@ -1413,13 +1405,13 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 		int x = 0;
 		int y = 0;
 
-		(mem2.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
-		delete pMem2;
+		(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+
+		delete mem;
 	}
 
 	//
 
-	fclose(file);
 	PostQuitMessage(0);
 }
 
diff --git a/plugins/GSdx/GSDevice9.h b/plugins/GSdx/GSDevice9.h
index fd32304c70..e7cb160135 100644
--- a/plugins/GSdx/GSDevice9.h
+++ b/plugins/GSdx/GSDevice9.h
@@ -173,7 +173,6 @@ public: // TODO
 	// Shaders...
 
 	hash_map<uint32, GSVertexShader9 > m_vs;
-	D3DXHANDLE m_vs_params;
 	hash_map<uint32, CComPtr<IDirect3DPixelShader9> > m_ps;
 	hash_map<uint32, Direct3DSamplerState9* > m_ps_ss;
 	hash_map<uint32, Direct3DDepthStencilState9* > m_om_dss;
diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp
index 10cb2433f5..757463d134 100644
--- a/plugins/GSdx/GSLocalMemory.cpp
+++ b/plugins/GSdx/GSLocalMemory.cpp
@@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@@ -801,27 +801,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			{
 			case PSM_PSMCT32:
 			case PSM_PSMZ32:
-				ReadColumn32(y, dst, buff, 32);
+				GSBlock::ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 4], 32);
-				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
 			case PSM_PSMZ16:
 			case PSM_PSMZ16S:
-				ReadColumn16(y, dst, buff, 32);
+				GSBlock::ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 2], 32);
-				WriteColumn16<32>(y, dst, buff, 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
-				ReadColumn8(y, dst, buff, 16);
+				GSBlock::ReadColumn8(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
-				ReadColumn4(y, dst, buff, 16);
+				GSBlock::ReadColumn4(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@@ -888,27 +888,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			{
 			case PSM_PSMCT32:
 			case PSM_PSMZ32:
-				ReadColumn32(y, dst, buff, 32);
+				GSBlock::ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 4], 32);
-				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
 			case PSM_PSMZ16:
 			case PSM_PSMZ16S:
-				ReadColumn16(y, dst, buff, 32);
+				GSBlock::ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 2], 32);
-				WriteColumn16<32>(y, dst, buff, 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
-				ReadColumn8(y, dst, buff, 16);
+				GSBlock::ReadColumn8(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
-				ReadColumn4(y, dst, buff, 16);
+				GSBlock::ReadColumn4(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@@ -1060,7 +1060,7 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, const uint8* src, int len, GI
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1094,7 +1094,7 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, const uint8* src, int len, GI
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1128,7 +1128,7 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1162,7 +1162,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1196,7 +1196,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw));
 			}
 		}
 
@@ -1612,7 +1612,7 @@ void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadBlock32(src, dst, dstpitch);
+		GSBlock::ReadBlock32(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1623,7 +1623,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 8, 8, 32)
 		{
-			ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1631,7 +1631,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 8, 8, 32)
 		{
-			ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1643,7 +1643,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 16, 8, 32)
 		{
-			ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1651,7 +1651,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 16, 8, 32)
 		{
-			ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1663,7 +1663,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r
 
 	FOREACH_BLOCK_START(r, 16, 16, 32)
 	{
-		ReadAndExpandBlock8_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock8_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1674,7 +1674,7 @@ void GSLocalMemory::ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r
 
 	FOREACH_BLOCK_START(r, 32, 16, 32)
 	{
-		ReadAndExpandBlock4_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1685,7 +1685,7 @@ void GSLocalMemory::ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock8H_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock8H_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1696,7 +1696,7 @@ void GSLocalMemory::ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1707,7 +1707,7 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1718,7 +1718,7 @@ void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, cons
 {
 	ALIGN_STACK(32);
 
-	ReadBlock32(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock32(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@@ -1727,11 +1727,11 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
 
 	if(TEXA.AEM)
 	{
-		ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 	else
 	{
-		ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 }
 
@@ -1741,11 +1741,11 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons
 
 	if(TEXA.AEM)
 	{
-		ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 	else
 	{
-		ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 }
 
@@ -1753,35 +1753,35 @@ void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 ///////////////////
@@ -1870,7 +1870,7 @@ void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 16, 16, 8)
 	{
-		ReadBlock8(src, dst, dstpitch);
+		GSBlock::ReadBlock8(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1879,7 +1879,7 @@ void GSLocalMemory::ReadTexture4P(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 32, 16, 8)
 	{
-		ReadBlock4P(src, dst, dstpitch);
+		GSBlock::ReadBlock4P(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1888,7 +1888,7 @@ void GSLocalMemory::ReadTexture8HP(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock8HP(src, dst, dstpitch);
+		GSBlock::ReadBlock8HP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1897,7 +1897,7 @@ void GSLocalMemory::ReadTexture4HLP(const GSOffset* RESTRICT o, const GSVector4i
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock4HLP(src, dst, dstpitch);
+		GSBlock::ReadBlock4HLP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1906,7 +1906,7 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock4HHP(src, dst, dstpitch);
+		GSBlock::ReadBlock4HHP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1915,35 +1915,35 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i
 
 void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ReadBlock8(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock8(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4P(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4P(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
 }
 
 //
diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h
index 67f2cf0c27..ea83bfd53b 100644
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@@ -76,7 +76,7 @@ struct GSPixelOffset4
 	uint32 fbp, zbp, fpsm, zpsm, bw;
 };
 
-class GSLocalMemory : public GSBlock
+class GSLocalMemory : public GSAlignedClass<32>
 {
 public:
 	typedef uint32 (*pixelAddress)(int x, int y, uint32 bp, uint32 bw);
diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp
new file mode 100644
index 0000000000..357a2a3e23
--- /dev/null
+++ b/plugins/GSdx/GSRendererCL.cpp
@@ -0,0 +1,1780 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererCL.h"
+
+#define LOG 0
+
+static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
+
+#define MAX_FRAME_SIZE 2048
+#define MAX_PRIM_COUNT 4096u
+#define MAX_PRIM_PER_BATCH_BITS 5
+#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS)
+#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH)
+#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT)
+#define BIN_SIZE_BITS 4
+#define BIN_SIZE (1u << BIN_SIZE_BITS)
+#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
+#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
+
+#if MAX_PRIM_PER_BATCH == 64u
+#define BIN_TYPE cl_ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+#define BIN_TYPE cl_uint
+#else
+#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+#pragma pack(push, 1)
+
+typedef struct
+{
+	GSVertexCL v[4];
+} gs_prim;
+
+typedef struct
+{
+	cl_float4 dx, dy;
+	cl_float4 zero;
+	cl_float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	cl_uint batch_counter;
+	cl_uint _pad[7];
+	struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	cl_uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+#pragma pack(pop)
+
+GSRendererCL::GSRendererCL()
+	: m_vb_count(0)
+{
+	m_nativeres = true; // ignore ini, sw is always native
+
+	//s_dump = 1;
+	//s_save = 1;
+	//s_savez = 1;
+
+	// TODO: m_tc = new GSTextureCacheCL(this);
+
+	memset(m_texture, 0, sizeof(m_texture));
+
+	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
+
+	memset(m_rw_pages, 0, sizeof(m_rw_pages));
+	memset(m_tex_pages, 0, sizeof(m_tex_pages));
+
+	#define InitCVB(P) \
+		m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
+		m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer<P, 0, 1>; \
+		m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer<P, 1, 0>; \
+		m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer<P, 1, 1>; \
+
+	InitCVB(GS_POINT_CLASS);
+	InitCVB(GS_LINE_CLASS);
+	InitCVB(GS_TRIANGLE_CLASS);
+	InitCVB(GS_SPRITE_CLASS);
+
+	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
+	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
+}
+
+GSRendererCL::~GSRendererCL()
+{
+	// TODO: delete m_tc;
+
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+	}
+
+	_aligned_free(m_output);
+}
+
+void GSRendererCL::Reset()
+{
+	Sync(-1);
+
+	// TODO: m_tc->RemoveAll();
+
+	GSRenderer::Reset();
+}
+
+void GSRendererCL::VSync(int field)
+{
+	Sync(0); // IncAge might delete a cached texture in use
+
+	GSRenderer::VSync(field);
+
+	// TODO: m_tc->IncAge();
+
+	//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
+}
+
+void GSRendererCL::ResetDevice()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+
+		m_texture[i] = NULL;
+	}
+}
+
+GSTexture* GSRendererCL::GetOutput(int i)
+{
+	Sync(1);
+
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	int w = DISPFB.FBW * 64;
+	int h = GetFrameRect(i).bottom;
+
+	// TODO: round up bottom
+
+	if(m_dev->ResizeTexture(&m_texture[i], w, h))
+	{
+		static int pitch = 1024 * 4;
+
+		GSVector4i r(0, 0, w, h);
+
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
+
+		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
+
+		m_texture[i]->Update(r, m_output, pitch);
+
+		if(s_dump)
+		{
+			if(s_save && s_n >= s_saven)
+			{
+				m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+			}
+
+			s_n++;
+		}
+	}
+
+	return m_texture[i];
+}
+
+const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f);
+
+template<uint32 primclass, uint32 tme, uint32 fst>
+void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
+{
+	GSVector4i o = (GSVector4i)m_context->XYOFFSET;
+	GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0);
+
+	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
+	{
+		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
+
+		#if _M_SSE >= 0x401
+
+		GSVector4i xyzuvf(src->m[1]);
+
+		GSVector4i xy = xyzuvf.upl16() - o;
+		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
+
+		#else
+
+		uint32 z = src->XYZ.Z;
+
+		GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o;
+		GSVector4i zf = GSVector4i((int)std::min<uint32>(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later
+
+		#endif
+
+		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
+
+		GSVector4 t = GSVector4::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				#if _M_SSE >= 0x401
+
+				t = GSVector4(xyzuvf.uph16());
+					
+				#else
+
+				t = GSVector4(GSVector4i::load(src->UV).upl16());
+
+				#endif
+			}
+			else
+			{
+				t = stcq.xyww() * st_scale;
+			}
+		}
+
+		dst->t = t.insert32<2, 3>(stcq);
+	}
+}
+
+void GSRendererCL::Draw()
+{
+	const GSDrawingContext* context = m_context;
+
+	GSVector4i scissor = GSVector4i(context->scissor.in);
+	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (example: single line 0,0->256,0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
+	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
+
+	GSVector4i rect = bbox.rintersect(scissor);
+
+	if(rect.rempty())
+	{
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+
+		std::string s;
+
+		if(s_save && s_n >= s_saven && PRIM->TME)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
+
+			m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+
+	try
+	{
+		size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
+		size_t ib_size = m_index.tail * sizeof(uint32);
+		size_t pb_size = sizeof(TFXParameter);
+
+		if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
+		{
+			if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size)
+			{
+				// buffer too small for even one batch, allow twice the size (at least 1 MB)
+
+				Sync(2); // must sync, reallocating the input buffers
+
+				m_cl.Unmap();
+
+				m_cl.vb.size = 0;
+				m_cl.ib.size = 0;
+
+				size_t size = std::max(vb_size * 2, 2u << 20);
+
+				printf("growing vertex/index buffer %d\n", size);
+
+				m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.size = size;
+
+				size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), 1u << 20); // worst case, three times the vertex count
+
+				ASSERT(size >= ib_size);
+
+				if(size < ib_size) size = ib_size; // should not happen
+
+				m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.size = size;
+			}
+			else
+			{
+				Enqueue();
+
+				m_cl.Unmap();
+
+				// make the write queue wait until the rendering queue is ready, it may still use the device buffers
+
+				std::vector<cl::Event> el(1);
+
+				m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]);
+				m_cl.wq->enqueueBarrierWithWaitList(&el, NULL);
+
+				// switch to the other queue/buffer (double buffering)
+
+				m_cl.wqidx = (m_cl.wqidx + 1) & 1;
+				m_cl.wq = &m_cl.queue[m_cl.wqidx];
+			}
+
+			m_cl.vb.head = m_cl.vb.tail = 0;
+			m_cl.ib.head = m_cl.ib.tail = 0;
+			m_cl.pb.head = m_cl.pb.tail = 0;
+
+			m_cl.Map();
+		}
+		else
+		{
+			// only allow batches of the same primclass in Enqueue
+
+			if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass)
+			{
+				Enqueue();
+			}
+		}
+
+		//
+
+		GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
+		uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
+		TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
+
+		pb->scissor = scissor;
+		pb->bbox = bbox;
+		pb->rect = rect;
+
+		(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? 
+
+		if(m_jobs.empty())
+		{
+			memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32));
+
+			m_vb_start = m_cl.vb.tail;
+		}
+		else
+		{
+			// TODO: SIMD
+
+			uint32 vb_count = m_vb_count;
+
+			for(size_t i = 0; i < m_index.tail; i++)
+			{
+				ib[i] = m_index.buff[i] + vb_count;
+			}
+		}
+
+		m_vb_count += m_vertex.next;
+
+		if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail))
+		{
+			return;
+		}
+
+		TFXJob job;
+
+		job.rect.x = rect.x;
+		job.rect.y = rect.y;
+		job.rect.z = rect.z;
+		job.rect.w = rect.w;
+		job.sel = pb->sel;
+		job.ib_start = m_cl.ib.tail;
+		job.ib_count = m_index.tail;
+		job.pb_start = m_cl.pb.tail;
+
+		m_jobs.push_back(job);
+
+		m_cl.vb.tail += vb_size;
+		m_cl.ib.tail += ib_size;
+		m_cl.pb.tail += pb_size;
+
+		// mark pages for writing
+
+		if(pb->sel.fb)
+		{
+			uint8 flag = pb->sel.fb;
+
+			const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages);
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				m_rw_pages[*p] |= flag;
+			}
+		}
+
+		if(pb->sel.zb)
+		{
+			uint8 flag = pb->sel.zb;
+
+			const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages);
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				m_rw_pages[*p] |= flag;
+			}
+		}
+
+		// don't buffer too much data, feed them to the device if there is enough
+
+		if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64)
+		{
+			Enqueue();
+		}
+
+		/*
+		// check if the texture is not part of a target currently in use
+
+		if(CheckSourcePages(data))
+		{
+		Sync(4);
+		}
+
+		// addref source and target pages
+
+		data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
+		*/
+
+		// update previously invalidated parts
+
+		//data->UpdateSource();
+		/*
+		if(LOG)
+		{
+		fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
+		sd->counter,
+		m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
+		m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
+		PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
+		PRIM->PRIM, sd->vertex_count, sd->index_count);
+
+		fflush(s_fp);
+		}
+		*/
+
+		//printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w);
+
+		/*
+		// invalidate new parts rendered onto
+
+		if(sd->global.sel.fwrite)
+		{
+			m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
+		}
+
+		if(sd->global.sel.zwrite)
+		{
+			m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
+		}
+		*/
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+
+		return;
+	}
+	catch(std::exception err)
+	{
+		printf("%s\n", err.what());
+
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+		
+		std::string s;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+}
+
+void GSRendererCL::Sync(int reason)
+{
+	//printf("sync %d\n", reason);
+
+	GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
+
+	Enqueue();
+
+	m_cl.queue[2].finish();
+
+	memset(m_rw_pages, 0, sizeof(m_rw_pages));
+	memset(m_tex_pages, 0, sizeof(m_tex_pages));	
+
+	// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, 
+	// though it does not seem to be necessary even with GPU devices where it might be cached, 
+	// needs more testing...
+
+	//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+	//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
+}
+
+void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+
+	o->GetPages(r, m_tmp_pages);
+
+	//if(!synced)
+	{
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_rw_pages[*p] & 3) // rw
+			{
+				Sync(3);
+
+				break;
+			}
+		}
+	}
+
+	for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+	{
+		m_tex_pages[*p] = 1;
+	}
+}
+
+void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	//if(!synced)
+	{
+		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+
+		o->GetPages(r, m_tmp_pages);
+
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_rw_pages[*p] & 1) // w
+			{
+				Sync(4);
+
+				break;
+			}
+		}
+	}
+}
+/*
+bool GSRendererCL::CheckSourcePages(RasterizerData* data)
+{
+	// TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync
+	{
+		for(size_t i = 0; data->m_tex[i].t != NULL; i++)
+		{
+			data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages);
+
+			uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n;
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
+
+				if(m_fzb_pages[*p]) // currently being drawn to? => sync
+				{
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+*/
+
+//#include "GSTextureCL.h"
+
+void GSRendererCL::Enqueue()
+{
+	if(m_jobs.empty()) return;
+
+	try
+	{
+		ASSERT(m_cl.vb.tail > m_cl.vb.head);
+		ASSERT(m_cl.ib.tail > m_cl.ib.head);
+		ASSERT(m_cl.pb.tail > m_cl.pb.head);
+
+		int primclass = m_jobs.front().sel.prim;
+
+		uint32 n;
+
+		switch(primclass)
+		{
+		case GS_POINT_CLASS: n = 1; break;
+		case GS_LINE_CLASS: n = 2; break;
+		case GS_TRIANGLE_CLASS: n = 3; break;
+		case GS_SPRITE_CLASS: n = 2; break;
+		default: __assume(0);
+		}
+
+		PrimSelector psel;
+
+		psel.key = 0;
+		psel.prim = primclass;
+
+		cl::Kernel& pk = m_cl.GetPrimKernel(psel);
+
+		pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]);
+		pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]);
+
+		TileSelector tsel;
+
+		tsel.key = 0;
+		tsel.prim = primclass;
+
+		tsel.mode = 0;
+
+		cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 1;
+
+		cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 2;
+
+		cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 3;
+
+		cl::Kernel& tk = m_cl.GetTileKernel(tsel);
+
+		tsel.key = 0;
+		tsel.clear = 1;
+
+		cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel);
+
+		//
+
+		m_cl.Unmap();
+
+		std::vector<cl::Event> el2(1);
+
+		m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]);
+		m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL);
+
+		//
+
+		cl_kernel tfx_prev = NULL;
+
+		auto head = m_jobs.begin();
+
+		while(head != m_jobs.end())
+		{
+			uint32 total_prim_count = 0;
+
+			auto next = head;
+
+			while(next != m_jobs.end())
+			{
+				auto job = next++;
+
+				uint32 cur_prim_count = job->ib_count / n;
+				uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0;
+
+				total_prim_count += cur_prim_count;
+
+				if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2)
+				{
+					uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);					
+
+					pk.setArg(3, (cl_uint)m_vb_start);
+					pk.setArg(4, (cl_uint)head->ib_start);
+
+					m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
+
+					if(0)
+					{
+						gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+						m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+					}
+
+					GSVector4i rect = GSVector4i::zero();
+
+					for(auto i = head; i != next; i++)
+					{
+						rect = rect.runion(GSVector4i::load<false>(&i->rect));
+					}
+
+					rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
+
+					int bin_w = rect.width();
+					int bin_h = rect.height();
+
+					uint32 batch_count = BATCH_COUNT(prim_count);
+					uint32 bin_count = bin_w * bin_h;
+
+					cl_uchar4 bin_dim;
+
+					bin_dim.s[0] = (cl_uchar)rect.x;
+					bin_dim.s[1] = (cl_uchar)rect.y;
+					bin_dim.s[2] = (cl_uchar)bin_w;
+					bin_dim.s[3] = (cl_uchar)bin_h;
+
+					if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area
+					{
+						m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange);
+
+						if(bin_count <= 32 && m_cl.WIs >= 256)
+						{
+							uint32 item_count;
+							uint32 group_count;
+							cl::Kernel* k;
+
+							if(bin_count <= 8)
+							{
+								item_count = std::min(prim_count, 32u);
+								group_count = ((prim_count + 31) >> 5) * item_count;
+								k = &tk_32;
+							}
+							else if(bin_count <= 16)
+							{
+								item_count = std::min(prim_count, 16u);
+								group_count = ((prim_count + 15) >> 4) * item_count;
+								k = &tk_16;
+							}
+							else
+							{
+								item_count = std::min(prim_count, 8u);
+								group_count = ((prim_count + 7) >> 3) * item_count;
+								k = &tk_8;
+							}
+
+							k->setArg(1, (cl_uint)prim_count);
+							k->setArg(2, (cl_uint)bin_count);
+							k->setArg(3, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count));
+						}
+						else
+						{
+							uint32 item_count = std::min(bin_count, m_cl.WIs);
+							uint32 group_count = batch_count * item_count;
+
+							tk.setArg(1, (cl_uint)prim_count);
+							tk.setArg(2, (cl_uint)batch_count);
+							tk.setArg(3, (cl_uint)bin_count);
+							tk.setArg(4, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
+						}
+
+						if(0)
+						{
+							gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+							m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+						}
+					}
+
+					//
+
+					uint32 prim_start = 0;
+
+					for(auto i = head; i != next; i++)
+					{
+						ASSERT(prim_start < MAX_PRIM_COUNT);
+
+						uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start);
+
+						// TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text),
+						// changed by tfx in the previous loop or marked by InvalidateVideoMem
+
+						// TODO: tile level z test
+
+						cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
+
+						if(tfx_prev != tfx())
+						{
+							tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
+
+							tfx_prev = tfx();
+						}
+
+						tfx.setArg(4, (cl_uint)i->pb_start);
+						tfx.setArg(5, (cl_uint)prim_start);
+						tfx.setArg(6, (cl_uint)prim_count_inner);
+						tfx.setArg(7, (cl_uint)batch_count);
+						tfx.setArg(8, (cl_uint)bin_count);
+						tfx.setArg(9, bin_dim);
+
+						//m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256));
+
+						//printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y);
+
+						GSVector4i r = GSVector4i::load<false>(&i->rect);
+
+						r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
+						/*
+						if(i->sel.IsSolidRect()) // TODO: simple mem fill
+							;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
+						else
+						*/
+						m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
+
+						// TODO: invalidate texture cache pages
+
+						prim_start += prim_count_inner;
+					}
+
+					//
+
+					if(total_prim_count > MAX_PRIM_COUNT)
+					{
+						prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
+
+						job->ib_start += prim_count * n * sizeof(uint32);
+						job->ib_count -= prim_count * n;
+
+						next = job; // try again for the reminder
+					}
+
+					break;
+				}
+			}
+
+			head = next;
+		}
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+	}
+
+	m_jobs.clear();
+
+	m_vb_count = 0;
+
+	m_cl.vb.head = m_cl.vb.tail;
+	m_cl.ib.head = m_cl.ib.tail;
+	m_cl.pb.head = m_cl.pb.tail;
+
+	m_cl.Map();
+}
+
+static int RemapPSM(int psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: psm = 0; break;
+	case PSM_PSMCT24: psm = 1; break;
+	case PSM_PSMCT16: psm = 2; break;
+	case PSM_PSMCT16S: psm = 3; break;
+	case PSM_PSMZ32: psm = 4; break;
+	case PSM_PSMZ24: psm = 5; break;
+	case PSM_PSMZ16: psm = 6; break;
+	case PSM_PSMZ16S: psm = 7; break;
+	case PSM_PSMT8: psm = 8; break;
+	case PSM_PSMT4: psm = 9; break;
+	case PSM_PSMT8H: psm = 10; break;
+	case PSM_PSMT4HL: psm = 11; break;
+	case PSM_PSMT4HH: psm = 12; break;
+	}
+
+	return psm;
+}
+
+bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
+{
+	const GSDrawingEnvironment& env = m_env;
+	const GSDrawingContext* context = m_context;
+	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
+
+	pb->sel.key = 0;
+
+	pb->sel.atst = ATST_ALWAYS;
+	pb->sel.tfx = TFX_NONE;
+	pb->sel.ababcd = 0xff;
+	pb->sel.prim = primclass;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
+	{
+		fm = 0xffffffff;
+		zm = 0xffffffff;
+	}
+
+	if(PRIM->TME)
+	{
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+	}
+
+	if(context->TEST.ATE)
+	{
+		if(!TryAlphaTest(fm, zm))
+		{
+			pb->sel.atst = context->TEST.ATST;
+			pb->sel.afail = context->TEST.AFAIL;
+			pb->aref = context->TEST.AREF;
+		}
+	}
+
+	bool fwrite;
+	bool zwrite;
+	
+	switch(context->FRAME.PSM)
+	{
+	default:
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+		fwrite = fm != 0xffffffff;
+		break;
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		fwrite = (fm & 0x00ffffff) != 0x00ffffff;
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8;
+		break;
+	}
+
+	switch(context->ZBUF.PSM)
+	{
+	default:
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+		zwrite = zm != 0xffffffff;
+		break;
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		zwrite = (zm & 0x00ffffff) != 0x00ffffff;
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		zm &= 0x80f8f8f8;
+		zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8;
+		break;
+	}
+
+	if(!fwrite && !zwrite) return false;
+
+	bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
+
+	pb->sel.fwrite = fwrite;
+	pb->sel.ftest = ftest;
+	pb->sel.zwrite = zwrite;
+	pb->sel.ztest = ztest;
+
+	if(fwrite || ftest)
+	{
+		pb->sel.fpsm = RemapPSM(context->FRAME.PSM);
+
+		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
+		{
+			pb->sel.iip = PRIM->IIP;
+		}
+
+		if(PRIM->TME)
+		{
+			pb->sel.tfx = context->TEX0.TFX;
+			pb->sel.tcc = context->TEX0.TCC;
+			pb->sel.fst = PRIM->FST;
+			pb->sel.ltf = m_vt.IsLinear();
+			pb->sel.tpsm = RemapPSM(context->TEX0.PSM);
+			pb->sel.aem = m_env.TEXA.AEM;
+
+			pb->tbp[0] = context->TEX0.TBP0;
+			pb->tbw[0] = context->TEX0.TBW;
+			pb->ta0 = m_env.TEXA.TA0;
+			pb->ta1 = m_env.TEXA.TA1;
+
+			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+			{
+				pb->sel.tlu = 1;
+
+				memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
+			}
+
+			pb->sel.wms = context->CLAMP.WMS;
+			pb->sel.wmt = context->CLAMP.WMT;
+
+			if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			{
+				// modulate does not do anything when vertex color is 0x80
+
+				pb->sel.tfx = TFX_DECAL;
+			}
+
+			// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
+
+			// TODO: if(t == NULL) {ASSERT(0); return false;}
+
+			GSVector4i r;
+
+			GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
+
+			// TODO: data->SetSource(t, r, 0);
+
+			// TODO: pb->sel.tw = t->m_tw - 3;
+
+			// TODO: store r to current job
+
+			if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
+			{
+				// TEX1.MMIN
+				// 000 p
+				// 001 l
+				// 010 p round
+				// 011 p tri
+				// 100 l round
+				// 101 l tri
+
+				if(m_vt.m_lod.x > 0)
+				{
+					pb->sel.ltf = context->TEX1.MMIN >> 2;
+				}
+				else
+				{
+					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
+				}
+
+				pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				pb->sel.lcm = context->TEX1.LCM;
+
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
+				int k = context->TEX1.K << 12;
+
+				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
+				{
+					k = (int)m_vt.m_lod.x << 16; // set lod to max level
+
+					pb->sel.lcm = 1; // lod is constant
+					pb->sel.mmin = 1; // tri-linear is meaningless
+				}
+
+				if(pb->sel.mmin == 2)
+				{
+					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
+				}
+
+				if(pb->sel.fst)
+				{
+					ASSERT(pb->sel.lcm == 1);
+					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
+
+					pb->sel.lcm = 1;
+				}
+
+				if(pb->sel.lcm)
+				{
+					int lod = std::max<int>(std::min<int>(k, mxl), 0);
+
+					if(pb->sel.mmin == 1)
+					{
+						lod = (lod + 0x8000) & 0xffff0000; // rounding
+					}
+
+					pb->lod = lod;
+
+					// TODO: lot to optimize when lod is constant
+				}
+				else
+				{
+					pb->mxl = mxl;
+					pb->l = (float)(-0x10000 << context->TEX1.L);
+					pb->k = (float)k;
+				}
+
+				GIFRegTEX0 MIP_TEX0 = context->TEX0;
+				GIFRegCLAMP MIP_CLAMP = context->CLAMP;
+
+				GSVector4 tmin = m_vt.m_min.t;
+				GSVector4 tmax = m_vt.m_max.t;
+
+				static int s_counter = 0;
+
+				for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++)
+				{
+					switch(i)
+					{
+					case 1:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP1;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW1;
+						break;
+					case 2:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP2;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW2;
+						break;
+					case 3:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP3;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW3;
+						break;
+					case 4:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP4;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW4;
+						break;
+					case 5:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP5;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW5;
+						break;
+					case 6:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP6;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW6;
+						break;
+					default:
+						__assume(0);
+					}
+
+					pb->tbp[i] = MIP_TEX0.TBP0;
+					pb->tbw[i] = MIP_TEX0.TBW;
+
+					if(MIP_TEX0.TW > 0) MIP_TEX0.TW--;
+					if(MIP_TEX0.TH > 0) MIP_TEX0.TH--;
+
+					MIP_CLAMP.MINU >>= 1;
+					MIP_CLAMP.MINV >>= 1;
+					MIP_CLAMP.MAXU >>= 1;
+					MIP_CLAMP.MAXV >>= 1;
+
+					m_vt.m_min.t *= 0.5f;
+					m_vt.m_max.t *= 0.5f;
+
+					// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3);
+
+					// TODO: if(t == NULL) {ASSERT(0); return false;}
+
+					GSVector4i r;
+
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
+
+					// TODO: data->SetSource(t, r, i);
+
+					// TODO: store r to current job
+				}
+
+				s_counter++;
+
+				m_vt.m_min.t = tmin;
+				m_vt.m_max.t = tmax;
+			}
+			else
+			{
+				if(pb->sel.fst == 0)
+				{
+					// skip per pixel division if q is constant
+
+					GSVertexCL* RESTRICT v = vertex;
+
+					if(m_vt.m_eq.q)
+					{
+						pb->sel.fst = 1;
+
+						const GSVector4& t = v[index[0]].t;
+
+						if(t.z != 1.0f)
+						{
+							GSVector4 w = t.zzzz().rcpnr();
+
+							for(int i = 0, j = vertex_count; i < j; i++)
+							{
+								GSVector4 t = v[i].t;
+
+								v[i].t = (t * w).xyzw(t);
+							}
+						}
+					}
+					else if(primclass == GS_SPRITE_CLASS)
+					{
+						pb->sel.fst = 1;
+
+						for(int i = 0, j = vertex_count; i < j; i += 2)
+						{
+							GSVector4 t0 = v[i + 0].t;
+							GSVector4 t1 = v[i + 1].t;
+
+							GSVector4 w = t1.zzzz().rcpnr();
+
+							v[i + 0].t = (t0 * w).xyzw(t0);
+							v[i + 1].t = (t1 * w).xyzw(t1);
+						}
+					}
+				}
+
+				if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel?
+				{
+					// if q is constant we can do the half pel shift for bilinear sampling on the vertices
+
+					// TODO: but not when mipmapping is used!!!
+
+					GSVector4 half(8.0f, 8.0f);
+
+					GSVertexCL* RESTRICT v = vertex;
+
+					for(int i = 0, j = vertex_count; i < j; i++)
+					{
+						GSVector4 t = v[i].t;
+
+						v[i].t = (t - half).xyzw(t);
+					}
+				}
+			}
+
+			int tw = 1 << context->TEX0.TW;
+			int th = 1 << context->TEX0.TH;
+
+			switch(context->CLAMP.WMS)
+			{
+			case CLAMP_REPEAT:
+				pb->minu = tw - 1;
+				pb->maxu = 0;
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minu = 0;
+				pb->maxu = tw - 1;
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minu = std::min((int)context->CLAMP.MINU, tw - 1);
+				pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1);
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minu = (int)context->CLAMP.MINU & (tw - 1);
+				pb->maxu = (int)context->CLAMP.MAXU & (tw - 1);
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+
+			switch(context->CLAMP.WMT)
+			{
+			case CLAMP_REPEAT:
+				pb->minv = th - 1;
+				pb->maxv = 0;
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minv = 0;
+				pb->maxv = th - 1;
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minv = std::min((int)context->CLAMP.MINV, th - 1);
+				pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127
+				pb->maxv = (int)context->CLAMP.MAXV & (th - 1);
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+		}
+
+		if(PRIM->FGE)
+		{
+			pb->sel.fge = 1;
+			pb->fog = env.FOGCOL.u32[0];
+		}
+
+		if(context->FRAME.PSM != PSM_PSMCT24)
+		{
+			pb->sel.date = context->TEST.DATE;
+			pb->sel.datm = context->TEST.DATM;
+		}
+
+		if(!IsOpaque())
+		{
+			pb->sel.abe = PRIM->ABE;
+			pb->sel.ababcd = context->ALPHA.u32[0];
+
+			if(env.PABE.PABE)
+			{
+				pb->sel.pabe = 1;
+			}
+
+			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
+			{
+				pb->sel.aa1 = 1;
+			}
+
+			pb->afix = context->ALPHA.FIX;
+		}
+
+		if(pb->sel.date
+		|| pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1
+		|| pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
+		|| (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0
+		|| (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask
+		|| (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0)
+		{
+			pb->sel.rfb = 1;
+		}
+
+		pb->sel.colclamp = env.COLCLAMP.CLAMP;
+		pb->sel.fba = context->FBA.FBA;
+
+		if(env.DTHE.DTHE)
+		{
+			pb->sel.dthe = 1;
+
+			GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
+			GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
+			GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16);
+			GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16);
+
+			pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3));
+		}
+	}
+
+	if(zwrite || ztest)
+	{
+		pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
+		pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+		pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
+	}
+
+	pb->fm = fm;
+	pb->zm = zm;
+
+	if((pb->sel.fpsm & 3) == 1)
+	{
+		pb->fm |= 0xff000000;
+	}
+	else if((pb->sel.fpsm & 3) >= 2)
+	{
+		uint32 rb = pb->fm & 0x00f800f8;
+		uint32 ga = pb->fm & 0x8000f800;
+
+		pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
+	}
+
+	if((pb->sel.zpsm & 3) == 1)
+	{
+		pb->zm |= 0xff000000;
+	}
+	else if((pb->sel.zpsm & 3) >= 2)
+	{
+		pb->zm |= 0xffff0000;
+	}
+
+	if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor)))
+	{
+		pb->sel.noscissor = 1;
+	}
+
+	pb->fbp = context->FRAME.Block();
+	pb->zbp = context->ZBUF.Block();
+	pb->bw = context->FRAME.FBW;
+
+	return true;
+}
+
+//////////
+
+//#define IOCL_DEBUG
+
+GSRendererCL::CL::CL()
+{
+	WIs = INT_MAX;
+
+	std::vector<cl::Platform> platforms;
+
+	cl::Platform::get(&platforms);
+
+	for(auto& p : platforms)
+	{
+		std::string platform_vendor = p.getInfo<CL_PLATFORM_VENDOR>();
+
+		std::vector<cl::Device> ds;
+
+		p.getDevices(CL_DEVICE_TYPE_ALL, &ds);
+
+		for(auto& device : ds)
+		{
+			std::string vendor = device.getInfo<CL_DEVICE_VENDOR>();
+			std::string name = device.getInfo<CL_DEVICE_NAME>();
+			std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+
+			printf("%s %s %s", vendor.c_str(), name.c_str(), version.c_str());
+
+			cl_device_type type = device.getInfo<CL_DEVICE_TYPE>();
+
+			switch(type)
+			{
+			case CL_DEVICE_TYPE_GPU: printf(" GPU"); break;
+			case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
+			}
+
+			if(strstr(version.c_str(), "OpenCL C 1.2") != NULL)
+			{
+#ifdef IOCL_DEBUG
+				if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
+#else
+				//if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
+				if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL)
+#endif
+				{
+					devices.push_back(device);
+
+					WIs = std::min(WIs, device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+
+					printf(" *");
+				}
+			}
+
+			printf("\n");
+		}
+
+		if(!devices.empty()) break;
+	}
+
+	if(devices.empty())
+	{
+		throw new std::exception("OpenCL device not found");
+	}
+
+	context = cl::Context(devices);
+
+	queue[0] = cl::CommandQueue(context);
+	queue[1] = cl::CommandQueue(context);
+	queue[2] = cl::CommandQueue(context);
+
+	vector<unsigned char> buff;
+
+	if(theApp.LoadResource(IDR_TFX_CL, buff))
+	{
+		kernel_str = std::string((const char*)buff.data(), buff.size());
+	}
+
+	vb.head = vb.tail = vb.size = 0;
+	ib.head = ib.tail = ib.size = 0;
+	pb.head = pb.tail = pb.size = 0;
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+
+	pb.size = sizeof(TFXParameter) * 256;
+	pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+	pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+
+	env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env));
+
+	wqidx = 0;
+	wq = &queue[0];
+}
+
+GSRendererCL::CL::~CL()
+{
+	Unmap();
+}
+
+void GSRendererCL::CL::Map()
+{
+	Unmap();
+
+	if(vb.head < vb.size)
+	{
+		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head);
+		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
+		ASSERT(((size_t)vb.ptr & 15) == 0);
+		ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
+	}
+
+	if(ib.head < ib.size)
+	{
+		ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head);
+		ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
+		ASSERT(((size_t)ib.ptr & 15) == 0);
+	}
+
+	if(pb.head < pb.size)
+	{
+		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head);
+		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
+		ASSERT(((size_t)pb.ptr & 15) == 0);
+		ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
+	}
+}
+
+void GSRendererCL::CL::Unmap()
+{
+	if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr);
+	if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr);
+	if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr);
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+}
+
+static void AddDefs(ostringstream& opt)
+{
+	opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
+	opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u ";
+	opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u ";
+	opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " ";
+	opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
+	opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";	
+	opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
+#ifdef IOCL_DEBUG
+	opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
+#endif
+}
+
+cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
+{
+	auto i = prim_map.find(sel);
+
+	if(i != prim_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "prim_%02x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_PRIM=" << entry << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	prim_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return prim_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel)
+{
+	auto i = tile_map.find(sel);
+
+	if(i != tile_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tile_%02x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_TILE=" << entry << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+		opt << "-D MODE=" << sel.mode << " ";
+		opt << "-D CLEAR=" << sel.clear << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	tile_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return tile_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
+{
+	auto i = tfx_map.find(sel);
+
+	if(i != tfx_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tfx_%016x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_TFX=" << entry << " ";
+		opt << "-D FPSM=" << sel.fpsm << " ";
+		opt << "-D ZPSM=" << sel.zpsm << " ";
+		opt << "-D ZTST=" << sel.ztst << " ";
+		opt << "-D ATST=" << sel.atst << " ";
+		opt << "-D AFAIL=" << sel.afail << " ";
+		opt << "-D IIP=" << sel.iip << " ";
+		opt << "-D TFX=" << sel.tfx << " ";
+		opt << "-D TCC=" << sel.tcc << " ";
+		opt << "-D FST=" << sel.fst << " ";
+		opt << "-D LTF=" << sel.ltf << " ";
+		opt << "-D TLU=" << sel.tlu << " ";
+		opt << "-D FGE=" << sel.fge << " ";
+		opt << "-D DATE=" << sel.date << " ";
+		opt << "-D ABE=" << sel.abe << " ";
+		opt << "-D ABA=" << sel.aba << " ";
+		opt << "-D ABB=" << sel.abb << " ";
+		opt << "-D ABC=" << sel.abc << " ";
+		opt << "-D ABD=" << sel.abd << " ";
+		opt << "-D PABE=" << sel.pabe << " ";
+		opt << "-D AA1=" << sel.aa1 << " ";
+		opt << "-D FWRITE=" << sel.fwrite << " ";
+		opt << "-D FTEST=" << sel.ftest << " ";
+		opt << "-D RFB=" << sel.rfb << " ";
+		opt << "-D ZWRITE=" << sel.zwrite << " ";
+		opt << "-D ZTEST=" << sel.ztest << " ";
+		opt << "-D ZOVERFLOW=" << sel.zoverflow << " ";
+		opt << "-D WMS=" << sel.wms << " ";
+		opt << "-D WMT=" << sel.wmt << " ";
+		opt << "-D DATM=" << sel.datm << " ";
+		opt << "-D COLCLAMP=" << sel.colclamp << " ";
+		opt << "-D FBA=" << sel.fba << " ";
+		opt << "-D DTHE=" << sel.dthe << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+		opt << "-D TW=" << sel.tw << " ";
+		opt << "-D LCM=" << sel.lcm << " ";
+		opt << "-D MMIN=" << sel.mmin << " ";
+		opt << "-D NOSCISSOR=" << sel.noscissor << " ";
+		opt << "-D TPSM=" << sel.tpsm << " ";
+		opt << "-D AEM=" << sel.aem << " ";
+		opt << "-D FB=" << sel.fb << " ";
+		opt << "-D ZB=" << sel.zb << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	tfx_map[sel] = k;
+
+	k.setArg(0, env);
+	k.setArg(1, vm);
+	k.setArg(2, tex);
+
+	return tfx_map[sel];
+}
diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h
new file mode 100644
index 0000000000..3ac008ceaa
--- /dev/null
+++ b/plugins/GSdx/GSRendererCL.h
@@ -0,0 +1,310 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+//#include "GSTextureCacheCL.h"
+
+__aligned(struct, 32) GSVertexCL
+{
+	GSVector4 p, t;
+};
+
+class GSRendererCL : public GSRenderer
+{
+	typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	ConvertVertexBufferPtr m_cvb[4][2][2];
+
+	template<uint32 primclass, uint32 tme, uint32 fst>
+	void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	union PrimSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TileSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+			uint32 mode:2; // 2
+			uint32 clear:1; // 4
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union JobSelector
+	{
+		struct
+		{
+			uint32 dummy:1; // 0
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TFXSelector
+	{
+		struct
+		{
+			uint32 fpsm:3; // 0
+			uint32 zpsm:3; // 3
+			uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g))
+			uint32 atst:3; // 8
+			uint32 afail:2; // 11
+			uint32 iip:1; // 13
+			uint32 tfx:3; // 14
+			uint32 tcc:1; // 17
+			uint32 fst:1; // 18
+			uint32 ltf:1; // 19
+			uint32 tlu:1; // 20
+			uint32 fge:1; // 21
+			uint32 date:1; // 22
+			uint32 abe:1; // 23
+			uint32 aba:2; // 24
+			uint32 abb:2; // 26
+			uint32 abc:2; // 28
+			uint32 abd:2; // 30
+
+			uint32 pabe:1; // 32
+			uint32 aa1:1; // 33
+			uint32 fwrite:1; // 34
+			uint32 ftest:1; // 35
+			uint32 rfb:1; // 36
+			uint32 zwrite:1; // 37
+			uint32 ztest:1; // 38
+			uint32 zoverflow:1; // 39 (z max >= 0x80000000)
+			uint32 wms:2; // 40
+			uint32 wmt:2; // 42
+			uint32 datm:1; // 44
+			uint32 colclamp:1; // 45
+			uint32 fba:1; // 46
+			uint32 dthe:1; // 47
+			uint32 prim:2; // 48
+			uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
+			uint32 lcm:1; // 53
+			uint32 mmin:2; // 54
+			uint32 noscissor:1; // 55
+			uint32 tpsm:4; // 56
+			uint32 aem:1; // 60
+			// TODO
+		};
+
+		struct
+		{
+			uint32 _pad1:24;
+			uint32 ababcd:8;
+			uint32 _pad2:2;
+			uint32 fb:2;
+			uint32 _pad3:1;
+			uint32 zb:2;
+		};
+
+		struct
+		{
+			uint32 lo;
+			uint32 hi;
+		};
+
+		uint64 key;
+
+		operator uint64() const { return key; }
+
+		bool IsSolidRect() const
+		{
+			return prim == GS_SPRITE_CLASS
+				&& iip == 0
+				&& tfx == TFX_NONE
+				&& abe == 0
+				&& ztst <= 1
+				&& atst <= 1
+				&& date == 0
+				&& fge == 0;
+		}
+	};
+
+	__aligned(struct, 32) TFXParameter
+	{
+		GSVector4i scissor;
+		GSVector4i bbox;
+		GSVector4i rect;
+		GSVector4i dimx; // 4x4 signed char
+		TFXSelector sel;
+		uint32 fbp, zbp, bw;
+		uint32 fm, zm;
+		uint32 fog; // rgb
+		uint8 aref, afix;
+		uint8 ta0, ta1;
+		uint32 tbp[7], tbw[7];
+		int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix
+		int lod; // lcm == 1
+		int mxl;
+		float l; // TEX1.L * -0x10000
+		float k; // TEX1.K * 0x10000
+		uint32 clut[256];
+	};
+
+	struct TFXJob
+	{
+		struct {int x, y, z, w;} rect;
+		TFXSelector sel;
+		uint32 ib_start, ib_count;
+		uint32 pb_start;
+	};
+
+	class CL
+	{
+		std::string kernel_str;
+		std::map<uint32, cl::Kernel> prim_map;
+		std::map<uint32, cl::Kernel> tile_map;
+		std::map<uint64, cl::Kernel> tfx_map;
+
+	public:
+		std::vector<cl::Device> devices;
+		cl::Context context;
+		cl::CommandQueue queue[3];
+		cl::Buffer vm;
+		cl::Buffer tex;
+		struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb;
+		cl::Buffer env;
+		cl::CommandQueue* wq;
+		int wqidx;
+		size_t WIs;
+
+	public:
+		CL();
+		virtual ~CL();
+
+		cl::Kernel& GetPrimKernel(const PrimSelector& sel);
+		cl::Kernel& GetTileKernel(const TileSelector& sel);
+		cl::Kernel& GetTFXKernel(const TFXSelector& sel);
+
+		void Map();
+		void Unmap();
+	};
+
+	CL m_cl;
+	std::list<TFXJob> m_jobs;
+	uint32 m_vb_start;
+	uint32 m_vb_count;
+
+	void Enqueue();
+
+	/*
+	class RasterizerData : public GSAlignedClass<32>
+	{
+		__aligned(struct, 16) TextureLevel
+		{
+			GSVector4i r;
+			// TODO: GSTextureCacheCL::Texture* t;
+		};
+
+	public:
+		GSRendererCL* m_parent;
+		const uint32* m_fb_pages;
+		const uint32* m_zb_pages;
+
+		//cl::Buffer m_vbuff;
+		//cl::Buffer m_ibuff;
+
+		// TODO: buffers
+		TextureLevel m_tex[7 + 1]; // NULL terminated
+		//cl::Buffer m_clut;
+		//cl::Buffer m_dimx;
+
+		// TODO: struct in a cl::Buffer
+		TFXSelector m_sel;
+		GSVector4i m_scissor;
+		GSVector4i m_bbox;
+		uint32 m_fm, m_zm;
+		int m_aref, m_afix;
+		uint32 m_fog; // rgb
+		int m_lod; // lcm == 1
+		int m_mxl;
+		float m_l; // TEX1.L * -0x10000
+		float m_k; // TEX1.K * 0x10000
+		// TODO: struct { GSVector4i min, max, minmax, mask, invmask; } t; // [u] x 4 [v] x 4
+
+		RasterizerData(GSRendererCL* parent)
+			: m_parent(parent)
+			, m_fb_pages(NULL)
+			, m_zb_pages(NULL)
+		{
+			m_sel.key = 0;
+		}
+
+		virtual ~RasterizerData()
+		{
+			// TODO: ReleasePages();
+		}
+
+		// TODO: void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm);
+		// TODO: void ReleasePages();
+
+		// TODO: void SetSource(GSTextureCacheCL::Texture* t, const GSVector4i& r, int level);
+		// TODO: void UpdateSource();
+	};
+	*/
+protected:
+//	GSTextureCacheCL* m_tc;
+	GSTexture* m_texture[2];
+	uint8* m_output;
+
+	uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read)
+	uint8 m_tex_pages[512];
+	uint32 m_tmp_pages[512 + 1];
+
+	void Reset();
+	void VSync(int field);
+	void ResetDevice();
+	GSTexture* GetOutput(int i);
+
+	void Draw();
+	void Sync(int reason);
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
+
+	void UsePages(const uint32* pages, int type);
+	void ReleasePages(const uint32* pages, int type);
+	
+	//bool CheckSourcePages(RasterizerData* data);
+
+	bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
+
+public:
+	GSRendererCL();
+	virtual ~GSRendererCL();
+};
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 30d848e4aa..71f42f961d 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -429,6 +429,15 @@ void GSRendererSW::Draw()
 
 	GSVector4i scissor = GSVector4i(context->scissor.in);
 	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (single line: 0, 0 - 256, 0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
 	GSVector4i r = bbox.rintersect(scissor);
 
 	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
@@ -973,7 +982,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 	gd.sel.zpsm = 3;
 	gd.sel.atst = ATST_ALWAYS;
 	gd.sel.tfx = TFX_NONE;
-	gd.sel.ababcd = 255;
+	gd.sel.ababcd = 0xff;
 	gd.sel.prim = primclass;
 
 	uint32 fm = context->FRAME.FBMSK;
@@ -1101,7 +1110,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 				gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
 				gd.sel.lcm = context->TEX1.LCM;
 
-				int mxl = (std::min<int>((int)context->TEX1.MXL, 6) << 16);
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
 				int k = context->TEX1.K << 12;
 
 				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp
index 89656965dd..41d66f24e9 100644
--- a/plugins/GSdx/GSSettingsDlg.cpp
+++ b/plugins/GSdx/GSSettingsDlg.cpp
@@ -329,16 +329,19 @@ void GSSettingsDlg::UpdateRenderers()
 	{
 		GSSetting r = theApp.m_gs_renderers[i];
 
-		if(i >= 3 && i <= 5)
+		if(r.id >= 3 && r.id <= 5 || r.id == 15)
 		{
 			if(level < D3D_FEATURE_LEVEL_10_0) continue;
 
-			r.name = std::string("Direct3D") + (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10");
+			r.name += (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10");
 		}
 
 		renderers.push_back(r);
-		if (r.id == renderer_setting)
+
+		if(r.id == renderer_setting)
+		{
 			renderer_sel = renderer_setting;
+		}
 	}
 
 	ComboBoxInit(IDC_RENDERER, renderers, renderer_sel);
@@ -607,13 +610,13 @@ bool GSHacksDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
 				break;
 			case IDC_SPRITEHACK:
 				helpstr = "Sprite Hack\n\nHelps getting rid of black inner lines in some filtered sprites."
-						  " Half option is the preferred one. Use it for Mana Khemia or Ar tonelico for example."
+						  " Half option is the preferred one. Use it for Mana Khemia or Ar Tonelico for example."
 						  " Full can be used for Tales of Destiny.";
 				break;
 			case IDC_WILDHACK:
 				helpstr = "WildArms\n\nLowers the GS precision to avoid gaps between pixels when"
 						  " upscaling. Full option fixes the text on WildArms games, while Half option might improve portraits"
-						  " in Ar tonelico.";
+						  " in Ar Tonelico.";
 				break;
 			case IDC_MSAACB:
 			case IDC_STATIC_MSAA:
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index 604836af03..731b2aa480 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -1551,7 +1551,8 @@ void GSState::Read(uint8* mem, int len)
 		return;
 	}
 
-	if (!m_init_read_fifo_supported) {
+	if(!m_init_read_fifo_supported)
+	{
 		if(m_tr.x == sx && m_tr.y == sy)
 		{
 			InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
@@ -2316,20 +2317,20 @@ void GSState::GrowVertexBuffer()
 	GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
 	uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3
 
-	if (!vertex || !index)
+	if(vertex == NULL || index == NULL)
 	{
 		printf("GSdx: failed to allocate %d bytes for verticles and %d for indices.\n", sizeof(GSVertex) * maxcount, sizeof(uint32) * maxcount * 3);
 		throw GSDXError();
 	}
 
-	if (m_vertex.buff != NULL)
+	if(m_vertex.buff != NULL)
 	{
 		memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail);
 
 		_aligned_free(m_vertex.buff);
 	}
 
-	if (m_index.buff != NULL)
+	if(m_index.buff != NULL)
 	{
 		memcpy(index, m_index.buff, sizeof(uint32) * m_index.tail);
 		
diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp
index 00fb984260..93d9642fc2 100644
--- a/plugins/GSdx/GSdx.cpp
+++ b/plugins/GSdx/GSdx.cpp
@@ -41,8 +41,29 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserv
 	return TRUE;
 }
 
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	HRSRC hRsrc = FindResource((HMODULE)s_hModule, MAKEINTRESOURCE(id), type != NULL ? type : RT_RCDATA);
+	if(!hRsrc) return false;
+	HGLOBAL hGlobal = ::LoadResource((HMODULE)s_hModule, hRsrc);
+	if(!hGlobal) return false;
+	DWORD size = SizeofResource((HMODULE)s_hModule, hRsrc);
+	if(!size) return false;
+	buff.resize(size);
+	memcpy(buff.data(), LockResource(hGlobal), size);
+	return true;
+}
+
 #else
 
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	printf("LoadResource not implemented\n");
+	return false;
+}
+
 size_t GSdxApp::GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName)
 {
 	BuildConfigurationMap(lpFileName);
@@ -108,10 +129,12 @@ GSdxApp::GSdxApp()
 
 	m_gs_renderers.push_back(GSSetting(0, "Direct3D9", "Hardware"));
 	m_gs_renderers.push_back(GSSetting(1, "Direct3D9", "Software"));
+	m_gs_renderers.push_back(GSSetting(14, "Direct3D9", "OpenCL"));
 	m_gs_renderers.push_back(GSSetting(2, "Direct3D9", "Null"));
-	m_gs_renderers.push_back(GSSetting(3, "Direct3D%d    ", "Hardware"));
-	m_gs_renderers.push_back(GSSetting(4, "Direct3D%d    ", "Software"));
-	m_gs_renderers.push_back(GSSetting(5, "Direct3D%d    ", "Null"));
+	m_gs_renderers.push_back(GSSetting(3, "Direct3D", "Hardware"));
+	m_gs_renderers.push_back(GSSetting(4, "Direct3D", "Software"));
+	m_gs_renderers.push_back(GSSetting(15, "Direct3D", "OpenCL"));
+	m_gs_renderers.push_back(GSSetting(5, "Direct3D", "Null"));
 #ifdef _LINUX
 	// note: SDL was removed. We keep those bits for compatibility of the renderer
 	// position in the linux dialog.
@@ -119,9 +142,11 @@ GSdxApp::GSdxApp()
 	m_gs_renderers.push_back(GSSetting(8, "SDL 1.3", "Null"));
 #endif
 	m_gs_renderers.push_back(GSSetting(10, "Null", "Software"));
+	m_gs_renderers.push_back(GSSetting(16, "Null", "OpenCL"));
 	m_gs_renderers.push_back(GSSetting(11, "Null", "Null"));
 	m_gs_renderers.push_back(GSSetting(12, "OpenGL", "Hardware"));
 	m_gs_renderers.push_back(GSSetting(13, "OpenGL", "Software"));
+	m_gs_renderers.push_back(GSSetting(17, "OpenGL", "OpenCL"));
 
 	m_gs_interlace.push_back(GSSetting(0, "None", ""));
 	m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth"));
diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h
index f89a3fa761..5ccb1e2c69 100644
--- a/plugins/GSdx/GSdx.h
+++ b/plugins/GSdx/GSdx.h
@@ -39,6 +39,7 @@ public:
 #ifdef _WINDOWS
  	HMODULE GetModuleHandle() {return (HMODULE)GetModuleHandlePtr();}
 #endif
+
 #ifdef _LINUX
 	void BuildConfigurationMap(const char* lpFileName);
 	void ReloadConfig();
@@ -48,6 +49,8 @@ public:
 	int GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName);
 #endif
 
+	bool LoadResource(int id, vector<unsigned char>& buff, const char* type = NULL);
+
 	string GetConfig(const char* entry, const char* value);
 	void SetConfig(const char* entry, const char* value);
 	int GetConfig(const char* entry, int value);
diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc
index 47ccc62aa0..87c0b6164d 100644
--- a/plugins/GSdx/GSdx.rc
+++ b/plugins/GSdx/GSdx.rc
@@ -51,9 +51,11 @@ BEGIN
     "#include ""res/tfx.fx""\r\n"
     "#include ""res/convert.fx""\r\n"
     "#include ""res/interlace.fx""\r\n"
-    "#include ""res/merge.fx""\r\0"
-    "#include ""res/fxaa.fx""\r\0"
-    "#include ""res/shadeboost.fx""\r\0"
+    "#include ""res/merge.fx""\r\n"
+    "#include ""res/fxaa.fx""\r\n"
+    "#include ""res/cs.fx""\r\n"
+    "#include ""res/shadeboost.fx""\r\n"
+    "#include ""res/tfx.cl""\r\n"
 END
 
 #endif    // APSTUDIO_INVOKED
@@ -64,13 +66,14 @@ END
 // RCDATA
 //
 
-IDR_CONVERT_FX          RCDATA                  "res\\convert.fx"
 IDR_TFX_FX              RCDATA                  "res\\tfx.fx"
-IDR_MERGE_FX            RCDATA                  "res\\merge.fx"
+IDR_CONVERT_FX          RCDATA                  "res\\convert.fx"
 IDR_INTERLACE_FX        RCDATA                  "res\\interlace.fx"
+IDR_MERGE_FX            RCDATA                  "res\\merge.fx"
 IDR_FXAA_FX             RCDATA                  "res\\fxaa.fx"
 IDR_CS_FX               RCDATA                  "res\\cs.fx"
 IDR_SHADEBOOST_FX       RCDATA                  "res\\shadeboost.fx"
+IDR_TFX_CL              RCDATA                  "res\\tfx.cl"
 
 /////////////////////////////////////////////////////////////////////////////
 //
@@ -394,6 +397,10 @@ END
 #include "res/convert.fx"
 #include "res/interlace.fx"
 #include "res/merge.fx"
+#include "res/fxaa.fx"
+#include "res/cs.fx"
+#include "res/shadeboost.fx"
+#include "res/tfx.cl"
 
 /////////////////////////////////////////////////////////////////////////////
 #endif    // not APSTUDIO_INVOKED
diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj b/plugins/GSdx/GSdx_vs2013.vcxproj
index f9ee445906..ef165a5839 100644
--- a/plugins/GSdx/GSdx_vs2013.vcxproj
+++ b/plugins/GSdx/GSdx_vs2013.vcxproj
@@ -687,6 +687,7 @@
       <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">AssemblyAndSourceCode</AssemblerOutput>
     </ClCompile>
     <ClCompile Include="GSRenderer.cpp" />
+    <ClCompile Include="GSRendererCL.cpp" />
     <ClCompile Include="GSRendererCS.cpp" />
     <ClCompile Include="GSRendererDX.cpp" />
     <ClCompile Include="GSRendererDX11.cpp" />
@@ -1970,6 +1971,7 @@
     <ClInclude Include="GSPerfMon.h" />
     <ClInclude Include="GSRasterizer.h" />
     <ClInclude Include="GSRenderer.h" />
+    <ClInclude Include="GSRendererCL.h" />
     <ClInclude Include="GSRendererCS.h" />
     <ClInclude Include="GSRendererDX.h" />
     <ClInclude Include="GSRendererDX11.h" />
@@ -2057,6 +2059,7 @@
     <None Include="res\interlace.fx" />
     <None Include="res\merge.fx" />
     <None Include="res\shadeboost.fx" />
+    <None Include="res\tfx.cl" />
     <None Include="res\tfx.fx" />
     <None Include="baseclasses\activex.rcv" />
     <None Include="baseclasses\activex.ver" />
diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj.filters b/plugins/GSdx/GSdx_vs2013.vcxproj.filters
index 1067b76f32..b0053934e9 100644
--- a/plugins/GSdx/GSdx_vs2013.vcxproj.filters
+++ b/plugins/GSdx/GSdx_vs2013.vcxproj.filters
@@ -348,6 +348,9 @@
     <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="GSRendererCL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="GLLoader.h">
@@ -707,6 +710,9 @@
     <ClInclude Include="targetver.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="GSRendererCL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="res\logo10.bmp">
@@ -737,10 +743,13 @@
     <None Include="res\fxaa.fx">
       <Filter>Shaders</Filter>
     </None>
+    <None Include="res\shadeboost.fx">
+      <Filter>Shaders</Filter>
+    </None>
     <None Include="res\cs.fx">
       <Filter>Shaders</Filter>
     </None>
-    <None Include="res\shadeboost.fx">
+    <None Include="res\tfx.cl">
       <Filter>Shaders</Filter>
     </None>
   </ItemGroup>
diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx
index fb63c0b012..c84211ba95 100644
--- a/plugins/GSdx/res/cs.fx
+++ b/plugins/GSdx/res/cs.fx
@@ -1,3 +1,5 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+
 #ifndef VS_TME
 #define VS_TME 1
 #define VS_FST 1
@@ -381,3 +383,5 @@ void ps_main1(GS_OUTPUT input)
 	WritePixel(addr.x, c, PS_FPSM);
 	WritePixel(addr.y, z, PS_ZPSM);
 }
+
+#endif
diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl
new file mode 100644
index 0000000000..d28622380e
--- /dev/null
+++ b/plugins/GSdx/res/tfx.cl
@@ -0,0 +1,1619 @@
+#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency
+
+#ifndef CL_FLT_EPSILON
+#define CL_FLT_EPSILON 1.1920928955078125e-7
+#endif
+
+#if MAX_PRIM_PER_BATCH == 64u
+	#define BIN_TYPE ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+	#define BIN_TYPE uint
+#else
+	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+typedef struct
+{
+	union {float4 p; struct {float x, y, z, f;};};
+	union {float4 tc; struct {float s, t, q; uchar4 c;};};
+} gs_vertex;
+
+typedef struct
+{
+	gs_vertex v[4];
+} gs_prim;
+
+typedef struct
+{
+	float4 dx, dy;
+	float4 zero;
+	float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	uint batch_counter;
+	uint _pad[7];
+	struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+typedef struct
+{
+	int4 scissor;
+	int4 bbox;
+	int4 rect;
+	char dimx[4][4];
+	ulong sel;
+	uint fbp, zbp, bw;
+	uint fm, zm;
+	uchar4 fog; // rgb
+	uchar aref, afix;
+	uchar ta0, ta1;
+	uint tbp[7], tbw[7];
+	int minu, maxu, minv, maxv;
+	int lod; // lcm == 1
+	int mxl;
+	float l; // TEX1.L * -0x10000
+	float k; // TEX1.K * 0x10000
+	uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother
+} gs_param;
+
+enum GS_PRIM_CLASS
+{
+	GS_POINT_CLASS,
+	GS_LINE_CLASS,
+	GS_TRIANGLE_CLASS,
+	GS_SPRITE_CLASS
+};
+
+enum GS_PSM_TARGET
+{
+	PSM_PSMCT32,
+	PSM_PSMCT24,
+	PSM_PSMCT16,
+	PSM_PSMCT16S,
+	PSM_PSMZ32,
+	PSM_PSMZ24,
+	PSM_PSMZ16,
+	PSM_PSMZ16S,
+	PSM_PSMT8,
+	PSM_PSMT4,
+	PSM_PSMT8H,
+	PSM_PSMT4HL,
+	PSM_PSMT4HH,
+};
+
+enum GS_TFX
+{
+	TFX_MODULATE	= 0,
+	TFX_DECAL		= 1,
+	TFX_HIGHLIGHT	= 2,
+	TFX_HIGHLIGHT2	= 3,
+	TFX_NONE		= 4,
+};
+
+enum GS_CLAMP
+{
+	CLAMP_REPEAT		= 0,
+	CLAMP_CLAMP			= 1,
+	CLAMP_REGION_CLAMP	= 2,
+	CLAMP_REGION_REPEAT	= 3,
+};
+
+enum GS_ZTST
+{
+	ZTST_NEVER		= 0,
+	ZTST_ALWAYS		= 1,
+	ZTST_GEQUAL		= 2,
+	ZTST_GREATER	= 3,
+};
+
+enum GS_ATST
+{
+	ATST_NEVER		= 0,
+	ATST_ALWAYS		= 1,
+	ATST_LESS		= 2,
+	ATST_LEQUAL		= 3,
+	ATST_EQUAL		= 4,
+	ATST_GEQUAL		= 5,
+	ATST_GREATER	= 6,
+	ATST_NOTEQUAL	= 7,
+};
+
+enum GS_AFAIL
+{
+	AFAIL_KEEP		= 0,
+	AFAIL_FB_ONLY	= 1,
+	AFAIL_ZB_ONLY	= 2,
+	AFAIL_RGB_ONLY	= 3,
+};
+
+__constant uchar blockTable32[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable32Z[4][8] =
+{
+	{ 24, 25, 28, 29,  8,  9, 12, 13},
+	{ 26, 27, 30, 31, 10, 11, 14, 15},
+	{ 16, 17, 20, 21,  0,  1,  4,  5},
+	{ 18, 19, 22, 23,  2,  3,  6,  7}
+};
+
+__constant uchar blockTable16[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar blockTable16S[8][4] =
+{
+	{  0,  2, 16, 18 },
+	{  1,  3, 17, 19 },
+	{  8, 10, 24, 26 },
+	{  9, 11, 25, 27 },
+	{  4,  6, 20, 22 },
+	{  5,  7, 21, 23 },
+	{ 12, 14, 28, 30 },
+	{ 13, 15, 29, 31 }
+};
+
+__constant uchar blockTable16Z[8][4] =
+{
+	{ 24, 26, 16, 18 },
+	{ 25, 27, 17, 19 },
+	{ 28, 30, 20, 22 },
+	{ 29, 31, 21, 23 },
+	{  8, 10,  0,  2 },
+	{  9, 11,  1,  3 },
+	{ 12, 14,  4,  6 },
+	{ 13, 15,  5,  7 }
+};
+
+__constant uchar blockTable16SZ[8][4] =
+{
+	{ 24, 26,  8, 10 },
+	{ 25, 27,  9, 11 },
+	{ 16, 18,  0,  2 },
+	{ 17, 19,  1,  3 },
+	{ 28, 30, 12, 14 },
+	{ 29, 31, 13, 15 },
+	{ 20, 22,  4,  6 },
+	{ 21, 23,  5,  7 }
+};
+
+__constant uchar blockTable8[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable4[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar columnTable32[8][8] =
+{
+	{  0,  1,  4,  5,  8,  9, 12, 13 },
+	{  2,  3,  6,  7, 10, 11, 14, 15 },
+	{ 16, 17, 20, 21, 24, 25, 28, 29 },
+	{ 18, 19, 22, 23, 26, 27, 30, 31 },
+	{ 32, 33, 36, 37, 40, 41, 44, 45 },
+	{ 34, 35, 38, 39, 42, 43, 46, 47 },
+	{ 48, 49, 52, 53, 56, 57, 60, 61 },
+	{ 50, 51, 54, 55, 58, 59, 62, 63 },
+};
+
+__constant uchar columnTable16[8][16] =
+{
+	{   0,   2,   8,  10,  16,  18,  24,  26,
+	    1,   3,   9,  11,  17,  19,  25,  27 },
+	{   4,   6,  12,  14,  20,  22,  28,  30,
+	    5,   7,  13,  15,  21,  23,  29,  31 },
+	{  32,  34,  40,  42,  48,  50,  56,  58,
+	   33,  35,  41,  43,  49,  51,  57,  59 },
+	{  36,  38,  44,  46,  52,  54,  60,  62,
+	   37,  39,  45,  47,  53,  55,  61,  63 },
+	{  64,  66,  72,  74,  80,  82,  88,  90,
+	   65,  67,  73,  75,  81,  83,  89,  91 },
+	{  68,  70,  76,  78,  84,  86,  92,  94,
+	   69,  71,  77,  79,  85,  87,  93,  95 },
+	{  96,  98, 104, 106, 112, 114, 120, 122,
+	   97,  99, 105, 107, 113, 115, 121, 123 },
+	{ 100, 102, 108, 110, 116, 118, 124, 126,
+	  101, 103, 109, 111, 117, 119, 125, 127 },
+};
+
+__constant uchar columnTable8[16][16] =
+{
+	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
+	    2,   6,  18,  22,  34,  38,  50,  54 },
+	{   8,  12,  24,  28,  40,  44,  56,  60,
+	   10,  14,  26,  30,  42,  46,  58,  62 },
+	{  33,  37,  49,  53,   1,   5,  17,  21,
+	   35,  39,  51,  55,   3,   7,  19,  23 },
+	{  41,  45,  57,  61,   9,  13,  25,  29,
+	   43,  47,  59,  63,  11,  15,  27,  31 },
+	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
+	   98, 102, 114, 118,  66,  70,  82,  86 },
+	{ 104, 108, 120, 124,  72,  76,  88,  92,
+	  106, 110, 122, 126,  74,  78,  90,  94 },
+	{  65,  69,  81,  85,  97, 101, 113, 117,
+	   67,  71,  83,  87,  99, 103, 115, 119 },
+	{  73,  77,  89,  93, 105, 109, 121, 125,
+	   75,  79,  91,  95, 107, 111, 123, 127 },
+	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
+	  130, 134, 146, 150, 162, 166, 178, 182 },
+	{ 136, 140, 152, 156, 168, 172, 184, 188,
+	  138, 142, 154, 158, 170, 174, 186, 190 },
+	{ 161, 165, 177, 181, 129, 133, 145, 149,
+	  163, 167, 179, 183, 131, 135, 147, 151 },
+	{ 169, 173, 185, 189, 137, 141, 153, 157,
+	  171, 175, 187, 191, 139, 143, 155, 159 },
+	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
+	  226, 230, 242, 246, 194, 198, 210, 214 },
+	{ 232, 236, 248, 252, 200, 204, 216, 220,
+	  234, 238, 250, 254, 202, 206, 218, 222 },
+	{ 193, 197, 209, 213, 225, 229, 241, 245,
+	  195, 199, 211, 215, 227, 231, 243, 247 },
+	{ 201, 205, 217, 221, 233, 237, 249, 253,
+	  203, 207, 219, 223, 235, 239, 251, 255 },
+};
+
+__constant ushort columnTable4[16][32] =
+{
+	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
+	    2,  10,  34,  42,  66,  74,  98, 106,
+	    4,  12,  36,  44,  68,  76, 100, 108,
+	    6,  14,  38,  46,  70,  78, 102, 110 },
+	{  16,  24,  48,  56,  80,  88, 112, 120,
+	   18,  26,  50,  58,  82,  90, 114, 122,
+	   20,  28,  52,  60,  84,  92, 116, 124,
+	   22,  30,  54,  62,  86,  94, 118, 126 },
+	{  65,  73,  97, 105,   1,   9,  33,  41,
+	   67,  75,  99, 107,   3,  11,  35,  43,
+	   69,  77, 101, 109,   5,  13,  37,  45,
+	   71,  79, 103, 111,   7,  15,  39,  47 },
+	{  81,  89, 113, 121,  17,  25,  49,  57,
+	   83,  91, 115, 123,  19,  27,  51,  59,
+	   85,  93, 117, 125,  21,  29,  53,  61,
+	   87,  95, 119, 127,  23,  31,  55,  63 },
+	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
+	  194, 202, 226, 234, 130, 138, 162, 170,
+	  196, 204, 228, 236, 132, 140, 164, 172,
+	  198, 206, 230, 238, 134, 142, 166, 174 },
+	{ 208, 216, 240, 248, 144, 152, 176, 184,
+	  210, 218, 242, 250, 146, 154, 178, 186,
+	  212, 220, 244, 252, 148, 156, 180, 188,
+	  214, 222, 246, 254, 150, 158, 182, 190 },
+	{ 129, 137, 161, 169, 193, 201, 225, 233,
+	  131, 139, 163, 171, 195, 203, 227, 235,
+	  133, 141, 165, 173, 197, 205, 229, 237,
+	  135, 143, 167, 175, 199, 207, 231, 239 },
+	{ 145, 153, 177, 185, 209, 217, 241, 249,
+	  147, 155, 179, 187, 211, 219, 243, 251,
+	  149, 157, 181, 189, 213, 221, 245, 253,
+	  151, 159, 183, 191, 215, 223, 247, 255 },
+	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
+	  258, 266, 290, 298, 322, 330, 354, 362,
+	  260, 268, 292, 300, 324, 332, 356, 364,
+	  262, 270, 294, 302, 326, 334, 358, 366 },
+	{ 272, 280, 304, 312, 336, 344, 368, 376,
+	  274, 282, 306, 314, 338, 346, 370, 378,
+	  276, 284, 308, 316, 340, 348, 372, 380,
+	  278, 286, 310, 318, 342, 350, 374, 382 },
+	{ 321, 329, 353, 361, 257, 265, 289, 297,
+	  323, 331, 355, 363, 259, 267, 291, 299,
+	  325, 333, 357, 365, 261, 269, 293, 301,
+	  327, 335, 359, 367, 263, 271, 295, 303 },
+	{ 337, 345, 369, 377, 273, 281, 305, 313,
+	  339, 347, 371, 379, 275, 283, 307, 315,
+	  341, 349, 373, 381, 277, 285, 309, 317,
+	  343, 351, 375, 383, 279, 287, 311, 319 },
+	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
+	  450, 458, 482, 490, 386, 394, 418, 426,
+	  452, 460, 484, 492, 388, 396, 420, 428,
+	  454, 462, 486, 494, 390, 398, 422, 430 },
+	{ 464, 472, 496, 504, 400, 408, 432, 440,
+	  466, 474, 498, 506, 402, 410, 434, 442,
+	  468, 476, 500, 508, 404, 412, 436, 444,
+	  470, 478, 502, 510, 406, 414, 438, 446 },
+	{ 385, 393, 417, 425, 449, 457, 481, 489,
+	  387, 395, 419, 427, 451, 459, 483, 491,
+	  389, 397, 421, 429, 453, 461, 485, 493,
+	  391, 399, 423, 431, 455, 463, 487, 495 },
+	{ 401, 409, 433, 441, 465, 473, 497, 505,
+	  403, 411, 435, 443, 467, 475, 499, 507,
+	  405, 413, 437, 445, 469, 477, 501, 509,
+	  407, 415, 439, 447, 471, 479, 503, 511 },
+};
+
+uint BlockNumber32(int x, int y, uint bp, uint bw)
+{
+	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+uint BlockNumber16(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber16S(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber32Z(int x, int y, uint bp, uint bw)
+{
+	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+uint BlockNumber16Z(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber16SZ(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber8(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
+}
+
+uint BlockNumber4(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
+}
+
+uint PixelAddress32(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+uint PixelAddress16(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress16S(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress32Z(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+uint PixelAddress16Z(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress16SZ(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress8(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
+}
+
+uint PixelAddress4(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
+}
+
+uint PixelAddress(int x, int y, uint bp, uint bw, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMT8H:
+	case PSM_PSMT4HL:
+	case PSM_PSMT4HH:
+		return PixelAddress32(x, y, bp, bw);
+	case PSM_PSMCT16: 
+		return PixelAddress16(x, y, bp, bw);
+	case PSM_PSMCT16S: 
+		return PixelAddress16S(x, y, bp, bw);
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return PixelAddress32Z(x, y, bp, bw);
+	case PSM_PSMZ16: 
+		return PixelAddress16Z(x, y, bp, bw);
+	case PSM_PSMZ16S: 
+		return PixelAddress16SZ(x, y, bp, bw);
+	case PSM_PSMT8:
+		return PixelAddress8(x, y, bp, bw);
+	case PSM_PSMT4:
+		return PixelAddress4(x, y, bp, bw);
+	}
+}
+
+uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm)
+{
+	// TODO: replace blockTable with a subset tileTable
+
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6];
+	case PSM_PSMCT16: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMCT16S: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6];
+	case PSM_PSMZ16: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMZ16S: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3];
+	}
+}
+
+uint TilePixelAddress(int x, int y, uint ba, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7];
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15];
+	}
+}
+
+uint ReadFrame(__global uchar* vm, uint addr, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return ((__global uint*)vm)[addr];
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		return ((__global ushort*)vm)[addr];
+	}
+}
+
+void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		((__global uint*)vm)[addr] = value; 
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		((__global ushort*)vm)[addr] = (ushort)value;
+		break;
+	}
+}
+
+bool is16bit(int psm)
+{
+	return psm < 8 && (psm & 3) >= 2;
+}
+
+bool is24bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 1;
+}
+
+bool is32bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 0;
+}
+
+#ifdef PRIM
+
+int GetVertexPerPrim(int prim_class)
+{
+	switch(prim_class)
+	{
+	default:
+	case GS_POINT_CLASS: return 1;
+	case GS_LINE_CLASS: return 2;
+	case GS_TRIANGLE_CLASS: return 3;
+	case GS_SPRITE_CLASS: return 2;
+	}
+}
+
+#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM)
+
+#endif
+
+#ifdef KERNEL_PRIM
+
+__kernel void KERNEL_PRIM(
+	__global gs_env* env,
+	__global uchar* vb_base, 
+	__global uchar* ib_base, 
+	uint vb_start,
+	uint ib_start)
+{
+	size_t prim_index = get_global_id(0);
+
+	__global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start);
+	__global uint* ib = (__global uint*)(ib_base + ib_start);
+	__global gs_prim* prim = &env->prim[prim_index];
+	
+	ib += prim_index * VERTEX_PER_PRIM;
+
+	int2 pmin, pmax;
+
+	if(PRIM == GS_POINT_CLASS)
+	{
+		pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy);
+	}
+	else if(PRIM == GS_LINE_CLASS)
+	{
+		int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
+		int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+	}
+	else if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		__global gs_vertex* v0 = &vb[ib[0]];
+		__global gs_vertex* v1 = &vb[ib[1]];
+		__global gs_vertex* v2 = &vb[ib[2]];
+
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+		int2 p2 = convert_int2_rtp(v2->p.xy);
+
+		pmin = min(min(p0, p1), p2);
+		pmax = max(max(p0, p1), p2);
+
+		prim->v[0].p = v0->p;
+		prim->v[0].tc = v0->tc;
+		prim->v[1].p = v1->p;
+		prim->v[1].tc = v1->tc;
+		prim->v[2].p = v2->p;
+		prim->v[2].tc = v2->tc;
+
+		float4 dp0 = v1->p - v0->p;
+		float4 dp1 = v0->p - v2->p;
+		float4 dp2 = v2->p - v1->p;
+
+		float cp = dp0.x * dp1.y - dp0.y * dp1.x;
+
+		if(cp != 0.0f)
+		{
+			float cp_rcp = 1.0f / cp;// native_recip(cp);
+
+			float2 u = dp0.xy * cp_rcp;
+			float2 v = -dp1.xy * cp_rcp;
+
+			// v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0)
+
+			gs_barycentric b;
+
+			b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x);
+			b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y);
+
+			dp0.xy = dp0.xy * sign(cp);
+			dp1.xy = dp1.xy * sign(cp);
+			dp2.xy = dp2.xy * sign(cp);
+
+			b.zero.x = (dp1.y < 0 || dp1.y == 0 && dp1.x > 0) ? CL_FLT_EPSILON : 0;
+			b.zero.y = (dp0.y < 0 || dp0.y == 0 && dp0.x > 0) ? CL_FLT_EPSILON : 0;
+			b.zero.z = (dp2.y < 0 || dp2.y == 0 && dp2.x > 0) ? CL_FLT_EPSILON : 0;
+
+			// any barycentric(reject_corner) < 0, tile outside the triangle
+
+			b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE;
+			b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE;
+			b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE;
+
+			// TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed
+
+			env->barycentric[prim_index] = b;
+		}
+		else
+		{
+			// TODO: set b.zero to something that always fails the tests
+		}
+	}
+	else if(PRIM == GS_SPRITE_CLASS)
+	{
+		__global gs_vertex* v0 = &vb[ib[0]];
+		__global gs_vertex* v1 = &vb[ib[1]];
+
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+
+		int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0);
+
+		prim->v[0].p = select(v0->p, v1->p, mask); // pmin
+		prim->v[0].tc = select(v0->tc, v1->tc, mask);
+		prim->v[1].p = select(v1->p, v0->p, mask); // pmax
+		prim->v[1].tc = select(v1->tc, v0->tc, mask);
+		prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy);
+	}
+
+	int4 pminmax = (int4)(pmin, pmax);
+
+	env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS);
+}
+
+#endif
+
+#ifdef KERNEL_TILE
+
+int tile_in_triangle(float2 p, gs_barycentric b)
+{
+	float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz;
+
+	f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
+
+	return all(f >= b.zero.xyz);
+}
+
+#if CLEAR == 1
+
+__kernel void KERNEL_TILE(__global gs_env* env)
+{
+	env->batch_counter = 0;
+	env->bounds[get_global_id(0)].first = -1;
+	env->bounds[get_global_id(0)].last = 0;
+}
+
+#elif MODE < 3
+
+#if MAX_PRIM_PER_BATCH != 32
+	#error "MAX_PRIM_PER_BATCH != 32"
+#endif
+
+#define MAX_PRIM_PER_GROUP (32u >> MODE)
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	uint batch_index = get_group_id(2) >> MODE;
+	uint prim_start = get_group_id(2) << (5 - MODE);
+	uint group_prim_index = get_local_id(2);
+	uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+	__global uchar4* bbox = &env->bbox[prim_start];
+	__global gs_barycentric* barycentric = &env->barycentric[prim_start];
+
+	__local uchar4 bbox_cache[MAX_PRIM_PER_GROUP];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP];
+	__local uint visible[8 << MODE];
+
+	if(get_local_id(2) == 0)
+	{
+		visible[bin_index] = 0;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP);
+
+	event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0);
+
+	wait_group_events(1, &e);
+
+	if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	if(group_prim_index < group_prim_count)
+	{
+		int x = bin_dim.x + get_local_id(0);
+		int y = bin_dim.y + get_local_id(1);
+
+		uchar4 r = bbox_cache[group_prim_index];
+
+		uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
+
+		if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+		{
+			test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
+		}
+
+		atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2)));
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_local_id(2) == 0)
+	{
+		#if MODE == 0
+		((__global uint*)&bin[bin_index])[0] = visible[bin_index];
+		#elif MODE == 1
+		((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index];
+		#elif MODE == 2
+		((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index];
+		#endif
+
+		if(visible[bin_index] != 0)
+		{
+			atomic_min(&env->bounds[bin_index].first, batch_index);
+			atomic_max(&env->bounds[bin_index].last, batch_index);
+		}
+	}
+}
+
+#elif MODE == 3
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint batch_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
+	__local uint batch_index;
+
+	size_t local_id = get_local_id(0);
+	size_t local_size = get_local_size(0);
+
+	while(1)
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(local_id == 0)
+		{
+			batch_index = atomic_inc(&env->batch_counter);
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(batch_index >= batch_count) 
+		{
+			break;
+		}
+
+		uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
+		
+		__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+		__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
+		__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
+
+		event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
+
+		wait_group_events(1, &e);
+
+		if(PRIM == GS_TRIANGLE_CLASS)
+		{
+			e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+			wait_group_events(1, &e);
+		}
+
+		for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
+		{
+			int y = bin_index / bin_dim.z;
+			int x = bin_index - y * bin_dim.z;
+
+			x += bin_dim.x;
+			y += bin_dim.y;
+
+			BIN_TYPE visible = 0;
+
+			for(uint i = 0; i < batch_prim_count; i++)
+			{
+				uchar4 r = bbox_cache[i];
+
+				BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
+
+				if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+				{
+					test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
+				}
+
+				visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
+			}
+
+			bin[bin_index] = visible;
+
+			if(visible != 0)
+			{
+				atomic_min(&env->bounds[bin_index].first, batch_index);
+				atomic_max(&env->bounds[bin_index].last, batch_index);
+			}
+		}
+	}
+}
+
+#endif
+
+#endif
+
+#ifdef KERNEL_TFX
+
+bool ZTest(uint zs, uint zd)
+{ 
+	if(ZTEST)
+	{
+		if(is24bit(ZPSM)) zd &= 0x00ffffff;
+
+		switch(ZTST)
+		{
+		case ZTST_NEVER:
+			return false;
+		case ZTST_ALWAYS:
+			return true;
+		case ZTST_GEQUAL:
+			return zs >= zd;
+		case ZTST_GREATER:
+			return zs > zd;
+		}
+	}
+
+	return true;
+}
+
+bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
+{
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		break;
+	case AFAIL_FB_ONLY:
+		if(!ZWRITE) return true;
+		break;
+	case AFAIL_ZB_ONLY:
+		if(!FWRITE) return true;
+		break;
+	case AFAIL_RGB_ONLY:
+		if(!ZWRITE && is24bit(FPSM)) return true;
+		break;
+	}
+
+	uint pass;
+	
+	switch(ATST)
+	{
+	case ATST_NEVER:
+		pass = false;
+		break;
+	case ATST_ALWAYS:
+		return true;
+	case ATST_LESS:
+		pass = alpha < aref;
+		break;
+	case ATST_LEQUAL:
+		pass = alpha <= aref;
+		break;
+	case ATST_EQUAL:
+		pass = alpha == aref;
+		break;
+	case ATST_GEQUAL:
+		pass = alpha >= aref;
+		break;
+	case ATST_GREATER:
+		pass = alpha > aref;
+		break;
+	case ATST_NOTEQUAL:
+		pass = alpha != aref;
+		break;
+	}
+
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		return pass;
+	case AFAIL_FB_ONLY:
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_ZB_ONLY:
+		*fm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_RGB_ONLY:
+		*fm |= pass ? 0 : 0xff000000;
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	}
+
+	return true;
+}
+
+bool DestAlphaTest(uint fd)
+{
+	if(DATE)
+	{
+		if(DATM)
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) != 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) != 0;
+		}
+		else
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) == 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) == 0;
+		}
+	}
+
+	return true;
+}
+
+int Wrap(int a, int b, int c, int mode)
+{
+	switch(mode)
+	{
+	case CLAMP_REPEAT:
+		return a & b;
+	case CLAMP_CLAMP:
+		return clamp(a, 0, c);
+	case CLAMP_REGION_CLAMP:
+		return clamp(a, b, c);
+	case CLAMP_REGION_REPEAT:
+		return (a & b) | c;
+	}
+}
+
+int4 AlphaBlend(int4 c, int afix, uint fd)
+{
+	if(FWRITE && (ABE || AA1))
+	{
+		int4 cs = c;
+		int4 cd;
+
+		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1)
+		{
+			if(is32bit(FPSM) || is24bit(FPSM))
+			{
+				cd.x = fd & 0xff;
+				cd.y = (fd >> 8) & 0xff;
+				cd.z = (fd >> 16) & 0xff;
+				cd.w = fd >> 24;
+			}
+			else if(is16bit(FPSM))
+			{
+				cd.x = (fd & 0x001f) << 3;
+				cd.y = (fd & 0x03e0) >> 2;
+				cd.z = (fd & 0x7c00) >> 7;
+				cd.w = (fd & 0x8000) >> 8;
+			}
+		}
+
+		if(ABA != ABB)
+		{
+			switch(ABA)
+			{
+			case 0: break; // c.xyz = cs.xyz;
+			case 1: c.xyz = cd.xyz; break;
+			case 2: c.xyz = 0; break;
+			}
+
+			switch(ABB)
+			{
+			case 0: c.xyz -= cs.xyz; break;
+			case 1: c.xyz -= cd.xyz; break;
+			case 2: break;
+			}
+
+			if(!(is24bit(FPSM) && ABC == 1))
+			{
+				int a = 0;
+
+				switch(ABC)
+				{
+				case 0: a = cs.w; break;
+				case 1: a = cd.w; break;
+				case 2: a = afix; break;
+				}
+
+				c.xyz = c.xyz * a >> 7;
+			}
+
+			switch(ABD)
+			{
+			case 0: c.xyz += cs.xyz; break;
+			case 1: c.xyz += cd.xyz; break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			switch(ABD)
+			{
+			case 0: break;
+			case 1: c.xyz = cd.xyz; break;
+			case 2: c.xyz = 0; break;
+			}
+		}
+
+		if(PABE)
+		{
+			c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24));
+		}
+	}
+
+	return c;
+}
+
+uchar4 Expand24To32(uint rgba, uchar ta0)
+{
+	uchar4 c;
+
+	c.x = rgba & 0xff;
+	c.y = (rgba >> 8) & 0xff;
+	c.z = (rgba >> 16) & 0xff;
+	c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0;
+
+	return c;
+}
+
+uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
+{
+	uchar4 c;
+
+	c.x = (rgba & 0x001f) << 3;
+	c.y = (rgba & 0x03e0) >> 2;
+	c.z = (rgba & 0x7c00) >> 7;
+	c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
+
+	return c;
+}
+
+int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb)
+{
+	uchar4 c;
+
+	uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM);
+
+	__global ushort* vm16 = (__global ushort*)vm;
+	__global uint* vm32 = (__global uint*)vm;
+
+	switch(TPSM)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+		c = ((__global uchar4*)vm)[addr];
+		break;
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		c = Expand24To32(vm32[addr], pb->ta0);
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		c = Expand16To32(vm16[addr], pb->ta0, pb->ta1);
+		break;
+	case PSM_PSMT8:
+		c = pb->clut[vm[addr]];
+		break;
+	case PSM_PSMT4:
+		c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f];
+		break;
+	case PSM_PSMT8H:
+		c = pb->clut[vm32[addr] >> 24];
+		break;
+	case PSM_PSMT4HL:
+		c = pb->clut[(vm32[addr] >> 24) & 0x0f];
+		break;
+	case PSM_PSMT4HH:
+		c = pb->clut[(vm32[addr] >> 28) & 0x0f];
+		break;
+	}
+
+	//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]);
+
+	return convert_int4(c);
+}
+
+__kernel void KERNEL_TFX(
+	__global gs_env* env,
+	__global uchar* vm,
+	__global uchar* tex,
+	__global uchar* pb_base, 
+	uint pb_start,
+	uint prim_start, 
+	uint prim_count,
+	uint batch_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way
+
+	uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x;
+	uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y;
+	uint bin_index = bin_y * bin_dim.z + bin_x;
+
+	uint batch_first = env->bounds[bin_index].first;
+	uint batch_last = env->bounds[bin_index].last;
+	uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS;
+
+	if(batch_last < batch_first)
+	{
+		return;
+	}
+
+	uint skip;
+	
+	if(batch_start < batch_first)
+	{
+		uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1));
+
+		if(n > prim_count) 
+		{
+			return;
+		}
+
+		skip = 0;
+		prim_count -= n;
+		batch_start = batch_first;
+	}
+	else
+	{
+		skip = prim_start & (MAX_PRIM_PER_BATCH - 1);
+		prim_count += skip;
+	}
+
+	if(batch_start > batch_last) 
+	{
+		return;
+	}
+	
+	prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS);
+
+	//
+
+	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
+
+	uint x = get_global_id(0);
+	uint y = get_global_id(1);
+
+	int2 pi = (int2)(x, y);
+	float2 pf = convert_float2(pi);
+
+	if(!NOSCISSOR)
+	{
+		int4 scissor = pb->scissor;
+
+		if(!all((pi >= scissor.xy) & (pi < scissor.zw)))
+		{
+			return;
+		}
+	}
+
+	uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
+	uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
+
+	uint fd, zd;
+
+	if(RFB) 
+	{
+		fd = ReadFrame(vm, faddr, FPSM);
+	}
+
+	if(ZTEST)
+	{
+		zd = ReadFrame(vm, zaddr, ZPSM);
+	}
+/*
+	// TODO: lookup top left address of this tile + local offset
+	//
+	// 32bpp: 8x8 block size, 4 blocks, 1024 bytes
+	// 0 1
+	// 2 3
+	// 16bpp: 16x8 block size, 2 blocks, 512 bytes
+	// 0
+	// 1
+	// linear access in memory, this layout is the same for all formats
+
+	__local uint fbn, zbn;
+	__local uchar fb[1024], zb[1024];
+
+	if(get_local_id(0) == 0 && get_local_id(1) == 0)
+	{
+		fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
+		zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint faddr = TilePixelAddress(x, y, fbn, FPSM);
+	uint zaddr = TilePixelAddress(x, y, zbn, ZPSM);
+
+	if(RFB)
+	{
+		event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	if(ZTEST)
+	{
+		event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	// not sure if faster
+*/
+
+	// early destination alpha test
+
+	if(!DestAlphaTest(fd))
+	{
+		return;
+	}
+
+	//
+
+	uint fragments = 0;
+
+	//__local gs_prim p;
+
+	__global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case"
+	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
+	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
+
+	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
+
+	__local gs_prim prim_cache;
+
+	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
+	{
+		while(bin_value != 0)
+		{
+			uint i = clz(bin_value);
+
+			if(prim_index + i >= prim_count)
+			{
+				break;
+			}
+
+			__global gs_prim* prim = &prim_base[prim_index + i];
+			
+			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
+
+			uint2 zf;
+			float4 t;
+			int4 c;
+
+			 // TODO: do not hittest if we know the tile is fully inside the prim
+
+			if(PRIM == GS_POINT_CLASS)
+			{
+				// TODO: distance.x < 0.5f || distance.y < 0.5f
+
+				continue;
+			}
+			else if(PRIM == GS_LINE_CLASS)
+			{
+				// TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f
+
+				continue;
+			}
+			else if(PRIM == GS_TRIANGLE_CLASS)
+			{
+				__global gs_barycentric* b = &barycentric[prim_index + i];
+
+				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
+
+				f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
+
+				if(!all(f >= b->zero.xyz))
+				{
+					continue;
+				}
+
+				zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y);
+
+				t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
+
+				if(IIP)
+				{
+					float4 c0 = convert_float4(prim->v[0].c);
+					float4 c1 = convert_float4(prim->v[1].c);
+					float4 c2 = convert_float4(prim->v[2].c);
+
+					c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y);
+				}
+				else
+				{
+					c = convert_int4(prim->v[2].c);
+				}
+			}
+			else if(PRIM == GS_SPRITE_CLASS)
+			{
+				int2 tl = convert_int2_rtp(prim->v[0].p.xy);
+				int2 br = convert_int2_rtp(prim->v[1].p.xy);
+
+				if(!all((pi >= tl) & (pi < br)))
+				{
+					continue;
+				}
+
+				zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint
+				
+				t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy);
+				t.z = prim->v[0].tc.z;
+
+				c = convert_int4(prim->v[1].c);
+			}
+
+			// z test
+
+			uint zs = zf.x;
+
+			if(!ZTest(zs, zd))
+			{
+				continue;
+			}
+
+			// sample texture
+
+			int4 ct;
+
+			if(FB && TFX != TFX_NONE)
+			{
+				// TODO
+
+				if(0)//if(MMIN)
+				{
+					// TODO
+				}
+				else
+				{
+					int2 uv;
+
+					if(!FST)
+					{
+						uv = convert_int2(t.xy * (1.0f / t.z));
+
+						if(LTF) uv -= 0x0008;
+					}
+					else
+					{
+						uv = convert_int2(t.xy);
+					}
+
+					int2 uvf = uv & 0x000f;
+
+					int2 uv0 = uv >> 4;
+					int2 uv1 = uv0 + 1;
+
+					uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
+					uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
+					uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
+					uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
+					
+					tex = vm; // TODO: use the texture cache
+
+					int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
+					int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
+					int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
+					int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
+
+					if(LTF)
+					{
+						c00 = ((c01 - c00) * uvf.x >> 4) + c00;
+						c10 = ((c11 - c10) * uvf.x >> 4) + c10;
+						c00 = ((c10 - c00) * uvf.y >> 4) + c00;
+					}
+
+					ct = c00;
+				}
+			}
+
+			// alpha tfx
+
+			if(FB)
+			{
+				if(TCC)
+				{
+					switch(TFX)
+					{
+					case TFX_MODULATE:
+						c.w = clamp(ct.w * c.w >> 7, 0, 0xff);
+						break;
+					case TFX_DECAL:
+						c.w = ct.w;
+						break;
+					case TFX_HIGHLIGHT:
+						c.w = clamp(ct.w + c.w, 0, 0xff);
+						break;
+					case TFX_HIGHLIGHT2:
+						c.w = ct.w;
+						break;
+					}
+				}
+
+				if(AA1)
+				{
+					if(!ABE || c.w == 0x80)
+					{
+						// TODO: c.w = coverage; // coverage 0x80 at 100%
+					}
+				}
+			}
+
+			// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
+
+			uint fm = pb->fm;
+			uint zm = pb->zm;
+
+			// alpha test
+
+			if(!AlphaTest(c.w, pb->aref, &fm, &zm))
+			{
+				continue;
+			}
+
+			// all tests done, we have a new output
+
+			fragments++;
+
+			// write z
+
+			if(ZWRITE)
+			{
+				zd = bitselect(zs, zd, zm);
+			}
+
+			// rgb tfx
+
+			if(FWRITE)
+			{
+				switch(TFX)
+				{
+				case TFX_MODULATE:
+					c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff);
+					break;
+				case TFX_DECAL:
+					c.xyz = ct.xyz;
+					break;
+				case TFX_HIGHLIGHT:
+				case TFX_HIGHLIGHT2:					
+					c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff);
+					break;
+				}
+			}
+
+			// fog
+
+			if(FWRITE && FGE)
+			{
+				int fog = (int)zf.y;
+
+				c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8);				
+			}
+
+			// alpha blend
+
+			c = AlphaBlend(c, pb->afix, fd);
+
+			// write frame
+
+			if(FWRITE)
+			{
+				if(DTHE && is16bit(FPSM))
+				{
+					// TODO: c += pb->dimx[y & 3]
+				}
+
+				c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff);
+				
+				if(FBA && !is24bit(FPSM))
+				{
+					c.w |= 0x80;
+				}
+
+				uint fs;
+
+				if(is32bit(FPSM))
+				{
+					fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is24bit(FPSM))
+				{
+					fs = (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is16bit(FPSM))
+				{
+					fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3);
+				}
+
+				fd = RFB ? bitselect(fs, fd, fm) : fs;
+
+				// dest alpha test for the next loop
+
+				if(!DestAlphaTest(fd))
+				{
+					prim_index = prim_count; // game over
+
+					break;
+				}
+			}
+		}
+
+		bin += bin_count;
+		bin_value = *bin;
+	}
+
+	if(fragments > 0)
+	{
+		// TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place)
+
+		// TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format)
+
+		if(ZWRITE)
+		{
+			WriteFrame(vm, zaddr, ZPSM, zd);
+		}
+
+		if(FWRITE)
+		{
+			WriteFrame(vm, faddr, FPSM, fd);
+			//WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments);
+		}
+	}
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index 02626f8dad..b9bfa19737 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -43,6 +43,7 @@
 #include <d3dx9.h>
 #include <comutil.h>
 #include "../../common/include/comptr.h"
+#include <CL/cl.hpp>
 
 #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
 #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO
diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props
index 71bf73f281..91c58981d0 100644
--- a/plugins/GSdx/vsprops/common.props
+++ b/plugins/GSdx/vsprops/common.props
@@ -8,22 +8,22 @@
   <ItemDefinitionGroup>
     <ClCompile>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <FloatingPointModel>Fast</FloatingPointModel>
       <RuntimeTypeInfo>false</RuntimeTypeInfo>
       <WarningLevel>Level4</WarningLevel>
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
       <DisableSpecificWarnings>4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <AdditionalIncludeDirectories>$(DXSDK_DIR)include;$(VTUNE_AMPLIFIER_XE_2013_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(DXSDK_DIR)include;$(INTELOCLSDKROOT)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
     <Link>
-      <AdditionalDependencies>d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;comsuppw.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;opencl.lib;comsuppw.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <DelayLoadDLLs>d3d9.dll;d3dx9_43.dll;d3d11.dll;d3dx11_43.dll;dxgi.dll;opengl32.dll;%(DelayLoadDLLs)</DelayLoadDLLs>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
       <RandomizedBaseAddress>false</RandomizedBaseAddress>
-      <AdditionalLibraryDirectories>$(VTUNE_AMPLIFIER_XE_2013_DIR)lib32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(VTUNE_AMPLIFIER_XE_2015_DIR)lib32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
       <Command>.\postBuild.cmd "$(TargetPath)" "$(TargetName)" $(TargetExt) $(PcsxSubsection)</Command>
diff --git a/plugins/GSdx/vsprops/x64.props b/plugins/GSdx/vsprops/x64.props
index 947675c717..ab8253d819 100644
--- a/plugins/GSdx/vsprops/x64.props
+++ b/plugins/GSdx/vsprops/x64.props
@@ -5,7 +5,7 @@
   <PropertyGroup />
   <ItemDefinitionGroup>
     <Link>
-      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x64;$(INTELOCLSDKROOT)lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <ClCompile>
       <PreprocessorDefinitions>_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
diff --git a/plugins/GSdx/vsprops/x86.props b/plugins/GSdx/vsprops/x86.props
index ff76b9535d..56171d784c 100644
--- a/plugins/GSdx/vsprops/x86.props
+++ b/plugins/GSdx/vsprops/x86.props
@@ -5,7 +5,7 @@
   <PropertyGroup />
   <ItemDefinitionGroup>
     <Link>
-      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x86;$(INTELOCLSDKROOT)lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <ClCompile />
     <ClCompile>