diff --git a/plugins/GSdx/GPURenderer.h b/plugins/GSdx/GPURenderer.h
index 1a7b505ced..16568a0da2 100644
--- a/plugins/GSdx/GPURenderer.h
+++ b/plugins/GSdx/GPURenderer.h
@@ -123,13 +123,13 @@ protected:
 		int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
 		Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);
 
-		if (!vertices)
+		if(vertices == NULL)
 		{
 			printf("GSdx: failed to allocate %d bytes for verticles.\n", sizeof(Vertex) * maxcount);
 			throw GSDXError();
 		}
 
-		if (m_vertices != NULL)
+		if(m_vertices != NULL)
 		{
 			memcpy(vertices, m_vertices, sizeof(Vertex) * m_maxcount);
 			_aligned_free(m_vertices);
diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp
index 2f23f0903f..5d91813659 100644
--- a/plugins/GSdx/GS.cpp
+++ b/plugins/GSdx/GS.cpp
@@ -37,6 +37,7 @@
 #include "GSWndDX.h"
 #include "GSWndWGL.h"
 #include "GSRendererCS.h"
+#include "GSRendererCL.h"
 #include "GSSettingsDlg.h"
 
 static HRESULT s_hr = E_FAIL;
@@ -203,6 +204,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
 	}
 
 	GSWnd* wnd[2];
+
 	try
 	{
 		if(s_renderer != renderer)
@@ -216,78 +218,72 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
 			s_gs = NULL;
 		}
 
-		if(renderer == 15)
+		switch(renderer)
 		{
-			#ifdef _WINDOWS
-	
-			dev = new GSDevice11();
-
-			if(dev == NULL)
-			{
-				return -1;
-			}
-
-			delete s_gs;
-
-			s_gs = new GSRendererCS();
-
-			s_renderer = renderer;
-
-			#endif
+		default:
+#ifdef _WINDOWS
+		case 0: case 1: case 2: case 14:
+			dev = new GSDevice9(); 
+			break;
+		case 3: case 4: case 5: case 15:
+			dev = new GSDevice11(); 
+			break;
+#endif
+		case 9: case 10: case 11: case 16:
+			dev = new GSDeviceNull(); 
+			break;
+		case 12: case 13: case 17:
+			dev = new GSDeviceOGL(); 
+			break;
 		}
-		else
+
+		if(dev == NULL)
 		{
-			switch(renderer / 3)
+			return -1;
+		}
+
+		if(s_gs == NULL)
+		{
+			switch(renderer)
 			{
 			default:
-				#ifdef _WINDOWS
-				case 0: dev = new GSDevice9(); break;
-				case 1: dev = new GSDevice11(); break;
-				#endif
-				case 3: dev = new GSDeviceNull(); break;
-				case 4: dev = new GSDeviceOGL(); break;
-			}
-
-			if(dev == NULL)
-			{
-				return -1;
-			}
-
-			if(s_gs == NULL)
-			{
-				switch(renderer % 3)
-				{
-					default:
-					case 0:
-						switch(renderer) 
-						{ 
-							default:
 #ifdef _WINDOWS
-							case 0: s_gs = (GSRenderer*)new GSRendererDX9(); break;
-							case 3: s_gs = (GSRenderer*)new GSRendererDX11(); break;
+			case 0:
+				s_gs = (GSRenderer*)new GSRendererDX9();
+				break;
+			case 3: 
+				s_gs = (GSRenderer*)new GSRendererDX11(); 
+				break;
 #endif
-							case 12: s_gs = (GSRenderer*)new GSRendererOGL(); break;
-						}
-						break;
-					case 1:
-						s_gs = new GSRendererSW(threads);
-						break;
-					case 2:
-						s_gs = new GSRendererNull();
-						break;
-				}
-
-				s_renderer = renderer;
+			case 12: 
+				s_gs = (GSRenderer*)new GSRendererOGL(); 
+				break;
+			case 1: case 4: case 10: case 13:
+				s_gs = new GSRendererSW(threads);
+				break;
+			case 2: case 5: case 11:
+				s_gs = new GSRendererNull();
+				break;
+			case 14: case 15: case 16: case 17:
+				s_gs = new GSRendererCL();
+				break;
 			}
+
+			s_renderer = renderer;
 		}
 
 		if (s_gs->m_wnd == NULL)
 		{
 #ifdef _WINDOWS
-			if (renderer / 3 == 4)
+			switch(renderer)
+			{
+			case 12: case 13: case 17:
 				s_gs->m_wnd = new GSWndWGL();
-			else
+				break;
+			default:
 				s_gs->m_wnd = new GSWndDX();
+				break;
+			}
 #else
 #ifdef ENABLE_GLES
 			wnd[0] = NULL;
@@ -681,8 +677,10 @@ EXPORT_C GSkeyEvent(GSKeyEventData* e)
 {
 	try
 	{
-		if (gsopen_done)
+		if(gsopen_done)
+		{
 			s_gs->KeyEvent(e);
+		}
 	}
 	catch (GSDXRecoverableError)
 	{
@@ -1218,15 +1216,11 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 {
 	::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS);
 
-	FILE* file = fopen("c:\\temp1\\log.txt", "a");
-
-	fprintf(file, "-------------------------\n\n");
+	Console console("GSdx", true);
 
 	if(1)
 	{
-		GSLocalMemory * pMem = new GSLocalMemory();
-		GSLocalMemory& mem(*pMem);
-		
+		GSLocalMemory* mem = new GSLocalMemory();		
 
 		static struct {int psm; const char* name;} s_format[] =
 		{
@@ -1258,7 +1252,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 			int w = 1 << tbw;
 			int h = 1 << tbw;
 
-			fprintf(file, "%d x %d\n\n", w, h);
+			printf("%d x %d\n\n", w, h);
 
 			for(size_t i = 0; i < countof(s_format); i++)
 			{
@@ -1308,7 +1302,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 
 				clock_t start, end;
 
-				_ftprintf(file, _T("[%4s] "), s_format[i].name);
+				printf("[%4s] ", s_format[i].name);
 
 				start = clock();
 
@@ -1317,12 +1311,12 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 					int x = 0;
 					int y = 0;
 
-					(mem.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+					(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
 				start = clock();
 
@@ -1331,25 +1325,25 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 					int x = 0;
 					int y = 0;
 
-					(mem.*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+					(mem->*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
-				const GSOffset* o = mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+				const GSOffset* o = mem->GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
 
 				start = clock();
 
 				for(int j = 0; j < n; j++)
 				{
-					(mem.*rtx)(o, r, ptr, w * 4, TEXA);
+					(mem->*rtx)(o, r, ptr, w * 4, TEXA);
 				}
 
 				end = clock();
 
-				fprintf(file, "%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				printf("%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 
 				if(psm.pal > 0)
 				{
@@ -1357,32 +1351,30 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 
 					for(int j = 0; j < n; j++)
 					{
-						(mem.*rtxP)(o, r, ptr, w, TEXA);
+						(mem->*rtxP)(o, r, ptr, w, TEXA);
 					}
 
 					end = clock();
 
-					fprintf(file, "| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+					printf("| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
 				}
 
-				fprintf(file, "\n");
-
-				fflush(file);
+				printf("\n");
 			}
 
-			fprintf(file, "\n");
+			printf("\n");
 		}
 
 		_aligned_free(ptr);
-		delete pMem;
+
+		delete mem;
 	}
 
 	//
 
 	if(0)
 	{
-		GSLocalMemory * pMem2 = new GSLocalMemory();
-		GSLocalMemory& mem2(*pMem2);
+		GSLocalMemory* mem = new GSLocalMemory();
 
 		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
 
@@ -1413,13 +1405,13 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 		int x = 0;
 		int y = 0;
 
-		(mem2.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
-		delete pMem2;
+		(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+
+		delete mem;
 	}
 
 	//
 
-	fclose(file);
 	PostQuitMessage(0);
 }
 
diff --git a/plugins/GSdx/GSDevice9.h b/plugins/GSdx/GSDevice9.h
index fd32304c70..e7cb160135 100644
--- a/plugins/GSdx/GSDevice9.h
+++ b/plugins/GSdx/GSDevice9.h
@@ -173,7 +173,6 @@ public: // TODO
 	// Shaders...
 
 	hash_map<uint32, GSVertexShader9 > m_vs;
-	D3DXHANDLE m_vs_params;
 	hash_map<uint32, CComPtr<IDirect3DPixelShader9> > m_ps;
 	hash_map<uint32, Direct3DSamplerState9* > m_ps_ss;
 	hash_map<uint32, Direct3DDepthStencilState9* > m_om_dss;
diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp
index 10cb2433f5..757463d134 100644
--- a/plugins/GSdx/GSLocalMemory.cpp
+++ b/plugins/GSdx/GSLocalMemory.cpp
@@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@@ -801,27 +801,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			{
 			case PSM_PSMCT32:
 			case PSM_PSMZ32:
-				ReadColumn32(y, dst, buff, 32);
+				GSBlock::ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 4], 32);
-				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
 			case PSM_PSMZ16:
 			case PSM_PSMZ16S:
-				ReadColumn16(y, dst, buff, 32);
+				GSBlock::ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 2], 32);
-				WriteColumn16<32>(y, dst, buff, 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
-				ReadColumn8(y, dst, buff, 16);
+				GSBlock::ReadColumn8(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
-				ReadColumn4(y, dst, buff, 16);
+				GSBlock::ReadColumn4(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@@ -888,27 +888,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			{
 			case PSM_PSMCT32:
 			case PSM_PSMZ32:
-				ReadColumn32(y, dst, buff, 32);
+				GSBlock::ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 4], 32);
-				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
 			case PSM_PSMZ16:
 			case PSM_PSMZ16S:
-				ReadColumn16(y, dst, buff, 32);
+				GSBlock::ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 2], 32);
-				WriteColumn16<32>(y, dst, buff, 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
-				ReadColumn8(y, dst, buff, 16);
+				GSBlock::ReadColumn8(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
-				ReadColumn4(y, dst, buff, 16);
+				GSBlock::ReadColumn4(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<32>(y, dst, buff, 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@@ -1060,7 +1060,7 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, const uint8* src, int len, GI
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1094,7 +1094,7 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, const uint8* src, int len, GI
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1128,7 +1128,7 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1162,7 +1162,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
 			}
 		}
 
@@ -1196,7 +1196,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw));
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw));
 			}
 		}
 
@@ -1612,7 +1612,7 @@ void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadBlock32(src, dst, dstpitch);
+		GSBlock::ReadBlock32(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1623,7 +1623,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 8, 8, 32)
 		{
-			ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1631,7 +1631,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 8, 8, 32)
 		{
-			ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1643,7 +1643,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 16, 8, 32)
 		{
-			ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1651,7 +1651,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
 	{
 		FOREACH_BLOCK_START(r, 16, 8, 32)
 		{
-			ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
+			GSBlock::ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
 		}
 		FOREACH_BLOCK_END
 	}
@@ -1663,7 +1663,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r
 
 	FOREACH_BLOCK_START(r, 16, 16, 32)
 	{
-		ReadAndExpandBlock8_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock8_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1674,7 +1674,7 @@ void GSLocalMemory::ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r
 
 	FOREACH_BLOCK_START(r, 32, 16, 32)
 	{
-		ReadAndExpandBlock4_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1685,7 +1685,7 @@ void GSLocalMemory::ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock8H_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock8H_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1696,7 +1696,7 @@ void GSLocalMemory::ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1707,7 +1707,7 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i&
 
 	FOREACH_BLOCK_START(r, 8, 8, 32)
 	{
-		ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal);
+		GSBlock::ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1718,7 +1718,7 @@ void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, cons
 {
 	ALIGN_STACK(32);
 
-	ReadBlock32(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock32(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@@ -1727,11 +1727,11 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
 
 	if(TEXA.AEM)
 	{
-		ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 	else
 	{
-		ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 }
 
@@ -1741,11 +1741,11 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons
 
 	if(TEXA.AEM)
 	{
-		ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 	else
 	{
-		ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+		GSBlock::ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
 	}
 }
 
@@ -1753,35 +1753,35 @@ void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
+	GSBlock::ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 ///////////////////
@@ -1870,7 +1870,7 @@ void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 16, 16, 8)
 	{
-		ReadBlock8(src, dst, dstpitch);
+		GSBlock::ReadBlock8(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1879,7 +1879,7 @@ void GSLocalMemory::ReadTexture4P(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 32, 16, 8)
 	{
-		ReadBlock4P(src, dst, dstpitch);
+		GSBlock::ReadBlock4P(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1888,7 +1888,7 @@ void GSLocalMemory::ReadTexture8HP(const GSOffset* RESTRICT o, const GSVector4i&
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock8HP(src, dst, dstpitch);
+		GSBlock::ReadBlock8HP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1897,7 +1897,7 @@ void GSLocalMemory::ReadTexture4HLP(const GSOffset* RESTRICT o, const GSVector4i
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock4HLP(src, dst, dstpitch);
+		GSBlock::ReadBlock4HLP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1906,7 +1906,7 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i
 {
 	FOREACH_BLOCK_START(r, 8, 8, 8)
 	{
-		ReadBlock4HHP(src, dst, dstpitch);
+		GSBlock::ReadBlock4HHP(src, dst, dstpitch);
 	}
 	FOREACH_BLOCK_END
 }
@@ -1915,35 +1915,35 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i
 
 void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ReadBlock8(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock8(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4P(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4P(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
 	ALIGN_STACK(32);
 
-	ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
+	GSBlock::ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
 }
 
 //
diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h
index 67f2cf0c27..ea83bfd53b 100644
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@@ -76,7 +76,7 @@ struct GSPixelOffset4
 	uint32 fbp, zbp, fpsm, zpsm, bw;
 };
 
-class GSLocalMemory : public GSBlock
+class GSLocalMemory : public GSAlignedClass<32>
 {
 public:
 	typedef uint32 (*pixelAddress)(int x, int y, uint32 bp, uint32 bw);
diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp
new file mode 100644
index 0000000000..357a2a3e23
--- /dev/null
+++ b/plugins/GSdx/GSRendererCL.cpp
@@ -0,0 +1,1780 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererCL.h"
+
+#define LOG 0
+
+static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
+
+#define MAX_FRAME_SIZE 2048
+#define MAX_PRIM_COUNT 4096u
+#define MAX_PRIM_PER_BATCH_BITS 5
+#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS)
+#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH)
+#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT)
+#define BIN_SIZE_BITS 4
+#define BIN_SIZE (1u << BIN_SIZE_BITS)
+#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
+#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
+
+#if MAX_PRIM_PER_BATCH == 64u
+#define BIN_TYPE cl_ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+#define BIN_TYPE cl_uint
+#else
+#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+#pragma pack(push, 1)
+
+typedef struct
+{
+	GSVertexCL v[4];
+} gs_prim;
+
+typedef struct
+{
+	cl_float4 dx, dy;
+	cl_float4 zero;
+	cl_float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	cl_uint batch_counter;
+	cl_uint _pad[7];
+	struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	cl_uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+#pragma pack(pop)
+
+GSRendererCL::GSRendererCL()
+	: m_vb_count(0)
+{
+	m_nativeres = true; // ignore ini, sw is always native
+
+	//s_dump = 1;
+	//s_save = 1;
+	//s_savez = 1;
+
+	// TODO: m_tc = new GSTextureCacheCL(this);
+
+	memset(m_texture, 0, sizeof(m_texture));
+
+	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
+
+	memset(m_rw_pages, 0, sizeof(m_rw_pages));
+	memset(m_tex_pages, 0, sizeof(m_tex_pages));
+
+	#define InitCVB(P) \
+		m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
+		m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer<P, 0, 1>; \
+		m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer<P, 1, 0>; \
+		m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer<P, 1, 1>; \
+
+	InitCVB(GS_POINT_CLASS);
+	InitCVB(GS_LINE_CLASS);
+	InitCVB(GS_TRIANGLE_CLASS);
+	InitCVB(GS_SPRITE_CLASS);
+
+	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
+	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
+}
+
+GSRendererCL::~GSRendererCL()
+{
+	// TODO: delete m_tc;
+
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+	}
+
+	_aligned_free(m_output);
+}
+
+void GSRendererCL::Reset()
+{
+	Sync(-1);
+
+	// TODO: m_tc->RemoveAll();
+
+	GSRenderer::Reset();
+}
+
+void GSRendererCL::VSync(int field)
+{
+	Sync(0); // IncAge might delete a cached texture in use
+
+	GSRenderer::VSync(field);
+
+	// TODO: m_tc->IncAge();
+
+	//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
+}
+
+void GSRendererCL::ResetDevice()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+
+		m_texture[i] = NULL;
+	}
+}
+
+GSTexture* GSRendererCL::GetOutput(int i)
+{
+	Sync(1);
+
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	int w = DISPFB.FBW * 64;
+	int h = GetFrameRect(i).bottom;
+
+	// TODO: round up bottom
+
+	if(m_dev->ResizeTexture(&m_texture[i], w, h))
+	{
+		static int pitch = 1024 * 4;
+
+		GSVector4i r(0, 0, w, h);
+
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
+
+		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
+
+		m_texture[i]->Update(r, m_output, pitch);
+
+		if(s_dump)
+		{
+			if(s_save && s_n >= s_saven)
+			{
+				m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+			}
+
+			s_n++;
+		}
+	}
+
+	return m_texture[i];
+}
+
+const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f);
+
+template<uint32 primclass, uint32 tme, uint32 fst>
+void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
+{
+	GSVector4i o = (GSVector4i)m_context->XYOFFSET;
+	GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0);
+
+	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
+	{
+		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
+
+		#if _M_SSE >= 0x401
+
+		GSVector4i xyzuvf(src->m[1]);
+
+		GSVector4i xy = xyzuvf.upl16() - o;
+		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
+
+		#else
+
+		uint32 z = src->XYZ.Z;
+
+		GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o;
+		GSVector4i zf = GSVector4i((int)std::min<uint32>(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later
+
+		#endif
+
+		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
+
+		GSVector4 t = GSVector4::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				#if _M_SSE >= 0x401
+
+				t = GSVector4(xyzuvf.uph16());
+					
+				#else
+
+				t = GSVector4(GSVector4i::load(src->UV).upl16());
+
+				#endif
+			}
+			else
+			{
+				t = stcq.xyww() * st_scale;
+			}
+		}
+
+		dst->t = t.insert32<2, 3>(stcq);
+	}
+}
+
+void GSRendererCL::Draw()
+{
+	const GSDrawingContext* context = m_context;
+
+	GSVector4i scissor = GSVector4i(context->scissor.in);
+	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (example: single line 0,0->256,0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
+	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
+
+	GSVector4i rect = bbox.rintersect(scissor);
+
+	if(rect.rempty())
+	{
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+
+		std::string s;
+
+		if(s_save && s_n >= s_saven && PRIM->TME)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
+
+			m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+
+	try
+	{
+		size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
+		size_t ib_size = m_index.tail * sizeof(uint32);
+		size_t pb_size = sizeof(TFXParameter);
+
+		if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
+		{
+			if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size)
+			{
+				// buffer too small for even one batch, allow twice the size (at least 1 MB)
+
+				Sync(2); // must sync, reallocating the input buffers
+
+				m_cl.Unmap();
+
+				m_cl.vb.size = 0;
+				m_cl.ib.size = 0;
+
+				size_t size = std::max(vb_size * 2, 2u << 20);
+
+				printf("growing vertex/index buffer %d\n", size);
+
+				m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.size = size;
+
+				size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), 1u << 20); // worst case, three times the vertex count
+
+				ASSERT(size >= ib_size);
+
+				if(size < ib_size) size = ib_size; // should not happen
+
+				m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.size = size;
+			}
+			else
+			{
+				Enqueue();
+
+				m_cl.Unmap();
+
+				// make the write queue wait until the rendering queue is ready, it may still use the device buffers
+
+				std::vector<cl::Event> el(1);
+
+				m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]);
+				m_cl.wq->enqueueBarrierWithWaitList(&el, NULL);
+
+				// switch to the other queue/buffer (double buffering)
+
+				m_cl.wqidx = (m_cl.wqidx + 1) & 1;
+				m_cl.wq = &m_cl.queue[m_cl.wqidx];
+			}
+
+			m_cl.vb.head = m_cl.vb.tail = 0;
+			m_cl.ib.head = m_cl.ib.tail = 0;
+			m_cl.pb.head = m_cl.pb.tail = 0;
+
+			m_cl.Map();
+		}
+		else
+		{
+			// only allow batches of the same primclass in Enqueue
+
+			if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass)
+			{
+				Enqueue();
+			}
+		}
+
+		//
+
+		GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
+		uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
+		TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
+
+		pb->scissor = scissor;
+		pb->bbox = bbox;
+		pb->rect = rect;
+
+		(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? 
+
+		if(m_jobs.empty())
+		{
+			memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32));
+
+			m_vb_start = m_cl.vb.tail;
+		}
+		else
+		{
+			// TODO: SIMD
+
+			uint32 vb_count = m_vb_count;
+
+			for(size_t i = 0; i < m_index.tail; i++)
+			{
+				ib[i] = m_index.buff[i] + vb_count;
+			}
+		}
+
+		m_vb_count += m_vertex.next;
+
+		if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail))
+		{
+			return;
+		}
+
+		TFXJob job;
+
+		job.rect.x = rect.x;
+		job.rect.y = rect.y;
+		job.rect.z = rect.z;
+		job.rect.w = rect.w;
+		job.sel = pb->sel;
+		job.ib_start = m_cl.ib.tail;
+		job.ib_count = m_index.tail;
+		job.pb_start = m_cl.pb.tail;
+
+		m_jobs.push_back(job);
+
+		m_cl.vb.tail += vb_size;
+		m_cl.ib.tail += ib_size;
+		m_cl.pb.tail += pb_size;
+
+		// mark pages for writing
+
+		if(pb->sel.fb)
+		{
+			uint8 flag = pb->sel.fb;
+
+			const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages);
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				m_rw_pages[*p] |= flag;
+			}
+		}
+
+		if(pb->sel.zb)
+		{
+			uint8 flag = pb->sel.zb;
+
+			const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages);
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				m_rw_pages[*p] |= flag;
+			}
+		}
+
+		// don't buffer too much data, feed them to the device if there is enough
+
+		if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64)
+		{
+			Enqueue();
+		}
+
+		/*
+		// check if the texture is not part of a target currently in use
+
+		if(CheckSourcePages(data))
+		{
+		Sync(4);
+		}
+
+		// addref source and target pages
+
+		data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
+		*/
+
+		// update previously invalidated parts
+
+		//data->UpdateSource();
+		/*
+		if(LOG)
+		{
+		fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
+		sd->counter,
+		m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
+		m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
+		PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
+		PRIM->PRIM, sd->vertex_count, sd->index_count);
+
+		fflush(s_fp);
+		}
+		*/
+
+		//printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w);
+
+		/*
+		// invalidate new parts rendered onto
+
+		if(sd->global.sel.fwrite)
+		{
+			m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
+		}
+
+		if(sd->global.sel.zwrite)
+		{
+			m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
+		}
+		*/
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+
+		return;
+	}
+	catch(std::exception err)
+	{
+		printf("%s\n", err.what());
+
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+		
+		std::string s;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+}
+
+void GSRendererCL::Sync(int reason)
+{
+	//printf("sync %d\n", reason);
+
+	GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
+
+	Enqueue();
+
+	m_cl.queue[2].finish();
+
+	memset(m_rw_pages, 0, sizeof(m_rw_pages));
+	memset(m_tex_pages, 0, sizeof(m_tex_pages));	
+
+	// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, 
+	// though it does not seem to be necessary even with GPU devices where it might be cached, 
+	// needs more testing...
+
+	//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+	//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
+}
+
+void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+
+	o->GetPages(r, m_tmp_pages);
+
+	//if(!synced)
+	{
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_rw_pages[*p] & 3) // rw
+			{
+				Sync(3);
+
+				break;
+			}
+		}
+	}
+
+	for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+	{
+		m_tex_pages[*p] = 1;
+	}
+}
+
+void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	//if(!synced)
+	{
+		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+
+		o->GetPages(r, m_tmp_pages);
+
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_rw_pages[*p] & 1) // w
+			{
+				Sync(4);
+
+				break;
+			}
+		}
+	}
+}
+/*
+bool GSRendererCL::CheckSourcePages(RasterizerData* data)
+{
+	// TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync
+	{
+		for(size_t i = 0; data->m_tex[i].t != NULL; i++)
+		{
+			data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages);
+
+			uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n;
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
+
+				if(m_fzb_pages[*p]) // currently being drawn to? => sync
+				{
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+*/
+
+//#include "GSTextureCL.h"
+
+void GSRendererCL::Enqueue()
+{
+	if(m_jobs.empty()) return;
+
+	try
+	{
+		ASSERT(m_cl.vb.tail > m_cl.vb.head);
+		ASSERT(m_cl.ib.tail > m_cl.ib.head);
+		ASSERT(m_cl.pb.tail > m_cl.pb.head);
+
+		int primclass = m_jobs.front().sel.prim;
+
+		uint32 n;
+
+		switch(primclass)
+		{
+		case GS_POINT_CLASS: n = 1; break;
+		case GS_LINE_CLASS: n = 2; break;
+		case GS_TRIANGLE_CLASS: n = 3; break;
+		case GS_SPRITE_CLASS: n = 2; break;
+		default: __assume(0);
+		}
+
+		PrimSelector psel;
+
+		psel.key = 0;
+		psel.prim = primclass;
+
+		cl::Kernel& pk = m_cl.GetPrimKernel(psel);
+
+		pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]);
+		pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]);
+
+		TileSelector tsel;
+
+		tsel.key = 0;
+		tsel.prim = primclass;
+
+		tsel.mode = 0;
+
+		cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 1;
+
+		cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 2;
+
+		cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 3;
+
+		cl::Kernel& tk = m_cl.GetTileKernel(tsel);
+
+		tsel.key = 0;
+		tsel.clear = 1;
+
+		cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel);
+
+		//
+
+		m_cl.Unmap();
+
+		std::vector<cl::Event> el2(1);
+
+		m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]);
+		m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL);
+
+		//
+
+		cl_kernel tfx_prev = NULL;
+
+		auto head = m_jobs.begin();
+
+		while(head != m_jobs.end())
+		{
+			uint32 total_prim_count = 0;
+
+			auto next = head;
+
+			while(next != m_jobs.end())
+			{
+				auto job = next++;
+
+				uint32 cur_prim_count = job->ib_count / n;
+				uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0;
+
+				total_prim_count += cur_prim_count;
+
+				if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2)
+				{
+					uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);					
+
+					pk.setArg(3, (cl_uint)m_vb_start);
+					pk.setArg(4, (cl_uint)head->ib_start);
+
+					m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
+
+					if(0)
+					{
+						gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+						m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+					}
+
+					GSVector4i rect = GSVector4i::zero();
+
+					for(auto i = head; i != next; i++)
+					{
+						rect = rect.runion(GSVector4i::load<false>(&i->rect));
+					}
+
+					rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
+
+					int bin_w = rect.width();
+					int bin_h = rect.height();
+
+					uint32 batch_count = BATCH_COUNT(prim_count);
+					uint32 bin_count = bin_w * bin_h;
+
+					cl_uchar4 bin_dim;
+
+					bin_dim.s[0] = (cl_uchar)rect.x;
+					bin_dim.s[1] = (cl_uchar)rect.y;
+					bin_dim.s[2] = (cl_uchar)bin_w;
+					bin_dim.s[3] = (cl_uchar)bin_h;
+
+					if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area
+					{
+						m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange);
+
+						if(bin_count <= 32 && m_cl.WIs >= 256)
+						{
+							uint32 item_count;
+							uint32 group_count;
+							cl::Kernel* k;
+
+							if(bin_count <= 8)
+							{
+								item_count = std::min(prim_count, 32u);
+								group_count = ((prim_count + 31) >> 5) * item_count;
+								k = &tk_32;
+							}
+							else if(bin_count <= 16)
+							{
+								item_count = std::min(prim_count, 16u);
+								group_count = ((prim_count + 15) >> 4) * item_count;
+								k = &tk_16;
+							}
+							else
+							{
+								item_count = std::min(prim_count, 8u);
+								group_count = ((prim_count + 7) >> 3) * item_count;
+								k = &tk_8;
+							}
+
+							k->setArg(1, (cl_uint)prim_count);
+							k->setArg(2, (cl_uint)bin_count);
+							k->setArg(3, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count));
+						}
+						else
+						{
+							uint32 item_count = std::min(bin_count, m_cl.WIs);
+							uint32 group_count = batch_count * item_count;
+
+							tk.setArg(1, (cl_uint)prim_count);
+							tk.setArg(2, (cl_uint)batch_count);
+							tk.setArg(3, (cl_uint)bin_count);
+							tk.setArg(4, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
+						}
+
+						if(0)
+						{
+							gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+							m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+						}
+					}
+
+					//
+
+					uint32 prim_start = 0;
+
+					for(auto i = head; i != next; i++)
+					{
+						ASSERT(prim_start < MAX_PRIM_COUNT);
+
+						uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start);
+
+						// TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text),
+						// changed by tfx in the previous loop or marked by InvalidateVideoMem
+
+						// TODO: tile level z test
+
+						cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
+
+						if(tfx_prev != tfx())
+						{
+							tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
+
+							tfx_prev = tfx();
+						}
+
+						tfx.setArg(4, (cl_uint)i->pb_start);
+						tfx.setArg(5, (cl_uint)prim_start);
+						tfx.setArg(6, (cl_uint)prim_count_inner);
+						tfx.setArg(7, (cl_uint)batch_count);
+						tfx.setArg(8, (cl_uint)bin_count);
+						tfx.setArg(9, bin_dim);
+
+						//m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256));
+
+						//printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y);
+
+						GSVector4i r = GSVector4i::load<false>(&i->rect);
+
+						r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
+						/*
+						if(i->sel.IsSolidRect()) // TODO: simple mem fill
+							;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
+						else
+						*/
+						m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
+
+						// TODO: invalidate texture cache pages
+
+						prim_start += prim_count_inner;
+					}
+
+					//
+
+					if(total_prim_count > MAX_PRIM_COUNT)
+					{
+						prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
+
+						job->ib_start += prim_count * n * sizeof(uint32);
+						job->ib_count -= prim_count * n;
+
+						next = job; // try again for the reminder
+					}
+
+					break;
+				}
+			}
+
+			head = next;
+		}
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+	}
+
+	m_jobs.clear();
+
+	m_vb_count = 0;
+
+	m_cl.vb.head = m_cl.vb.tail;
+	m_cl.ib.head = m_cl.ib.tail;
+	m_cl.pb.head = m_cl.pb.tail;
+
+	m_cl.Map();
+}
+
+static int RemapPSM(int psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: psm = 0; break;
+	case PSM_PSMCT24: psm = 1; break;
+	case PSM_PSMCT16: psm = 2; break;
+	case PSM_PSMCT16S: psm = 3; break;
+	case PSM_PSMZ32: psm = 4; break;
+	case PSM_PSMZ24: psm = 5; break;
+	case PSM_PSMZ16: psm = 6; break;
+	case PSM_PSMZ16S: psm = 7; break;
+	case PSM_PSMT8: psm = 8; break;
+	case PSM_PSMT4: psm = 9; break;
+	case PSM_PSMT8H: psm = 10; break;
+	case PSM_PSMT4HL: psm = 11; break;
+	case PSM_PSMT4HH: psm = 12; break;
+	}
+
+	return psm;
+}
+
+bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
+{
+	const GSDrawingEnvironment& env = m_env;
+	const GSDrawingContext* context = m_context;
+	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
+
+	pb->sel.key = 0;
+
+	pb->sel.atst = ATST_ALWAYS;
+	pb->sel.tfx = TFX_NONE;
+	pb->sel.ababcd = 0xff;
+	pb->sel.prim = primclass;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
+	{
+		fm = 0xffffffff;
+		zm = 0xffffffff;
+	}
+
+	if(PRIM->TME)
+	{
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+	}
+
+	if(context->TEST.ATE)
+	{
+		if(!TryAlphaTest(fm, zm))
+		{
+			pb->sel.atst = context->TEST.ATST;
+			pb->sel.afail = context->TEST.AFAIL;
+			pb->aref = context->TEST.AREF;
+		}
+	}
+
+	bool fwrite;
+	bool zwrite;
+	
+	switch(context->FRAME.PSM)
+	{
+	default:
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+		fwrite = fm != 0xffffffff;
+		break;
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		fwrite = (fm & 0x00ffffff) != 0x00ffffff;
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8;
+		break;
+	}
+
+	switch(context->ZBUF.PSM)
+	{
+	default:
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+		zwrite = zm != 0xffffffff;
+		break;
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		zwrite = (zm & 0x00ffffff) != 0x00ffffff;
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		zm &= 0x80f8f8f8;
+		zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8;
+		break;
+	}
+
+	if(!fwrite && !zwrite) return false;
+
+	bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
+
+	pb->sel.fwrite = fwrite;
+	pb->sel.ftest = ftest;
+	pb->sel.zwrite = zwrite;
+	pb->sel.ztest = ztest;
+
+	if(fwrite || ftest)
+	{
+		pb->sel.fpsm = RemapPSM(context->FRAME.PSM);
+
+		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
+		{
+			pb->sel.iip = PRIM->IIP;
+		}
+
+		if(PRIM->TME)
+		{
+			pb->sel.tfx = context->TEX0.TFX;
+			pb->sel.tcc = context->TEX0.TCC;
+			pb->sel.fst = PRIM->FST;
+			pb->sel.ltf = m_vt.IsLinear();
+			pb->sel.tpsm = RemapPSM(context->TEX0.PSM);
+			pb->sel.aem = m_env.TEXA.AEM;
+
+			pb->tbp[0] = context->TEX0.TBP0;
+			pb->tbw[0] = context->TEX0.TBW;
+			pb->ta0 = m_env.TEXA.TA0;
+			pb->ta1 = m_env.TEXA.TA1;
+
+			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+			{
+				pb->sel.tlu = 1;
+
+				memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
+			}
+
+			pb->sel.wms = context->CLAMP.WMS;
+			pb->sel.wmt = context->CLAMP.WMT;
+
+			if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			{
+				// modulate does not do anything when vertex color is 0x80
+
+				pb->sel.tfx = TFX_DECAL;
+			}
+
+			// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
+
+			// TODO: if(t == NULL) {ASSERT(0); return false;}
+
+			GSVector4i r;
+
+			GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
+
+			// TODO: data->SetSource(t, r, 0);
+
+			// TODO: pb->sel.tw = t->m_tw - 3;
+
+			// TODO: store r to current job
+
+			if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
+			{
+				// TEX1.MMIN
+				// 000 p
+				// 001 l
+				// 010 p round
+				// 011 p tri
+				// 100 l round
+				// 101 l tri
+
+				if(m_vt.m_lod.x > 0)
+				{
+					pb->sel.ltf = context->TEX1.MMIN >> 2;
+				}
+				else
+				{
+					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
+				}
+
+				pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				pb->sel.lcm = context->TEX1.LCM;
+
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
+				int k = context->TEX1.K << 12;
+
+				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
+				{
+					k = (int)m_vt.m_lod.x << 16; // set lod to max level
+
+					pb->sel.lcm = 1; // lod is constant
+					pb->sel.mmin = 1; // tri-linear is meaningless
+				}
+
+				if(pb->sel.mmin == 2)
+				{
+					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
+				}
+
+				if(pb->sel.fst)
+				{
+					ASSERT(pb->sel.lcm == 1);
+					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
+
+					pb->sel.lcm = 1;
+				}
+
+				if(pb->sel.lcm)
+				{
+					int lod = std::max<int>(std::min<int>(k, mxl), 0);
+
+					if(pb->sel.mmin == 1)
+					{
+						lod = (lod + 0x8000) & 0xffff0000; // rounding
+					}
+
+					pb->lod = lod;
+
+					// TODO: lot to optimize when lod is constant
+				}
+				else
+				{
+					pb->mxl = mxl;
+					pb->l = (float)(-0x10000 << context->TEX1.L);
+					pb->k = (float)k;
+				}
+
+				GIFRegTEX0 MIP_TEX0 = context->TEX0;
+				GIFRegCLAMP MIP_CLAMP = context->CLAMP;
+
+				GSVector4 tmin = m_vt.m_min.t;
+				GSVector4 tmax = m_vt.m_max.t;
+
+				static int s_counter = 0;
+
+				for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++)
+				{
+					switch(i)
+					{
+					case 1:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP1;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW1;
+						break;
+					case 2:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP2;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW2;
+						break;
+					case 3:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP3;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW3;
+						break;
+					case 4:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP4;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW4;
+						break;
+					case 5:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP5;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW5;
+						break;
+					case 6:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP6;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW6;
+						break;
+					default:
+						__assume(0);
+					}
+
+					pb->tbp[i] = MIP_TEX0.TBP0;
+					pb->tbw[i] = MIP_TEX0.TBW;
+
+					if(MIP_TEX0.TW > 0) MIP_TEX0.TW--;
+					if(MIP_TEX0.TH > 0) MIP_TEX0.TH--;
+
+					MIP_CLAMP.MINU >>= 1;
+					MIP_CLAMP.MINV >>= 1;
+					MIP_CLAMP.MAXU >>= 1;
+					MIP_CLAMP.MAXV >>= 1;
+
+					m_vt.m_min.t *= 0.5f;
+					m_vt.m_max.t *= 0.5f;
+
+					// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3);
+
+					// TODO: if(t == NULL) {ASSERT(0); return false;}
+
+					GSVector4i r;
+
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
+
+					// TODO: data->SetSource(t, r, i);
+
+					// TODO: store r to current job
+				}
+
+				s_counter++;
+
+				m_vt.m_min.t = tmin;
+				m_vt.m_max.t = tmax;
+			}
+			else
+			{
+				if(pb->sel.fst == 0)
+				{
+					// skip per pixel division if q is constant
+
+					GSVertexCL* RESTRICT v = vertex;
+
+					if(m_vt.m_eq.q)
+					{
+						pb->sel.fst = 1;
+
+						const GSVector4& t = v[index[0]].t;
+
+						if(t.z != 1.0f)
+						{
+							GSVector4 w = t.zzzz().rcpnr();
+
+							for(int i = 0, j = vertex_count; i < j; i++)
+							{
+								GSVector4 t = v[i].t;
+
+								v[i].t = (t * w).xyzw(t);
+							}
+						}
+					}
+					else if(primclass == GS_SPRITE_CLASS)
+					{
+						pb->sel.fst = 1;
+
+						for(int i = 0, j = vertex_count; i < j; i += 2)
+						{
+							GSVector4 t0 = v[i + 0].t;
+							GSVector4 t1 = v[i + 1].t;
+
+							GSVector4 w = t1.zzzz().rcpnr();
+
+							v[i + 0].t = (t0 * w).xyzw(t0);
+							v[i + 1].t = (t1 * w).xyzw(t1);
+						}
+					}
+				}
+
+				if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel?
+				{
+					// if q is constant we can do the half pel shift for bilinear sampling on the vertices
+
+					// TODO: but not when mipmapping is used!!!
+
+					GSVector4 half(8.0f, 8.0f);
+
+					GSVertexCL* RESTRICT v = vertex;
+
+					for(int i = 0, j = vertex_count; i < j; i++)
+					{
+						GSVector4 t = v[i].t;
+
+						v[i].t = (t - half).xyzw(t);
+					}
+				}
+			}
+
+			int tw = 1 << context->TEX0.TW;
+			int th = 1 << context->TEX0.TH;
+
+			switch(context->CLAMP.WMS)
+			{
+			case CLAMP_REPEAT:
+				pb->minu = tw - 1;
+				pb->maxu = 0;
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minu = 0;
+				pb->maxu = tw - 1;
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minu = std::min((int)context->CLAMP.MINU, tw - 1);
+				pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1);
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minu = (int)context->CLAMP.MINU & (tw - 1);
+				pb->maxu = (int)context->CLAMP.MAXU & (tw - 1);
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+
+			switch(context->CLAMP.WMT)
+			{
+			case CLAMP_REPEAT:
+				pb->minv = th - 1;
+				pb->maxv = 0;
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minv = 0;
+				pb->maxv = th - 1;
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minv = std::min((int)context->CLAMP.MINV, th - 1);
+				pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127
+				pb->maxv = (int)context->CLAMP.MAXV & (th - 1);
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+		}
+
+		if(PRIM->FGE)
+		{
+			pb->sel.fge = 1;
+			pb->fog = env.FOGCOL.u32[0];
+		}
+
+		if(context->FRAME.PSM != PSM_PSMCT24)
+		{
+			pb->sel.date = context->TEST.DATE;
+			pb->sel.datm = context->TEST.DATM;
+		}
+
+		if(!IsOpaque())
+		{
+			pb->sel.abe = PRIM->ABE;
+			pb->sel.ababcd = context->ALPHA.u32[0];
+
+			if(env.PABE.PABE)
+			{
+				pb->sel.pabe = 1;
+			}
+
+			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
+			{
+				pb->sel.aa1 = 1;
+			}
+
+			pb->afix = context->ALPHA.FIX;
+		}
+
+		if(pb->sel.date
+		|| pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1
+		|| pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
+		|| (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0
+		|| (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask
+		|| (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0)
+		{
+			pb->sel.rfb = 1;
+		}
+
+		pb->sel.colclamp = env.COLCLAMP.CLAMP;
+		pb->sel.fba = context->FBA.FBA;
+
+		if(env.DTHE.DTHE)
+		{
+			pb->sel.dthe = 1;
+
+			GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
+			GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
+			GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16);
+			GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16);
+
+			pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3));
+		}
+	}
+
+	if(zwrite || ztest)
+	{
+		pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
+		pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+		pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
+	}
+
+	pb->fm = fm;
+	pb->zm = zm;
+
+	if((pb->sel.fpsm & 3) == 1)
+	{
+		pb->fm |= 0xff000000;
+	}
+	else if((pb->sel.fpsm & 3) >= 2)
+	{
+		uint32 rb = pb->fm & 0x00f800f8;
+		uint32 ga = pb->fm & 0x8000f800;
+
+		pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
+	}
+
+	if((pb->sel.zpsm & 3) == 1)
+	{
+		pb->zm |= 0xff000000;
+	}
+	else if((pb->sel.zpsm & 3) >= 2)
+	{
+		pb->zm |= 0xffff0000;
+	}
+
+	if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor)))
+	{
+		pb->sel.noscissor = 1;
+	}
+
+	pb->fbp = context->FRAME.Block();
+	pb->zbp = context->ZBUF.Block();
+	pb->bw = context->FRAME.FBW;
+
+	return true;
+}
+
+//////////
+
+//#define IOCL_DEBUG
+
+GSRendererCL::CL::CL()
+{
+	WIs = INT_MAX;
+
+	std::vector<cl::Platform> platforms;
+
+	cl::Platform::get(&platforms);
+
+	for(auto& p : platforms)
+	{
+		std::string platform_vendor = p.getInfo<CL_PLATFORM_VENDOR>();
+
+		std::vector<cl::Device> ds;
+
+		p.getDevices(CL_DEVICE_TYPE_ALL, &ds);
+
+		for(auto& device : ds)
+		{
+			std::string vendor = device.getInfo<CL_DEVICE_VENDOR>();
+			std::string name = device.getInfo<CL_DEVICE_NAME>();
+			std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+
+			printf("%s %s %s", vendor.c_str(), name.c_str(), version.c_str());
+
+			cl_device_type type = device.getInfo<CL_DEVICE_TYPE>();
+
+			switch(type)
+			{
+			case CL_DEVICE_TYPE_GPU: printf(" GPU"); break;
+			case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
+			}
+
+			if(strstr(version.c_str(), "OpenCL C 1.2") != NULL)
+			{
+#ifdef IOCL_DEBUG
+				if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
+#else
+				//if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
+				if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL)
+#endif
+				{
+					devices.push_back(device);
+
+					WIs = std::min(WIs, device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+
+					printf(" *");
+				}
+			}
+
+			printf("\n");
+		}
+
+		if(!devices.empty()) break;
+	}
+
+	if(devices.empty())
+	{
+		throw new std::exception("OpenCL device not found");
+	}
+
+	context = cl::Context(devices);
+
+	queue[0] = cl::CommandQueue(context);
+	queue[1] = cl::CommandQueue(context);
+	queue[2] = cl::CommandQueue(context);
+
+	vector<unsigned char> buff;
+
+	if(theApp.LoadResource(IDR_TFX_CL, buff))
+	{
+		kernel_str = std::string((const char*)buff.data(), buff.size());
+	}
+
+	vb.head = vb.tail = vb.size = 0;
+	ib.head = ib.tail = ib.size = 0;
+	pb.head = pb.tail = pb.size = 0;
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+
+	pb.size = sizeof(TFXParameter) * 256;
+	pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+	pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+
+	env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env));
+
+	wqidx = 0;
+	wq = &queue[0];
+}
+
+GSRendererCL::CL::~CL()
+{
+	Unmap();
+}
+
+void GSRendererCL::CL::Map()
+{
+	Unmap();
+
+	if(vb.head < vb.size)
+	{
+		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head);
+		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
+		ASSERT(((size_t)vb.ptr & 15) == 0);
+		ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
+	}
+
+	if(ib.head < ib.size)
+	{
+		ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head);
+		ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
+		ASSERT(((size_t)ib.ptr & 15) == 0);
+	}
+
+	if(pb.head < pb.size)
+	{
+		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head);
+		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
+		ASSERT(((size_t)pb.ptr & 15) == 0);
+		ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
+	}
+}
+
+void GSRendererCL::CL::Unmap()
+{
+	if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr);
+	if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr);
+	if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr);
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+}
+
+static void AddDefs(ostringstream& opt)
+{
+	opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
+	opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u ";
+	opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u ";
+	opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " ";
+	opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
+	opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";	
+	opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
+#ifdef IOCL_DEBUG
+	opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
+#endif
+}
+
+cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
+{
+	auto i = prim_map.find(sel);
+
+	if(i != prim_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "prim_%02x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_PRIM=" << entry << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	prim_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return prim_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel)
+{
+	auto i = tile_map.find(sel);
+
+	if(i != tile_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tile_%02x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_TILE=" << entry << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+		opt << "-D MODE=" << sel.mode << " ";
+		opt << "-D CLEAR=" << sel.clear << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	tile_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return tile_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
+{
+	auto i = tfx_map.find(sel);
+
+	if(i != tfx_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tfx_%016x", sel);
+
+	cl::Program program = cl::Program(context, kernel_str);
+
+	try
+	{
+		ostringstream opt;
+
+		opt << "-D KERNEL_TFX=" << entry << " ";
+		opt << "-D FPSM=" << sel.fpsm << " ";
+		opt << "-D ZPSM=" << sel.zpsm << " ";
+		opt << "-D ZTST=" << sel.ztst << " ";
+		opt << "-D ATST=" << sel.atst << " ";
+		opt << "-D AFAIL=" << sel.afail << " ";
+		opt << "-D IIP=" << sel.iip << " ";
+		opt << "-D TFX=" << sel.tfx << " ";
+		opt << "-D TCC=" << sel.tcc << " ";
+		opt << "-D FST=" << sel.fst << " ";
+		opt << "-D LTF=" << sel.ltf << " ";
+		opt << "-D TLU=" << sel.tlu << " ";
+		opt << "-D FGE=" << sel.fge << " ";
+		opt << "-D DATE=" << sel.date << " ";
+		opt << "-D ABE=" << sel.abe << " ";
+		opt << "-D ABA=" << sel.aba << " ";
+		opt << "-D ABB=" << sel.abb << " ";
+		opt << "-D ABC=" << sel.abc << " ";
+		opt << "-D ABD=" << sel.abd << " ";
+		opt << "-D PABE=" << sel.pabe << " ";
+		opt << "-D AA1=" << sel.aa1 << " ";
+		opt << "-D FWRITE=" << sel.fwrite << " ";
+		opt << "-D FTEST=" << sel.ftest << " ";
+		opt << "-D RFB=" << sel.rfb << " ";
+		opt << "-D ZWRITE=" << sel.zwrite << " ";
+		opt << "-D ZTEST=" << sel.ztest << " ";
+		opt << "-D ZOVERFLOW=" << sel.zoverflow << " ";
+		opt << "-D WMS=" << sel.wms << " ";
+		opt << "-D WMT=" << sel.wmt << " ";
+		opt << "-D DATM=" << sel.datm << " ";
+		opt << "-D COLCLAMP=" << sel.colclamp << " ";
+		opt << "-D FBA=" << sel.fba << " ";
+		opt << "-D DTHE=" << sel.dthe << " ";
+		opt << "-D PRIM=" << sel.prim << " ";
+		opt << "-D TW=" << sel.tw << " ";
+		opt << "-D LCM=" << sel.lcm << " ";
+		opt << "-D MMIN=" << sel.mmin << " ";
+		opt << "-D NOSCISSOR=" << sel.noscissor << " ";
+		opt << "-D TPSM=" << sel.tpsm << " ";
+		opt << "-D AEM=" << sel.aem << " ";
+		opt << "-D FB=" << sel.fb << " ";
+		opt << "-D ZB=" << sel.zb << " ";
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto device : devices)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	cl::Kernel k(program, entry);
+
+	tfx_map[sel] = k;
+
+	k.setArg(0, env);
+	k.setArg(1, vm);
+	k.setArg(2, tex);
+
+	return tfx_map[sel];
+}
diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h
new file mode 100644
index 0000000000..3ac008ceaa
--- /dev/null
+++ b/plugins/GSdx/GSRendererCL.h
@@ -0,0 +1,310 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+//#include "GSTextureCacheCL.h"
+
+__aligned(struct, 32) GSVertexCL
+{
+	GSVector4 p, t;
+};
+
+class GSRendererCL : public GSRenderer
+{
+	typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	ConvertVertexBufferPtr m_cvb[4][2][2];
+
+	template<uint32 primclass, uint32 tme, uint32 fst>
+	void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	union PrimSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TileSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+			uint32 mode:2; // 2
+			uint32 clear:1; // 4
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union JobSelector
+	{
+		struct
+		{
+			uint32 dummy:1; // 0
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TFXSelector
+	{
+		struct
+		{
+			uint32 fpsm:3; // 0
+			uint32 zpsm:3; // 3
+			uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g))
+			uint32 atst:3; // 8
+			uint32 afail:2; // 11
+			uint32 iip:1; // 13
+			uint32 tfx:3; // 14
+			uint32 tcc:1; // 17
+			uint32 fst:1; // 18
+			uint32 ltf:1; // 19
+			uint32 tlu:1; // 20
+			uint32 fge:1; // 21
+			uint32 date:1; // 22
+			uint32 abe:1; // 23
+			uint32 aba:2; // 24
+			uint32 abb:2; // 26
+			uint32 abc:2; // 28
+			uint32 abd:2; // 30
+
+			uint32 pabe:1; // 32
+			uint32 aa1:1; // 33
+			uint32 fwrite:1; // 34
+			uint32 ftest:1; // 35
+			uint32 rfb:1; // 36
+			uint32 zwrite:1; // 37
+			uint32 ztest:1; // 38
+			uint32 zoverflow:1; // 39 (z max >= 0x80000000)
+			uint32 wms:2; // 40
+			uint32 wmt:2; // 42
+			uint32 datm:1; // 44
+			uint32 colclamp:1; // 45
+			uint32 fba:1; // 46
+			uint32 dthe:1; // 47
+			uint32 prim:2; // 48
+			uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
+			uint32 lcm:1; // 53
+			uint32 mmin:2; // 54
+			uint32 noscissor:1; // 55
+			uint32 tpsm:4; // 56
+			uint32 aem:1; // 60
+			// TODO
+		};
+
+		struct
+		{
+			uint32 _pad1:24;
+			uint32 ababcd:8;
+			uint32 _pad2:2;
+			uint32 fb:2;
+			uint32 _pad3:1;
+			uint32 zb:2;
+		};
+
+		struct
+		{
+			uint32 lo;
+			uint32 hi;
+		};
+
+		uint64 key;
+
+		operator uint64() const { return key; }
+
+		bool IsSolidRect() const
+		{
+			return prim == GS_SPRITE_CLASS
+				&& iip == 0
+				&& tfx == TFX_NONE
+				&& abe == 0
+				&& ztst <= 1
+				&& atst <= 1
+				&& date == 0
+				&& fge == 0;
+		}
+	};
+
+	__aligned(struct, 32) TFXParameter
+	{
+		GSVector4i scissor;
+		GSVector4i bbox;
+		GSVector4i rect;
+		GSVector4i dimx; // 4x4 signed char
+		TFXSelector sel;
+		uint32 fbp, zbp, bw;
+		uint32 fm, zm;
+		uint32 fog; // rgb
+		uint8 aref, afix;
+		uint8 ta0, ta1;
+		uint32 tbp[7], tbw[7];
+		int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix
+		int lod; // lcm == 1
+		int mxl;
+		float l; // TEX1.L * -0x10000
+		float k; // TEX1.K * 0x10000
+		uint32 clut[256];
+	};
+
+	struct TFXJob
+	{
+		struct {int x, y, z, w;} rect;
+		TFXSelector sel;
+		uint32 ib_start, ib_count;
+		uint32 pb_start;
+	};
+
+	class CL
+	{
+		std::string kernel_str;
+		std::map<uint32, cl::Kernel> prim_map;
+		std::map<uint32, cl::Kernel> tile_map;
+		std::map<uint64, cl::Kernel> tfx_map;
+
+	public:
+		std::vector<cl::Device> devices;
+		cl::Context context;
+		cl::CommandQueue queue[3];
+		cl::Buffer vm;
+		cl::Buffer tex;
+		struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb;
+		cl::Buffer env;
+		cl::CommandQueue* wq;
+		int wqidx;
+		size_t WIs;
+
+	public:
+		CL();
+		virtual ~CL();
+
+		cl::Kernel& GetPrimKernel(const PrimSelector& sel);
+		cl::Kernel& GetTileKernel(const TileSelector& sel);
+		cl::Kernel& GetTFXKernel(const TFXSelector& sel);
+
+		void Map();
+		void Unmap();
+	};
+
+	CL m_cl;
+	std::list<TFXJob> m_jobs;
+	uint32 m_vb_start;
+	uint32 m_vb_count;
+
+	void Enqueue();
+
+	/*
+	class RasterizerData : public GSAlignedClass<32>
+	{
+		__aligned(struct, 16) TextureLevel
+		{
+			GSVector4i r;
+			// TODO: GSTextureCacheCL::Texture* t;
+		};
+
+	public:
+		GSRendererCL* m_parent;
+		const uint32* m_fb_pages;
+		const uint32* m_zb_pages;
+
+		//cl::Buffer m_vbuff;
+		//cl::Buffer m_ibuff;
+
+		// TODO: buffers
+		TextureLevel m_tex[7 + 1]; // NULL terminated
+		//cl::Buffer m_clut;
+		//cl::Buffer m_dimx;
+
+		// TODO: struct in a cl::Buffer
+		TFXSelector m_sel;
+		GSVector4i m_scissor;
+		GSVector4i m_bbox;
+		uint32 m_fm, m_zm;
+		int m_aref, m_afix;
+		uint32 m_fog; // rgb
+		int m_lod; // lcm == 1
+		int m_mxl;
+		float m_l; // TEX1.L * -0x10000
+		float m_k; // TEX1.K * 0x10000
+		// TODO: struct { GSVector4i min, max, minmax, mask, invmask; } t; // [u] x 4 [v] x 4
+
+		RasterizerData(GSRendererCL* parent)
+			: m_parent(parent)
+			, m_fb_pages(NULL)
+			, m_zb_pages(NULL)
+		{
+			m_sel.key = 0;
+		}
+
+		virtual ~RasterizerData()
+		{
+			// TODO: ReleasePages();
+		}
+
+		// TODO: void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm);
+		// TODO: void ReleasePages();
+
+		// TODO: void SetSource(GSTextureCacheCL::Texture* t, const GSVector4i& r, int level);
+		// TODO: void UpdateSource();
+	};
+	*/
+protected:
+//	GSTextureCacheCL* m_tc;
+	GSTexture* m_texture[2];
+	uint8* m_output;
+
+	uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read)
+	uint8 m_tex_pages[512];
+	uint32 m_tmp_pages[512 + 1];
+
+	void Reset();
+	void VSync(int field);
+	void ResetDevice();
+	GSTexture* GetOutput(int i);
+
+	void Draw();
+	void Sync(int reason);
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
+
+	void UsePages(const uint32* pages, int type);
+	void ReleasePages(const uint32* pages, int type);
+	
+	//bool CheckSourcePages(RasterizerData* data);
+
+	bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
+
+public:
+	GSRendererCL();
+	virtual ~GSRendererCL();
+};
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 30d848e4aa..71f42f961d 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -429,6 +429,15 @@ void GSRendererSW::Draw()
 
 	GSVector4i scissor = GSVector4i(context->scissor.in);
 	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (single line: 0, 0 - 256, 0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
 	GSVector4i r = bbox.rintersect(scissor);
 
 	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
@@ -973,7 +982,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 	gd.sel.zpsm = 3;
 	gd.sel.atst = ATST_ALWAYS;
 	gd.sel.tfx = TFX_NONE;
-	gd.sel.ababcd = 255;
+	gd.sel.ababcd = 0xff;
 	gd.sel.prim = primclass;
 
 	uint32 fm = context->FRAME.FBMSK;
@@ -1101,7 +1110,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 				gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
 				gd.sel.lcm = context->TEX1.LCM;
 
-				int mxl = (std::min<int>((int)context->TEX1.MXL, 6) << 16);
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
 				int k = context->TEX1.K << 12;
 
 				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp
index 89656965dd..41d66f24e9 100644
--- a/plugins/GSdx/GSSettingsDlg.cpp
+++ b/plugins/GSdx/GSSettingsDlg.cpp
@@ -329,16 +329,19 @@ void GSSettingsDlg::UpdateRenderers()
 	{
 		GSSetting r = theApp.m_gs_renderers[i];
 
-		if(i >= 3 && i <= 5)
+		if(r.id >= 3 && r.id <= 5 || r.id == 15)
 		{
 			if(level < D3D_FEATURE_LEVEL_10_0) continue;
 
-			r.name = std::string("Direct3D") + (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10");
+			r.name += (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10");
 		}
 
 		renderers.push_back(r);
-		if (r.id == renderer_setting)
+
+		if(r.id == renderer_setting)
+		{
 			renderer_sel = renderer_setting;
+		}
 	}
 
 	ComboBoxInit(IDC_RENDERER, renderers, renderer_sel);
@@ -607,13 +610,13 @@ bool GSHacksDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
 				break;
 			case IDC_SPRITEHACK:
 				helpstr = "Sprite Hack\n\nHelps getting rid of black inner lines in some filtered sprites."
-						  " Half option is the preferred one. Use it for Mana Khemia or Ar tonelico for example."
+						  " Half option is the preferred one. Use it for Mana Khemia or Ar Tonelico for example."
 						  " Full can be used for Tales of Destiny.";
 				break;
 			case IDC_WILDHACK:
 				helpstr = "WildArms\n\nLowers the GS precision to avoid gaps between pixels when"
 						  " upscaling. Full option fixes the text on WildArms games, while Half option might improve portraits"
-						  " in Ar tonelico.";
+						  " in Ar Tonelico.";
 				break;
 			case IDC_MSAACB:
 			case IDC_STATIC_MSAA:
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index 604836af03..731b2aa480 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -1551,7 +1551,8 @@ void GSState::Read(uint8* mem, int len)
 		return;
 	}
 
-	if (!m_init_read_fifo_supported) {
+	if(!m_init_read_fifo_supported)
+	{
 		if(m_tr.x == sx && m_tr.y == sy)
 		{
 			InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
@@ -2316,20 +2317,20 @@ void GSState::GrowVertexBuffer()
 	GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
 	uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3
 
-	if (!vertex || !index)
+	if(vertex == NULL || index == NULL)
 	{
 		printf("GSdx: failed to allocate %d bytes for verticles and %d for indices.\n", sizeof(GSVertex) * maxcount, sizeof(uint32) * maxcount * 3);
 		throw GSDXError();
 	}
 
-	if (m_vertex.buff != NULL)
+	if(m_vertex.buff != NULL)
 	{
 		memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail);
 
 		_aligned_free(m_vertex.buff);
 	}
 
-	if (m_index.buff != NULL)
+	if(m_index.buff != NULL)
 	{
 		memcpy(index, m_index.buff, sizeof(uint32) * m_index.tail);
 		
diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp
index 00fb984260..93d9642fc2 100644
--- a/plugins/GSdx/GSdx.cpp
+++ b/plugins/GSdx/GSdx.cpp
@@ -41,8 +41,29 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserv
 	return TRUE;
 }
 
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	HRSRC hRsrc = FindResource((HMODULE)s_hModule, MAKEINTRESOURCE(id), type != NULL ? type : RT_RCDATA);
+	if(!hRsrc) return false;
+	HGLOBAL hGlobal = ::LoadResource((HMODULE)s_hModule, hRsrc);
+	if(!hGlobal) return false;
+	DWORD size = SizeofResource((HMODULE)s_hModule, hRsrc);
+	if(!size) return false;
+	buff.resize(size);
+	memcpy(buff.data(), LockResource(hGlobal), size);
+	return true;
+}
+
 #else
 
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	printf("LoadResource not implemented\n");
+	return false;
+}
+
 size_t GSdxApp::GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName)
 {
 	BuildConfigurationMap(lpFileName);
@@ -108,10 +129,12 @@ GSdxApp::GSdxApp()
 
 	m_gs_renderers.push_back(GSSetting(0, "Direct3D9", "Hardware"));
 	m_gs_renderers.push_back(GSSetting(1, "Direct3D9", "Software"));
+	m_gs_renderers.push_back(GSSetting(14, "Direct3D9", "OpenCL"));
 	m_gs_renderers.push_back(GSSetting(2, "Direct3D9", "Null"));
-	m_gs_renderers.push_back(GSSetting(3, "Direct3D%d    ", "Hardware"));
-	m_gs_renderers.push_back(GSSetting(4, "Direct3D%d    ", "Software"));
-	m_gs_renderers.push_back(GSSetting(5, "Direct3D%d    ", "Null"));
+	m_gs_renderers.push_back(GSSetting(3, "Direct3D", "Hardware"));
+	m_gs_renderers.push_back(GSSetting(4, "Direct3D", "Software"));
+	m_gs_renderers.push_back(GSSetting(15, "Direct3D", "OpenCL"));
+	m_gs_renderers.push_back(GSSetting(5, "Direct3D", "Null"));
 #ifdef _LINUX
 	// note: SDL was removed. We keep those bits for compatibility of the renderer
 	// position in the linux dialog.
@@ -119,9 +142,11 @@ GSdxApp::GSdxApp()
 	m_gs_renderers.push_back(GSSetting(8, "SDL 1.3", "Null"));
 #endif
 	m_gs_renderers.push_back(GSSetting(10, "Null", "Software"));
+	m_gs_renderers.push_back(GSSetting(16, "Null", "OpenCL"));
 	m_gs_renderers.push_back(GSSetting(11, "Null", "Null"));
 	m_gs_renderers.push_back(GSSetting(12, "OpenGL", "Hardware"));
 	m_gs_renderers.push_back(GSSetting(13, "OpenGL", "Software"));
+	m_gs_renderers.push_back(GSSetting(17, "OpenGL", "OpenCL"));
 
 	m_gs_interlace.push_back(GSSetting(0, "None", ""));
 	m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth"));
diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h
index f89a3fa761..5ccb1e2c69 100644
--- a/plugins/GSdx/GSdx.h
+++ b/plugins/GSdx/GSdx.h
@@ -39,6 +39,7 @@ public:
 #ifdef _WINDOWS
  	HMODULE GetModuleHandle() {return (HMODULE)GetModuleHandlePtr();}
 #endif
+
 #ifdef _LINUX
 	void BuildConfigurationMap(const char* lpFileName);
 	void ReloadConfig();
@@ -48,6 +49,8 @@ public:
 	int GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName);
 #endif
 
+	bool LoadResource(int id, vector<unsigned char>& buff, const char* type = NULL);
+
 	string GetConfig(const char* entry, const char* value);
 	void SetConfig(const char* entry, const char* value);
 	int GetConfig(const char* entry, int value);
diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc
index 47ccc62aa0..87c0b6164d 100644
--- a/plugins/GSdx/GSdx.rc
+++ b/plugins/GSdx/GSdx.rc
@@ -51,9 +51,11 @@ BEGIN
     "#include ""res/tfx.fx""\r\n"
     "#include ""res/convert.fx""\r\n"
     "#include ""res/interlace.fx""\r\n"
-    "#include ""res/merge.fx""\r\0"
-    "#include ""res/fxaa.fx""\r\0"
-    "#include ""res/shadeboost.fx""\r\0"
+    "#include ""res/merge.fx""\r\n"
+    "#include ""res/fxaa.fx""\r\n"
+    "#include ""res/cs.fx""\r\n"
+    "#include ""res/shadeboost.fx""\r\n"
+    "#include ""res/tfx.cl""\r\n"
 END
 
 #endif    // APSTUDIO_INVOKED
@@ -64,13 +66,14 @@ END
 // RCDATA
 //
 
-IDR_CONVERT_FX          RCDATA                  "res\\convert.fx"
 IDR_TFX_FX              RCDATA                  "res\\tfx.fx"
-IDR_MERGE_FX            RCDATA                  "res\\merge.fx"
+IDR_CONVERT_FX          RCDATA                  "res\\convert.fx"
 IDR_INTERLACE_FX        RCDATA                  "res\\interlace.fx"
+IDR_MERGE_FX            RCDATA                  "res\\merge.fx"
 IDR_FXAA_FX             RCDATA                  "res\\fxaa.fx"
 IDR_CS_FX               RCDATA                  "res\\cs.fx"
 IDR_SHADEBOOST_FX       RCDATA                  "res\\shadeboost.fx"
+IDR_TFX_CL              RCDATA                  "res\\tfx.cl"
 
 /////////////////////////////////////////////////////////////////////////////
 //
@@ -394,6 +397,10 @@ END
 #include "res/convert.fx"
 #include "res/interlace.fx"
 #include "res/merge.fx"
+#include "res/fxaa.fx"
+#include "res/cs.fx"
+#include "res/shadeboost.fx"
+#include "res/tfx.cl"
 
 /////////////////////////////////////////////////////////////////////////////
 #endif    // not APSTUDIO_INVOKED
diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj b/plugins/GSdx/GSdx_vs2013.vcxproj
index f9ee445906..ef165a5839 100644
--- a/plugins/GSdx/GSdx_vs2013.vcxproj
+++ b/plugins/GSdx/GSdx_vs2013.vcxproj
@@ -687,6 +687,7 @@
       <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">AssemblyAndSourceCode</AssemblerOutput>
     </ClCompile>
     <ClCompile Include="GSRenderer.cpp" />
+    <ClCompile Include="GSRendererCL.cpp" />
     <ClCompile Include="GSRendererCS.cpp" />
     <ClCompile Include="GSRendererDX.cpp" />
     <ClCompile Include="GSRendererDX11.cpp" />
@@ -1970,6 +1971,7 @@
     <ClInclude Include="GSPerfMon.h" />
     <ClInclude Include="GSRasterizer.h" />
     <ClInclude Include="GSRenderer.h" />
+    <ClInclude Include="GSRendererCL.h" />
     <ClInclude Include="GSRendererCS.h" />
     <ClInclude Include="GSRendererDX.h" />
     <ClInclude Include="GSRendererDX11.h" />
@@ -2057,6 +2059,7 @@
     <None Include="res\interlace.fx" />
     <None Include="res\merge.fx" />
     <None Include="res\shadeboost.fx" />
+    <None Include="res\tfx.cl" />
     <None Include="res\tfx.fx" />
     <None Include="baseclasses\activex.rcv" />
     <None Include="baseclasses\activex.ver" />
diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj.filters b/plugins/GSdx/GSdx_vs2013.vcxproj.filters
index 1067b76f32..b0053934e9 100644
--- a/plugins/GSdx/GSdx_vs2013.vcxproj.filters
+++ b/plugins/GSdx/GSdx_vs2013.vcxproj.filters
@@ -348,6 +348,9 @@
     <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="GSRendererCL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="GLLoader.h">
@@ -707,6 +710,9 @@
     <ClInclude Include="targetver.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="GSRendererCL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="res\logo10.bmp">
@@ -737,10 +743,13 @@
     <None Include="res\fxaa.fx">
       <Filter>Shaders</Filter>
     </None>
+    <None Include="res\shadeboost.fx">
+      <Filter>Shaders</Filter>
+    </None>
     <None Include="res\cs.fx">
       <Filter>Shaders</Filter>
     </None>
-    <None Include="res\shadeboost.fx">
+    <None Include="res\tfx.cl">
       <Filter>Shaders</Filter>
     </None>
   </ItemGroup>
diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx
index fb63c0b012..c84211ba95 100644
--- a/plugins/GSdx/res/cs.fx
+++ b/plugins/GSdx/res/cs.fx
@@ -1,3 +1,5 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+
 #ifndef VS_TME
 #define VS_TME 1
 #define VS_FST 1
@@ -381,3 +383,5 @@ void ps_main1(GS_OUTPUT input)
 	WritePixel(addr.x, c, PS_FPSM);
 	WritePixel(addr.y, z, PS_ZPSM);
 }
+
+#endif
diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl
new file mode 100644
index 0000000000..d28622380e
--- /dev/null
+++ b/plugins/GSdx/res/tfx.cl
@@ -0,0 +1,1619 @@
+#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency
+
+#ifndef CL_FLT_EPSILON
+#define CL_FLT_EPSILON 1.1920928955078125e-7
+#endif
+
+#if MAX_PRIM_PER_BATCH == 64u
+	#define BIN_TYPE ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+	#define BIN_TYPE uint
+#else
+	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+typedef struct
+{
+	union {float4 p; struct {float x, y, z, f;};};
+	union {float4 tc; struct {float s, t, q; uchar4 c;};};
+} gs_vertex;
+
+typedef struct
+{
+	gs_vertex v[4];
+} gs_prim;
+
+typedef struct
+{
+	float4 dx, dy;
+	float4 zero;
+	float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	uint batch_counter;
+	uint _pad[7];
+	struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+typedef struct
+{
+	int4 scissor;
+	int4 bbox;
+	int4 rect;
+	char dimx[4][4];
+	ulong sel;
+	uint fbp, zbp, bw;
+	uint fm, zm;
+	uchar4 fog; // rgb
+	uchar aref, afix;
+	uchar ta0, ta1;
+	uint tbp[7], tbw[7];
+	int minu, maxu, minv, maxv;
+	int lod; // lcm == 1
+	int mxl;
+	float l; // TEX1.L * -0x10000
+	float k; // TEX1.K * 0x10000
+	uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother
+} gs_param;
+
+enum GS_PRIM_CLASS
+{
+	GS_POINT_CLASS,
+	GS_LINE_CLASS,
+	GS_TRIANGLE_CLASS,
+	GS_SPRITE_CLASS
+};
+
+enum GS_PSM_TARGET
+{
+	PSM_PSMCT32,
+	PSM_PSMCT24,
+	PSM_PSMCT16,
+	PSM_PSMCT16S,
+	PSM_PSMZ32,
+	PSM_PSMZ24,
+	PSM_PSMZ16,
+	PSM_PSMZ16S,
+	PSM_PSMT8,
+	PSM_PSMT4,
+	PSM_PSMT8H,
+	PSM_PSMT4HL,
+	PSM_PSMT4HH,
+};
+
+enum GS_TFX
+{
+	TFX_MODULATE	= 0,
+	TFX_DECAL		= 1,
+	TFX_HIGHLIGHT	= 2,
+	TFX_HIGHLIGHT2	= 3,
+	TFX_NONE		= 4,
+};
+
+enum GS_CLAMP
+{
+	CLAMP_REPEAT		= 0,
+	CLAMP_CLAMP			= 1,
+	CLAMP_REGION_CLAMP	= 2,
+	CLAMP_REGION_REPEAT	= 3,
+};
+
+enum GS_ZTST
+{
+	ZTST_NEVER		= 0,
+	ZTST_ALWAYS		= 1,
+	ZTST_GEQUAL		= 2,
+	ZTST_GREATER	= 3,
+};
+
+enum GS_ATST
+{
+	ATST_NEVER		= 0,
+	ATST_ALWAYS		= 1,
+	ATST_LESS		= 2,
+	ATST_LEQUAL		= 3,
+	ATST_EQUAL		= 4,
+	ATST_GEQUAL		= 5,
+	ATST_GREATER	= 6,
+	ATST_NOTEQUAL	= 7,
+};
+
+enum GS_AFAIL
+{
+	AFAIL_KEEP		= 0,
+	AFAIL_FB_ONLY	= 1,
+	AFAIL_ZB_ONLY	= 2,
+	AFAIL_RGB_ONLY	= 3,
+};
+
+__constant uchar blockTable32[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable32Z[4][8] =
+{
+	{ 24, 25, 28, 29,  8,  9, 12, 13},
+	{ 26, 27, 30, 31, 10, 11, 14, 15},
+	{ 16, 17, 20, 21,  0,  1,  4,  5},
+	{ 18, 19, 22, 23,  2,  3,  6,  7}
+};
+
+__constant uchar blockTable16[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar blockTable16S[8][4] =
+{
+	{  0,  2, 16, 18 },
+	{  1,  3, 17, 19 },
+	{  8, 10, 24, 26 },
+	{  9, 11, 25, 27 },
+	{  4,  6, 20, 22 },
+	{  5,  7, 21, 23 },
+	{ 12, 14, 28, 30 },
+	{ 13, 15, 29, 31 }
+};
+
+__constant uchar blockTable16Z[8][4] =
+{
+	{ 24, 26, 16, 18 },
+	{ 25, 27, 17, 19 },
+	{ 28, 30, 20, 22 },
+	{ 29, 31, 21, 23 },
+	{  8, 10,  0,  2 },
+	{  9, 11,  1,  3 },
+	{ 12, 14,  4,  6 },
+	{ 13, 15,  5,  7 }
+};
+
+__constant uchar blockTable16SZ[8][4] =
+{
+	{ 24, 26,  8, 10 },
+	{ 25, 27,  9, 11 },
+	{ 16, 18,  0,  2 },
+	{ 17, 19,  1,  3 },
+	{ 28, 30, 12, 14 },
+	{ 29, 31, 13, 15 },
+	{ 20, 22,  4,  6 },
+	{ 21, 23,  5,  7 }
+};
+
+__constant uchar blockTable8[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable4[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar columnTable32[8][8] =
+{
+	{  0,  1,  4,  5,  8,  9, 12, 13 },
+	{  2,  3,  6,  7, 10, 11, 14, 15 },
+	{ 16, 17, 20, 21, 24, 25, 28, 29 },
+	{ 18, 19, 22, 23, 26, 27, 30, 31 },
+	{ 32, 33, 36, 37, 40, 41, 44, 45 },
+	{ 34, 35, 38, 39, 42, 43, 46, 47 },
+	{ 48, 49, 52, 53, 56, 57, 60, 61 },
+	{ 50, 51, 54, 55, 58, 59, 62, 63 },
+};
+
+__constant uchar columnTable16[8][16] =
+{
+	{   0,   2,   8,  10,  16,  18,  24,  26,
+	    1,   3,   9,  11,  17,  19,  25,  27 },
+	{   4,   6,  12,  14,  20,  22,  28,  30,
+	    5,   7,  13,  15,  21,  23,  29,  31 },
+	{  32,  34,  40,  42,  48,  50,  56,  58,
+	   33,  35,  41,  43,  49,  51,  57,  59 },
+	{  36,  38,  44,  46,  52,  54,  60,  62,
+	   37,  39,  45,  47,  53,  55,  61,  63 },
+	{  64,  66,  72,  74,  80,  82,  88,  90,
+	   65,  67,  73,  75,  81,  83,  89,  91 },
+	{  68,  70,  76,  78,  84,  86,  92,  94,
+	   69,  71,  77,  79,  85,  87,  93,  95 },
+	{  96,  98, 104, 106, 112, 114, 120, 122,
+	   97,  99, 105, 107, 113, 115, 121, 123 },
+	{ 100, 102, 108, 110, 116, 118, 124, 126,
+	  101, 103, 109, 111, 117, 119, 125, 127 },
+};
+
+__constant uchar columnTable8[16][16] =
+{
+	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
+	    2,   6,  18,  22,  34,  38,  50,  54 },
+	{   8,  12,  24,  28,  40,  44,  56,  60,
+	   10,  14,  26,  30,  42,  46,  58,  62 },
+	{  33,  37,  49,  53,   1,   5,  17,  21,
+	   35,  39,  51,  55,   3,   7,  19,  23 },
+	{  41,  45,  57,  61,   9,  13,  25,  29,
+	   43,  47,  59,  63,  11,  15,  27,  31 },
+	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
+	   98, 102, 114, 118,  66,  70,  82,  86 },
+	{ 104, 108, 120, 124,  72,  76,  88,  92,
+	  106, 110, 122, 126,  74,  78,  90,  94 },
+	{  65,  69,  81,  85,  97, 101, 113, 117,
+	   67,  71,  83,  87,  99, 103, 115, 119 },
+	{  73,  77,  89,  93, 105, 109, 121, 125,
+	   75,  79,  91,  95, 107, 111, 123, 127 },
+	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
+	  130, 134, 146, 150, 162, 166, 178, 182 },
+	{ 136, 140, 152, 156, 168, 172, 184, 188,
+	  138, 142, 154, 158, 170, 174, 186, 190 },
+	{ 161, 165, 177, 181, 129, 133, 145, 149,
+	  163, 167, 179, 183, 131, 135, 147, 151 },
+	{ 169, 173, 185, 189, 137, 141, 153, 157,
+	  171, 175, 187, 191, 139, 143, 155, 159 },
+	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
+	  226, 230, 242, 246, 194, 198, 210, 214 },
+	{ 232, 236, 248, 252, 200, 204, 216, 220,
+	  234, 238, 250, 254, 202, 206, 218, 222 },
+	{ 193, 197, 209, 213, 225, 229, 241, 245,
+	  195, 199, 211, 215, 227, 231, 243, 247 },
+	{ 201, 205, 217, 221, 233, 237, 249, 253,
+	  203, 207, 219, 223, 235, 239, 251, 255 },
+};
+
+__constant ushort columnTable4[16][32] =
+{
+	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
+	    2,  10,  34,  42,  66,  74,  98, 106,
+	    4,  12,  36,  44,  68,  76, 100, 108,
+	    6,  14,  38,  46,  70,  78, 102, 110 },
+	{  16,  24,  48,  56,  80,  88, 112, 120,
+	   18,  26,  50,  58,  82,  90, 114, 122,
+	   20,  28,  52,  60,  84,  92, 116, 124,
+	   22,  30,  54,  62,  86,  94, 118, 126 },
+	{  65,  73,  97, 105,   1,   9,  33,  41,
+	   67,  75,  99, 107,   3,  11,  35,  43,
+	   69,  77, 101, 109,   5,  13,  37,  45,
+	   71,  79, 103, 111,   7,  15,  39,  47 },
+	{  81,  89, 113, 121,  17,  25,  49,  57,
+	   83,  91, 115, 123,  19,  27,  51,  59,
+	   85,  93, 117, 125,  21,  29,  53,  61,
+	   87,  95, 119, 127,  23,  31,  55,  63 },
+	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
+	  194, 202, 226, 234, 130, 138, 162, 170,
+	  196, 204, 228, 236, 132, 140, 164, 172,
+	  198, 206, 230, 238, 134, 142, 166, 174 },
+	{ 208, 216, 240, 248, 144, 152, 176, 184,
+	  210, 218, 242, 250, 146, 154, 178, 186,
+	  212, 220, 244, 252, 148, 156, 180, 188,
+	  214, 222, 246, 254, 150, 158, 182, 190 },
+	{ 129, 137, 161, 169, 193, 201, 225, 233,
+	  131, 139, 163, 171, 195, 203, 227, 235,
+	  133, 141, 165, 173, 197, 205, 229, 237,
+	  135, 143, 167, 175, 199, 207, 231, 239 },
+	{ 145, 153, 177, 185, 209, 217, 241, 249,
+	  147, 155, 179, 187, 211, 219, 243, 251,
+	  149, 157, 181, 189, 213, 221, 245, 253,
+	  151, 159, 183, 191, 215, 223, 247, 255 },
+	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
+	  258, 266, 290, 298, 322, 330, 354, 362,
+	  260, 268, 292, 300, 324, 332, 356, 364,
+	  262, 270, 294, 302, 326, 334, 358, 366 },
+	{ 272, 280, 304, 312, 336, 344, 368, 376,
+	  274, 282, 306, 314, 338, 346, 370, 378,
+	  276, 284, 308, 316, 340, 348, 372, 380,
+	  278, 286, 310, 318, 342, 350, 374, 382 },
+	{ 321, 329, 353, 361, 257, 265, 289, 297,
+	  323, 331, 355, 363, 259, 267, 291, 299,
+	  325, 333, 357, 365, 261, 269, 293, 301,
+	  327, 335, 359, 367, 263, 271, 295, 303 },
+	{ 337, 345, 369, 377, 273, 281, 305, 313,
+	  339, 347, 371, 379, 275, 283, 307, 315,
+	  341, 349, 373, 381, 277, 285, 309, 317,
+	  343, 351, 375, 383, 279, 287, 311, 319 },
+	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
+	  450, 458, 482, 490, 386, 394, 418, 426,
+	  452, 460, 484, 492, 388, 396, 420, 428,
+	  454, 462, 486, 494, 390, 398, 422, 430 },
+	{ 464, 472, 496, 504, 400, 408, 432, 440,
+	  466, 474, 498, 506, 402, 410, 434, 442,
+	  468, 476, 500, 508, 404, 412, 436, 444,
+	  470, 478, 502, 510, 406, 414, 438, 446 },
+	{ 385, 393, 417, 425, 449, 457, 481, 489,
+	  387, 395, 419, 427, 451, 459, 483, 491,
+	  389, 397, 421, 429, 453, 461, 485, 493,
+	  391, 399, 423, 431, 455, 463, 487, 495 },
+	{ 401, 409, 433, 441, 465, 473, 497, 505,
+	  403, 411, 435, 443, 467, 475, 499, 507,
+	  405, 413, 437, 445, 469, 477, 501, 509,
+	  407, 415, 439, 447, 471, 479, 503, 511 },
+};
+
+uint BlockNumber32(int x, int y, uint bp, uint bw)
+{
+	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+uint BlockNumber16(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber16S(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber32Z(int x, int y, uint bp, uint bw)
+{
+	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+uint BlockNumber16Z(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber16SZ(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+uint BlockNumber8(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
+}
+
+uint BlockNumber4(int x, int y, uint bp, uint bw)
+{
+	return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
+}
+
+uint PixelAddress32(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+uint PixelAddress16(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress16S(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress32Z(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+uint PixelAddress16Z(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress16SZ(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+uint PixelAddress8(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
+}
+
+uint PixelAddress4(int x, int y, uint bp, uint bw)
+{
+	return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
+}
+
+uint PixelAddress(int x, int y, uint bp, uint bw, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMT8H:
+	case PSM_PSMT4HL:
+	case PSM_PSMT4HH:
+		return PixelAddress32(x, y, bp, bw);
+	case PSM_PSMCT16: 
+		return PixelAddress16(x, y, bp, bw);
+	case PSM_PSMCT16S: 
+		return PixelAddress16S(x, y, bp, bw);
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return PixelAddress32Z(x, y, bp, bw);
+	case PSM_PSMZ16: 
+		return PixelAddress16Z(x, y, bp, bw);
+	case PSM_PSMZ16S: 
+		return PixelAddress16SZ(x, y, bp, bw);
+	case PSM_PSMT8:
+		return PixelAddress8(x, y, bp, bw);
+	case PSM_PSMT4:
+		return PixelAddress4(x, y, bp, bw);
+	}
+}
+
+uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm)
+{
+	// TODO: replace blockTable with a subset tileTable
+
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6];
+	case PSM_PSMCT16: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMCT16S: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6];
+	case PSM_PSMZ16: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3];
+	case PSM_PSMZ16S: 
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3];
+	}
+}
+
+uint TilePixelAddress(int x, int y, uint ba, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7];
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15];
+	}
+}
+
+uint ReadFrame(__global uchar* vm, uint addr, uint psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return ((__global uint*)vm)[addr];
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		return ((__global ushort*)vm)[addr];
+	}
+}
+
+void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		((__global uint*)vm)[addr] = value; 
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		((__global ushort*)vm)[addr] = (ushort)value;
+		break;
+	}
+}
+
+bool is16bit(int psm)
+{
+	return psm < 8 && (psm & 3) >= 2;
+}
+
+bool is24bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 1;
+}
+
+bool is32bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 0;
+}
+
+#ifdef PRIM
+
+int GetVertexPerPrim(int prim_class)
+{
+	switch(prim_class)
+	{
+	default:
+	case GS_POINT_CLASS: return 1;
+	case GS_LINE_CLASS: return 2;
+	case GS_TRIANGLE_CLASS: return 3;
+	case GS_SPRITE_CLASS: return 2;
+	}
+}
+
+#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM)
+
+#endif
+
+#ifdef KERNEL_PRIM
+
+__kernel void KERNEL_PRIM(
+	__global gs_env* env,
+	__global uchar* vb_base, 
+	__global uchar* ib_base, 
+	uint vb_start,
+	uint ib_start)
+{
+	size_t prim_index = get_global_id(0);
+
+	__global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start);
+	__global uint* ib = (__global uint*)(ib_base + ib_start);
+	__global gs_prim* prim = &env->prim[prim_index];
+	
+	ib += prim_index * VERTEX_PER_PRIM;
+
+	int2 pmin, pmax;
+
+	if(PRIM == GS_POINT_CLASS)
+	{
+		pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy);
+	}
+	else if(PRIM == GS_LINE_CLASS)
+	{
+		int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
+		int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+	}
+	else if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		__global gs_vertex* v0 = &vb[ib[0]];
+		__global gs_vertex* v1 = &vb[ib[1]];
+		__global gs_vertex* v2 = &vb[ib[2]];
+
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+		int2 p2 = convert_int2_rtp(v2->p.xy);
+
+		pmin = min(min(p0, p1), p2);
+		pmax = max(max(p0, p1), p2);
+
+		prim->v[0].p = v0->p;
+		prim->v[0].tc = v0->tc;
+		prim->v[1].p = v1->p;
+		prim->v[1].tc = v1->tc;
+		prim->v[2].p = v2->p;
+		prim->v[2].tc = v2->tc;
+
+		float4 dp0 = v1->p - v0->p;
+		float4 dp1 = v0->p - v2->p;
+		float4 dp2 = v2->p - v1->p;
+
+		float cp = dp0.x * dp1.y - dp0.y * dp1.x;
+
+		if(cp != 0.0f)
+		{
+			float cp_rcp = 1.0f / cp;// native_recip(cp);
+
+			float2 u = dp0.xy * cp_rcp;
+			float2 v = -dp1.xy * cp_rcp;
+
+			// v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0)
+
+			gs_barycentric b;
+
+			b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x);
+			b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y);
+
+			dp0.xy = dp0.xy * sign(cp);
+			dp1.xy = dp1.xy * sign(cp);
+			dp2.xy = dp2.xy * sign(cp);
+
+			b.zero.x = (dp1.y < 0 || dp1.y == 0 && dp1.x > 0) ? CL_FLT_EPSILON : 0;
+			b.zero.y = (dp0.y < 0 || dp0.y == 0 && dp0.x > 0) ? CL_FLT_EPSILON : 0;
+			b.zero.z = (dp2.y < 0 || dp2.y == 0 && dp2.x > 0) ? CL_FLT_EPSILON : 0;
+
+			// any barycentric(reject_corner) < 0, tile outside the triangle
+
+			b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE;
+			b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE;
+			b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE;
+
+			// TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed
+
+			env->barycentric[prim_index] = b;
+		}
+		else
+		{
+			// TODO: set b.zero to something that always fails the tests
+		}
+	}
+	else if(PRIM == GS_SPRITE_CLASS)
+	{
+		__global gs_vertex* v0 = &vb[ib[0]];
+		__global gs_vertex* v1 = &vb[ib[1]];
+
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+
+		int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0);
+
+		prim->v[0].p = select(v0->p, v1->p, mask); // pmin
+		prim->v[0].tc = select(v0->tc, v1->tc, mask);
+		prim->v[1].p = select(v1->p, v0->p, mask); // pmax
+		prim->v[1].tc = select(v1->tc, v0->tc, mask);
+		prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy);
+	}
+
+	int4 pminmax = (int4)(pmin, pmax);
+
+	env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS);
+}
+
+#endif
+
+#ifdef KERNEL_TILE
+
+int tile_in_triangle(float2 p, gs_barycentric b)
+{
+	float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz;
+
+	f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
+
+	return all(f >= b.zero.xyz);
+}
+
+#if CLEAR == 1
+
+__kernel void KERNEL_TILE(__global gs_env* env)
+{
+	env->batch_counter = 0;
+	env->bounds[get_global_id(0)].first = -1;
+	env->bounds[get_global_id(0)].last = 0;
+}
+
+#elif MODE < 3
+
+#if MAX_PRIM_PER_BATCH != 32
+	#error "MAX_PRIM_PER_BATCH != 32"
+#endif
+
+#define MAX_PRIM_PER_GROUP (32u >> MODE)
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	uint batch_index = get_group_id(2) >> MODE;
+	uint prim_start = get_group_id(2) << (5 - MODE);
+	uint group_prim_index = get_local_id(2);
+	uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+	__global uchar4* bbox = &env->bbox[prim_start];
+	__global gs_barycentric* barycentric = &env->barycentric[prim_start];
+
+	__local uchar4 bbox_cache[MAX_PRIM_PER_GROUP];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP];
+	__local uint visible[8 << MODE];
+
+	if(get_local_id(2) == 0)
+	{
+		visible[bin_index] = 0;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP);
+
+	event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0);
+
+	wait_group_events(1, &e);
+
+	if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	if(group_prim_index < group_prim_count)
+	{
+		int x = bin_dim.x + get_local_id(0);
+		int y = bin_dim.y + get_local_id(1);
+
+		uchar4 r = bbox_cache[group_prim_index];
+
+		uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
+
+		if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+		{
+			test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
+		}
+
+		atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2)));
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_local_id(2) == 0)
+	{
+		#if MODE == 0
+		((__global uint*)&bin[bin_index])[0] = visible[bin_index];
+		#elif MODE == 1
+		((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index];
+		#elif MODE == 2
+		((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index];
+		#endif
+
+		if(visible[bin_index] != 0)
+		{
+			atomic_min(&env->bounds[bin_index].first, batch_index);
+			atomic_max(&env->bounds[bin_index].last, batch_index);
+		}
+	}
+}
+
+#elif MODE == 3
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint batch_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
+	__local uint batch_index;
+
+	size_t local_id = get_local_id(0);
+	size_t local_size = get_local_size(0);
+
+	while(1)
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(local_id == 0)
+		{
+			batch_index = atomic_inc(&env->batch_counter);
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(batch_index >= batch_count) 
+		{
+			break;
+		}
+
+		uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
+		
+		__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+		__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
+		__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
+
+		event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
+
+		wait_group_events(1, &e);
+
+		if(PRIM == GS_TRIANGLE_CLASS)
+		{
+			e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+			wait_group_events(1, &e);
+		}
+
+		for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
+		{
+			int y = bin_index / bin_dim.z;
+			int x = bin_index - y * bin_dim.z;
+
+			x += bin_dim.x;
+			y += bin_dim.y;
+
+			BIN_TYPE visible = 0;
+
+			for(uint i = 0; i < batch_prim_count; i++)
+			{
+				uchar4 r = bbox_cache[i];
+
+				BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
+
+				if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+				{
+					test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
+				}
+
+				visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
+			}
+
+			bin[bin_index] = visible;
+
+			if(visible != 0)
+			{
+				atomic_min(&env->bounds[bin_index].first, batch_index);
+				atomic_max(&env->bounds[bin_index].last, batch_index);
+			}
+		}
+	}
+}
+
+#endif
+
+#endif
+
+#ifdef KERNEL_TFX
+
+bool ZTest(uint zs, uint zd)
+{ 
+	if(ZTEST)
+	{
+		if(is24bit(ZPSM)) zd &= 0x00ffffff;
+
+		switch(ZTST)
+		{
+		case ZTST_NEVER:
+			return false;
+		case ZTST_ALWAYS:
+			return true;
+		case ZTST_GEQUAL:
+			return zs >= zd;
+		case ZTST_GREATER:
+			return zs > zd;
+		}
+	}
+
+	return true;
+}
+
+bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
+{
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		break;
+	case AFAIL_FB_ONLY:
+		if(!ZWRITE) return true;
+		break;
+	case AFAIL_ZB_ONLY:
+		if(!FWRITE) return true;
+		break;
+	case AFAIL_RGB_ONLY:
+		if(!ZWRITE && is24bit(FPSM)) return true;
+		break;
+	}
+
+	uint pass;
+	
+	switch(ATST)
+	{
+	case ATST_NEVER:
+		pass = false;
+		break;
+	case ATST_ALWAYS:
+		return true;
+	case ATST_LESS:
+		pass = alpha < aref;
+		break;
+	case ATST_LEQUAL:
+		pass = alpha <= aref;
+		break;
+	case ATST_EQUAL:
+		pass = alpha == aref;
+		break;
+	case ATST_GEQUAL:
+		pass = alpha >= aref;
+		break;
+	case ATST_GREATER:
+		pass = alpha > aref;
+		break;
+	case ATST_NOTEQUAL:
+		pass = alpha != aref;
+		break;
+	}
+
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		return pass;
+	case AFAIL_FB_ONLY:
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_ZB_ONLY:
+		*fm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_RGB_ONLY:
+		*fm |= pass ? 0 : 0xff000000;
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	}
+
+	return true;
+}
+
+bool DestAlphaTest(uint fd)
+{
+	if(DATE)
+	{
+		if(DATM)
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) != 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) != 0;
+		}
+		else
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) == 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) == 0;
+		}
+	}
+
+	return true;
+}
+
+int Wrap(int a, int b, int c, int mode)
+{
+	switch(mode)
+	{
+	case CLAMP_REPEAT:
+		return a & b;
+	case CLAMP_CLAMP:
+		return clamp(a, 0, c);
+	case CLAMP_REGION_CLAMP:
+		return clamp(a, b, c);
+	case CLAMP_REGION_REPEAT:
+		return (a & b) | c;
+	}
+}
+
+int4 AlphaBlend(int4 c, int afix, uint fd)
+{
+	if(FWRITE && (ABE || AA1))
+	{
+		int4 cs = c;
+		int4 cd;
+
+		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1)
+		{
+			if(is32bit(FPSM) || is24bit(FPSM))
+			{
+				cd.x = fd & 0xff;
+				cd.y = (fd >> 8) & 0xff;
+				cd.z = (fd >> 16) & 0xff;
+				cd.w = fd >> 24;
+			}
+			else if(is16bit(FPSM))
+			{
+				cd.x = (fd & 0x001f) << 3;
+				cd.y = (fd & 0x03e0) >> 2;
+				cd.z = (fd & 0x7c00) >> 7;
+				cd.w = (fd & 0x8000) >> 8;
+			}
+		}
+
+		if(ABA != ABB)
+		{
+			switch(ABA)
+			{
+			case 0: break; // c.xyz = cs.xyz;
+			case 1: c.xyz = cd.xyz; break;
+			case 2: c.xyz = 0; break;
+			}
+
+			switch(ABB)
+			{
+			case 0: c.xyz -= cs.xyz; break;
+			case 1: c.xyz -= cd.xyz; break;
+			case 2: break;
+			}
+
+			if(!(is24bit(FPSM) && ABC == 1))
+			{
+				int a = 0;
+
+				switch(ABC)
+				{
+				case 0: a = cs.w; break;
+				case 1: a = cd.w; break;
+				case 2: a = afix; break;
+				}
+
+				c.xyz = c.xyz * a >> 7;
+			}
+
+			switch(ABD)
+			{
+			case 0: c.xyz += cs.xyz; break;
+			case 1: c.xyz += cd.xyz; break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			switch(ABD)
+			{
+			case 0: break;
+			case 1: c.xyz = cd.xyz; break;
+			case 2: c.xyz = 0; break;
+			}
+		}
+
+		if(PABE)
+		{
+			c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24));
+		}
+	}
+
+	return c;
+}
+
+uchar4 Expand24To32(uint rgba, uchar ta0)
+{
+	uchar4 c;
+
+	c.x = rgba & 0xff;
+	c.y = (rgba >> 8) & 0xff;
+	c.z = (rgba >> 16) & 0xff;
+	c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0;
+
+	return c;
+}
+
+uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
+{
+	uchar4 c;
+
+	c.x = (rgba & 0x001f) << 3;
+	c.y = (rgba & 0x03e0) >> 2;
+	c.z = (rgba & 0x7c00) >> 7;
+	c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
+
+	return c;
+}
+
+int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb)
+{
+	uchar4 c;
+
+	uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM);
+
+	__global ushort* vm16 = (__global ushort*)vm;
+	__global uint* vm32 = (__global uint*)vm;
+
+	switch(TPSM)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+		c = ((__global uchar4*)vm)[addr];
+		break;
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		c = Expand24To32(vm32[addr], pb->ta0);
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		c = Expand16To32(vm16[addr], pb->ta0, pb->ta1);
+		break;
+	case PSM_PSMT8:
+		c = pb->clut[vm[addr]];
+		break;
+	case PSM_PSMT4:
+		c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f];
+		break;
+	case PSM_PSMT8H:
+		c = pb->clut[vm32[addr] >> 24];
+		break;
+	case PSM_PSMT4HL:
+		c = pb->clut[(vm32[addr] >> 24) & 0x0f];
+		break;
+	case PSM_PSMT4HH:
+		c = pb->clut[(vm32[addr] >> 28) & 0x0f];
+		break;
+	}
+
+	//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]);
+
+	return convert_int4(c);
+}
+
+__kernel void KERNEL_TFX(
+	__global gs_env* env,
+	__global uchar* vm,
+	__global uchar* tex,
+	__global uchar* pb_base, 
+	uint pb_start,
+	uint prim_start, 
+	uint prim_count,
+	uint batch_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way
+
+	uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x;
+	uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y;
+	uint bin_index = bin_y * bin_dim.z + bin_x;
+
+	uint batch_first = env->bounds[bin_index].first;
+	uint batch_last = env->bounds[bin_index].last;
+	uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS;
+
+	if(batch_last < batch_first)
+	{
+		return;
+	}
+
+	uint skip;
+	
+	if(batch_start < batch_first)
+	{
+		uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1));
+
+		if(n > prim_count) 
+		{
+			return;
+		}
+
+		skip = 0;
+		prim_count -= n;
+		batch_start = batch_first;
+	}
+	else
+	{
+		skip = prim_start & (MAX_PRIM_PER_BATCH - 1);
+		prim_count += skip;
+	}
+
+	if(batch_start > batch_last) 
+	{
+		return;
+	}
+	
+	prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS);
+
+	//
+
+	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
+
+	uint x = get_global_id(0);
+	uint y = get_global_id(1);
+
+	int2 pi = (int2)(x, y);
+	float2 pf = convert_float2(pi);
+
+	if(!NOSCISSOR)
+	{
+		int4 scissor = pb->scissor;
+
+		if(!all((pi >= scissor.xy) & (pi < scissor.zw)))
+		{
+			return;
+		}
+	}
+
+	uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
+	uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
+
+	uint fd, zd;
+
+	if(RFB) 
+	{
+		fd = ReadFrame(vm, faddr, FPSM);
+	}
+
+	if(ZTEST)
+	{
+		zd = ReadFrame(vm, zaddr, ZPSM);
+	}
+/*
+	// TODO: lookup top left address of this tile + local offset
+	//
+	// 32bpp: 8x8 block size, 4 blocks, 1024 bytes
+	// 0 1
+	// 2 3
+	// 16bpp: 16x8 block size, 2 blocks, 512 bytes
+	// 0
+	// 1
+	// linear access in memory, this layout is the same for all formats
+
+	__local uint fbn, zbn;
+	__local uchar fb[1024], zb[1024];
+
+	if(get_local_id(0) == 0 && get_local_id(1) == 0)
+	{
+		fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
+		zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint faddr = TilePixelAddress(x, y, fbn, FPSM);
+	uint zaddr = TilePixelAddress(x, y, zbn, ZPSM);
+
+	if(RFB)
+	{
+		event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	if(ZTEST)
+	{
+		event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	// not sure if faster
+*/
+
+	// early destination alpha test
+
+	if(!DestAlphaTest(fd))
+	{
+		return;
+	}
+
+	//
+
+	uint fragments = 0;
+
+	//__local gs_prim p;
+
+	__global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case"
+	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
+	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
+
+	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
+
+	__local gs_prim prim_cache;
+
+	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
+	{
+		while(bin_value != 0)
+		{
+			uint i = clz(bin_value);
+
+			if(prim_index + i >= prim_count)
+			{
+				break;
+			}
+
+			__global gs_prim* prim = &prim_base[prim_index + i];
+			
+			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
+
+			uint2 zf;
+			float4 t;
+			int4 c;
+
+			 // TODO: do not hittest if we know the tile is fully inside the prim
+
+			if(PRIM == GS_POINT_CLASS)
+			{
+				// TODO: distance.x < 0.5f || distance.y < 0.5f
+
+				continue;
+			}
+			else if(PRIM == GS_LINE_CLASS)
+			{
+				// TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f
+
+				continue;
+			}
+			else if(PRIM == GS_TRIANGLE_CLASS)
+			{
+				__global gs_barycentric* b = &barycentric[prim_index + i];
+
+				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
+
+				f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
+
+				if(!all(f >= b->zero.xyz))
+				{
+					continue;
+				}
+
+				zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y);
+
+				t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
+
+				if(IIP)
+				{
+					float4 c0 = convert_float4(prim->v[0].c);
+					float4 c1 = convert_float4(prim->v[1].c);
+					float4 c2 = convert_float4(prim->v[2].c);
+
+					c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y);
+				}
+				else
+				{
+					c = convert_int4(prim->v[2].c);
+				}
+			}
+			else if(PRIM == GS_SPRITE_CLASS)
+			{
+				int2 tl = convert_int2_rtp(prim->v[0].p.xy);
+				int2 br = convert_int2_rtp(prim->v[1].p.xy);
+
+				if(!all((pi >= tl) & (pi < br)))
+				{
+					continue;
+				}
+
+				zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint
+				
+				t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy);
+				t.z = prim->v[0].tc.z;
+
+				c = convert_int4(prim->v[1].c);
+			}
+
+			// z test
+
+			uint zs = zf.x;
+
+			if(!ZTest(zs, zd))
+			{
+				continue;
+			}
+
+			// sample texture
+
+			int4 ct;
+
+			if(FB && TFX != TFX_NONE)
+			{
+				// TODO
+
+				if(0)//if(MMIN)
+				{
+					// TODO
+				}
+				else
+				{
+					int2 uv;
+
+					if(!FST)
+					{
+						uv = convert_int2(t.xy * (1.0f / t.z));
+
+						if(LTF) uv -= 0x0008;
+					}
+					else
+					{
+						uv = convert_int2(t.xy);
+					}
+
+					int2 uvf = uv & 0x000f;
+
+					int2 uv0 = uv >> 4;
+					int2 uv1 = uv0 + 1;
+
+					uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
+					uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
+					uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
+					uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
+					
+					tex = vm; // TODO: use the texture cache
+
+					int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
+					int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
+					int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
+					int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
+
+					if(LTF)
+					{
+						c00 = ((c01 - c00) * uvf.x >> 4) + c00;
+						c10 = ((c11 - c10) * uvf.x >> 4) + c10;
+						c00 = ((c10 - c00) * uvf.y >> 4) + c00;
+					}
+
+					ct = c00;
+				}
+			}
+
+			// alpha tfx
+
+			if(FB)
+			{
+				if(TCC)
+				{
+					switch(TFX)
+					{
+					case TFX_MODULATE:
+						c.w = clamp(ct.w * c.w >> 7, 0, 0xff);
+						break;
+					case TFX_DECAL:
+						c.w = ct.w;
+						break;
+					case TFX_HIGHLIGHT:
+						c.w = clamp(ct.w + c.w, 0, 0xff);
+						break;
+					case TFX_HIGHLIGHT2:
+						c.w = ct.w;
+						break;
+					}
+				}
+
+				if(AA1)
+				{
+					if(!ABE || c.w == 0x80)
+					{
+						// TODO: c.w = coverage; // coverage 0x80 at 100%
+					}
+				}
+			}
+
+			// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
+
+			uint fm = pb->fm;
+			uint zm = pb->zm;
+
+			// alpha test
+
+			if(!AlphaTest(c.w, pb->aref, &fm, &zm))
+			{
+				continue;
+			}
+
+			// all tests done, we have a new output
+
+			fragments++;
+
+			// write z
+
+			if(ZWRITE)
+			{
+				zd = bitselect(zs, zd, zm);
+			}
+
+			// rgb tfx
+
+			if(FWRITE)
+			{
+				switch(TFX)
+				{
+				case TFX_MODULATE:
+					c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff);
+					break;
+				case TFX_DECAL:
+					c.xyz = ct.xyz;
+					break;
+				case TFX_HIGHLIGHT:
+				case TFX_HIGHLIGHT2:					
+					c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff);
+					break;
+				}
+			}
+
+			// fog
+
+			if(FWRITE && FGE)
+			{
+				int fog = (int)zf.y;
+
+				c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8);				
+			}
+
+			// alpha blend
+
+			c = AlphaBlend(c, pb->afix, fd);
+
+			// write frame
+
+			if(FWRITE)
+			{
+				if(DTHE && is16bit(FPSM))
+				{
+					// TODO: c += pb->dimx[y & 3]
+				}
+
+				c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff);
+				
+				if(FBA && !is24bit(FPSM))
+				{
+					c.w |= 0x80;
+				}
+
+				uint fs;
+
+				if(is32bit(FPSM))
+				{
+					fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is24bit(FPSM))
+				{
+					fs = (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is16bit(FPSM))
+				{
+					fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3);
+				}
+
+				fd = RFB ? bitselect(fs, fd, fm) : fs;
+
+				// dest alpha test for the next loop
+
+				if(!DestAlphaTest(fd))
+				{
+					prim_index = prim_count; // game over
+
+					break;
+				}
+			}
+		}
+
+		bin += bin_count;
+		bin_value = *bin;
+	}
+
+	if(fragments > 0)
+	{
+		// TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place)
+
+		// TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format)
+
+		if(ZWRITE)
+		{
+			WriteFrame(vm, zaddr, ZPSM, zd);
+		}
+
+		if(FWRITE)
+		{
+			WriteFrame(vm, faddr, FPSM, fd);
+			//WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments);
+		}
+	}
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index 02626f8dad..b9bfa19737 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -43,6 +43,7 @@
 #include <d3dx9.h>
 #include <comutil.h>
 #include "../../common/include/comptr.h"
+#include <CL/cl.hpp>
 
 #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
 #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO
diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props
index 71bf73f281..91c58981d0 100644
--- a/plugins/GSdx/vsprops/common.props
+++ b/plugins/GSdx/vsprops/common.props
@@ -8,22 +8,22 @@
   <ItemDefinitionGroup>
     <ClCompile>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <FloatingPointModel>Fast</FloatingPointModel>
       <RuntimeTypeInfo>false</RuntimeTypeInfo>
       <WarningLevel>Level4</WarningLevel>
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
       <DisableSpecificWarnings>4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <AdditionalIncludeDirectories>$(DXSDK_DIR)include;$(VTUNE_AMPLIFIER_XE_2013_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(DXSDK_DIR)include;$(INTELOCLSDKROOT)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
     <Link>
-      <AdditionalDependencies>d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;comsuppw.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;opencl.lib;comsuppw.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <DelayLoadDLLs>d3d9.dll;d3dx9_43.dll;d3d11.dll;d3dx11_43.dll;dxgi.dll;opengl32.dll;%(DelayLoadDLLs)</DelayLoadDLLs>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
       <RandomizedBaseAddress>false</RandomizedBaseAddress>
-      <AdditionalLibraryDirectories>$(VTUNE_AMPLIFIER_XE_2013_DIR)lib32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(VTUNE_AMPLIFIER_XE_2015_DIR)lib32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
       <Command>.\postBuild.cmd "$(TargetPath)" "$(TargetName)" $(TargetExt) $(PcsxSubsection)</Command>
diff --git a/plugins/GSdx/vsprops/x64.props b/plugins/GSdx/vsprops/x64.props
index 947675c717..ab8253d819 100644
--- a/plugins/GSdx/vsprops/x64.props
+++ b/plugins/GSdx/vsprops/x64.props
@@ -5,7 +5,7 @@
   <PropertyGroup />
   <ItemDefinitionGroup>
     <Link>
-      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x64;$(INTELOCLSDKROOT)lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <ClCompile>
       <PreprocessorDefinitions>_WIN64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
diff --git a/plugins/GSdx/vsprops/x86.props b/plugins/GSdx/vsprops/x86.props
index ff76b9535d..56171d784c 100644
--- a/plugins/GSdx/vsprops/x86.props
+++ b/plugins/GSdx/vsprops/x86.props
@@ -5,7 +5,7 @@
   <PropertyGroup />
   <ItemDefinitionGroup>
     <Link>
-      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(DXSDK_DIR)Lib\x86;$(INTELOCLSDKROOT)lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
     <ClCompile />
     <ClCompile>