From a1a842b07fba7f2692f772b3f89b5c64115cd9c0 Mon Sep 17 00:00:00 2001
From: gabest11 <gabest11@gmail.com>
Date: Sun, 2 Aug 2015 20:21:49 +0200
Subject: [PATCH] gsdx: date/datm fix for 16-bit frame buffer in sw rendering
 mode

---
 plugins/GSdx/GPURenderer.cpp                  |  10 +-
 plugins/GSdx/GSDrawScanline.cpp               |  10 +-
 .../GSDrawScanlineCodeGenerator.x64.avx.cpp   |   4 +-
 .../GSDrawScanlineCodeGenerator.x86.avx.cpp   |   4 +-
 .../GSDrawScanlineCodeGenerator.x86.avx2.cpp  |   4 +-
 plugins/GSdx/GSRendererCL.cpp                 | 659 ++++++++++++------
 plugins/GSdx/GSRendererCL.h                   |  16 +-
 plugins/GSdx/GSState.cpp                      |   3 +
 plugins/GSdx/GSUtil.cpp                       |   2 +-
 plugins/GSdx/res/tfx.cl                       | 169 +++--
 plugins/GSdx/stdafx.h                         |   7 +
 11 files changed, 599 insertions(+), 289 deletions(-)

diff --git a/plugins/GSdx/GPURenderer.cpp b/plugins/GSdx/GPURenderer.cpp
index e92c5db51b..70062991c0 100644
--- a/plugins/GSdx/GPURenderer.cpp
+++ b/plugins/GSdx/GPURenderer.cpp
@@ -113,13 +113,13 @@ bool GPURenderer::Merge()
 
 	GSVector2i s = st[0]->GetSize();
 
-	GSVector4 sRect[2];
-	GSVector4 dRect[2];
+	GSVector4 sr[2];
+	GSVector4 dr[2];
 
-	sRect[0] = GSVector4(0, 0, 1, 1);
-	dRect[0] = GSVector4(0, 0, s.x, s.y);
+	sr[0] = GSVector4(0, 0, 1, 1);
+	dr[0] = GSVector4(0, 0, s.x, s.y);
 
-	m_dev->Merge(st, sRect, dRect, s, 1, 1, GSVector4(0, 0, 0, 1));
+	m_dev->Merge(st, sr, dr, s, 1, 1, GSVector4(0, 0, 0, 1));
 
 	if(m_shadeboost)
 	{
diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp
index 2604a16488..96616bb48b 100644
--- a/plugins/GSdx/GSDrawScanline.cpp
+++ b/plugins/GSdx/GSDrawScanline.cpp
@@ -1148,7 +1148,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.srl32(15) == GSVector8i::zero();
+						// test |= fd.srl32(15) == GSVector8i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector8i::zero();
 					}
 					else
 					{
@@ -1159,7 +1160,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.sll32(16).sra32(31);
+						test |= fd.sll32(16).sra32(31); // == GSVector8i::xffffffff();
 					}
 					else
 					{
@@ -2264,7 +2265,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.srl32(15) == GSVector4i::zero();
+						// test |= fd.srl32(15) == GSVector4i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector4i::zero();
 					}
 					else
 					{
@@ -2275,7 +2277,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
 				{
 					if(sel.fpsm == 2)
 					{
-						test |= fd.sll32(16).sra32(31);
+						test |= fd.sll32(16).sra32(31); // == GSVector4i::xffffffff();
 					}
 					else
 					{
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
index 56b82ca070..e3954a4129 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
@@ -1326,7 +1326,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(xmm0, xmm0);
-			vpsrld(xmm1, xmm6, 15);
+			//vpsrld(xmm1, xmm6, 15);
+			vpslld(xmm1, xmm6, 16);
+			vpsrad(xmm1, 31);
 			vpcmpeqd(xmm1, xmm0);
 		}
 		else
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
index 33872215c1..f1da5abe29 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
@@ -2330,7 +2330,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(xmm0, xmm0);
-			vpsrld(xmm1, xmm2, 15);
+			//vpsrld(xmm1, xmm2, 15);
+			vpslld(xmm1, xmm2, 16);
+			vpsrad(xmm1, 31);
 			vpcmpeqd(xmm1, xmm0);
 		}
 		else
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
index d616311a1e..2e3c974ea8 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
@@ -2295,7 +2295,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		if(m_sel.fpsm == 2)
 		{
 			vpxor(ymm0, ymm0);
-			vpsrld(ymm1, ymm2, 15);
+			//vpsrld(ymm1, ymm2, 15);
+			vpslld(ymm1, ymm2, 16);
+			vpsrad(ymm1, 31);
 			vpcmpeqd(ymm1, ymm0);
 		}
 		else
diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp
index ade70dae71..e3e4224803 100644
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@@ -39,7 +39,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
 #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
 #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
 #define TFX_PARAM_SIZE 2048
-#define TFX_PROGRAM_VERSION 1
+#define TFX_MAX_PARAM_COUNT 256
 
 #if MAX_PRIM_PER_BATCH == 64u
 #define BIN_TYPE cl_ulong
@@ -91,6 +91,8 @@ GSRendererCL::GSRendererCL()
 		m_tc_pages[i] = GSVector4i::xffffffff();
 	}
 
+	memset(m_rw_pages_rendering, 0, sizeof(m_rw_pages_rendering));
+
 	#define InitCVB(P) \
 		m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
 		m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer<P, 0, 1>; \
@@ -106,7 +108,7 @@ GSRendererCL::GSRendererCL()
 	// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
 
 	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
-	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
+	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY, (size_t)m_mem.m_vmsize);
 }
 
 GSRendererCL::~GSRendererCL()
@@ -157,8 +159,6 @@ void GSRendererCL::ResetDevice()
 
 GSTexture* GSRendererCL::GetOutput(int i)
 {
-	Sync(1);
-
 	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
 
 	int w = DISPFB.FBW * 64;
@@ -174,6 +174,14 @@ GSTexture* GSRendererCL::GetOutput(int i)
 
 		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
 
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = DISPFB.Block();
+		BITBLTBUF.SBW = DISPFB.FBW;
+		BITBLTBUF.SPSM = DISPFB.PSM;
+
+		InvalidateLocalMem(BITBLTBUF, r);
+
 		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
 
 		m_texture[i]->Update(r, m_output, pitch);
@@ -197,7 +205,7 @@ const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f);
 template<uint32 primclass, uint32 tme, uint32 fst>
 void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
 {
-	GSVector4i off = (GSVector4i)m_context->XYOFFSET;
+	GSVector4i o = (GSVector4i)m_context->XYOFFSET;
 	GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0);
 
 	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
@@ -206,7 +214,7 @@ void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex*
 
 		GSVector4i xyzuvf(src->m[1]);
 
-		dst->p = (GSVector4(xyzuvf.upl16() - off) * g_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints
+		dst->p = (GSVector4(xyzuvf.upl16() - o) * g_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints
 
 		GSVector4 t = GSVector4::zero();
 
@@ -387,7 +395,7 @@ void GSRendererCL::Draw()
 		{
 			// TODO: SIMD
 
-			ASSERT(m_pb_count < 256);
+			ASSERT(m_pb_count < TFX_MAX_PARAM_COUNT);
 
 			uint32 vb_count = m_vb_count | (m_pb_count << 24);
 
@@ -408,18 +416,22 @@ void GSRendererCL::Draw()
 
 		if(bbox.eq(bbox.rintersect(scissor)))
 		{
-			job->sel.noscissor = 1;
+			pb->sel.noscissor = 1;
 		}
 
 		job->rect.x = rect.x;
 		job->rect.y = rect.y;
 		job->rect.z = rect.z;
 		job->rect.w = rect.w;
+		job->sel = pb->sel;
 		job->ib_start = m_cl.ib.tail;
 		job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
 		job->fbp = pb->fbp;
 		job->zbp = pb->zbp;
 		job->bw = pb->bw;
+		job->fpsm = context->FRAME.PSM;
+		job->zpsm = context->ZBUF.PSM;
+		job->tpsm = context->TEX0.PSM;
 
 #ifdef DEBUG
 		job->pb = pb;
@@ -502,7 +514,7 @@ void GSRendererCL::Draw()
 
 		// don't buffer too much data, feed them to the device if there is enough
 
-		if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64)
+		if(m_pb_count >= TFX_MAX_PARAM_COUNT || m_vb_count >= 4096)
 		{
 			Enqueue();
 		}
@@ -564,6 +576,8 @@ void GSRendererCL::Sync(int reason)
 		m_rw_pages[1][i] = GSVector4i::zero();
 	}
 
+	for(int i = 0; i < MAX_PAGES; i++) ASSERT(m_rw_pages_rendering[i] == 0);
+
 	m_synced = true;
 }
 
@@ -571,27 +585,77 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 {
 	if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
 	
-	GSOffset* off = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+	GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
 
-	off->GetPagesAsBits(r, m_tmp_pages);
+	o->GetPagesAsBits(r, m_tmp_pages);
 
 	if(!m_synced)
 	{
-		for(int i = 0; i < 4; i++)
+		int i = 0;
+
+		bool wait;
+
+		do
 		{
-			GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
+			wait = false;
 
-			if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
+			for(; i < 4; i++)
 			{
-				// TODO: an awesome idea to avoid this Sync
-				// - call Enqueue() to flush m_jobs 
-				// - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it
-				// - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written
+				GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
 
-				Sync(3); 
+				if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
+				{
+					// TODO: an awesome idea to avoid this Sync
+					// - call Enqueue() to flush m_jobs 
+					// - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it
+					// - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written
 
-				break;
+					//Sync(3);
+
+					Enqueue();
+
+					wait = true;
+
+					break;
+				}
 			}
+
+			_mm_pause();
+		}
+		while(wait);
+
+		if(!m_synced)
+		{
+			o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice
+			
+			const uint32* p = m_tmp_pages2;
+
+			do
+			{
+				wait = false;
+
+				for(; *p != GSOffset::EOP; p++)
+				{
+					if(m_rw_pages_rendering[*p])
+					{
+						// Sync(5);
+
+						wait = true;
+
+						break;
+					}
+				}
+				/*
+				if(!m_synced)
+				{
+					void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+					m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr);
+				}
+				*/
+
+				_mm_pause();
+			}
+			while(wait);
 		}
 	}
 
@@ -607,9 +671,9 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 	
 	if(!m_synced)
 	{
-		GSOffset* off = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
 
-		off->GetPagesAsBits(r, m_tmp_pages);
+		o->GetPagesAsBits(r, m_tmp_pages);
 
 		for(int i = 0; i < 4; i++)
 		{
@@ -622,13 +686,43 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 				break;
 			}
 		}
+
+		if(!m_synced)
+		{
+			o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice
+
+			for(const uint32* p = m_tmp_pages2; *p != GSOffset::EOP; p++)
+			{
+				if(m_rw_pages_rendering[*p] & 0xffff0000)
+				{
+					Sync(6);
+
+					break;
+				}
+			}
+			/*
+			if(!m_synced)
+			{
+				void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+				m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr);
+			}
+			*/
+		}
 	}
 }
 
+typedef struct { GSRendererCL* r; uint32 pages[(MAX_PAGES + 1) * 2]; } cb_data;
+
 void GSRendererCL::Enqueue()
 {
 	if(m_jobs.empty()) return;
 
+	cb_data* data = new cb_data();
+
+	data->r = this;
+
+	UsePages(data->pages);
+
 	try
 	{
 		ASSERT(m_cl.vb.tail > m_cl.vb.head);
@@ -798,6 +892,8 @@ void GSRendererCL::Enqueue()
 
 					std::list<shared_ptr<TFXJob>> jobs(head, next);
 
+					JoinTFX(jobs);
+
 					EnqueueTFX(jobs, bin_count, bin_dim);
 
 					if(total_prim_count > MAX_PRIM_COUNT)
@@ -824,6 +920,19 @@ void GSRendererCL::Enqueue()
 		printf("%s (%d)\n", err.what(), err.err());
 	}
 
+	try
+	{
+		cl::Event e;
+		m_cl.queue[2].enqueueMarker(&e);
+		e.setCallback(CL_COMPLETE, ReleasePageEvent, data);
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+
+		delete [] data;
+	}
+
 	m_jobs.clear();
 
 	m_vb_count = 0;
@@ -837,79 +946,6 @@ void GSRendererCL::Enqueue()
 
 void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
 {
-	// join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages
-
-	//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
-
-	auto next = jobs.begin();
-
-	while(next != jobs.end())
-	{
-		auto prev = next++;
-
-		if(next == jobs.end())
-		{
-			break;
-		}
-
-		if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw)
-		{
-			if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
-			{
-				bool overlap = false;
-
-				for(int i = 0; i < 4; i++)
-				{
-					if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
-					{
-						overlap = true;
-
-						break;
-					}
-				}
-
-				if(overlap)
-				{
-					continue;
-				}
-			}
-
-			if((*prev)->src_pages != NULL)
-			{
-				GSVector4i* src_pages = (*next)->GetSrcPages();
-
-				for(int i = 0; i < 4; i++)
-				{
-					src_pages[i] |= (*prev)->src_pages[i];
-				}
-			}
-
-			if((*prev)->dst_pages != NULL)
-			{
-				GSVector4i* dst_pages = (*next)->GetDstPages();
-
-				for(int i = 0; i < 4; i++)
-				{
-					dst_pages[i] |= (*prev)->dst_pages[i];
-				}
-			}
-
-			GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
-			GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
-
-			GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
-
-			(*next)->prim_count += (*prev)->prim_count;
-			(*next)->ib_start = (*prev)->ib_start;
-
-			jobs.erase(prev);
-		}
-	}
-
-	//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
-
-	//
-
 	cl_kernel tfx_prev = NULL;
 
 	uint32 prim_start = 0;
@@ -920,14 +956,14 @@ void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_co
 
 		tfxcount++;
 
-		UpdateTextureCache(i.get());
-
 		uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
 
-		// TODO: tile level z test
-
 		cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
 
+		cl::Buffer* tex = UpdateTextureCache(i.get()) ? &m_cl.tex : &m_cl.vm;
+
+		tfx.setArg(2, sizeof(*tex), tex);
+
 		if(tfx_prev != tfx())
 		{
 			tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
@@ -954,15 +990,145 @@ void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_co
 
 		InvalidateTextureCache(i.get());
 
-		// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
-
 		prim_start += prim_count;
 	}
 }
 
-void GSRendererCL::UpdateTextureCache(TFXJob* job)
+void GSRendererCL::JoinTFX(std::list<shared_ptr<TFXJob>>& jobs)
 {
-	if(job->src_pages == NULL) return;
+	// join tfx kernel calls where the selector and fbp/zbp/bw/fpsm/zpsm are the same and src_pages != prev dst_pages
+
+	//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+
+	tfxselcount += jobs.size();
+
+	auto next = jobs.begin();
+
+	while(next != jobs.end())
+	{
+		auto prev = next++;
+
+		if(next == jobs.end())
+		{
+			break;
+		}
+
+		TFXSelector prev_sel = (*prev)->sel;
+		TFXSelector next_sel = (*next)->sel;
+
+		prev_sel.ababcd = next_sel.ababcd = 0;
+		prev_sel.wms = next_sel.wms = 0;
+		prev_sel.wmt = next_sel.wmt = 0;
+		prev_sel.noscissor = next_sel.noscissor = prev_sel.noscissor | next_sel.noscissor;
+		prev_sel.merged = next_sel.merged = 0;
+
+		if(prev_sel != next_sel
+		|| (*prev)->fbp != (*next)->fbp
+		|| (*prev)->zbp != (*next)->zbp
+		|| (*prev)->bw != (*next)->bw
+		|| (*prev)->fpsm != (*next)->fpsm
+		|| (*prev)->zpsm != (*next)->zpsm)
+		{
+			continue;
+		}
+
+		if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
+		{
+			bool overlap = false;
+
+			for(int i = 0; i < 4; i++)
+			{
+				if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
+				{
+					overlap = true;
+
+					break;
+				}
+			}
+
+			if(overlap)
+			{
+				continue;
+			}
+		}
+
+		if((*prev)->src_pages != NULL)
+		{
+			GSVector4i* src_pages = (*next)->GetSrcPages();
+
+			for(int i = 0; i < 4; i++)
+			{
+				src_pages[i] |= (*prev)->src_pages[i];
+			}
+		}
+
+		if((*prev)->dst_pages != NULL)
+		{
+			GSVector4i* dst_pages = (*next)->GetDstPages();
+
+			for(int i = 0; i < 4; i++)
+			{
+				dst_pages[i] |= (*prev)->dst_pages[i];
+			}
+		}
+
+		GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
+		GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
+
+		GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
+
+		(*next)->prim_count += (*prev)->prim_count;
+		(*next)->ib_start = (*prev)->ib_start;
+
+		(*next)->sel = next_sel;
+		(*next)->sel.merged = 1;
+
+		jobs.erase(prev);
+
+		//if((*prev)->sel != (*next)->sel) printf("%d %016llx %016llx\n", jobs.size(), (*prev)->sel.key, (*next)->sel.key);
+	}
+
+	tfxdiffselcount += jobs.size();
+
+	//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+}
+
+bool GSRendererCL::UpdateTextureCache(TFXJob* job)
+{
+	if(job->src_pages == NULL) return false;
+
+	bool overlap = false;
+	bool invalid = false;
+
+	if(job->dst_pages != NULL)
+	{
+		bool can_overlap = job->sel.fwrite && GSUtil::HasSharedBits(job->tpsm, job->fpsm) || job->sel.zwrite && GSUtil::HasSharedBits(job->tpsm, job->zpsm);
+
+		for(int i = 0; i < 4; i++)
+		{
+			if(!(job->src_pages[i] & job->dst_pages[i]).eq(GSVector4i::zero()))
+			{
+				overlap = can_overlap; // gow, re4
+			}
+
+			if(!(m_tc_pages[i] & job->src_pages[i]).eq(GSVector4i::zero()))
+			{
+				invalid = true;
+			}
+		}
+	}
+
+	if(!invalid)
+	{
+		return true; // all needed pages are valid in texture cache, use it
+	}
+
+	if(!overlap)
+	{
+		return false; // no overlap, but has invalid pages, don't use texture cache
+	}
+
+	// overlap && invalid, update and use texture cache
 
 	int count = 0;
 
@@ -972,19 +1138,17 @@ void GSRendererCL::UpdateTextureCache(TFXJob* job)
 
 		if(pages.eq(GSVector4i::zero())) continue;
 
-		size_t page_size = 8192;
+		m_tc_pages[i] &= ~job->src_pages[i];
 
-		// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
-
-		if(0) for(int j = 0; j < 4; j++)
+		for(int j = 0; j < 4; j++)
 		{
 			if(pages.u32[j] == 0) continue;
 
 			if(pages.u32[j] == 0xffffffff)
 			{
-				size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * page_size;
+				size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * PAGE_SIZE;
 
-				m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 32);
+				m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 32);
 
 				if(LOG) { fprintf(s_fp, "tc (%d x32)\n", offset >> 13); fflush(s_fp); }
 
@@ -1002,9 +1166,9 @@ void GSRendererCL::UpdateTextureCache(TFXJob* job)
 
 				if(b == 0xff)
 				{
-					size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * page_size;
+					size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * PAGE_SIZE;
 
-					m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 8);
+					m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 8);
 
 					if(LOG) { fprintf(s_fp, "tc (%d x8)\n", offset >> 13); fflush(s_fp); }
 
@@ -1018,9 +1182,9 @@ void GSRendererCL::UpdateTextureCache(TFXJob* job)
 				{
 					if(b & (1 << l))
 					{
-						size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * page_size;
+						size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * PAGE_SIZE;
 
-						m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size);
+						m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE);
 
 						if(LOG) { fprintf(s_fp, "tc (%d x1)\n", offset >> 13); fflush(s_fp); }
 
@@ -1030,23 +1194,100 @@ void GSRendererCL::UpdateTextureCache(TFXJob* job)
 				}
 			}
 		}
-
-		m_tc_pages[i] &= ~job->src_pages[i];
 	}
 
 	if(count > 0)
 	{
 		pageuploads += count;
 	}
+
+	return true;
 }
 
 void GSRendererCL::InvalidateTextureCache(TFXJob* job)
 {
 	if(job->dst_pages == NULL) return;
 
-	for(int j = 0; j < 4; j++)
+	for(int i = 0; i < 4; i++)
 	{
-		m_tc_pages[j] |= job->dst_pages[j];
+		m_tc_pages[i] |= job->dst_pages[i];
+	}
+}
+
+void GSRendererCL::UsePages(uint32* p)
+{
+	for(int l = 0; l < 2; l++)
+	{
+		for(int i = 0; i < 4; i++)
+		{
+			GSVector4i* v = &m_rw_pages[l][i];
+
+			if(v->eq(GSVector4i::zero())) continue;
+
+			for(int j = 0; j < 4; j++)
+			{
+				unsigned long index;
+				unsigned long mask = v->u32[j];
+
+				if(mask == 0) continue;
+
+				int o = (i << 7) | (j << 5);
+
+				if(mask == 0xffffffff)
+				{
+					for(int index = 0; index < 32; index++)
+					{
+						_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l);
+
+						*p++ = index | o;
+					}
+				}
+				else
+				{
+					while(_BitScanForward(&index, mask))
+					{
+						mask &= ~(1 << index);
+
+						_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l);
+
+						*p++ = index | o;
+					}
+				}
+			}
+
+			*v = GSVector4i::zero();
+		}
+
+		*p++ = GSOffset::EOP;
+	}
+}
+
+void GSRendererCL::ReleasePages(uint32* pages)
+{
+	const uint32* p = pages;
+
+	for(; *p != GSOffset::EOP; p++)
+	{
+		_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 0);
+	}
+
+	p++;
+
+	for(; *p != GSOffset::EOP; p++)
+	{
+		_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 1);
+	}
+}
+
+void CL_CALLBACK GSRendererCL::ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data)
+{
+	if(event_command_exec_status == CL_COMPLETE)
+	{
+		cb_data* data = (cb_data*)user_data;
+		
+		data->r->ReleasePages(data->pages);
+
+		delete data;
 	}
 }
 
@@ -1079,12 +1320,14 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	const GSDrawingContext* context = m_context;
 	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
 
-	job->sel.key = 0;
+	TFXSelector sel;
 
-	job->sel.atst = ATST_ALWAYS;
-	job->sel.tfx = TFX_NONE;
-	job->sel.ababcd = 0xff;
-	job->sel.prim = primclass;
+	sel.key = 0;
+
+	sel.atst = ATST_ALWAYS;
+	sel.tfx = TFX_NONE;
+	sel.ababcd = 0xff;
+	sel.prim = primclass;
 
 	uint32 fm = context->FRAME.FBMSK;
 	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
@@ -1107,9 +1350,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	{
 		if(!TryAlphaTest(fm, zm))
 		{
-			job->sel.atst = context->TEST.ATST;
-			job->sel.afail = context->TEST.AFAIL;
+			sel.atst = context->TEST.ATST;
+			sel.afail = context->TEST.AFAIL;
 			pb->aref = context->TEST.AREF;
+
+			switch(sel.atst)
+			{
+			case ATST_LESS:
+				sel.atst = ATST_LEQUAL;
+				pb->aref--;
+				break;
+			case ATST_GREATER:
+				sel.atst = ATST_GEQUAL;
+				pb->aref++;
+				break;
+			}
 		}
 	}
 
@@ -1137,31 +1392,31 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 	if(!fwrite && !zwrite) return false;
 
-	bool ftest = job->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+	bool ftest = sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
 	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
 
-	job->sel.fwrite = fwrite;
-	job->sel.ftest = ftest;
-	job->sel.zwrite = zwrite;
-	job->sel.ztest = ztest;
+	sel.fwrite = fwrite;
+	sel.ftest = ftest;
+	sel.zwrite = zwrite;
+	sel.ztest = ztest;
 
 	if(fwrite || ftest)
 	{
-		job->sel.fpsm = RemapPSM(context->FRAME.PSM);
+		sel.fpsm = RemapPSM(context->FRAME.PSM);
 
 		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
 		{
-			job->sel.iip = PRIM->IIP;
+			sel.iip = PRIM->IIP;
 		}
 
 		if(PRIM->TME)
 		{
-			job->sel.tfx = context->TEX0.TFX;
-			job->sel.tcc = context->TEX0.TCC;
-			job->sel.fst = PRIM->FST;
-			job->sel.ltf = m_vt.IsLinear();
-			job->sel.tpsm = RemapPSM(context->TEX0.PSM);
-			job->sel.aem = m_env.TEXA.AEM;
+			sel.tfx = context->TEX0.TFX;
+			sel.tcc = context->TEX0.TCC;
+			sel.fst = PRIM->FST;
+			sel.ltf = m_vt.IsLinear();
+			sel.tpsm = RemapPSM(context->TEX0.PSM);
+			sel.aem = m_env.TEXA.AEM;
 
 			pb->tbp[0] = context->TEX0.TBP0;
 			pb->tbw[0] = context->TEX0.TBW;
@@ -1170,30 +1425,30 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
 			{
-				job->sel.tlu = 1;
+				sel.tlu = 1;
 
 				memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
 			}
 
-			job->sel.wms = context->CLAMP.WMS;
-			job->sel.wmt = context->CLAMP.WMT;
+			sel.wms = ((uint32)context->CLAMP.WMS + 1) & 3;
+			sel.wmt = ((uint32)context->CLAMP.WMT + 1) & 3;
 
-			if(job->sel.tfx == TFX_MODULATE && job->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			if(sel.tfx == TFX_MODULATE && sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
 			{
 				// modulate does not do anything when vertex color is 0x80
 
-				job->sel.tfx = TFX_DECAL;
+				sel.tfx = TFX_DECAL;
 			}
 
 			GSVector4i r;
 
-			GetTextureMinMax(r, context->TEX0, context->CLAMP, job->sel.ltf);
+			GetTextureMinMax(r, context->TEX0, context->CLAMP, sel.ltf);
 
 			GSVector4i* src_pages = job->GetSrcPages();
 
-			GSOffset* off = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
+			GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
 			
-			off->GetPagesAsBits(r, m_tmp_pages);
+			o->GetPagesAsBits(r, m_tmp_pages);
 
 			for(int i = 0; i < 4; i++)
 			{
@@ -1212,15 +1467,15 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 				if(m_vt.m_lod.x > 0)
 				{
-					job->sel.ltf = context->TEX1.MMIN >> 2;
+					sel.ltf = context->TEX1.MMIN >> 2;
 				}
 				else
 				{
 					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
 				}
 
-				job->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
-				job->sel.lcm = context->TEX1.LCM;
+				sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				sel.lcm = context->TEX1.LCM;
 
 				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
 				int k = context->TEX1.K << 12;
@@ -1229,28 +1484,28 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 				{
 					k = (int)m_vt.m_lod.x << 16; // set lod to max level
 
-					job->sel.lcm = 1; // lod is constant
-					job->sel.mmin = 1; // tri-linear is meaningless
+					sel.lcm = 1; // lod is constant
+					sel.mmin = 1; // tri-linear is meaningless
 				}
 
-				if(job->sel.mmin == 2)
+				if(sel.mmin == 2)
 				{
 					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
 				}
 
-				if(job->sel.fst)
+				if(sel.fst)
 				{
-					ASSERT(job->sel.lcm == 1);
+					ASSERT(sel.lcm == 1);
 					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
 
-					job->sel.lcm = 1;
+					sel.lcm = 1;
 				}
 
-				if(job->sel.lcm)
+				if(sel.lcm)
 				{
 					int lod = std::max<int>(std::min<int>(k, mxl), 0);
 
-					if(job->sel.mmin == 1)
+					if(sel.mmin == 1)
 					{
 						lod = (lod + 0x8000) & 0xffff0000; // rounding
 					}
@@ -1322,11 +1577,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 					GSVector4i r;
 
-					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, job->sel.ltf);
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, sel.ltf);
 
-					GSOffset* off = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
+					GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
 					
-					off->GetPagesAsBits(r, m_tmp_pages);
+					o->GetPagesAsBits(r, m_tmp_pages);
 
 					for(int i = 0; i < 4; i++)
 					{
@@ -1341,7 +1596,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 			}
 			else
 			{
-				if(job->sel.fst == 0)
+				if(sel.fst == 0)
 				{
 					// skip per pixel division if q is constant
 
@@ -1349,7 +1604,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 					if(m_vt.m_eq.q)
 					{
-						job->sel.fst = 1;
+						sel.fst = 1;
 
 						const GSVector4& t = v[index[0]].t;
 
@@ -1367,7 +1622,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 					}
 					else if(primclass == GS_SPRITE_CLASS)
 					{
-						job->sel.fst = 1;
+						sel.fst = 1;
 
 						for(int i = 0, j = vertex_count; i < j; i += 2)
 						{
@@ -1381,24 +1636,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 						}
 					}
 				}
-
-				if(job->sel.ltf && job->sel.fst) // TODO: quite slow, do this in the prim kernel?
-				{
-					// if q is constant we can do the half pel shift for bilinear sampling on the vertices
-
-					// TODO: but not when mipmapping is used!!!
-
-					GSVector4 half(8.0f, 8.0f);
-
-					GSVertexCL* RESTRICT v = vertex;
-
-					for(int i = 0, j = vertex_count; i < j; i++)
-					{
-						GSVector4 t = v[i].t;
-
-						v[i].t = (t - half).xyzw(t);
-					}
-				}
 			}
 
 			int tw = 1 << context->TEX0.TW;
@@ -1459,58 +1696,58 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 		if(PRIM->FGE)
 		{
-			job->sel.fge = 1;
+			sel.fge = 1;
 			pb->fog = env.FOGCOL.u32[0];
 		}
 
 		if(context->FRAME.PSM != PSM_PSMCT24)
 		{
-			job->sel.date = context->TEST.DATE;
-			job->sel.datm = context->TEST.DATM;
+			sel.date = context->TEST.DATE;
+			sel.datm = context->TEST.DATM;
 		}
 
 		if(!IsOpaque())
 		{
-			job->sel.abe = PRIM->ABE;
-			job->sel.ababcd = context->ALPHA.u32[0];
+			sel.abe = PRIM->ABE;
+			sel.ababcd = context->ALPHA.u32[0];
 
 			if(env.PABE.PABE)
 			{
-				job->sel.pabe = 1;
+				sel.pabe = 1;
 			}
 
 			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
 			{
-				job->sel.aa1 = 1;
+				sel.aa1 = 1;
 			}
 
 			pb->afix = context->ALPHA.FIX;
 		}
 
-		if(job->sel.date || job->sel.aba == 1 || job->sel.abb == 1 || job->sel.abc == 1 || job->sel.abd == 1)
+		if(sel.date || sel.aba == 1 || sel.abb == 1 || sel.abc == 1 && (sel.fpsm & 3) != 1 || sel.abd == 1)
 		{
-			job->sel.rfb = 1;
+			sel.rfb = 1;
 		}
 		else
 		{
 			if(fwrite)
 			{
-				if(job->sel.atst != ATST_ALWAYS && job->sel.afail == AFAIL_RGB_ONLY
-				|| (job->sel.fpsm & 3) == 0 && fm != 0
-				|| (job->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
-				|| (job->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
+				if(sel.atst != ATST_ALWAYS && sel.afail == AFAIL_RGB_ONLY
+				|| (sel.fpsm & 3) == 0 && fm != 0
+				|| (sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
+				|| (sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
 				{
-					job->sel.rfb = 1;
+					sel.rfb = 1;
 				}
 			}
 		}
 
-		job->sel.colclamp = env.COLCLAMP.CLAMP;
-		job->sel.fba = context->FBA.FBA;
+		sel.colclamp = env.COLCLAMP.CLAMP;
+		sel.fba = context->FBA.FBA;
 
 		if(env.DTHE.DTHE)
 		{
-			job->sel.dthe = 1;
+			sel.dthe = 1;
 
 			GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
 			GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
@@ -1523,21 +1760,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 	if(zwrite || ztest)
 	{
-		job->sel.zpsm = RemapPSM(context->ZBUF.PSM);
-		job->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+		sel.zpsm = RemapPSM(context->ZBUF.PSM);
+		sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
 
 		if(ztest)
 		{
-			job->sel.rzb = 1;
+			sel.rzb = 1;
 		}
 		else
 		{
 			if(zwrite)
 			{
-				if(job->sel.atst != ATST_ALWAYS && (job->sel.afail == AFAIL_FB_ONLY || job->sel.afail == AFAIL_RGB_ONLY)
-				|| (job->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
+				if(sel.atst != ATST_ALWAYS && (sel.afail == AFAIL_FB_ONLY || sel.afail == AFAIL_RGB_ONLY)
+				|| (sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
 				{
-					job->sel.rzb = 1;
+					sel.rzb = 1;
 				}
 			}
 		}
@@ -1546,11 +1783,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	pb->fm = fm;
 	pb->zm = zm;
 
-	if((job->sel.fpsm & 3) == 1)
+	if((sel.fpsm & 3) == 1)
 	{
 		pb->fm |= 0xff000000;
 	}
-	else if((job->sel.fpsm & 3) >= 2)
+	else if((sel.fpsm & 3) >= 2)
 	{
 		uint32 rb = pb->fm & 0x00f800f8;
 		uint32 ga = pb->fm & 0x8000f800;
@@ -1558,11 +1795,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 		pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
 	}
 
-	if((job->sel.zpsm & 3) == 1)
+	if((sel.zpsm & 3) == 1)
 	{
 		pb->zm |= 0xff000000;
 	}
-	else if((job->sel.zpsm & 3) >= 2)
+	else if((sel.zpsm & 3) >= 2)
 	{
 		pb->zm |= 0xffff0000;
 	}
@@ -1571,6 +1808,8 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	pb->zbp = context->ZBUF.Block();
 	pb->bw = context->FRAME.FBW;
 
+	pb->sel = sel;
+
 	return true;
 }
 
@@ -1745,8 +1984,6 @@ void GSRendererCL::CL::Unmap()
 
 cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
 {
-	// TODO: cache binary on disk
-
 	cl::Program program;
 
 	if(version >= 120)
@@ -1850,7 +2087,7 @@ cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
 					fclose(f);
 				}
 
-				delete[] binaries[i];
+				delete [] binaries[i];
 			}
 		}
 		catch(cl::Error err)
@@ -1993,6 +2230,7 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
 	opt << "-D AEM=" << sel.aem << " ";
 	opt << "-D FB=" << sel.fb << " ";
 	opt << "-D ZB=" << sel.zb << " ";
+	opt << "-D MERGED=" << sel.merged << " ";
 
 	cl::Kernel k = Build(entry, opt);
 
@@ -2000,7 +2238,6 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
 
 	k.setArg(0, env);
 	k.setArg(1, vm);
-	k.setArg(2, tex);
 
 	return tfx_map[sel];
 }
diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h
index e0afe67a22..81ec47ba49 100644
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@@ -108,6 +108,7 @@ class GSRendererCL : public GSRenderer
 			uint32 noscissor:1; // 53
 			uint32 tpsm:4; // 54
 			uint32 aem:1; // 58
+			uint32 merged:1; // 59
 			// TODO
 		};
 
@@ -148,6 +149,7 @@ class GSRendererCL : public GSRenderer
 	{
 		GSVector4i scissor;
 		GSVector4i dimx; // 4x4 signed char
+		TFXSelector sel;
 		uint32 fbp, zbp, bw;
 		uint32 fm, zm;
 		uint32 fog; // rgb
@@ -172,6 +174,7 @@ class GSRendererCL : public GSRenderer
 		GSVector4i* src_pages; // read by any texture level
 		GSVector4i* dst_pages; // f/z writes to it
 		uint32 fbp, zbp, bw;
+		uint32 fpsm, zpsm, tpsm;
 #ifdef DEBUG
 		TFXParameter* pb;
 #endif
@@ -227,16 +230,23 @@ class GSRendererCL : public GSRenderer
 
 	void Enqueue();
 	void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
-	void UpdateTextureCache(TFXJob* job);
+	void JoinTFX(std::list<shared_ptr<TFXJob>>& jobs);
+	bool UpdateTextureCache(TFXJob* job);
 	void InvalidateTextureCache(TFXJob* job);
+	void UsePages(uint32* pages);
+	void ReleasePages(uint32* pages);
+
+	static void CL_CALLBACK ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data);
 
 protected:
 	GSTexture* m_texture[2];
 	uint8* m_output;
 	
 	GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
-	GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?)
-	GSVector4i m_tmp_pages[4]; // TODO: this should be block level, too many overlaps inside pages with render targets
+	GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) // TODO: this should be block level, too many overlaps inside pages with render targets
+	GSVector4i m_tmp_pages[4];
+	uint32 m_tmp_pages2[MAX_PAGES + 1];
+	uint32 m_rw_pages_rendering[512]; // pages that are currently in-use
 
 	void Reset();
 	void VSync(int field);
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index 0afa73e855..a6f1730f7e 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -66,6 +66,9 @@ GSState::GSState()
 	//s_dump = 1;
 	//s_save = 1;
 	//s_savez = 1;
+	//s_savet = 1;
+	//s_savef = 1;
+	//s_saven = 656;
 
 	UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;
 	m_crc_hack_level = theApp.GetConfig("crc_hack_level", 3);
diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp
index f29b22812a..02950970b5 100644
--- a/plugins/GSdx/GSUtil.cpp
+++ b/plugins/GSdx/GSUtil.cpp
@@ -224,7 +224,7 @@ bool GSUtil::CheckSSE()
 	return true;
 }
 
-#define OCL_PROGRAM_VERSION 1
+#define OCL_PROGRAM_VERSION 3
 
 #ifdef ENABLE_OPENCL
 void GSUtil::GetDeviceDescs(list<OCLDeviceDesc>& dl)
diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl
index 0ecd98f6db..91bebeed11 100644
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@@ -1,3 +1,9 @@
+#if defined(CL_VERSION_2_0)
+
+#error hello
+
+#endif
+
 #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
 
 #ifdef cl_amd_printf
@@ -26,6 +32,13 @@
 	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
 #endif
 
+#define TFX_ABA(sel) ((sel.x >> 24) & 3)
+#define TFX_ABB(sel) ((sel.x >> 26) & 3)
+#define TFX_ABC(sel) ((sel.x >> 28) & 3)
+#define TFX_ABD(sel) ((sel.x >> 30) & 3)
+#define TFX_WMS(sel) ((sel.y >>  8) & 3)
+#define TFX_WMT(sel) ((sel.y >> 10) & 3)
+
 typedef struct
 {
 	union {float4 p; struct {float x, y; uint z, f;};};
@@ -35,9 +48,9 @@ typedef struct
 typedef struct
 {
 	gs_vertex v[3];
-	uint zmin;
+	uint zmin, zmax;
 	uint pb_index;
-	uint _pad[2];
+	uint _pad;
 } gs_prim;
 
 typedef struct
@@ -60,6 +73,7 @@ typedef struct
 {
 	int4 scissor;
 	char dimx[4][4];
+	uint2 sel;
 	int fbp, zbp, bw;
 	uint fm, zm;
 	uchar4 fog; // rgb
@@ -110,10 +124,10 @@ enum GS_TFX
 
 enum GS_CLAMP
 {
-	CLAMP_REPEAT		= 0,
-	CLAMP_CLAMP			= 1,
-	CLAMP_REGION_CLAMP	= 2,
-	CLAMP_REGION_REPEAT	= 3,
+	CLAMP_REGION_REPEAT	= 0,
+	CLAMP_REPEAT		= 1,
+	CLAMP_CLAMP			= 2,
+	CLAMP_REGION_CLAMP	= 3,
 };
 
 enum GS_ZTST
@@ -603,6 +617,7 @@ __kernel void KERNEL_PRIM(
 		// only interpolate the relative to zmin and hopefully small values
 
 		uint zmin = min(min(v0->z, v1->z), v2->z);
+		uint zmax = max(max(v0->z, v1->z), v2->z);
 		
 		prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
 		prim->v[0].tc = v0->tc;
@@ -612,6 +627,7 @@ __kernel void KERNEL_PRIM(
 		prim->v[2].tc = v2->tc;
 
 		prim->zmin = zmin;
+		prim->zmax = zmax;
 
 		float4 dp0 = v1->p - v0->p;
 		float4 dp1 = v0->p - v2->p;
@@ -969,27 +985,34 @@ bool DestAlphaTest(uint fd)
 
 int Wrap(int a, int b, int c, int mode)
 {
-	switch(mode)
+	if(MERGED)
 	{
-	case CLAMP_REPEAT:
-		return a & b;
-	case CLAMP_CLAMP:
-		return clamp(a, 0, c);
-	case CLAMP_REGION_CLAMP:
-		return clamp(a, b, c);
-	case CLAMP_REGION_REPEAT:
-		return (a & b) | c;
+		return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0);
+	}
+	else
+	{
+		switch(mode)
+		{
+		case CLAMP_REGION_REPEAT:
+			return (a & b) | c;
+		case CLAMP_REPEAT:
+			return a & b;
+		case CLAMP_CLAMP:
+			return clamp(a, 0, c);
+		case CLAMP_REGION_CLAMP:
+			return clamp(a, b, c);
+		}
 	}
 }
 
-int4 AlphaBlend(int4 c, int afix, uint fd)
+int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel)
 {
 	if(FWRITE && (ABE || AA1))
 	{
 		int4 cs = c;
 		int4 cd;
 
-		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1)
+		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED)
 		{
 			if(is32bit(FPSM) || is24bit(FPSM))
 			{
@@ -1007,50 +1030,69 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
 			}
 		}
 
-		if(ABA != ABB)
+		if(MERGED)
 		{
-			switch(ABA)
-			{
-			case 0: break; // c.xyz = cs.xyz;
-			case 1: c.xyz = cd.xyz; break;
-			case 2: c.xyz = 0; break;
-			}
+			int aba = TFX_ABA(sel);
+			int abb = TFX_ABB(sel);
+			int abc = TFX_ABC(sel);
+			int abd = TFX_ABD(sel);
 
-			switch(ABB)
-			{
-			case 0: c.xyz -= cs.xyz; break;
-			case 1: c.xyz -= cd.xyz; break;
-			case 2: break;
-			}
+			int ad = !is24bit(FPSM) ? cd.w : 0x80;
 
-			if(!(is24bit(FPSM) && ABC == 1))
-			{
-				int a = 0;
+			int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0;
+			int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0;
+			int C = abc == 0 ? cs.w : abc == 1 ? ad : afix;
+			int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0;
 
-				switch(ABC)
-				{
-				case 0: a = cs.w; break;
-				case 1: a = cd.w; break;
-				case 2: a = afix; break;
-				}
-
-				c.xyz = c.xyz * a >> 7;
-			}
-
-			switch(ABD)
-			{
-			case 0: c.xyz += cs.xyz; break;
-			case 1: c.xyz += cd.xyz; break;
-			case 2: break;
-			}
+			c.xyz = (mul24(A - B, C) >> 7) + D;
 		}
 		else
 		{
-			switch(ABD)
+			if(ABA != ABB)
 			{
-			case 0: break;
-			case 1: c.xyz = cd.xyz; break;
-			case 2: c.xyz = 0; break;
+				switch(ABA)
+				{
+				case 0: break; // c.xyz = cs.xyz;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
+
+				switch(ABB)
+				{
+				case 0: c.xyz -= cs.xyz; break;
+				case 1: c.xyz -= cd.xyz; break;
+				case 2: break;
+				}
+
+				if(!(is24bit(FPSM) && ABC == 1))
+				{
+					int a = 0;
+
+					switch(ABC)
+					{
+					case 0: a = cs.w; break;
+					case 1: a = cd.w; break;
+					case 2: a = afix; break;
+					}
+
+					c.xyz = c.xyz * a >> 7;
+				}
+
+				switch(ABD)
+				{
+				case 0: c.xyz += cs.xyz; break;
+				case 1: c.xyz += cd.xyz; break;
+				case 2: break;
+				}
+			}
+			else
+			{
+				switch(ABD)
+				{
+				case 0: break;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
 			}
 		}
 
@@ -1150,8 +1192,6 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
 		if(!FST)
 		{
 			uv = convert_int2_rte(t.xy * native_recip(t.z));
-
-			if(LTF) uv -= 0x0008;
 		}
 		else
 		{
@@ -1167,15 +1207,17 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
 			uv = convert_int2(t.xy); 
 		}
 
+		if(LTF) uv -= 0x0008;
+
 		int2 uvf = uv & 0x000f;
 
 		int2 uv0 = uv >> 4;
 		int2 uv1 = uv0 + 1;
 
-		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
-		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
-		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
-		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
+		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
+		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
 
 		int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
 		int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
@@ -1356,6 +1398,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 			{
 				// TODO: aa1: draw edge as a line
 
+				if(!ZTest(prim->zmax, zd))
+				{
+					continue;
+				}
+
 				__global gs_barycentric* b = &barycentric[prim_index + i];
 
 				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
@@ -1420,8 +1467,6 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 
 			if(TFX != TFX_NONE)
 			{
-				tex = vm; // TODO: use the texture cache
-
 				ct = SampleTexture(tex, pb, t);
 			}
 
@@ -1515,7 +1560,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 
 			// alpha blend
 
-			c = AlphaBlend(c, pb->afix, fd);
+			c = AlphaBlend(c, fd, pb->afix, pb->sel);
 
 			// write frame
 
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index 72348dcd9d..7c94b4d67c 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -123,8 +123,15 @@ using namespace std;
 	#include <GL/wglext.h>
 	#include "GLLoader.h"
 
+	#if _MSC_VER >= 1800 
+	#include <unordered_map>
+	#include <unordered_set>
+	#define hash_map unordered_map
+	#define hash_set unordered_set
+	#else
 	#include <hash_map>
 	#include <hash_set>
+	#endif
 
 	using namespace stdext;