diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp
index 77693579d6..ff7aa4d43f 100644
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
 #define BIN_SIZE (1u << BIN_SIZE_BITS)
 #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
 #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
+#define TFX_PARAM_SIZE 2048
 
 #if MAX_PRIM_PER_BATCH == 64u
 #define BIN_TYPE cl_ulong
@@ -72,6 +73,7 @@ typedef struct
 
 GSRendererCL::GSRendererCL()
 	: m_vb_count(0)
+	, m_synced(true)
 {
 	m_nativeres = true; // ignore ini, sw is always native
 
@@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL()
 	InitCVB(GS_TRIANGLE_CLASS);
 	InitCVB(GS_SPRITE_CLASS);
 
+	// NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without 
+	// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
+
 	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
 	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
 }
@@ -122,13 +127,17 @@ static int pageuploads = 0;
 static int pageuploadcount = 0;
 static int tfxcount = 0;
 static int64 tfxpixels = 0;
+static int tfxselcount = 0;
+static int tfxdiffselcount = 0;
 
 void GSRendererCL::VSync(int field)
 {
 	GSRenderer::VSync(field);
 
 	//printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels);
+	//printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount);
 	pageuploads = pageuploadcount = tfxcount = tfxpixels = 0;
+	tfxselcount = tfxdiffselcount = 0;
 
 	//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
 }
@@ -284,7 +293,9 @@ void GSRendererCL::Draw()
 	{
 		size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
 		size_t ib_size = m_index.tail * sizeof(uint32);
-		size_t pb_size = sizeof(TFXParameter);
+		size_t pb_size = TFX_PARAM_SIZE;
+
+		ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE);
 
 		if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
 		{
@@ -366,12 +377,16 @@ void GSRendererCL::Draw()
 
 			m_vb_start = m_cl.vb.tail;
 			m_vb_count = 0;
+			m_pb_start = m_cl.pb.tail;
+			m_pb_count = 0;
 		}
 		else
 		{
 			// TODO: SIMD
 
-			uint32 vb_count = m_vb_count;
+			ASSERT(m_pb_count < 256);
+
+			uint32 vb_count = m_vb_count | (m_pb_count << 24);
 
 			for(size_t i = 0; i < m_index.tail; i++)
 			{
@@ -398,21 +413,25 @@ void GSRendererCL::Draw()
 		job->rect.z = rect.z;
 		job->rect.w = rect.w;
 		job->ib_start = m_cl.ib.tail;
-		job->ib_count = m_index.tail;
-		job->pb_start = m_cl.pb.tail;
+		job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
+		job->fbp = pb->fbp;
+		job->zbp = pb->zbp;
+		job->bw = pb->bw;
 
 #ifdef DEBUG
-		job->param = pb;
+		job->pb = pb;
 #endif
-
 		m_jobs.push_back(job);
 
 		m_vb_count += m_vertex.next;
+		m_pb_count++;
 
 		m_cl.vb.tail += vb_size;
 		m_cl.ib.tail += ib_size;
 		m_cl.pb.tail += pb_size;
 
+		m_synced = false;
+
 		// mark pages used in rendering as source or target
 
 		if(job->sel.fwrite || job->sel.rfb)
@@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason)
 		m_rw_pages[1][i] = GSVector4i::zero();
 	}
 
-	// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, 
-	// though it does not seem to be necessary even with GPU devices where it might be cached, 
-	// needs more testing...
-
-	//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
-	//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
+	m_synced = true;
 }
 
 void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
@@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 
 	o->GetPagesAsBits(r, m_tmp_pages);
 
-	//if(!synced)
+	if(!m_synced)
 	{
 		for(int i = 0; i < 4; i++)
 		{
@@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 {
 	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
 	
-	//if(!synced)
+	if(!m_synced)
 	{
 		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
 
@@ -620,16 +634,7 @@ void GSRendererCL::Enqueue()
 
 		int primclass = m_jobs.front()->sel.prim;
 
-		uint32 n;
-
-		switch(primclass)
-		{
-		case GS_POINT_CLASS: n = 1; break;
-		case GS_LINE_CLASS: n = 2; break;
-		case GS_TRIANGLE_CLASS: n = 3; break;
-		case GS_SPRITE_CLASS: n = 2; break;
-		default: __assume(0);
-		}
+		uint32 n = GSUtil::GetClassVertexCount(primclass);
 
 		PrimSelector psel;
 
@@ -678,8 +683,6 @@ void GSRendererCL::Enqueue()
 
 		//
 
-		cl_kernel tfx_prev = NULL;
-
 		auto head = m_jobs.begin();
 
 		while(head != m_jobs.end())
@@ -692,8 +695,8 @@ void GSRendererCL::Enqueue()
 			{
 				auto job = next++;
 
-				uint32 cur_prim_count = (*job)->ib_count / n;
-				uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
+				uint32 cur_prim_count = (*job)->prim_count;
+				uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0;
 
 				total_prim_count += cur_prim_count;
 
@@ -775,9 +778,8 @@ void GSRendererCL::Enqueue()
 							uint32 group_count = batch_count * item_count;
 
 							tk.setArg(1, (cl_uint)prim_count);
-							tk.setArg(2, (cl_uint)batch_count);
-							tk.setArg(3, (cl_uint)bin_count);
-							tk.setArg(4, bin_dim);
+							tk.setArg(2, (cl_uint)bin_count);
+							tk.setArg(3, bin_dim);
 
 							m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
 						}
@@ -789,68 +791,20 @@ void GSRendererCL::Enqueue()
 						}
 					}
 
-					//
+					std::list<shared_ptr<TFXJob>> jobs(head, next);
 
-					uint32 prim_start = 0;
-
-					for(auto i = head; i != next; i++)
-					{
-						ASSERT(prim_start < MAX_PRIM_COUNT);
-
-						// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same 
-						// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
-
-						tfxcount++;
-
-						UpdateTextureCache((*i).get());
-
-						uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
-
-						// TODO: tile level z test
-
-						cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
-
-						if(tfx_prev != tfx())
-						{
-							tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
-
-							tfx_prev = tfx();
-						}
-
-						tfx.setArg(4, (cl_uint)(*i)->pb_start);
-						tfx.setArg(5, (cl_uint)prim_start);
-						tfx.setArg(6, (cl_uint)prim_count_inner);
-						tfx.setArg(7, (cl_uint)batch_count);
-						tfx.setArg(8, (cl_uint)bin_count);
-						tfx.setArg(9, bin_dim);
-
-						GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
-
-						r = r.ralign<Align_Outside>(GSVector2i(8, 8));
-
-						m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
-
-						tfxpixels += r.width() * r.height();
-
-						InvalidateTextureCache((*i).get());
-
-						// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
-
-						prim_start += prim_count_inner;
-					}
-
-					//
+					EnqueueTFX(jobs, bin_count, bin_dim);
 
 					if(total_prim_count > MAX_PRIM_COUNT)
 					{
 						prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
 
 						(*job)->ib_start += prim_count * n * sizeof(uint32);
-						(*job)->ib_count -= prim_count * n;
+						(*job)->prim_count -= prim_count;
 
 						next = job; // try again for the remainder
 
-						//printf("split %d\n", (*job)->ib_count / n);
+						//printf("split %d\n", (*job)->prim_count);
 					}
 
 					break;
@@ -876,6 +830,131 @@ void GSRendererCL::Enqueue()
 	m_cl.Map();
 }
 
+void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
+{
+	// join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages
+
+	//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+
+	auto next = jobs.begin();
+
+	while(next != jobs.end())
+	{
+		auto prev = next++;
+
+		if(next == jobs.end())
+		{
+			break;
+		}
+
+		if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw)
+		{
+			if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
+			{
+				bool overlap = false;
+
+				for(int i = 0; i < 4; i++)
+				{
+					if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
+					{
+						overlap = true;
+
+						break;
+					}
+				}
+
+				if(overlap)
+				{
+					continue;
+				}
+			}
+
+			if((*prev)->src_pages != NULL)
+			{
+				GSVector4i* src_pages = (*next)->GetSrcPages();
+
+				for(int i = 0; i < 4; i++)
+				{
+					src_pages[i] |= (*prev)->src_pages[i];
+				}
+			}
+
+			if((*prev)->dst_pages != NULL)
+			{
+				GSVector4i* dst_pages = (*next)->GetDstPages();
+
+				for(int i = 0; i < 4; i++)
+				{
+					dst_pages[i] |= (*prev)->dst_pages[i];
+				}
+			}
+
+			GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
+			GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
+
+			GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
+
+			(*next)->prim_count += (*prev)->prim_count;
+			(*next)->ib_start = (*prev)->ib_start;
+
+			jobs.erase(prev);
+		}
+	}
+
+	//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+
+	//
+
+	cl_kernel tfx_prev = NULL;
+
+	uint32 prim_start = 0;
+
+	for(auto i : jobs)
+	{
+		ASSERT(prim_start < MAX_PRIM_COUNT);
+
+		tfxcount++;
+
+		UpdateTextureCache(i.get());
+
+		uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
+
+		// TODO: tile level z test
+
+		cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
+
+		if(tfx_prev != tfx())
+		{
+			tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
+			tfx.setArg(4, (cl_uint)m_pb_start);
+
+			tfx_prev = tfx();
+		}
+
+		tfx.setArg(5, (cl_uint)prim_start);
+		tfx.setArg(6, (cl_uint)prim_count);
+		tfx.setArg(7, (cl_uint)bin_count);
+		tfx.setArg(8, bin_dim);
+		tfx.setArg(9, i->fbp);
+		tfx.setArg(10, i->zbp);
+		tfx.setArg(11, i->bw);
+
+		GSVector4i r = GSVector4i::load<false>(&i->rect);
+
+		r = r.ralign<Align_Outside>(GSVector2i(8, 8));
+
+		m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
+
+		tfxpixels += r.width() * r.height();
+
+		InvalidateTextureCache(i.get());
+
+		// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
+
+		prim_start += prim_count;
+	}
+}
+
 void GSRendererCL::UpdateTextureCache(TFXJob* job)
 {
 	if(job->src_pages == NULL) return;
@@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	return true;
 }
 
-//////////
+//
+
+GSRendererCL::TFXJob::TFXJob()
+	: src_pages(NULL)
+	, dst_pages(NULL)
+{
+}
+
+GSRendererCL::TFXJob::~TFXJob()
+{
+	if(src_pages != NULL) _aligned_free(src_pages);
+	if(dst_pages != NULL) _aligned_free(dst_pages);
+}
+
+GSVector4i* GSRendererCL::TFXJob::GetSrcPages()
+{
+	if(src_pages == NULL)
+	{
+		src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
+
+		src_pages[0] = GSVector4i::zero();
+		src_pages[1] = GSVector4i::zero();
+		src_pages[2] = GSVector4i::zero();
+		src_pages[3] = GSVector4i::zero();
+	}
+
+	return src_pages;
+}
+
+GSVector4i* GSRendererCL::TFXJob::GetDstPages()
+{
+	if(dst_pages == NULL)
+	{
+		dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
+
+		dst_pages[0] = GSVector4i::zero();
+		dst_pages[1] = GSVector4i::zero();
+		dst_pages[2] = GSVector4i::zero();
+		dst_pages[3] = GSVector4i::zero();
+	}
+
+	return dst_pages;
+}
+
+//
 
 //#define IOCL_DEBUG
 
@@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL()
 	ib.mapped_ptr = ib.ptr = NULL;
 	pb.mapped_ptr = pb.ptr = NULL;
 
-	pb.size = sizeof(TFXParameter) * 256;
+	pb.size = TFX_PARAM_SIZE * 256;
 	pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
 	pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
 
@@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map()
 {
 	Unmap();
 
+	// TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+
+
 	if(vb.head < vb.size)
 	{
 		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
 		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
 		ASSERT(((size_t)vb.ptr & 15) == 0);
-		ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
 	}
 
 	if(ib.head < ib.size)
@@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map()
 		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
 		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
 		ASSERT(((size_t)pb.ptr & 15) == 0);
-		ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
 	}
 }
 
@@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt)
 	opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
 	opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";	
 	opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
+	opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u ";
 #ifdef IOCL_DEBUG
 	opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
 #endif
diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h
index 59fd524943..fdfea3f6d3 100644
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer
 	{
 	public:
 		struct { int x, y, z, w; } rect;
-		TFXSelector sel; // uses primclass, solidrect only
-		uint32 ib_start, ib_count;
-		uint32 pb_start;
+		TFXSelector sel;
+		uint32 ib_start;
+		uint32 prim_count;
 		GSVector4i* src_pages; // read by any texture level
 		GSVector4i* dst_pages; // f/z writes to it
+		uint32 fbp, zbp, bw;
 #ifdef DEBUG
-		TFXParameter* param;
+		TFXParameter* pb;
 #endif
-		TFXJob()
-			: src_pages(NULL)
-			, dst_pages(NULL)
-		{
-		}
+		TFXJob();
+		virtual ~TFXJob();
 
-		virtual ~TFXJob()
-		{
-			if(src_pages != NULL) _aligned_free(src_pages);
-			if(dst_pages != NULL) _aligned_free(dst_pages);
-		}
-
-		GSVector4i* GetSrcPages()
-		{
-			if(src_pages == NULL)
-			{
-				src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
-
-				src_pages[0] = GSVector4i::zero();
-				src_pages[1] = GSVector4i::zero();
-				src_pages[2] = GSVector4i::zero();
-				src_pages[3] = GSVector4i::zero();
-			}
-
-			return src_pages;
-		}
-
-		GSVector4i* GetDstPages()
-		{
-			if(dst_pages == NULL)
-			{
-				dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
-
-				dst_pages[0] = GSVector4i::zero();
-				dst_pages[1] = GSVector4i::zero();
-				dst_pages[2] = GSVector4i::zero();
-				dst_pages[3] = GSVector4i::zero();
-			}
-
-			return dst_pages;
-		}
+		GSVector4i* GetSrcPages();
+		GSVector4i* GetDstPages();
 	};
 
 	class CL
@@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer
 	std::list<shared_ptr<TFXJob>> m_jobs;
 	uint32 m_vb_start;
 	uint32 m_vb_count;
+	uint32 m_pb_start;
+	uint32 m_pb_count;
+	bool m_synced;
 
 	void Enqueue();
+	void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
 	void UpdateTextureCache(TFXJob* job);
 	void InvalidateTextureCache(TFXJob* job);
 
diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp
index 533e51642a..7c0e239edc 100644
--- a/plugins/GSdx/GSUtil.cpp
+++ b/plugins/GSdx/GSUtil.cpp
@@ -94,6 +94,7 @@ static class GSUtilMaps
 public:
 	uint8 PrimClassField[8];
 	uint8 VertexCountField[8];
+	uint8 ClassVertexCountField[4];
 	uint32 CompatibleBitsField[64][2];
 	uint32 SharedBitsField[64][2];
 
@@ -117,6 +118,11 @@ public:
 		VertexCountField[GS_SPRITE] = 2;
 		VertexCountField[GS_INVALID] = 1;
 
+		ClassVertexCountField[GS_POINT_CLASS] = 1;
+		ClassVertexCountField[GS_LINE_CLASS] = 2;
+		ClassVertexCountField[GS_TRIANGLE_CLASS] = 3;
+		ClassVertexCountField[GS_SPRITE_CLASS] = 2;
+
 		memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField));
 
 		for(int i = 0; i < 64; i++)
@@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim)
 	return s_maps.VertexCountField[prim];
 }
 
+int GSUtil::GetClassVertexCount(uint32 primclass)
+{
+	return s_maps.ClassVertexCountField[primclass];
+}
+
 const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
 {
 	return s_maps.SharedBitsField[dpsm];
diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h
index b3697c0812..a4cad91014 100644
--- a/plugins/GSdx/GSUtil.h
+++ b/plugins/GSdx/GSUtil.h
@@ -30,6 +30,7 @@ public:
 
 	static GS_PRIM_CLASS GetPrimClass(uint32 prim);
 	static int GetVertexCount(uint32 prim);
+	static int GetClassVertexCount(uint32 primclass);
 
 	static const uint32* HasSharedBitsPtr(uint32 dpsm);
 	static bool HasSharedBits(uint32 spsm, const uint32* ptr);
diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl
index f9bb6dac2b..65f7cfa05e 100644
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@@ -36,7 +36,10 @@ typedef struct
 
 typedef struct
 {
-	gs_vertex v[4];
+	gs_vertex v[3];
+	uint zmin;
+	uint pb_index;
+	uint _pad[2];
 } gs_prim;
 
 typedef struct
@@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM(
 	
 	ib += prim_index * VERTEX_PER_PRIM;
 
+	prim->pb_index = ib[0] >> 24;
+
+	__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
+	__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
+	__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];
+
 	int2 pmin, pmax;
 
 	if(PRIM == GS_POINT_CLASS)
 	{
-		__global gs_vertex* v0 = &vb[ib[0]];
-
 		pmin = pmax = convert_int2_rte(v0->p.xy);
 
 		prim->v[0].p = v0->p;
@@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM(
 	}
 	else if(PRIM == GS_LINE_CLASS)
 	{
-		int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
-		int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
+		int2 p0 = convert_int2_rte(v0->p.xy);
+		int2 p1 = convert_int2_rte(v1->p.xy);
 
 		pmin = min(p0, p1);
 		pmax = max(p0, p1);
 	}
 	else if(PRIM == GS_TRIANGLE_CLASS)
 	{
-		__global gs_vertex* v0 = &vb[ib[0]];
-		__global gs_vertex* v1 = &vb[ib[1]];
-		__global gs_vertex* v2 = &vb[ib[2]];
-
 		int2 p0 = convert_int2_rtp(v0->p.xy);
 		int2 p1 = convert_int2_rtp(v1->p.xy);
 		int2 p2 = convert_int2_rtp(v2->p.xy);
@@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM(
 		pmax = max(max(p0, p1), p2);
 
 		// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
-		// pass the minimum through the unused 4th padding vector 
-		// only interpolate the relative and hopefully small values
+		// only interpolate the relative to zmin and hopefully small values
 
 		uint zmin = min(min(v0->z, v1->z), v2->z);
 		
@@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM(
 		prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
 		prim->v[2].tc = v2->tc;
 
-		prim->v[3].z = zmin;
+		prim->zmin = zmin;
 
 		float4 dp0 = v1->p - v0->p;
 		float4 dp1 = v0->p - v2->p;
@@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM(
 	}
 	else if(PRIM == GS_SPRITE_CLASS)
 	{
-		__global gs_vertex* v0 = &vb[ib[0]];
-		__global gs_vertex* v1 = &vb[ib[1]];
-
 		int2 p0 = convert_int2_rtp(v0->p.xy);
 		int2 p1 = convert_int2_rtp(v1->p.xy);
 
@@ -785,7 +784,6 @@ __kernel void KERNEL_TILE(
 __kernel void KERNEL_TILE(
 	__global gs_env* env,
 	uint prim_count,
-	uint batch_count,
 	uint bin_count, // == bin_dim.z * bin_dim.w
 	uchar4 bin_dim)
 {
@@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	uint pb_start,
 	uint prim_start, 
 	uint prim_count,
-	uint batch_count,
 	uint bin_count, // == bin_dim.z * bin_dim.w
-	uchar4 bin_dim)
+	uchar4 bin_dim,
+	uint fbp, 
+	uint zbp, 
+	uint bw)
 {
 	uint x = get_global_id(0);
 	uint y = get_global_id(1);
@@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 
 	//
 
-	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
-
 	int2 pi = (int2)(x, y);
 	float2 pf = convert_float2(pi);
 
-	if(!NOSCISSOR)
-	{
-		if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
-		{
-			return;
-		}
-	}
-
-	int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
-	int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
+	int faddr = PixelAddress(x, y, fbp, bw, FPSM);
+	int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);
 
 	uint fd, zd; // TODO: fd as int4 and only pack before writing out?
 
@@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
 	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
 
+	pb_base += pb_start;
+
 	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
 
 	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
@@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				break;
 			}
 
-			__global gs_prim* prim = &prim_base[prim_index + i];
-			
 			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
 
+			__global gs_prim* prim = &prim_base[prim_index + i];
+			__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);
+
+			if(!NOSCISSOR)
+			{
+				if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
+				{
+					continue;
+				}
+			}
+			
 			uint2 zf;
 			float3 t;
 			int4 c;
@@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
 				float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
 
-				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
+				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
 				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
 
 				t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
@@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				}
 			}
 
-			// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
+			// read mask
 
 			uint fm = pb->fm;
 			uint zm = pb->zm;