joined some tfx kernel calls, general speed up in most games

2014-09-21 18:13:55 +02:00 · 2014-09-21 18:13:55 +02:00 · 6f5cd1cd4d
parent 3d2b0e3766
commit 6f5cd1cd4d
5 changed files with 271 additions and 165 deletions
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
 #define BIN_SIZE (1u << BIN_SIZE_BITS)
 #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
 #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
 #define TFX_PARAM_SIZE 2048
 #if MAX_PRIM_PER_BATCH == 64u
 #define BIN_TYPE cl_ulong
@ -72,6 +73,7 @@ typedef struct
 GSRendererCL::GSRendererCL()
 	: m_vb_count(0)
 	, m_synced(true)
 {
 	m_nativeres = true; // ignore ini, sw is always native
@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL()
 	InitCVB(GS_TRIANGLE_CLASS);
 	InitCVB(GS_SPRITE_CLASS);
 	// NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without 
 	// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
 	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
 	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
 }
@ -122,13 +127,17 @@ static int pageuploads = 0;
 static int pageuploadcount = 0;
 static int tfxcount = 0;
 static int64 tfxpixels = 0;
 static int tfxselcount = 0;
 static int tfxdiffselcount = 0;
 void GSRendererCL::VSync(int field)
 {
 	GSRenderer::VSync(field);
 	//printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels);
 	//printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount);
 	pageuploads = pageuploadcount = tfxcount = tfxpixels = 0;
 	tfxselcount = tfxdiffselcount = 0;
 	//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
 }
@ -284,7 +293,9 @@ void GSRendererCL::Draw()
 	{
 		size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
 		size_t ib_size = m_index.tail * sizeof(uint32);
-		size_t pb_size = sizeof(TFXParameter);
+		size_t pb_size = TFX_PARAM_SIZE;
 		ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE);
 		if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
 		{
@ -366,12 +377,16 @@ void GSRendererCL::Draw()
 			m_vb_start = m_cl.vb.tail;
 			m_vb_count = 0;
 			m_pb_start = m_cl.pb.tail;
 			m_pb_count = 0;
 		}
 		else
 		{
 			// TODO: SIMD
-			uint32 vb_count = m_vb_count;
+			ASSERT(m_pb_count < 256);
 			uint32 vb_count = m_vb_count | (m_pb_count << 24);
 			for(size_t i = 0; i < m_index.tail; i++)
 			{
@ -398,21 +413,25 @@ void GSRendererCL::Draw()
 		job->rect.z = rect.z;
 		job->rect.w = rect.w;
 		job->ib_start = m_cl.ib.tail;
-		job->ib_count = m_index.tail;
+		job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
-		job->pb_start = m_cl.pb.tail;
+		job->fbp = pb->fbp;
 		job->zbp = pb->zbp;
 		job->bw = pb->bw;
 #ifdef DEBUG
-		job->param = pb;
+		job->pb = pb;
 #endif
 		m_jobs.push_back(job);
 		m_vb_count += m_vertex.next;
 		m_pb_count++;
 		m_cl.vb.tail += vb_size;
 		m_cl.ib.tail += ib_size;
 		m_cl.pb.tail += pb_size;
 		m_synced = false;
 		// mark pages used in rendering as source or target
 		if(job->sel.fwrite || job->sel.rfb)
@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason)
 		m_rw_pages[1][i] = GSVector4i::zero();
 	}
-	// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, 
+	m_synced = true;
 	// though it does not seem to be necessary even with GPU devices where it might be cached, 
 	// needs more testing...
 	//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
 	//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
 }
 void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 	o->GetPagesAsBits(r, m_tmp_pages);
-	//if(!synced)
+	if(!m_synced)
 	{
 		for(int i = 0; i < 4; i++)
 		{
@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 {
 	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
-	//if(!synced)
+	if(!m_synced)
 	{
 		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
@ -620,16 +634,7 @@ void GSRendererCL::Enqueue()
 		int primclass = m_jobs.front()->sel.prim;
-		uint32 n;
+		uint32 n = GSUtil::GetClassVertexCount(primclass);
 		switch(primclass)
 		{
 		case GS_POINT_CLASS: n = 1; break;
 		case GS_LINE_CLASS: n = 2; break;
 		case GS_TRIANGLE_CLASS: n = 3; break;
 		case GS_SPRITE_CLASS: n = 2; break;
 		default: __assume(0);
 		}
 		PrimSelector psel;
@ -678,8 +683,6 @@ void GSRendererCL::Enqueue()
 		//
 		cl_kernel tfx_prev = NULL;
 		auto head = m_jobs.begin();
 		while(head != m_jobs.end())
@ -692,8 +695,8 @@ void GSRendererCL::Enqueue()
 			{
 				auto job = next++;
-				uint32 cur_prim_count = (*job)->ib_count / n;
+				uint32 cur_prim_count = (*job)->prim_count;
-				uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
+				uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0;
 				total_prim_count += cur_prim_count;
@ -775,9 +778,8 @@ void GSRendererCL::Enqueue()
 							uint32 group_count = batch_count * item_count;
 							tk.setArg(1, (cl_uint)prim_count);
-							tk.setArg(2, (cl_uint)batch_count);
+							tk.setArg(2, (cl_uint)bin_count);
-							tk.setArg(3, (cl_uint)bin_count);
+							tk.setArg(3, bin_dim);
 							tk.setArg(4, bin_dim);
 							m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
 						}
@ -789,68 +791,20 @@ void GSRendererCL::Enqueue()
 						}
 					}
-					//
+					std::list<shared_ptr<TFXJob>> jobs(head, next);
-					uint32 prim_start = 0;
+					EnqueueTFX(jobs, bin_count, bin_dim);
 					for(auto i = head; i != next; i++)
 					{
 						ASSERT(prim_start < MAX_PRIM_COUNT);
 						// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same 
 						// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
 						tfxcount++;
 						UpdateTextureCache((*i).get());
 						uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
 						// TODO: tile level z test
 						cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
 						if(tfx_prev != tfx())
 						{
 							tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
 							tfx_prev = tfx();
 						}
 						tfx.setArg(4, (cl_uint)(*i)->pb_start);
 						tfx.setArg(5, (cl_uint)prim_start);
 						tfx.setArg(6, (cl_uint)prim_count_inner);
 						tfx.setArg(7, (cl_uint)batch_count);
 						tfx.setArg(8, (cl_uint)bin_count);
 						tfx.setArg(9, bin_dim);
 						GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
 						r = r.ralign<Align_Outside>(GSVector2i(8, 8));
 						m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
 						tfxpixels += r.width() * r.height();
 						InvalidateTextureCache((*i).get());
 						// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
 						prim_start += prim_count_inner;
 					}
 					//
 					if(total_prim_count > MAX_PRIM_COUNT)
 					{
 						prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
 						(*job)->ib_start += prim_count * n * sizeof(uint32);
-						(*job)->ib_count -= prim_count * n;
+						(*job)->prim_count -= prim_count;
 						next = job; // try again for the remainder
-						//printf("split %d\n", (*job)->ib_count / n);
+						//printf("split %d\n", (*job)->prim_count);
 					}
 					break;
@ -876,6 +830,131 @@ void GSRendererCL::Enqueue()
 	m_cl.Map();
 }
 void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
 {
 	// join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages
 	//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
 	auto next = jobs.begin();
 	while(next != jobs.end())
 	{
 		auto prev = next++;
 		if(next == jobs.end())
 		{
 			break;
 		}
 		if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw)
 		{
 			if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
 			{
 				bool overlap = false;
 				for(int i = 0; i < 4; i++)
 				{
 					if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
 					{
 						overlap = true;
 						break;
 					}
 				}
 				if(overlap)
 				{
 					continue;
 				}
 			}
 			if((*prev)->src_pages != NULL)
 			{
 				GSVector4i* src_pages = (*next)->GetSrcPages();
 				for(int i = 0; i < 4; i++)
 				{
 					src_pages[i] |= (*prev)->src_pages[i];
 				}
 			}
 			if((*prev)->dst_pages != NULL)
 			{
 				GSVector4i* dst_pages = (*next)->GetDstPages();
 				for(int i = 0; i < 4; i++)
 				{
 					dst_pages[i] |= (*prev)->dst_pages[i];
 				}
 			}
 			GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
 			GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
 			GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
 			(*next)->prim_count += (*prev)->prim_count;
 			(*next)->ib_start = (*prev)->ib_start;
 			jobs.erase(prev);
 		}
 	}
 	//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
 	//
 	cl_kernel tfx_prev = NULL;
 	uint32 prim_start = 0;
 	for(auto i : jobs)
 	{
 		ASSERT(prim_start < MAX_PRIM_COUNT);
 		tfxcount++;
 		UpdateTextureCache(i.get());
 		uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
 		// TODO: tile level z test
 		cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
 		if(tfx_prev != tfx())
 		{
 			tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
 			tfx.setArg(4, (cl_uint)m_pb_start);
 			tfx_prev = tfx();
 		}
 		tfx.setArg(5, (cl_uint)prim_start);
 		tfx.setArg(6, (cl_uint)prim_count);
 		tfx.setArg(7, (cl_uint)bin_count);
 		tfx.setArg(8, bin_dim);
 		tfx.setArg(9, i->fbp);
 		tfx.setArg(10, i->zbp);
 		tfx.setArg(11, i->bw);
 		GSVector4i r = GSVector4i::load<false>(&i->rect);
 		r = r.ralign<Align_Outside>(GSVector2i(8, 8));
 		m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
 		tfxpixels += r.width() * r.height();
 		InvalidateTextureCache(i.get());
 		// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
 		prim_start += prim_count;
 	}
 }
 void GSRendererCL::UpdateTextureCache(TFXJob* job)
 {
 	if(job->src_pages == NULL) return;
@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	return true;
 }
-//////////
+//
 GSRendererCL::TFXJob::TFXJob()
 	: src_pages(NULL)
 	, dst_pages(NULL)
 {
 }
 GSRendererCL::TFXJob::~TFXJob()
 {
 	if(src_pages != NULL) _aligned_free(src_pages);
 	if(dst_pages != NULL) _aligned_free(dst_pages);
 }
 GSVector4i* GSRendererCL::TFXJob::GetSrcPages()
 {
 	if(src_pages == NULL)
 	{
 		src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
 		src_pages[0] = GSVector4i::zero();
 		src_pages[1] = GSVector4i::zero();
 		src_pages[2] = GSVector4i::zero();
 		src_pages[3] = GSVector4i::zero();
 	}
 	return src_pages;
 }
 GSVector4i* GSRendererCL::TFXJob::GetDstPages()
 {
 	if(dst_pages == NULL)
 	{
 		dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
 		dst_pages[0] = GSVector4i::zero();
 		dst_pages[1] = GSVector4i::zero();
 		dst_pages[2] = GSVector4i::zero();
 		dst_pages[3] = GSVector4i::zero();
 	}
 	return dst_pages;
 }
 //
 //#define IOCL_DEBUG
@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL()
 	ib.mapped_ptr = ib.ptr = NULL;
 	pb.mapped_ptr = pb.ptr = NULL;
-	pb.size = sizeof(TFXParameter) * 256;
+	pb.size = TFX_PARAM_SIZE * 256;
 	pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
 	pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map()
 {
 	Unmap();
 	// TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+
 	if(vb.head < vb.size)
 	{
 		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
 		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
 		ASSERT(((size_t)vb.ptr & 15) == 0);
 		ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
 	}
 	if(ib.head < ib.size)
@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map()
 		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
 		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
 		ASSERT(((size_t)pb.ptr & 15) == 0);
 		ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
 	}
 }
@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt)
 	opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
 	opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";	
 	opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
 	opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u ";
 #ifdef IOCL_DEBUG
 	opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
 #endif
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer
 	{
 	public:
 		struct { int x, y, z, w; } rect;
-		TFXSelector sel; // uses primclass, solidrect only
+		TFXSelector sel;
-		uint32 ib_start, ib_count;
+		uint32 ib_start;
-		uint32 pb_start;
+		uint32 prim_count;
 		GSVector4i* src_pages; // read by any texture level
 		GSVector4i* dst_pages; // f/z writes to it
 		uint32 fbp, zbp, bw;
 #ifdef DEBUG
-		TFXParameter* param;
+		TFXParameter* pb;
 #endif
-		TFXJob()
+		TFXJob();
-			: src_pages(NULL)
+		virtual ~TFXJob();
 			, dst_pages(NULL)
 		{
 		}
-		virtual ~TFXJob()
+		GSVector4i* GetSrcPages();
-		{
+		GSVector4i* GetDstPages();
 			if(src_pages != NULL) _aligned_free(src_pages);
 			if(dst_pages != NULL) _aligned_free(dst_pages);
 		}
 		GSVector4i* GetSrcPages()
 		{
 			if(src_pages == NULL)
 			{
 				src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
 				src_pages[0] = GSVector4i::zero();
 				src_pages[1] = GSVector4i::zero();
 				src_pages[2] = GSVector4i::zero();
 				src_pages[3] = GSVector4i::zero();
 			}
 			return src_pages;
 		}
 		GSVector4i* GetDstPages()
 		{
 			if(dst_pages == NULL)
 			{
 				dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
 				dst_pages[0] = GSVector4i::zero();
 				dst_pages[1] = GSVector4i::zero();
 				dst_pages[2] = GSVector4i::zero();
 				dst_pages[3] = GSVector4i::zero();
 			}
 			return dst_pages;
 		}
 	};
 	class CL
@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer
 	std::list<shared_ptr<TFXJob>> m_jobs;
 	uint32 m_vb_start;
 	uint32 m_vb_count;
 	uint32 m_pb_start;
 	uint32 m_pb_count;
 	bool m_synced;
 	void Enqueue();
 	void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
 	void UpdateTextureCache(TFXJob* job);
 	void InvalidateTextureCache(TFXJob* job);
--- a/plugins/GSdx/GSUtil.cpp
+++ b/plugins/GSdx/GSUtil.cpp
@ -94,6 +94,7 @@ static class GSUtilMaps
 public:
 	uint8 PrimClassField[8];
 	uint8 VertexCountField[8];
 	uint8 ClassVertexCountField[4];
 	uint32 CompatibleBitsField[64][2];
 	uint32 SharedBitsField[64][2];
@ -117,6 +118,11 @@ public:
 		VertexCountField[GS_SPRITE] = 2;
 		VertexCountField[GS_INVALID] = 1;
 		ClassVertexCountField[GS_POINT_CLASS] = 1;
 		ClassVertexCountField[GS_LINE_CLASS] = 2;
 		ClassVertexCountField[GS_TRIANGLE_CLASS] = 3;
 		ClassVertexCountField[GS_SPRITE_CLASS] = 2;
 		memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField));
 		for(int i = 0; i < 64; i++)
@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim)
 	return s_maps.VertexCountField[prim];
 }
 int GSUtil::GetClassVertexCount(uint32 primclass)
 {
 	return s_maps.ClassVertexCountField[primclass];
 }
 const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
 {
 	return s_maps.SharedBitsField[dpsm];
--- a/plugins/GSdx/GSUtil.h
+++ b/plugins/GSdx/GSUtil.h
@ -30,6 +30,7 @@ public:
 	static GS_PRIM_CLASS GetPrimClass(uint32 prim);
 	static int GetVertexCount(uint32 prim);
 	static int GetClassVertexCount(uint32 primclass);
 	static const uint32* HasSharedBitsPtr(uint32 dpsm);
 	static bool HasSharedBits(uint32 spsm, const uint32* ptr);
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@ -36,7 +36,10 @@ typedef struct
 typedef struct
 {
-	gs_vertex v[4];
+	gs_vertex v[3];
 	uint zmin;
 	uint pb_index;
 	uint _pad[2];
 } gs_prim;
 typedef struct
@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM(
 	ib += prim_index * VERTEX_PER_PRIM;
 	prim->pb_index = ib[0] >> 24;
 	__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
 	__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
 	__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];
 	int2 pmin, pmax;
 	if(PRIM == GS_POINT_CLASS)
 	{
 		__global gs_vertex* v0 = &vb[ib[0]];
 		pmin = pmax = convert_int2_rte(v0->p.xy);
 		prim->v[0].p = v0->p;
@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM(
 	}
 	else if(PRIM == GS_LINE_CLASS)
 	{
-		int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
+		int2 p0 = convert_int2_rte(v0->p.xy);
-		int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
+		int2 p1 = convert_int2_rte(v1->p.xy);
 		pmin = min(p0, p1);
 		pmax = max(p0, p1);
 	}
 	else if(PRIM == GS_TRIANGLE_CLASS)
 	{
 		__global gs_vertex* v0 = &vb[ib[0]];
 		__global gs_vertex* v1 = &vb[ib[1]];
 		__global gs_vertex* v2 = &vb[ib[2]];
 		int2 p0 = convert_int2_rtp(v0->p.xy);
 		int2 p1 = convert_int2_rtp(v1->p.xy);
 		int2 p2 = convert_int2_rtp(v2->p.xy);
@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM(
 		pmax = max(max(p0, p1), p2);
 		// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
-		// pass the minimum through the unused 4th padding vector 
+		// only interpolate the relative to zmin and hopefully small values
 		// only interpolate the relative and hopefully small values
 		uint zmin = min(min(v0->z, v1->z), v2->z);
@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM(
 		prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
 		prim->v[2].tc = v2->tc;
-		prim->v[3].z = zmin;
+		prim->zmin = zmin;
 		float4 dp0 = v1->p - v0->p;
 		float4 dp1 = v0->p - v2->p;
@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM(
 	}
 	else if(PRIM == GS_SPRITE_CLASS)
 	{
 		__global gs_vertex* v0 = &vb[ib[0]];
 		__global gs_vertex* v1 = &vb[ib[1]];
 		int2 p0 = convert_int2_rtp(v0->p.xy);
 		int2 p1 = convert_int2_rtp(v1->p.xy);
@ -785,7 +784,6 @@ __kernel void KERNEL_TILE(
 __kernel void KERNEL_TILE(
 	__global gs_env* env,
 	uint prim_count,
 	uint batch_count,
 	uint bin_count, // == bin_dim.z * bin_dim.w
 	uchar4 bin_dim)
 {
@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	uint pb_start,
 	uint prim_start, 
 	uint prim_count,
 	uint batch_count,
 	uint bin_count, // == bin_dim.z * bin_dim.w
-	uchar4 bin_dim)
+	uchar4 bin_dim,
 	uint fbp, 
 	uint zbp, 
 	uint bw)
 {
 	uint x = get_global_id(0);
 	uint y = get_global_id(1);
@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	//
 	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
 	int2 pi = (int2)(x, y);
 	float2 pf = convert_float2(pi);
-	if(!NOSCISSOR)
+	int faddr = PixelAddress(x, y, fbp, bw, FPSM);
-	{
+	int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);
 		if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
 		{
 			return;
 		}
 	}
 	int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
 	int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
 	uint fd, zd; // TODO: fd as int4 and only pack before writing out?
@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
 	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
 	pb_base += pb_start;
 	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
 	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				break;
 			}
 			__global gs_prim* prim = &prim_base[prim_index + i];
 			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
 			__global gs_prim* prim = &prim_base[prim_index + i];
 			__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);
 			if(!NOSCISSOR)
 			{
 				if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
 				{
 					continue;
 				}
 			}
 			uint2 zf;
 			float3 t;
 			int4 c;
@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
 				float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
-				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
+				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
 				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
 				t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 				}
 			}
-			// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
+			// read mask
 			uint fm = pb->fm;
 			uint zm = pb->zm;