From 3d2b0e3766b10b3d745775b99ef7599a46d7bb0d Mon Sep 17 00:00:00 2001
From: gabest11 <gabest11@gmail.com>
Date: Sat, 20 Sep 2014 23:59:45 +0200
Subject: [PATCH] minor opencl kernel optimizations

---
 plugins/GSdx/GSRendererCL.cpp | 155 +++++++++++++++++-----------------
 plugins/GSdx/GSRendererCL.h   |   1 -
 plugins/GSdx/res/tfx.cl       | 143 +++++++++++++++----------------
 3 files changed, 144 insertions(+), 155 deletions(-)

diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp
index 9954b51479..77693579d6 100644
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@@ -61,8 +61,6 @@ typedef struct
 
 typedef struct
 {
-	cl_uint batch_counter;
-	cl_uint _pad[7];
 	struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
 	BIN_TYPE bin[MAX_BIN_COUNT];
 	cl_uchar4 bbox[MAX_PRIM_COUNT];
@@ -392,10 +390,9 @@ void GSRendererCL::Draw()
 
 		if(bbox.eq(bbox.rintersect(scissor)))
 		{
-			pb->sel.noscissor = 1;
+			job->sel.noscissor = 1;
 		}
 
-		job->sel = pb->sel;
 		job->rect.x = rect.x;
 		job->rect.y = rect.y;
 		job->rect.z = rect.z;
@@ -418,11 +415,11 @@ void GSRendererCL::Draw()
 
 		// mark pages used in rendering as source or target
 
-		if(pb->sel.fwrite || pb->sel.rfb)
+		if(job->sel.fwrite || job->sel.rfb)
 		{
 			m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
 
-			if(pb->sel.rfb)
+			if(job->sel.rfb)
 			{
 				for(int i = 0; i < 4; i++)
 				{
@@ -430,7 +427,7 @@ void GSRendererCL::Draw()
 				}
 			}
 
-			if(pb->sel.fwrite)
+			if(job->sel.fwrite)
 			{
 				GSVector4i* dst_pages = job->GetDstPages();
 
@@ -443,11 +440,11 @@ void GSRendererCL::Draw()
 			}
 		}
 
-		if(pb->sel.zwrite || pb->sel.rzb)
+		if(job->sel.zwrite || job->sel.rzb)
 		{
 			m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
 
-			if(pb->sel.rzb)
+			if(job->sel.rzb)
 			{
 				for(int i = 0; i < 4; i++)
 				{
@@ -455,7 +452,7 @@ void GSRendererCL::Draw()
 				}
 			}
 
-			if(pb->sel.zwrite)
+			if(job->sel.zwrite)
 			{
 				GSVector4i* dst_pages = job->GetDstPages();
 
@@ -998,12 +995,12 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	const GSDrawingContext* context = m_context;
 	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
 
-	pb->sel.key = 0;
+	job->sel.key = 0;
 
-	pb->sel.atst = ATST_ALWAYS;
-	pb->sel.tfx = TFX_NONE;
-	pb->sel.ababcd = 0xff;
-	pb->sel.prim = primclass;
+	job->sel.atst = ATST_ALWAYS;
+	job->sel.tfx = TFX_NONE;
+	job->sel.ababcd = 0xff;
+	job->sel.prim = primclass;
 
 	uint32 fm = context->FRAME.FBMSK;
 	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
@@ -1026,8 +1023,8 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	{
 		if(!TryAlphaTest(fm, zm))
 		{
-			pb->sel.atst = context->TEST.ATST;
-			pb->sel.afail = context->TEST.AFAIL;
+			job->sel.atst = context->TEST.ATST;
+			job->sel.afail = context->TEST.AFAIL;
 			pb->aref = context->TEST.AREF;
 		}
 	}
@@ -1056,31 +1053,31 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 	if(!fwrite && !zwrite) return false;
 
-	bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+	bool ftest = job->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
 	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
 
-	pb->sel.fwrite = fwrite;
-	pb->sel.ftest = ftest;
-	pb->sel.zwrite = zwrite;
-	pb->sel.ztest = ztest;
+	job->sel.fwrite = fwrite;
+	job->sel.ftest = ftest;
+	job->sel.zwrite = zwrite;
+	job->sel.ztest = ztest;
 
 	if(fwrite || ftest)
 	{
-		pb->sel.fpsm = RemapPSM(context->FRAME.PSM);
+		job->sel.fpsm = RemapPSM(context->FRAME.PSM);
 
 		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
 		{
-			pb->sel.iip = PRIM->IIP;
+			job->sel.iip = PRIM->IIP;
 		}
 
 		if(PRIM->TME)
 		{
-			pb->sel.tfx = context->TEX0.TFX;
-			pb->sel.tcc = context->TEX0.TCC;
-			pb->sel.fst = PRIM->FST;
-			pb->sel.ltf = m_vt.IsLinear();
-			pb->sel.tpsm = RemapPSM(context->TEX0.PSM);
-			pb->sel.aem = m_env.TEXA.AEM;
+			job->sel.tfx = context->TEX0.TFX;
+			job->sel.tcc = context->TEX0.TCC;
+			job->sel.fst = PRIM->FST;
+			job->sel.ltf = m_vt.IsLinear();
+			job->sel.tpsm = RemapPSM(context->TEX0.PSM);
+			job->sel.aem = m_env.TEXA.AEM;
 
 			pb->tbp[0] = context->TEX0.TBP0;
 			pb->tbw[0] = context->TEX0.TBW;
@@ -1089,24 +1086,24 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
 			{
-				pb->sel.tlu = 1;
+				job->sel.tlu = 1;
 
 				memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
 			}
 
-			pb->sel.wms = context->CLAMP.WMS;
-			pb->sel.wmt = context->CLAMP.WMT;
+			job->sel.wms = context->CLAMP.WMS;
+			job->sel.wmt = context->CLAMP.WMT;
 
-			if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			if(job->sel.tfx == TFX_MODULATE && job->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
 			{
 				// modulate does not do anything when vertex color is 0x80
 
-				pb->sel.tfx = TFX_DECAL;
+				job->sel.tfx = TFX_DECAL;
 			}
 
 			GSVector4i r;
 
-			GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
+			GetTextureMinMax(r, context->TEX0, context->CLAMP, job->sel.ltf);
 
 			GSVector4i* src_pages = job->GetSrcPages();
 
@@ -1131,15 +1128,15 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 				if(m_vt.m_lod.x > 0)
 				{
-					pb->sel.ltf = context->TEX1.MMIN >> 2;
+					job->sel.ltf = context->TEX1.MMIN >> 2;
 				}
 				else
 				{
 					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
 				}
 
-				pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
-				pb->sel.lcm = context->TEX1.LCM;
+				job->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				job->sel.lcm = context->TEX1.LCM;
 
 				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
 				int k = context->TEX1.K << 12;
@@ -1148,28 +1145,28 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 				{
 					k = (int)m_vt.m_lod.x << 16; // set lod to max level
 
-					pb->sel.lcm = 1; // lod is constant
-					pb->sel.mmin = 1; // tri-linear is meaningless
+					job->sel.lcm = 1; // lod is constant
+					job->sel.mmin = 1; // tri-linear is meaningless
 				}
 
-				if(pb->sel.mmin == 2)
+				if(job->sel.mmin == 2)
 				{
 					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
 				}
 
-				if(pb->sel.fst)
+				if(job->sel.fst)
 				{
-					ASSERT(pb->sel.lcm == 1);
+					ASSERT(job->sel.lcm == 1);
 					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
 
-					pb->sel.lcm = 1;
+					job->sel.lcm = 1;
 				}
 
-				if(pb->sel.lcm)
+				if(job->sel.lcm)
 				{
 					int lod = std::max<int>(std::min<int>(k, mxl), 0);
 
-					if(pb->sel.mmin == 1)
+					if(job->sel.mmin == 1)
 					{
 						lod = (lod + 0x8000) & 0xffff0000; // rounding
 					}
@@ -1241,7 +1238,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 					GSVector4i r;
 
-					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, job->sel.ltf);
 
 					GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
 					
@@ -1260,7 +1257,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 			}
 			else
 			{
-				if(pb->sel.fst == 0)
+				if(job->sel.fst == 0)
 				{
 					// skip per pixel division if q is constant
 
@@ -1268,7 +1265,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 					if(m_vt.m_eq.q)
 					{
-						pb->sel.fst = 1;
+						job->sel.fst = 1;
 
 						const GSVector4& t = v[index[0]].t;
 
@@ -1286,7 +1283,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 					}
 					else if(primclass == GS_SPRITE_CLASS)
 					{
-						pb->sel.fst = 1;
+						job->sel.fst = 1;
 
 						for(int i = 0, j = vertex_count; i < j; i += 2)
 						{
@@ -1301,7 +1298,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 					}
 				}
 
-				if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel?
+				if(job->sel.ltf && job->sel.fst) // TODO: quite slow, do this in the prim kernel?
 				{
 					// if q is constant we can do the half pel shift for bilinear sampling on the vertices
 
@@ -1378,58 +1375,58 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 		if(PRIM->FGE)
 		{
-			pb->sel.fge = 1;
+			job->sel.fge = 1;
 			pb->fog = env.FOGCOL.u32[0];
 		}
 
 		if(context->FRAME.PSM != PSM_PSMCT24)
 		{
-			pb->sel.date = context->TEST.DATE;
-			pb->sel.datm = context->TEST.DATM;
+			job->sel.date = context->TEST.DATE;
+			job->sel.datm = context->TEST.DATM;
 		}
 
 		if(!IsOpaque())
 		{
-			pb->sel.abe = PRIM->ABE;
-			pb->sel.ababcd = context->ALPHA.u32[0];
+			job->sel.abe = PRIM->ABE;
+			job->sel.ababcd = context->ALPHA.u32[0];
 
 			if(env.PABE.PABE)
 			{
-				pb->sel.pabe = 1;
+				job->sel.pabe = 1;
 			}
 
 			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
 			{
-				pb->sel.aa1 = 1;
+				job->sel.aa1 = 1;
 			}
 
 			pb->afix = context->ALPHA.FIX;
 		}
 
-		if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1)
+		if(job->sel.date || job->sel.aba == 1 || job->sel.abb == 1 || job->sel.abc == 1 || job->sel.abd == 1)
 		{
-			pb->sel.rfb = 1;
+			job->sel.rfb = 1;
 		}
 		else
 		{
 			if(fwrite)
 			{
-				if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
-				|| (pb->sel.fpsm & 3) == 0 && fm != 0
-				|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
-				|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
+				if(job->sel.atst != ATST_ALWAYS && job->sel.afail == AFAIL_RGB_ONLY
+				|| (job->sel.fpsm & 3) == 0 && fm != 0
+				|| (job->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
+				|| (job->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
 				{
-					pb->sel.rfb = 1;
+					job->sel.rfb = 1;
 				}
 			}
 		}
 
-		pb->sel.colclamp = env.COLCLAMP.CLAMP;
-		pb->sel.fba = context->FBA.FBA;
+		job->sel.colclamp = env.COLCLAMP.CLAMP;
+		job->sel.fba = context->FBA.FBA;
 
 		if(env.DTHE.DTHE)
 		{
-			pb->sel.dthe = 1;
+			job->sel.dthe = 1;
 
 			GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
 			GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
@@ -1442,21 +1439,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 
 	if(zwrite || ztest)
 	{
-		pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
-		pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+		job->sel.zpsm = RemapPSM(context->ZBUF.PSM);
+		job->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
 
 		if(ztest)
 		{
-			pb->sel.rzb = 1;
+			job->sel.rzb = 1;
 		}
 		else
 		{
 			if(zwrite)
 			{
-				if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY)
-				|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
+				if(job->sel.atst != ATST_ALWAYS && (job->sel.afail == AFAIL_FB_ONLY || job->sel.afail == AFAIL_RGB_ONLY)
+				|| (job->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
 				{
-					pb->sel.rzb = 1;
+					job->sel.rzb = 1;
 				}
 			}
 		}
@@ -1465,11 +1462,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 	pb->fm = fm;
 	pb->zm = zm;
 
-	if((pb->sel.fpsm & 3) == 1)
+	if((job->sel.fpsm & 3) == 1)
 	{
 		pb->fm |= 0xff000000;
 	}
-	else if((pb->sel.fpsm & 3) >= 2)
+	else if((job->sel.fpsm & 3) >= 2)
 	{
 		uint32 rb = pb->fm & 0x00f800f8;
 		uint32 ga = pb->fm & 0x8000f800;
@@ -1477,11 +1474,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
 		pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
 	}
 
-	if((pb->sel.zpsm & 3) == 1)
+	if((job->sel.zpsm & 3) == 1)
 	{
 		pb->zm |= 0xff000000;
 	}
-	else if((pb->sel.zpsm & 3) >= 2)
+	else if((job->sel.zpsm & 3) >= 2)
 	{
 		pb->zm |= 0xffff0000;
 	}
diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h
index 3efe33f29b..59fd524943 100644
--- a/plugins/GSdx/GSRendererCL.h
+++ b/plugins/GSdx/GSRendererCL.h
@@ -146,7 +146,6 @@ class GSRendererCL : public GSRenderer
 	{
 		GSVector4i scissor;
 		GSVector4i dimx; // 4x4 signed char
-		TFXSelector sel;
 		uint32 fbp, zbp, bw;
 		uint32 fm, zm;
 		uint32 fog; // rgb
diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl
index bdf75b11a1..f9bb6dac2b 100644
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@@ -1,5 +1,21 @@
 #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
 
+#ifdef cl_amd_printf
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#else
+#define printf(x)
+#endif
+
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#else
+#endif
+
+#ifdef cl_amd_media_ops2
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#else
+#endif
+
 #ifndef CL_FLT_EPSILON
 #define CL_FLT_EPSILON 1.1920928955078125e-7f
 #endif
@@ -32,8 +48,6 @@ typedef struct
 
 typedef struct
 {
-	uint batch_counter;
-	uint _pad[7];
 	struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
 	BIN_TYPE bin[MAX_BIN_COUNT];
 	uchar4 bbox[MAX_PRIM_COUNT];
@@ -45,7 +59,6 @@ typedef struct
 {
 	int4 scissor;
 	char dimx[4][4];
-	ulong sel;
 	int fbp, zbp, bw;
 	uint fm, zm;
 	uchar4 fog; // rgb
@@ -679,7 +692,6 @@ int tile_in_triangle(float2 p, gs_barycentric b)
 
 __kernel void KERNEL_TILE(__global gs_env* env)
 {
-	env->batch_counter = 0;
 	env->bounds[get_global_id(0)].first = -1;
 	env->bounds[get_global_id(0)].last = 0;
 }
@@ -777,77 +789,60 @@ __kernel void KERNEL_TILE(
 	uint bin_count, // == bin_dim.z * bin_dim.w
 	uchar4 bin_dim)
 {
-	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
-	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
-	__local uint batch_index;
-
+	size_t batch_index = get_group_id(0);
 	size_t local_id = get_local_id(0);
 	size_t local_size = get_local_size(0);
 
-	while(1)
+	uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
+		
+	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+	__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
+	__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
+
+	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
+	
+	event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
+
+	wait_group_events(1, &e);
+
+	if(PRIM == GS_TRIANGLE_CLASS)
 	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		if(local_id == 0)
-		{
-			batch_index = atomic_inc(&env->batch_counter);
-		}
-
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		if(batch_index >= batch_count) 
-		{
-			break;
-		}
-
-		uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
+		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
 		
-		__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
-		__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
-		__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
-
-		event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
-
 		wait_group_events(1, &e);
+	}
 
-		if(PRIM == GS_TRIANGLE_CLASS)
+	for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
+	{
+		int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
+		int x = bin_index - y * bin_dim.z;
+
+		x += bin_dim.x;
+		y += bin_dim.y;
+
+		BIN_TYPE visible = 0;
+
+		for(uint i = 0; i < batch_prim_count; i++)
 		{
-			e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
-		
-			wait_group_events(1, &e);
+			uchar4 r = bbox_cache[i];
+
+			BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
+
+			if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+			{
+				test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
+			}
+
+			visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
 		}
 
-		for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
+		bin[bin_index] = visible;
+
+		if(visible != 0)
 		{
-			int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
-			int x = bin_index - y * bin_dim.z;
-
-			x += bin_dim.x;
-			y += bin_dim.y;
-
-			BIN_TYPE visible = 0;
-
-			for(uint i = 0; i < batch_prim_count; i++)
-			{
-				uchar4 r = bbox_cache[i];
-
-				BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
-
-				if(PRIM == GS_TRIANGLE_CLASS && test != 0)
-				{
-					test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
-				}
-
-				visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
-			}
-
-			bin[bin_index] = visible;
-
-			if(visible != 0)
-			{
-				atomic_min(&env->bounds[bin_index].first, batch_index);
-				atomic_max(&env->bounds[bin_index].last, batch_index);
-			}
+			atomic_min(&env->bounds[bin_index].first, batch_index);
+			atomic_max(&env->bounds[bin_index].last, batch_index);
 		}
 	}
 }
@@ -998,10 +993,10 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
 			}
 			else if(is16bit(FPSM))
 			{
-				cd.x = (fd & 0x001f) << 3;
-				cd.y = (fd & 0x03e0) >> 2;
-				cd.z = (fd & 0x7c00) >> 7;
-				cd.w = (fd & 0x8000) >> 8;
+				cd.x = (fd << 3) & 0xf8;
+				cd.y = (fd >> 2) & 0xf8;
+				cd.z = (fd >> 7) & 0xf8;
+				cd.w = (fd >> 8) & 0x80;
 			}
 		}
 
@@ -1077,9 +1072,9 @@ uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
 {
 	uchar4 c;
 
-	c.x = (rgba & 0x001f) << 3;
-	c.y = (rgba & 0x03e0) >> 2;
-	c.z = (rgba & 0x7c00) >> 7;
+	c.x = (rgba << 3) & 0xf8;
+	c.y = (rgba >> 2) & 0xf8;
+	c.z = (rgba >> 7) & 0xf8;
 	c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
 
 	return c;
@@ -1202,7 +1197,7 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
 // multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
 // pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average
 
-__kernel void KERNEL_TFX(
+__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
 	__global gs_env* env,
 	__global uchar* vm,
 	__global uchar* tex,
@@ -1214,8 +1209,6 @@ __kernel void KERNEL_TFX(
 	uint bin_count, // == bin_dim.z * bin_dim.w
 	uchar4 bin_dim)
 {
-	// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way
-
 	uint x = get_global_id(0);
 	uint y = get_global_id(1);
 
@@ -1451,7 +1444,7 @@ __kernel void KERNEL_TFX(
 				{
 					if(!ABE || c.w == 0x80)
 					{
-						c.w = /*edge ? coverage :*/ 0x80; // TODO
+						c.w = 0x80; // TODO: edge ? coverage : 0x80
 					}
 				}
 			}