solution for 32-bit z values in opencl and other minor optimizations

2014-09-18 09:32:37 +02:00 · 2014-09-18 09:32:37 +02:00 · 263c097d13
parent c64f9ad9b1
commit 263c097d13
2 changed files with 110 additions and 180 deletions
--- a/plugins/GSdx/GSRendererCL.cpp
+++ b/plugins/GSdx/GSRendererCL.cpp
@ -193,23 +193,9 @@ void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex*
 	{
 		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q

-		#if _M_SSE >= 0x401
-
 		GSVector4i xyzuvf(src->m[1]);

-		GSVector4i xy = xyzuvf.upl16() - o;
-		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
-
-		#else
-
-		uint32 z = src->XYZ.Z;
-
-		GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o;
-		GSVector4i zf = GSVector4i((int)std::min<uint32>(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later
-
-		#endif
-
-		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
+		dst->p = (GSVector4(xyzuvf.upl16() - o) * g_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints

 		GSVector4 t = GSVector4::zero();

@ -233,7 +219,7 @@ void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex*
 			}
 		}

-		dst->t = t.insert32<2, 3>(stcq);
+		dst->t = t.insert32<2, 3>(stcq); // color as uchar4 in t.w
 	}
 }

@ -871,6 +857,7 @@ void GSRendererCL::Enqueue()
 						GSVector4i r = GSVector4i::load<false>(&(*i)->rect);

 						r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
+
 						/*
 						if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask
 							;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
--- a/plugins/GSdx/res/tfx.cl
+++ b/plugins/GSdx/res/tfx.cl
@ -14,7 +14,7 @@

 typedef struct
 {
-	union {float4 p; struct {float x, y, z, f;};};
+	union {float4 p; struct {float x, y; uint z, f;};};
 	union {float4 tc; struct {float s, t, q; uchar4 c;};};
 } gs_vertex;

@ -46,12 +46,12 @@ typedef struct
 	int4 scissor;
 	char dimx[4][4];
 	ulong sel;
-	uint fbp, zbp, bw;
+	int fbp, zbp, bw;
 	uint fm, zm;
 	uchar4 fog; // rgb
 	uchar aref, afix;
 	uchar ta0, ta1;
-	uint tbp[7], tbw[7];
+	int tbp[7], tbw[7];
 	int minu, maxu, minv, maxv;
 	int lod; // lcm == 1
 	int mxl;
@ -68,7 +68,7 @@ enum GS_PRIM_CLASS
 	GS_SPRITE_CLASS
 };

-enum GS_PSM_TARGET
+enum GS_PSM
 {
 	PSM_PSMCT32,
 	PSM_PSMCT24,
@ -350,87 +350,87 @@ __constant ushort columnTable4[16][32] =
 	  407, 415, 439, 447, 471, 479, 503, 511 },
 };

-uint BlockNumber32(int x, int y, uint bp, uint bw)
+int BlockNumber32(int x, int y, int bp, int bw)
 {
-	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
+	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
 }

-uint BlockNumber16(int x, int y, uint bp, uint bw)
+int BlockNumber16(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
 }

-uint BlockNumber16S(int x, int y, uint bp, uint bw)
+int BlockNumber16S(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
 }

-uint BlockNumber32Z(int x, int y, uint bp, uint bw)
+int BlockNumber32Z(int x, int y, int bp, int bw)
 {
-	return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
+	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
 }

-uint BlockNumber16Z(int x, int y, uint bp, uint bw)
+int BlockNumber16Z(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
 }

-uint BlockNumber16SZ(int x, int y, uint bp, uint bw)
+int BlockNumber16SZ(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
 }

-uint BlockNumber8(int x, int y, uint bp, uint bw)
+int BlockNumber8(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
+	return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
 }

-uint BlockNumber4(int x, int y, uint bp, uint bw)
+int BlockNumber4(int x, int y, int bp, int bw)
 {
-	return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
+	return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
 }

-uint PixelAddress32(int x, int y, uint bp, uint bw)
+int PixelAddress32(int x, int y, int bp, int bw)
 {
 	return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
 }

-uint PixelAddress16(int x, int y, uint bp, uint bw)
+int PixelAddress16(int x, int y, int bp, int bw)
 {
 	return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
 }

-uint PixelAddress16S(int x, int y, uint bp, uint bw)
+int PixelAddress16S(int x, int y, int bp, int bw)
 {
 	return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
 }

-uint PixelAddress32Z(int x, int y, uint bp, uint bw)
+int PixelAddress32Z(int x, int y, int bp, int bw)
 {
 	return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
 }

-uint PixelAddress16Z(int x, int y, uint bp, uint bw)
+int PixelAddress16Z(int x, int y, int bp, int bw)
 {
 	return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
 }

-uint PixelAddress16SZ(int x, int y, uint bp, uint bw)
+int PixelAddress16SZ(int x, int y, int bp, int bw)
 {
 	return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
 }

-uint PixelAddress8(int x, int y, uint bp, uint bw)
+int PixelAddress8(int x, int y, int bp, int bw)
 {
 	return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
 }

-uint PixelAddress4(int x, int y, uint bp, uint bw)
+int PixelAddress4(int x, int y, int bp, int bw)
 {
 	return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
 }

-uint PixelAddress(int x, int y, uint bp, uint bw, uint psm)
+int PixelAddress(int x, int y, int bp, int bw, int psm)
 {
 	switch(psm)
 	{
@ -459,49 +459,7 @@ uint PixelAddress(int x, int y, uint bp, uint bw, uint psm)
 	}
 }

-uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm)
-{
-	// TODO: replace blockTable with a subset tileTable
-
-	switch(psm)
-	{
-	default:
-	case PSM_PSMCT32: 
-	case PSM_PSMCT24: 
-		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6];
-	case PSM_PSMCT16: 
-		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3];
-	case PSM_PSMCT16S: 
-		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3];
-	case PSM_PSMZ32: 
-	case PSM_PSMZ24: 
-		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6];
-	case PSM_PSMZ16: 
-		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3];
-	case PSM_PSMZ16S: 
-		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3];
-	}
-}
-
-uint TilePixelAddress(int x, int y, uint ba, uint psm)
-{
-	switch(psm)
-	{
-	default:
-	case PSM_PSMCT32: 
-	case PSM_PSMCT24: 
-	case PSM_PSMZ32: 
-	case PSM_PSMZ24: 
-		return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7];
-	case PSM_PSMCT16: 
-	case PSM_PSMCT16S: 
-	case PSM_PSMZ16: 
-	case PSM_PSMZ16S: 
-		return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15];
-	}
-}
-
-uint ReadFrame(__global uchar* vm, uint addr, uint psm)
+uint ReadFrame(__global uchar* vm, int addr, int psm)
 {
 	switch(psm)
 	{
@ -519,7 +477,7 @@ uint ReadFrame(__global uchar* vm, uint addr, uint psm)
 	}
 }

-void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value)
+void WriteFrame(__global uchar* vm, int addr, int psm, uint value)
 {
 	switch(psm)
 	{
@ -593,7 +551,12 @@ __kernel void KERNEL_PRIM(

 	if(PRIM == GS_POINT_CLASS)
 	{
-		pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy);
+		__global gs_vertex* v0 = &vb[ib[0]];
+
+		pmin = pmax = convert_int2_rte(v0->p.xy);
+
+		prim->v[0].p = v0->p;
+		prim->v[0].tc = v0->tc;
 	}
 	else if(PRIM == GS_LINE_CLASS)
 	{
@ -616,13 +579,21 @@ __kernel void KERNEL_PRIM(
 		pmin = min(min(p0, p1), p2);
 		pmax = max(max(p0, p1), p2);

-		prim->v[0].p = v0->p;
+		// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
+		// pass the minimum through the unused 4th padding vector 
+		// only interpolate the relative and hopefully small values
+
+		uint zmin = min(min(v0->z, v1->z), v2->z);
+		
+		prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
 		prim->v[0].tc = v0->tc;
-		prim->v[1].p = v1->p;
+		prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w);
 		prim->v[1].tc = v1->tc;
-		prim->v[2].p = v2->p;
+		prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
 		prim->v[2].tc = v2->tc;

+		prim->v[3].z = zmin;
+
 		float4 dp0 = v1->p - v0->p;
 		float4 dp1 = v0->p - v2->p;
 		float4 dp2 = v2->p - v1->p;
@ -631,10 +602,10 @@ __kernel void KERNEL_PRIM(

 		if(cp != 0.0f)
 		{
-			float cp_rcp = 1.0f / cp;// native_recip(cp);
+			cp = native_recip(cp);

-			float2 u = dp0.xy * cp_rcp;
-			float2 v = -dp1.xy * cp_rcp;
+			float2 u = dp0.xy * cp;
+			float2 v = -dp1.xy * cp;

 			// v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0)

@ -653,9 +624,9 @@ __kernel void KERNEL_PRIM(

 			// any barycentric(reject_corner) < 0, tile outside the triangle

-			b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE;
-			b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE;
-			b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE;
+			b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE;
+			b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE;
+			b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE;

 			// TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed

@ -686,9 +657,9 @@ __kernel void KERNEL_PRIM(
 		prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy);
 	}

-	int4 pminmax = (int4)(pmin, pmax);
+	int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS;

-	env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS);
+	env->bbox[prim_index] = convert_uchar4_sat(r);
 }

 #endif
@ -767,11 +738,11 @@ __kernel void KERNEL_TILE(

 		uchar4 r = bbox_cache[group_prim_index];

-		uint test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y);
+		uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);

 		if(PRIM == GS_TRIANGLE_CLASS && test != 0)
 		{
-			test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
+			test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
 		}

 		atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2)));
@ -848,7 +819,7 @@ __kernel void KERNEL_TILE(

 		for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
 		{
-			int y = bin_index / bin_dim.z;
+			int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
 			int x = bin_index - y * bin_dim.z;

 			x += bin_dim.x;
@ -860,11 +831,11 @@ __kernel void KERNEL_TILE(
 			{
 				uchar4 r = bbox_cache[i];

-				BIN_TYPE test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y);
+				BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);

 				if(PRIM == GS_TRIANGLE_CLASS && test != 0)
 				{
-					test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
+					test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
 				}

 				visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
@ -1185,9 +1156,12 @@ __kernel void KERNEL_TFX(
 {
 	// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way

-	uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x;
-	uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y;
-	uint bin_index = bin_y * bin_dim.z + bin_x;
+	uint x = get_global_id(0);
+	uint y = get_global_id(1);
+
+	uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x;
+	uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y;
+	uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x);

 	uint batch_first = env->bounds[bin_index].first;
 	uint batch_last = env->bounds[bin_index].last;
@ -1230,26 +1204,21 @@ __kernel void KERNEL_TFX(

 	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);

-	uint x = get_global_id(0);
-	uint y = get_global_id(1);
-
 	int2 pi = (int2)(x, y);
 	float2 pf = convert_float2(pi);

 	if(!NOSCISSOR)
 	{
-		int4 scissor = pb->scissor;
-
-		if(!all((pi >= scissor.xy) & (pi < scissor.zw)))
+		if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
 		{
 			return;
 		}
 	}

-	uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
-	uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
+	int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
+	int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);

-	uint fd, zd;
+	uint fd, zd; // TODO: fd as int4 and only pack before writing out?

 	if(RFB) 
 	{
@ -1260,47 +1229,6 @@ __kernel void KERNEL_TFX(
 	{
 		zd = ReadFrame(vm, zaddr, ZPSM);
 	}
-/*
-	// TODO: lookup top left address of this tile + local offset
-	//
-	// 32bpp: 8x8 block size, 4 blocks, 1024 bytes
-	// 0 1
-	// 2 3
-	// 16bpp: 16x8 block size, 2 blocks, 512 bytes
-	// 0
-	// 1
-	// linear access in memory, this layout is the same for all formats
-
-	__local uint fbn, zbn;
-	__local uchar fb[1024], zb[1024];
-
-	if(get_local_id(0) == 0 && get_local_id(1) == 0)
-	{
-		fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
-		zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM);
-	}
-
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	uint faddr = TilePixelAddress(x, y, fbn, FPSM);
-	uint zaddr = TilePixelAddress(x, y, zbn, ZPSM);
-
-	if(RFB)
-	{
-		event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0);
-		
-		wait_group_events(1, &e);
-	}
-
-	if(RZB)
-	{
-		event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
-		
-		wait_group_events(1, &e);
-	}
-
-	// not sure if faster
-*/

 	// early destination alpha test

@ -1346,30 +1274,44 @@ __kernel void KERNEL_TFX(

 			if(PRIM == GS_POINT_CLASS)
 			{
-				// TODO: distance.x < 0.5f || distance.y < 0.5f
+				float2 dpf = pf - prim->v[0].p.xy;

-				continue;
+				if(!all((dpf <= 0.5f) & (dpf > -0.5f)))
+				{
+					continue;
+				}
+
+				zf = as_uint2(prim->v[0].p.zw);
+				t.xyz = prim->v[0].tc.xyz;
+				c = convert_int4(prim->v[0].c);
 			}
 			else if(PRIM == GS_LINE_CLASS)
 			{
 				// TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f
+				// TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled
+				// TODO: do not draw last pixel of the line

 				continue;
 			}
 			else if(PRIM == GS_TRIANGLE_CLASS)
 			{
+				// TODO: aa1: draw edge as a line
+
 				__global gs_barycentric* b = &barycentric[prim_index + i];

 				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);

-				f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
-
-				if(!all(f >= b->zero.xyz))
+				if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz))
 				{
 					continue;
 				}

-				zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y);
+				float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw));
+				float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
+				float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
+
+				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
+				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);

 				t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;

@ -1379,7 +1321,7 @@ __kernel void KERNEL_TFX(
 					float4 c1 = convert_float4(prim->v[1].c);
 					float4 c2 = convert_float4(prim->v[2].c);

-					c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y);
+					c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y);
 				}
 				else
 				{
@ -1396,7 +1338,7 @@ __kernel void KERNEL_TFX(
 					continue;
 				}

-				zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint
+				zf = as_uint2(prim->v[1].p.zw);
 				
 				t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy);
 				t.z = prim->v[0].tc.z;
@ -1431,7 +1373,7 @@ __kernel void KERNEL_TFX(

 					if(!FST)
 					{
-						uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z));
+						uv = convert_int2_rte(t.xy * native_recip(t.z));

 						if(LTF) uv -= 0x0008;
 					}
@ -1444,7 +1386,9 @@ __kernel void KERNEL_TFX(
 						// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
 						// TODO: sw renderer samples at 112 exactly, check which one is correct

-						uv = convert_int2_rte(t.xy); 
+						// last line error in persona 3 movie clips if rounding is enabled
+
+						uv = convert_int2(t.xy); 
 					}

 					int2 uvf = uv & 0x000f;
@ -1466,9 +1410,9 @@ __kernel void KERNEL_TFX(

 					if(LTF)
 					{
-						c00 = ((c01 - c00) * uvf.x >> 4) + c00;
-						c10 = ((c11 - c10) * uvf.x >> 4) + c10;
-						c00 = ((c10 - c00) * uvf.y >> 4) + c00;
+						c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
+						c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
+						c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
 					}

 					ct = c00;
@ -1486,7 +1430,7 @@ __kernel void KERNEL_TFX(
 					switch(TFX)
 					{
 					case TFX_MODULATE:
-						c.w = clamp(ct.w * c.w >> 7, 0, 0xff);
+						c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff);
 						break;
 					case TFX_DECAL:
 						c.w = ct.w;
@ -1539,14 +1483,14 @@ __kernel void KERNEL_TFX(
 				switch(TFX)
 				{
 				case TFX_MODULATE:
-					c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff);
+					c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff);
 					break;
 				case TFX_DECAL:
 					c.xyz = ct.xyz;
 					break;
 				case TFX_HIGHLIGHT:
 				case TFX_HIGHLIGHT2:					
-					c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff);
+					c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff);
 					break;
 				}
 			}
@ -1557,7 +1501,10 @@ __kernel void KERNEL_TFX(
 			{
 				int fog = (int)zf.y;

-				c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8);				
+				int3 fv = mul24(c.xyz, fog) >> 8;
+				int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8;
+
+				c.xyz = fv + fc;
 			}

 			// alpha blend
@ -1614,10 +1561,6 @@ __kernel void KERNEL_TFX(

 	if(fragments > 0)
 	{
-		// TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place)
-
-		// TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format)
-
 		if(ZWRITE)
 		{
 			WriteFrame(vm, zaddr, ZPSM, zd);