#if defined(CL_VERSION_2_0)

#error hello

#endif

#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency

#ifdef cl_amd_printf
#pragma OPENCL EXTENSION cl_amd_printf : enable
#endif

#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
#endif

#ifdef cl_amd_media_ops2
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#else
#endif

#ifndef CL_FLT_EPSILON
#define CL_FLT_EPSILON 1.1920928955078125e-7f
#endif

#if MAX_PRIM_PER_BATCH == 64u
	#define BIN_TYPE ulong
#elif MAX_PRIM_PER_BATCH == 32u
	#define BIN_TYPE uint
#else
	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
#endif

#define TFX_ABA(sel) ((sel.x >> 24) & 3)
#define TFX_ABB(sel) ((sel.x >> 26) & 3)
#define TFX_ABC(sel) ((sel.x >> 28) & 3)
#define TFX_ABD(sel) ((sel.x >> 30) & 3)
#define TFX_WMS(sel) ((sel.y >>  8) & 3)
#define TFX_WMT(sel) ((sel.y >> 10) & 3)

typedef struct
{
	union {float4 p; struct {float x, y; uint z, f;};};
	union {float4 tc; struct {float s, t, q; uchar4 c;};};
} gs_vertex;

typedef struct
{
	gs_vertex v[3];
	uint zmin, zmax;
	uint pb_index;
	uint _pad;
} gs_prim;

typedef struct
{
	float4 dx, dy;
	float4 zero;
	float4 reject_corner;
} gs_barycentric;

typedef struct
{
	struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
	BIN_TYPE bin[MAX_BIN_COUNT];
	uchar4 bbox[MAX_PRIM_COUNT];
	gs_prim prim[MAX_PRIM_COUNT];
	gs_barycentric barycentric[MAX_PRIM_COUNT];
} gs_env;

typedef struct
{
	int4 scissor;
	char dimx[4][4];
	uint2 sel;
	int fbp, zbp, bw;
	uint fm, zm;
	uchar4 fog; // rgb
	uchar aref, afix;
	uchar ta0, ta1;
	int tbp[7], tbw[7];
	int minu, maxu, minv, maxv;
	int lod; // lcm == 1
	int mxl;
	float l; // TEX1.L * -0x10000
	float k; // TEX1.K * 0x10000
	uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother
} gs_param;

enum GS_PRIM_CLASS
{
	GS_POINT_CLASS,
	GS_LINE_CLASS,
	GS_TRIANGLE_CLASS,
	GS_SPRITE_CLASS
};

enum GS_PSM
{
	PSM_PSMCT32,
	PSM_PSMCT24,
	PSM_PSMCT16,
	PSM_PSMCT16S,
	PSM_PSMZ32,
	PSM_PSMZ24,
	PSM_PSMZ16,
	PSM_PSMZ16S,
	PSM_PSMT8,
	PSM_PSMT4,
	PSM_PSMT8H,
	PSM_PSMT4HL,
	PSM_PSMT4HH,
};

enum GS_TFX
{
	TFX_MODULATE	= 0,
	TFX_DECAL		= 1,
	TFX_HIGHLIGHT	= 2,
	TFX_HIGHLIGHT2	= 3,
	TFX_NONE		= 4,
};

enum GS_CLAMP
{
	CLAMP_REGION_REPEAT	= 0,
	CLAMP_REPEAT		= 1,
	CLAMP_CLAMP			= 2,
	CLAMP_REGION_CLAMP	= 3,
};

enum GS_ZTST
{
	ZTST_NEVER		= 0,
	ZTST_ALWAYS		= 1,
	ZTST_GEQUAL		= 2,
	ZTST_GREATER	= 3,
};

enum GS_ATST
{
	ATST_NEVER		= 0,
	ATST_ALWAYS		= 1,
	ATST_LESS		= 2,
	ATST_LEQUAL		= 3,
	ATST_EQUAL		= 4,
	ATST_GEQUAL		= 5,
	ATST_GREATER	= 6,
	ATST_NOTEQUAL	= 7,
};

enum GS_AFAIL
{
	AFAIL_KEEP		= 0,
	AFAIL_FB_ONLY	= 1,
	AFAIL_ZB_ONLY	= 2,
	AFAIL_RGB_ONLY	= 3,
};

__constant uchar blockTable32[4][8] =
{
	{  0,  1,  4,  5, 16, 17, 20, 21},
	{  2,  3,  6,  7, 18, 19, 22, 23},
	{  8,  9, 12, 13, 24, 25, 28, 29},
	{ 10, 11, 14, 15, 26, 27, 30, 31}
};

__constant uchar blockTable32Z[4][8] =
{
	{ 24, 25, 28, 29,  8,  9, 12, 13},
	{ 26, 27, 30, 31, 10, 11, 14, 15},
	{ 16, 17, 20, 21,  0,  1,  4,  5},
	{ 18, 19, 22, 23,  2,  3,  6,  7}
};

__constant uchar blockTable16[8][4] =
{
	{  0,  2,  8, 10 },
	{  1,  3,  9, 11 },
	{  4,  6, 12, 14 },
	{  5,  7, 13, 15 },
	{ 16, 18, 24, 26 },
	{ 17, 19, 25, 27 },
	{ 20, 22, 28, 30 },
	{ 21, 23, 29, 31 }
};

__constant uchar blockTable16S[8][4] =
{
	{  0,  2, 16, 18 },
	{  1,  3, 17, 19 },
	{  8, 10, 24, 26 },
	{  9, 11, 25, 27 },
	{  4,  6, 20, 22 },
	{  5,  7, 21, 23 },
	{ 12, 14, 28, 30 },
	{ 13, 15, 29, 31 }
};

__constant uchar blockTable16Z[8][4] =
{
	{ 24, 26, 16, 18 },
	{ 25, 27, 17, 19 },
	{ 28, 30, 20, 22 },
	{ 29, 31, 21, 23 },
	{  8, 10,  0,  2 },
	{  9, 11,  1,  3 },
	{ 12, 14,  4,  6 },
	{ 13, 15,  5,  7 }
};

__constant uchar blockTable16SZ[8][4] =
{
	{ 24, 26,  8, 10 },
	{ 25, 27,  9, 11 },
	{ 16, 18,  0,  2 },
	{ 17, 19,  1,  3 },
	{ 28, 30, 12, 14 },
	{ 29, 31, 13, 15 },
	{ 20, 22,  4,  6 },
	{ 21, 23,  5,  7 }
};

__constant uchar blockTable8[4][8] =
{
	{  0,  1,  4,  5, 16, 17, 20, 21},
	{  2,  3,  6,  7, 18, 19, 22, 23},
	{  8,  9, 12, 13, 24, 25, 28, 29},
	{ 10, 11, 14, 15, 26, 27, 30, 31}
};

__constant uchar blockTable4[8][4] =
{
	{  0,  2,  8, 10 },
	{  1,  3,  9, 11 },
	{  4,  6, 12, 14 },
	{  5,  7, 13, 15 },
	{ 16, 18, 24, 26 },
	{ 17, 19, 25, 27 },
	{ 20, 22, 28, 30 },
	{ 21, 23, 29, 31 }
};

__constant uchar columnTable32[8][8] =
{
	{  0,  1,  4,  5,  8,  9, 12, 13 },
	{  2,  3,  6,  7, 10, 11, 14, 15 },
	{ 16, 17, 20, 21, 24, 25, 28, 29 },
	{ 18, 19, 22, 23, 26, 27, 30, 31 },
	{ 32, 33, 36, 37, 40, 41, 44, 45 },
	{ 34, 35, 38, 39, 42, 43, 46, 47 },
	{ 48, 49, 52, 53, 56, 57, 60, 61 },
	{ 50, 51, 54, 55, 58, 59, 62, 63 },
};

__constant uchar columnTable16[8][16] =
{
	{   0,   2,   8,  10,  16,  18,  24,  26,
	    1,   3,   9,  11,  17,  19,  25,  27 },
	{   4,   6,  12,  14,  20,  22,  28,  30,
	    5,   7,  13,  15,  21,  23,  29,  31 },
	{  32,  34,  40,  42,  48,  50,  56,  58,
	   33,  35,  41,  43,  49,  51,  57,  59 },
	{  36,  38,  44,  46,  52,  54,  60,  62,
	   37,  39,  45,  47,  53,  55,  61,  63 },
	{  64,  66,  72,  74,  80,  82,  88,  90,
	   65,  67,  73,  75,  81,  83,  89,  91 },
	{  68,  70,  76,  78,  84,  86,  92,  94,
	   69,  71,  77,  79,  85,  87,  93,  95 },
	{  96,  98, 104, 106, 112, 114, 120, 122,
	   97,  99, 105, 107, 113, 115, 121, 123 },
	{ 100, 102, 108, 110, 116, 118, 124, 126,
	  101, 103, 109, 111, 117, 119, 125, 127 },
};

__constant uchar columnTable8[16][16] =
{
	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
	    2,   6,  18,  22,  34,  38,  50,  54 },
	{   8,  12,  24,  28,  40,  44,  56,  60,
	   10,  14,  26,  30,  42,  46,  58,  62 },
	{  33,  37,  49,  53,   1,   5,  17,  21,
	   35,  39,  51,  55,   3,   7,  19,  23 },
	{  41,  45,  57,  61,   9,  13,  25,  29,
	   43,  47,  59,  63,  11,  15,  27,  31 },
	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
	   98, 102, 114, 118,  66,  70,  82,  86 },
	{ 104, 108, 120, 124,  72,  76,  88,  92,
	  106, 110, 122, 126,  74,  78,  90,  94 },
	{  65,  69,  81,  85,  97, 101, 113, 117,
	   67,  71,  83,  87,  99, 103, 115, 119 },
	{  73,  77,  89,  93, 105, 109, 121, 125,
	   75,  79,  91,  95, 107, 111, 123, 127 },
	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
	  130, 134, 146, 150, 162, 166, 178, 182 },
	{ 136, 140, 152, 156, 168, 172, 184, 188,
	  138, 142, 154, 158, 170, 174, 186, 190 },
	{ 161, 165, 177, 181, 129, 133, 145, 149,
	  163, 167, 179, 183, 131, 135, 147, 151 },
	{ 169, 173, 185, 189, 137, 141, 153, 157,
	  171, 175, 187, 191, 139, 143, 155, 159 },
	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
	  226, 230, 242, 246, 194, 198, 210, 214 },
	{ 232, 236, 248, 252, 200, 204, 216, 220,
	  234, 238, 250, 254, 202, 206, 218, 222 },
	{ 193, 197, 209, 213, 225, 229, 241, 245,
	  195, 199, 211, 215, 227, 231, 243, 247 },
	{ 201, 205, 217, 221, 233, 237, 249, 253,
	  203, 207, 219, 223, 235, 239, 251, 255 },
};

__constant ushort columnTable4[16][32] =
{
	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
	    2,  10,  34,  42,  66,  74,  98, 106,
	    4,  12,  36,  44,  68,  76, 100, 108,
	    6,  14,  38,  46,  70,  78, 102, 110 },
	{  16,  24,  48,  56,  80,  88, 112, 120,
	   18,  26,  50,  58,  82,  90, 114, 122,
	   20,  28,  52,  60,  84,  92, 116, 124,
	   22,  30,  54,  62,  86,  94, 118, 126 },
	{  65,  73,  97, 105,   1,   9,  33,  41,
	   67,  75,  99, 107,   3,  11,  35,  43,
	   69,  77, 101, 109,   5,  13,  37,  45,
	   71,  79, 103, 111,   7,  15,  39,  47 },
	{  81,  89, 113, 121,  17,  25,  49,  57,
	   83,  91, 115, 123,  19,  27,  51,  59,
	   85,  93, 117, 125,  21,  29,  53,  61,
	   87,  95, 119, 127,  23,  31,  55,  63 },
	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
	  194, 202, 226, 234, 130, 138, 162, 170,
	  196, 204, 228, 236, 132, 140, 164, 172,
	  198, 206, 230, 238, 134, 142, 166, 174 },
	{ 208, 216, 240, 248, 144, 152, 176, 184,
	  210, 218, 242, 250, 146, 154, 178, 186,
	  212, 220, 244, 252, 148, 156, 180, 188,
	  214, 222, 246, 254, 150, 158, 182, 190 },
	{ 129, 137, 161, 169, 193, 201, 225, 233,
	  131, 139, 163, 171, 195, 203, 227, 235,
	  133, 141, 165, 173, 197, 205, 229, 237,
	  135, 143, 167, 175, 199, 207, 231, 239 },
	{ 145, 153, 177, 185, 209, 217, 241, 249,
	  147, 155, 179, 187, 211, 219, 243, 251,
	  149, 157, 181, 189, 213, 221, 245, 253,
	  151, 159, 183, 191, 215, 223, 247, 255 },
	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
	  258, 266, 290, 298, 322, 330, 354, 362,
	  260, 268, 292, 300, 324, 332, 356, 364,
	  262, 270, 294, 302, 326, 334, 358, 366 },
	{ 272, 280, 304, 312, 336, 344, 368, 376,
	  274, 282, 306, 314, 338, 346, 370, 378,
	  276, 284, 308, 316, 340, 348, 372, 380,
	  278, 286, 310, 318, 342, 350, 374, 382 },
	{ 321, 329, 353, 361, 257, 265, 289, 297,
	  323, 331, 355, 363, 259, 267, 291, 299,
	  325, 333, 357, 365, 261, 269, 293, 301,
	  327, 335, 359, 367, 263, 271, 295, 303 },
	{ 337, 345, 369, 377, 273, 281, 305, 313,
	  339, 347, 371, 379, 275, 283, 307, 315,
	  341, 349, 373, 381, 277, 285, 309, 317,
	  343, 351, 375, 383, 279, 287, 311, 319 },
	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
	  450, 458, 482, 490, 386, 394, 418, 426,
	  452, 460, 484, 492, 388, 396, 420, 428,
	  454, 462, 486, 494, 390, 398, 422, 430 },
	{ 464, 472, 496, 504, 400, 408, 432, 440,
	  466, 474, 498, 506, 402, 410, 434, 442,
	  468, 476, 500, 508, 404, 412, 436, 444,
	  470, 478, 502, 510, 406, 414, 438, 446 },
	{ 385, 393, 417, 425, 449, 457, 481, 489,
	  387, 395, 419, 427, 451, 459, 483, 491,
	  389, 397, 421, 429, 453, 461, 485, 493,
	  391, 399, 423, 431, 455, 463, 487, 495 },
	{ 401, 409, 433, 441, 465, 473, 497, 505,
	  403, 411, 435, 443, 467, 475, 499, 507,
	  405, 413, 437, 445, 469, 477, 501, 509,
	  407, 415, 439, 447, 471, 479, 503, 511 },
};

int BlockNumber32(int x, int y, int bp, int bw)
{
	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
}

int BlockNumber16(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
}

int BlockNumber16S(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
}

int BlockNumber32Z(int x, int y, int bp, int bw)
{
	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
}

int BlockNumber16Z(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
}

int BlockNumber16SZ(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
}

int BlockNumber8(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
}

int BlockNumber4(int x, int y, int bp, int bw)
{
	return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
}

int PixelAddress32(int x, int y, int bp, int bw)
{
	return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
}

int PixelAddress16(int x, int y, int bp, int bw)
{
	return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
}

int PixelAddress16S(int x, int y, int bp, int bw)
{
	return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
}

int PixelAddress32Z(int x, int y, int bp, int bw)
{
	return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
}

int PixelAddress16Z(int x, int y, int bp, int bw)
{
	return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
}

int PixelAddress16SZ(int x, int y, int bp, int bw)
{
	return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
}

int PixelAddress8(int x, int y, int bp, int bw)
{
	return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
}

int PixelAddress4(int x, int y, int bp, int bw)
{
	return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
}

int PixelAddress(int x, int y, int bp, int bw, int psm)
{
	switch(psm)
	{
	default:
	case PSM_PSMCT32: 
	case PSM_PSMCT24: 
	case PSM_PSMT8H:
	case PSM_PSMT4HL:
	case PSM_PSMT4HH:
		return PixelAddress32(x, y, bp, bw);
	case PSM_PSMCT16: 
		return PixelAddress16(x, y, bp, bw);
	case PSM_PSMCT16S: 
		return PixelAddress16S(x, y, bp, bw);
	case PSM_PSMZ32: 
	case PSM_PSMZ24: 
		return PixelAddress32Z(x, y, bp, bw);
	case PSM_PSMZ16: 
		return PixelAddress16Z(x, y, bp, bw);
	case PSM_PSMZ16S: 
		return PixelAddress16SZ(x, y, bp, bw);
	case PSM_PSMT8:
		return PixelAddress8(x, y, bp, bw);
	case PSM_PSMT4:
		return PixelAddress4(x, y, bp, bw);
	}
}

uint ReadFrame(__global uchar* vm, int addr, int psm)
{
	switch(psm)
	{
	default:
	case PSM_PSMCT32: 
	case PSM_PSMCT24: 
	case PSM_PSMZ32: 
	case PSM_PSMZ24: 
		return ((__global uint*)vm)[addr];
	case PSM_PSMCT16: 
	case PSM_PSMCT16S: 
	case PSM_PSMZ16: 
	case PSM_PSMZ16S: 
		return ((__global ushort*)vm)[addr];
	}
}

void WriteFrame(__global uchar* vm, int addr, int psm, uint value)
{
	switch(psm)
	{
	default:
	case PSM_PSMCT32: 
	case PSM_PSMZ32:
	case PSM_PSMCT24: 
	case PSM_PSMZ24: 
		((__global uint*)vm)[addr] = value; 
		break;
	case PSM_PSMCT16: 
	case PSM_PSMCT16S: 
	case PSM_PSMZ16: 
	case PSM_PSMZ16S: 
		((__global ushort*)vm)[addr] = (ushort)value;
		break;
	}
}

bool is16bit(int psm)
{
	return psm < 8 && (psm & 3) >= 2;
}

bool is24bit(int psm)
{
	return psm < 8 && (psm & 3) == 1;
}

bool is32bit(int psm)
{
	return psm < 8 && (psm & 3) == 0;
}

#ifdef PRIM

int GetVertexPerPrim(int prim_class)
{
	switch(prim_class)
	{
	default:
	case GS_POINT_CLASS: return 1;
	case GS_LINE_CLASS: return 2;
	case GS_TRIANGLE_CLASS: return 3;
	case GS_SPRITE_CLASS: return 2;
	}
}

#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM)

#endif

#ifdef KERNEL_PRIM

__kernel void KERNEL_PRIM(
	__global gs_env* env,
	__global uchar* vb_base, 
	__global uchar* ib_base,
	__global uchar* pb_base, 
	uint vb_start,
	uint ib_start,
	uint pb_start)
{
	size_t prim_index = get_global_id(0);

	__global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start);
	__global uint* ib = (__global uint*)(ib_base + ib_start);
	__global gs_prim* prim = &env->prim[prim_index];
	
	ib += prim_index * VERTEX_PER_PRIM;

	uint pb_index = ib[0] >> 24;

	prim->pb_index = pb_index;

	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start + pb_index * TFX_PARAM_SIZE);

	__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
	__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
	__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];

	int2 pmin, pmax;

	if(PRIM == GS_POINT_CLASS)
	{
		pmin = pmax = convert_int2_rte(v0->p.xy);

		prim->v[0].p = v0->p;
		prim->v[0].tc = v0->tc;
	}
	else if(PRIM == GS_LINE_CLASS)
	{
		int2 p0 = convert_int2_rte(v0->p.xy);
		int2 p1 = convert_int2_rte(v1->p.xy);

		pmin = min(p0, p1);
		pmax = max(p0, p1);
	}
	else if(PRIM == GS_TRIANGLE_CLASS)
	{
		int2 p0 = convert_int2_rtp(v0->p.xy);
		int2 p1 = convert_int2_rtp(v1->p.xy);
		int2 p2 = convert_int2_rtp(v2->p.xy);

		pmin = min(min(p0, p1), p2);
		pmax = max(max(p0, p1), p2);

		// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
		// only interpolate the relative to zmin and hopefully small values

		uint zmin = min(min(v0->z, v1->z), v2->z);
		uint zmax = max(max(v0->z, v1->z), v2->z);
		
		prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
		prim->v[0].tc = v0->tc;
		prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w);
		prim->v[1].tc = v1->tc;
		prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
		prim->v[2].tc = v2->tc;

		prim->zmin = zmin;
		prim->zmax = zmax;

		float4 dp0 = v1->p - v0->p;
		float4 dp1 = v0->p - v2->p;
		float4 dp2 = v2->p - v1->p;

		float cp = dp0.x * dp1.y - dp0.y * dp1.x;

		if(cp != 0.0f)
		{
			cp = native_recip(cp);

			float2 u = dp0.xy * cp;
			float2 v = -dp1.xy * cp;

			// v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0)

			gs_barycentric b;

			b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x);
			b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y);

			dp0.xy = dp0.xy * sign(cp);
			dp1.xy = dp1.xy * sign(cp);
			dp2.xy = dp2.xy * sign(cp);

			b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0)));
			b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0)));
			b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0)));
			
			// any barycentric(reject_corner) < 0, tile outside the triangle

			b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE;
			b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE;
			b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE;

			// TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed

			env->barycentric[prim_index] = b;
		}
		else // triangle has zero area
		{
			pmax = -1; // won't get included in any tile
		}
	}
	else if(PRIM == GS_SPRITE_CLASS)
	{
		int2 p0 = convert_int2_rtp(v0->p.xy);
		int2 p1 = convert_int2_rtp(v1->p.xy);

		pmin = min(p0, p1);
		pmax = max(p0, p1);

		int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0);

		prim->v[0].p = select(v0->p, v1->p, mask); // pmin
		prim->v[0].tc = select(v0->tc, v1->tc, mask);
		prim->v[1].p = select(v1->p, v0->p, mask); // pmax
		prim->v[1].tc = select(v1->tc, v0->tc, mask);
		prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy);
	}

	int4 scissor = pb->scissor;

	pmin = select(pmin, scissor.xy, pmin < scissor.xy);
	pmax = select(pmax, scissor.zw, pmax > scissor.zw);

	int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS;

	env->bbox[prim_index] = convert_uchar4_sat(r);
}

#endif

#ifdef KERNEL_TILE

int tile_in_triangle(float2 p, gs_barycentric b)
{
	float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz;

	f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));

	return all(f >= b.zero.xyz);
}

#if CLEAR == 1

__kernel void KERNEL_TILE(__global gs_env* env)
{
	env->bounds[get_global_id(0)].first = -1;
	env->bounds[get_global_id(0)].last = 0;
}

#elif MODE < 3

#if MAX_PRIM_PER_BATCH != 32
	#error "MAX_PRIM_PER_BATCH != 32"
#endif

#define MAX_PRIM_PER_GROUP (32u >> MODE)

__kernel void KERNEL_TILE(
	__global gs_env* env,
	uint prim_count,
	uint bin_count, // == bin_dim.z * bin_dim.w
	uchar4 bin_dim)
{
	uint batch_index = get_group_id(2) >> MODE;
	uint prim_start = get_group_id(2) << (5 - MODE);
	uint group_prim_index = get_local_id(2);
	uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0);

	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
	__global uchar4* bbox = &env->bbox[prim_start];
	__global gs_barycentric* barycentric = &env->barycentric[prim_start];

	__local uchar4 bbox_cache[MAX_PRIM_PER_GROUP];
	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP];
	__local uint visible[8 << MODE];

	if(get_local_id(2) == 0)
	{
		visible[bin_index] = 0;
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP);

	event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0);

	wait_group_events(1, &e);

	if(PRIM == GS_TRIANGLE_CLASS)
	{
		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
		
		wait_group_events(1, &e);
	}

	if(group_prim_index < group_prim_count)
	{
		int x = bin_dim.x + get_local_id(0);
		int y = bin_dim.y + get_local_id(1);

		uchar4 r = bbox_cache[group_prim_index];

		uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);

		if(PRIM == GS_TRIANGLE_CLASS && test != 0)
		{
			test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
		}

		atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2)));
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	if(get_local_id(2) == 0)
	{
		#if MODE == 0
		((__global uint*)&bin[bin_index])[0] = visible[bin_index];
		#elif MODE == 1
		((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index];
		#elif MODE == 2
		((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index];
		#endif

		if(visible[bin_index] != 0)
		{
			atomic_min(&env->bounds[bin_index].first, batch_index);
			atomic_max(&env->bounds[bin_index].last, batch_index);
		}
	}
}

#elif MODE == 3

__kernel void KERNEL_TILE(
	__global gs_env* env,
	uint prim_count,
	uint bin_count, // == bin_dim.z * bin_dim.w
	uchar4 bin_dim)
{
	size_t batch_index = get_group_id(0);
	size_t local_id = get_local_id(0);
	size_t local_size = get_local_size(0);

	uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
		
	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
	__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
	__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];

	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
	
	event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);

	wait_group_events(1, &e);

	if(PRIM == GS_TRIANGLE_CLASS)
	{
		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
		
		wait_group_events(1, &e);
	}

	for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
	{
		int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
		int x = bin_index - y * bin_dim.z;

		x += bin_dim.x;
		y += bin_dim.y;

		BIN_TYPE visible = 0;

		for(uint i = 0; i < batch_prim_count; i++)
		{
			uchar4 r = bbox_cache[i];

			BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);

			if(PRIM == GS_TRIANGLE_CLASS && test != 0)
			{
				test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
			}

			visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
		}

		bin[bin_index] = visible;

		if(visible != 0)
		{
			atomic_min(&env->bounds[bin_index].first, batch_index);
			atomic_max(&env->bounds[bin_index].last, batch_index);
		}
	}
}

#endif

#endif

#ifdef KERNEL_TFX

bool ZTest(uint zs, uint zd)
{ 
	if(ZTEST)
	{
		if(is24bit(ZPSM)) zd &= 0x00ffffff;

		switch(ZTST)
		{
		case ZTST_NEVER:
			return false;
		case ZTST_ALWAYS:
			return true;
		case ZTST_GEQUAL:
			return zs >= zd;
		case ZTST_GREATER:
			return zs > zd;
		}
	}

	return true;
}

bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
{
	switch(AFAIL)
	{
	case AFAIL_KEEP:
		break;
	case AFAIL_FB_ONLY:
		if(!ZWRITE) return true;
		break;
	case AFAIL_ZB_ONLY:
		if(!FWRITE) return true;
		break;
	case AFAIL_RGB_ONLY:
		if(!ZWRITE && is24bit(FPSM)) return true;
		break;
	}

	uint pass;
	
	switch(ATST)
	{
	case ATST_NEVER:
		pass = false;
		break;
	case ATST_ALWAYS:
		return true;
	case ATST_LESS:
		pass = alpha < aref;
		break;
	case ATST_LEQUAL:
		pass = alpha <= aref;
		break;
	case ATST_EQUAL:
		pass = alpha == aref;
		break;
	case ATST_GEQUAL:
		pass = alpha >= aref;
		break;
	case ATST_GREATER:
		pass = alpha > aref;
		break;
	case ATST_NOTEQUAL:
		pass = alpha != aref;
		break;
	}

	switch(AFAIL)
	{
	case AFAIL_KEEP:
		return pass;
	case AFAIL_FB_ONLY:
		*zm |= pass ? 0 : 0xffffffff;
		break;
	case AFAIL_ZB_ONLY:
		*fm |= pass ? 0 : 0xffffffff;
		break;
	case AFAIL_RGB_ONLY:
		if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000;
		if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000;
		*zm |= pass ? 0 : 0xffffffff;
		break;
	}

	return true;
}

bool DestAlphaTest(uint fd)
{
	if(DATE)
	{
		if(DATM)
		{
			if(is32bit(FPSM)) return (fd & 0x80000000) != 0;
			if(is16bit(FPSM)) return (fd & 0x00008000) != 0;
		}
		else
		{
			if(is32bit(FPSM)) return (fd & 0x80000000) == 0;
			if(is16bit(FPSM)) return (fd & 0x00008000) == 0;
		}
	}

	return true;
}

int Wrap(int a, int b, int c, int mode)
{
	if(MERGED)
	{
		return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0);
	}
	else
	{
		switch(mode)
		{
		case CLAMP_REGION_REPEAT:
			return (a & b) | c;
		case CLAMP_REPEAT:
			return a & b;
		case CLAMP_CLAMP:
			return clamp(a, 0, c);
		case CLAMP_REGION_CLAMP:
			return clamp(a, b, c);
		}
	}
}

int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel)
{
	if(FWRITE && (ABE || AA1))
	{
		int4 cs = c;
		int4 cd;

		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED)
		{
			if(is32bit(FPSM) || is24bit(FPSM))
			{
				cd.x = fd & 0xff;
				cd.y = (fd >> 8) & 0xff;
				cd.z = (fd >> 16) & 0xff;
				cd.w = fd >> 24;
			}
			else if(is16bit(FPSM))
			{
				cd.x = (fd << 3) & 0xf8;
				cd.y = (fd >> 2) & 0xf8;
				cd.z = (fd >> 7) & 0xf8;
				cd.w = (fd >> 8) & 0x80;
			}
		}

		if(MERGED)
		{
			int aba = TFX_ABA(sel);
			int abb = TFX_ABB(sel);
			int abc = TFX_ABC(sel);
			int abd = TFX_ABD(sel);

			int ad = !is24bit(FPSM) ? cd.w : 0x80;

			int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0;
			int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0;
			int C = abc == 0 ? cs.w : abc == 1 ? ad : afix;
			int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0;

			c.xyz = (mul24(A - B, C) >> 7) + D;
		}
		else
		{
			if(ABA != ABB)
			{
				switch(ABA)
				{
				case 0: break; // c.xyz = cs.xyz;
				case 1: c.xyz = cd.xyz; break;
				case 2: c.xyz = 0; break;
				}

				switch(ABB)
				{
				case 0: c.xyz -= cs.xyz; break;
				case 1: c.xyz -= cd.xyz; break;
				case 2: break;
				}

				if(!(is24bit(FPSM) && ABC == 1))
				{
					int a = 0;

					switch(ABC)
					{
					case 0: a = cs.w; break;
					case 1: a = cd.w; break;
					case 2: a = afix; break;
					}

					c.xyz = c.xyz * a >> 7;
				}

				switch(ABD)
				{
				case 0: c.xyz += cs.xyz; break;
				case 1: c.xyz += cd.xyz; break;
				case 2: break;
				}
			}
			else
			{
				switch(ABD)
				{
				case 0: break;
				case 1: c.xyz = cd.xyz; break;
				case 2: c.xyz = 0; break;
				}
			}
		}

		if(PABE)
		{
			c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24));
		}
	}

	return c;
}

uchar4 Expand24To32(uint rgba, uchar ta0)
{
	uchar4 c;

	c.x = rgba & 0xff;
	c.y = (rgba >> 8) & 0xff;
	c.z = (rgba >> 16) & 0xff;
	c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0;

	return c;
}

uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
{
	uchar4 c;

	c.x = (rgba << 3) & 0xf8;
	c.y = (rgba >> 2) & 0xf8;
	c.z = (rgba >> 7) & 0xf8;
	c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;

	return c;
}

int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb)
{
	uchar4 c;

	uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM);

	__global ushort* vm16 = (__global ushort*)vm;
	__global uint* vm32 = (__global uint*)vm;

	switch(TPSM)
	{
	default:
	case PSM_PSMCT32: 
	case PSM_PSMZ32:
		c = ((__global uchar4*)vm)[addr];
		break;
	case PSM_PSMCT24: 
	case PSM_PSMZ24: 
		c = Expand24To32(vm32[addr], pb->ta0);
		break;
	case PSM_PSMCT16: 
	case PSM_PSMCT16S: 
	case PSM_PSMZ16: 
	case PSM_PSMZ16S: 
		c = Expand16To32(vm16[addr], pb->ta0, pb->ta1);
		break;
	case PSM_PSMT8:
		c = pb->clut[vm[addr]];
		break;
	case PSM_PSMT4:
		c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f];
		break;
	case PSM_PSMT8H:
		c = pb->clut[vm32[addr] >> 24];
		break;
	case PSM_PSMT4HL:
		c = pb->clut[(vm32[addr] >> 24) & 0x0f];
		break;
	case PSM_PSMT4HH:
		c = pb->clut[(vm32[addr] >> 28) & 0x0f];
		break;
	}

	//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]);

	return convert_int4(c);
}

int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
{
	int4 c;

	if(0)//if(MMIN)
	{
		// TODO
	}
	else
	{
		int2 uv;

		if(!FST)
		{
			uv = convert_int2_rte(t.xy * native_recip(t.z));
		}
		else
		{
			// sfex capcom logo third drawing call at (0,223) calculated as:
			// t0 + (p - p0) * (t - t0) / (p1 - p0)  
			// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
			// due to rounding errors (multiply-add instruction maybe):
			// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
			// TODO: sw renderer samples at 112 exactly, check which one is correct

			// last line error in persona 3 movie clips if rounding is enabled

			uv = convert_int2(t.xy); 
		}

		if(LTF) uv -= 0x0008;

		int2 uvf = uv & 0x000f;

		int2 uv0 = uv >> 4;
		int2 uv1 = uv0 + 1;

		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);

		int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
		int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
		int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
		int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);

		if(LTF)
		{
			c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
			c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
			c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
		}

		c = c00;
	}

	return c;
}

// TODO: 2x2 MSAA idea
// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
// hittest and ztest 2x2 (create write mask, only skip if all -1) 
// calculate color 1x1, alpha tests 1x1
// use mask to filter failed sub-pixels when writing to __local
// needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered
// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average

__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
	__global gs_env* env,
	__global uchar* vm,
	__global uchar* tex,
	__global uchar* pb_base, 
	uint pb_start,
	uint prim_start, 
	uint prim_count,
	uint bin_count, // == bin_dim.z * bin_dim.w
	uchar4 bin_dim,
	uint fbp, 
	uint zbp, 
	uint bw)
{
	uint x = get_global_id(0);
	uint y = get_global_id(1);

	uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x;
	uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y;
	uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x);

	uint batch_first = env->bounds[bin_index].first;
	uint batch_last = env->bounds[bin_index].last;
	uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS;

	if(batch_last < batch_first)
	{
		return;
	}

	uint skip;
	
	if(batch_start < batch_first)
	{
		uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1));

		if(n > prim_count) 
		{
			return;
		}

		skip = 0;
		prim_count -= n;
		batch_start = batch_first;
	}
	else
	{
		skip = prim_start & (MAX_PRIM_PER_BATCH - 1);
		prim_count += skip;
	}

	if(batch_start > batch_last) 
	{
		return;
	}
	
	prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS);

	//

	int2 pi = (int2)(x, y);
	float2 pf = convert_float2(pi);

	int faddr = PixelAddress(x, y, fbp, bw, FPSM);
	int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);

	uint fd, zd; // TODO: fd as int4 and only pack before writing out?

	if(RFB) 
	{
		fd = ReadFrame(vm, faddr, FPSM);
	}

	if(RZB)
	{
		zd = ReadFrame(vm, zaddr, ZPSM);
	}

	// early destination alpha test

	if(!DestAlphaTest(fd))
	{
		return;
	}

	//

	uint fragments = 0;

	__global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case"
	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];

	pb_base += pb_start;

	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);

	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
	{
		while(bin_value != 0)
		{
			uint i = clz(bin_value);

			if(prim_index + i >= prim_count)
			{
				break;
			}

			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);

			__global gs_prim* prim = &prim_base[prim_index + i];
			__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);

			if(!NOSCISSOR)
			{
				if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
				{
					continue;
				}
			}
			
			uint2 zf;
			float3 t;
			int4 c;

			 // TODO: do not hittest if we know the tile is fully inside the prim

			if(PRIM == GS_POINT_CLASS)
			{
				float2 dpf = pf - prim->v[0].p.xy;

				if(!all((dpf <= 0.5f) & (dpf > -0.5f)))
				{
					continue;
				}

				zf = as_uint2(prim->v[0].p.zw);
				t = prim->v[0].tc.xyz;
				c = convert_int4(prim->v[0].c);
			}
			else if(PRIM == GS_LINE_CLASS)
			{
				// TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f
				// TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled
				// TODO: do not draw last pixel of the line

				continue;
			}
			else if(PRIM == GS_TRIANGLE_CLASS)
			{
				// TODO: aa1: draw edge as a line

				if(!ZTest(prim->zmax, zd))
				{
					continue;
				}

				__global gs_barycentric* b = &barycentric[prim_index + i];

				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);

				if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz))
				{
					continue;
				}

				float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw));
				float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
				float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));

				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);

				t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;

				if(IIP)
				{
					float4 c0 = convert_float4(prim->v[0].c);
					float4 c1 = convert_float4(prim->v[1].c);
					float4 c2 = convert_float4(prim->v[2].c);

					c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y);
				}
				else
				{
					c = convert_int4(prim->v[2].c);
				}
			}
			else if(PRIM == GS_SPRITE_CLASS)
			{
				int2 tl = convert_int2_rtp(prim->v[0].p.xy);
				int2 br = convert_int2_rtp(prim->v[1].p.xy);

				if(!all((pi >= tl) & (pi < br)))
				{
					continue;
				}

				zf = as_uint2(prim->v[1].p.zw);
				
				t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy);
				t.z = prim->v[0].tc.z;

				c = convert_int4(prim->v[1].c);
			}

			// z test

			uint zs = zf.x;

			if(!ZTest(zs, zd))
			{
				continue;
			}

			// sample texture

			int4 ct;

			if(TFX != TFX_NONE)
			{
				ct = SampleTexture(tex, pb, t);
			}

			// alpha tfx

			int alpha = c.w;

			if(FB)
			{
				if(TCC)
				{
					switch(TFX)
					{
					case TFX_MODULATE:
						c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff);
						break;
					case TFX_DECAL:
						c.w = ct.w;
						break;
					case TFX_HIGHLIGHT:
						c.w = clamp(ct.w + c.w, 0, 0xff);
						break;
					case TFX_HIGHLIGHT2:
						c.w = ct.w;
						break;
					}
				}

				if(AA1)
				{
					if(!ABE || c.w == 0x80)
					{
						c.w = 0x80; // TODO: edge ? coverage : 0x80
					}
				}
			}

			// read mask

			uint fm = pb->fm;
			uint zm = pb->zm;

			// alpha test

			if(!AlphaTest(c.w, pb->aref, &fm, &zm))
			{
				continue;
			}

			// all tests done, we have a new output

			fragments++;

			// write z

			if(ZWRITE)
			{
				zd = RZB ? bitselect(zs, zd, zm) : zs;
			}

			// rgb tfx

			if(FWRITE)
			{
				switch(TFX)
				{
				case TFX_MODULATE:
					c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff);
					break;
				case TFX_DECAL:
					c.xyz = ct.xyz;
					break;
				case TFX_HIGHLIGHT:
				case TFX_HIGHLIGHT2:					
					c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff);
					break;
				}
			}

			// fog

			if(FWRITE && FGE)
			{
				int fog = (int)zf.y;

				int3 fv = mul24(c.xyz, fog) >> 8;
				int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8;

				c.xyz = fv + fc;
			}

			// alpha blend

			c = AlphaBlend(c, fd, pb->afix, pb->sel);

			// write frame

			if(FWRITE)
			{
				if(DTHE && is16bit(FPSM))
				{
					c.xyz += pb->dimx[y & 3][x & 3];
				}

				c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff;
				
				if(FBA && !is24bit(FPSM))
				{
					c.w |= 0x80;
				}

				uint fs;

				if(is32bit(FPSM))
				{
					fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x;
				}
				else if(is24bit(FPSM))
				{
					fs = (c.z << 16) | (c.y << 8) | c.x;
				}
				else if(is16bit(FPSM))
				{
					fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3);
				}

				fd = RFB ? bitselect(fs, fd, fm) : fs;

				// dest alpha test for the next loop

				if(!DestAlphaTest(fd))
				{
					prim_index = prim_count; // game over

					break;
				}
			}
		}

		bin += bin_count;
		bin_value = *bin;
	}

	if(fragments > 0)
	{
		if(ZWRITE)
		{
			WriteFrame(vm, zaddr, ZPSM, zd);
		}

		if(FWRITE)
		{
			WriteFrame(vm, faddr, FPSM, fd);
		}
	}
}

#endif

#endif