#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency #ifdef cl_amd_printf #pragma OPENCL EXTENSION cl_amd_printf : enable #endif #ifdef cl_amd_media_ops #pragma OPENCL EXTENSION cl_amd_media_ops : enable #else #endif #ifdef cl_amd_media_ops2 #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable #else #endif #ifndef CL_FLT_EPSILON #define CL_FLT_EPSILON 1.1920928955078125e-7f #endif #if MAX_PRIM_PER_BATCH == 64u #define BIN_TYPE ulong #elif MAX_PRIM_PER_BATCH == 32u #define BIN_TYPE uint #else #error "MAX_PRIM_PER_BATCH != 32u OR 64u" #endif #define TFX_ABA(sel) ((sel.x >> 24) & 3) #define TFX_ABB(sel) ((sel.x >> 26) & 3) #define TFX_ABC(sel) ((sel.x >> 28) & 3) #define TFX_ABD(sel) ((sel.x >> 30) & 3) #define TFX_WMS(sel) ((sel.y >> 8) & 3) #define TFX_WMT(sel) ((sel.y >> 10) & 3) typedef struct { union {float4 p; struct {float x, y; uint z, f;};}; union {float4 tc; struct {float s, t, q; uchar4 c;};}; } gs_vertex; typedef struct { gs_vertex v[3]; uint zmin, zmax; uint pb_index; uint _pad; } gs_prim; typedef struct { float4 dx, dy; float4 zero; float4 reject_corner; } gs_barycentric; typedef struct { struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; BIN_TYPE bin[MAX_BIN_COUNT]; uchar4 bbox[MAX_PRIM_COUNT]; gs_prim prim[MAX_PRIM_COUNT]; gs_barycentric barycentric[MAX_PRIM_COUNT]; } gs_env; typedef struct { int4 scissor; char dimx[4][4]; uint2 sel; int fbp, zbp, bw; uint fm, zm; uchar4 fog; // rgb uchar aref, afix; uchar ta0, ta1; int tbp[7], tbw[7]; int minu, maxu, minv, maxv; int lod; // lcm == 1 int mxl; float l; // TEX1.L * -0x10000 float k; // TEX1.K * 0x10000 uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother } gs_param; enum GS_PRIM_CLASS { GS_POINT_CLASS, GS_LINE_CLASS, GS_TRIANGLE_CLASS, GS_SPRITE_CLASS }; enum GS_PSM { PSM_PSMCT32, PSM_PSMCT24, PSM_PSMCT16, PSM_PSMCT16S, PSM_PSMZ32, PSM_PSMZ24, PSM_PSMZ16, PSM_PSMZ16S, PSM_PSMT8, PSM_PSMT4, PSM_PSMT8H, PSM_PSMT4HL, PSM_PSMT4HH, }; enum GS_TFX { TFX_MODULATE = 0, TFX_DECAL = 1, TFX_HIGHLIGHT = 2, TFX_HIGHLIGHT2 = 3, TFX_NONE = 4, }; enum GS_CLAMP { CLAMP_REGION_REPEAT = 0, CLAMP_REPEAT = 1, CLAMP_CLAMP = 2, CLAMP_REGION_CLAMP = 3, }; enum GS_ZTST { ZTST_NEVER = 0, ZTST_ALWAYS = 1, ZTST_GEQUAL = 2, ZTST_GREATER = 3, }; enum GS_ATST { ATST_NEVER = 0, ATST_ALWAYS = 1, ATST_LESS = 2, ATST_LEQUAL = 3, ATST_EQUAL = 4, ATST_GEQUAL = 5, ATST_GREATER = 6, ATST_NOTEQUAL = 7, }; enum GS_AFAIL { AFAIL_KEEP = 0, AFAIL_FB_ONLY = 1, AFAIL_ZB_ONLY = 2, AFAIL_RGB_ONLY = 3, }; __constant uchar blockTable32[4][8] = { { 0, 1, 4, 5, 16, 17, 20, 21}, { 2, 3, 6, 7, 18, 19, 22, 23}, { 8, 9, 12, 13, 24, 25, 28, 29}, { 10, 11, 14, 15, 26, 27, 30, 31} }; __constant uchar blockTable32Z[4][8] = { { 24, 25, 28, 29, 8, 9, 12, 13}, { 26, 27, 30, 31, 10, 11, 14, 15}, { 16, 17, 20, 21, 0, 1, 4, 5}, { 18, 19, 22, 23, 2, 3, 6, 7} }; __constant uchar blockTable16[8][4] = { { 0, 2, 8, 10 }, { 1, 3, 9, 11 }, { 4, 6, 12, 14 }, { 5, 7, 13, 15 }, { 16, 18, 24, 26 }, { 17, 19, 25, 27 }, { 20, 22, 28, 30 }, { 21, 23, 29, 31 } }; __constant uchar blockTable16S[8][4] = { { 0, 2, 16, 18 }, { 1, 3, 17, 19 }, { 8, 10, 24, 26 }, { 9, 11, 25, 27 }, { 4, 6, 20, 22 }, { 5, 7, 21, 23 }, { 12, 14, 28, 30 }, { 13, 15, 29, 31 } }; __constant uchar blockTable16Z[8][4] = { { 24, 26, 16, 18 }, { 25, 27, 17, 19 }, { 28, 30, 20, 22 }, { 29, 31, 21, 23 }, { 8, 10, 0, 2 }, { 9, 11, 1, 3 }, { 12, 14, 4, 6 }, { 13, 15, 5, 7 } }; __constant uchar blockTable16SZ[8][4] = { { 24, 26, 8, 10 }, { 25, 27, 9, 11 }, { 16, 18, 0, 2 }, { 17, 19, 1, 3 }, { 28, 30, 12, 14 }, { 29, 31, 13, 15 }, { 20, 22, 4, 6 }, { 21, 23, 5, 7 } }; __constant uchar blockTable8[4][8] = { { 0, 1, 4, 5, 16, 17, 20, 21}, { 2, 3, 6, 7, 18, 19, 22, 23}, { 8, 9, 12, 13, 24, 25, 28, 29}, { 10, 11, 14, 15, 26, 27, 30, 31} }; __constant uchar blockTable4[8][4] = { { 0, 2, 8, 10 }, { 1, 3, 9, 11 }, { 4, 6, 12, 14 }, { 5, 7, 13, 15 }, { 16, 18, 24, 26 }, { 17, 19, 25, 27 }, { 20, 22, 28, 30 }, { 21, 23, 29, 31 } }; __constant uchar columnTable32[8][8] = { { 0, 1, 4, 5, 8, 9, 12, 13 }, { 2, 3, 6, 7, 10, 11, 14, 15 }, { 16, 17, 20, 21, 24, 25, 28, 29 }, { 18, 19, 22, 23, 26, 27, 30, 31 }, { 32, 33, 36, 37, 40, 41, 44, 45 }, { 34, 35, 38, 39, 42, 43, 46, 47 }, { 48, 49, 52, 53, 56, 57, 60, 61 }, { 50, 51, 54, 55, 58, 59, 62, 63 }, }; __constant uchar columnTable16[8][16] = { { 0, 2, 8, 10, 16, 18, 24, 26, 1, 3, 9, 11, 17, 19, 25, 27 }, { 4, 6, 12, 14, 20, 22, 28, 30, 5, 7, 13, 15, 21, 23, 29, 31 }, { 32, 34, 40, 42, 48, 50, 56, 58, 33, 35, 41, 43, 49, 51, 57, 59 }, { 36, 38, 44, 46, 52, 54, 60, 62, 37, 39, 45, 47, 53, 55, 61, 63 }, { 64, 66, 72, 74, 80, 82, 88, 90, 65, 67, 73, 75, 81, 83, 89, 91 }, { 68, 70, 76, 78, 84, 86, 92, 94, 69, 71, 77, 79, 85, 87, 93, 95 }, { 96, 98, 104, 106, 112, 114, 120, 122, 97, 99, 105, 107, 113, 115, 121, 123 }, { 100, 102, 108, 110, 116, 118, 124, 126, 101, 103, 109, 111, 117, 119, 125, 127 }, }; __constant uchar columnTable8[16][16] = { { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 2, 6, 18, 22, 34, 38, 50, 54 }, { 8, 12, 24, 28, 40, 44, 56, 60, 10, 14, 26, 30, 42, 46, 58, 62 }, { 33, 37, 49, 53, 1, 5, 17, 21, 35, 39, 51, 55, 3, 7, 19, 23 }, { 41, 45, 57, 61, 9, 13, 25, 29, 43, 47, 59, 63, 11, 15, 27, 31 }, { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 98, 102, 114, 118, 66, 70, 82, 86 }, { 104, 108, 120, 124, 72, 76, 88, 92, 106, 110, 122, 126, 74, 78, 90, 94 }, { 65, 69, 81, 85, 97, 101, 113, 117, 67, 71, 83, 87, 99, 103, 115, 119 }, { 73, 77, 89, 93, 105, 109, 121, 125, 75, 79, 91, 95, 107, 111, 123, 127 }, { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 130, 134, 146, 150, 162, 166, 178, 182 }, { 136, 140, 152, 156, 168, 172, 184, 188, 138, 142, 154, 158, 170, 174, 186, 190 }, { 161, 165, 177, 181, 129, 133, 145, 149, 163, 167, 179, 183, 131, 135, 147, 151 }, { 169, 173, 185, 189, 137, 141, 153, 157, 171, 175, 187, 191, 139, 143, 155, 159 }, { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 226, 230, 242, 246, 194, 198, 210, 214 }, { 232, 236, 248, 252, 200, 204, 216, 220, 234, 238, 250, 254, 202, 206, 218, 222 }, { 193, 197, 209, 213, 225, 229, 241, 245, 195, 199, 211, 215, 227, 231, 243, 247 }, { 201, 205, 217, 221, 233, 237, 249, 253, 203, 207, 219, 223, 235, 239, 251, 255 }, }; __constant ushort columnTable4[16][32] = { { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 2, 10, 34, 42, 66, 74, 98, 106, 4, 12, 36, 44, 68, 76, 100, 108, 6, 14, 38, 46, 70, 78, 102, 110 }, { 16, 24, 48, 56, 80, 88, 112, 120, 18, 26, 50, 58, 82, 90, 114, 122, 20, 28, 52, 60, 84, 92, 116, 124, 22, 30, 54, 62, 86, 94, 118, 126 }, { 65, 73, 97, 105, 1, 9, 33, 41, 67, 75, 99, 107, 3, 11, 35, 43, 69, 77, 101, 109, 5, 13, 37, 45, 71, 79, 103, 111, 7, 15, 39, 47 }, { 81, 89, 113, 121, 17, 25, 49, 57, 83, 91, 115, 123, 19, 27, 51, 59, 85, 93, 117, 125, 21, 29, 53, 61, 87, 95, 119, 127, 23, 31, 55, 63 }, { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 194, 202, 226, 234, 130, 138, 162, 170, 196, 204, 228, 236, 132, 140, 164, 172, 198, 206, 230, 238, 134, 142, 166, 174 }, { 208, 216, 240, 248, 144, 152, 176, 184, 210, 218, 242, 250, 146, 154, 178, 186, 212, 220, 244, 252, 148, 156, 180, 188, 214, 222, 246, 254, 150, 158, 182, 190 }, { 129, 137, 161, 169, 193, 201, 225, 233, 131, 139, 163, 171, 195, 203, 227, 235, 133, 141, 165, 173, 197, 205, 229, 237, 135, 143, 167, 175, 199, 207, 231, 239 }, { 145, 153, 177, 185, 209, 217, 241, 249, 147, 155, 179, 187, 211, 219, 243, 251, 149, 157, 181, 189, 213, 221, 245, 253, 151, 159, 183, 191, 215, 223, 247, 255 }, { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 258, 266, 290, 298, 322, 330, 354, 362, 260, 268, 292, 300, 324, 332, 356, 364, 262, 270, 294, 302, 326, 334, 358, 366 }, { 272, 280, 304, 312, 336, 344, 368, 376, 274, 282, 306, 314, 338, 346, 370, 378, 276, 284, 308, 316, 340, 348, 372, 380, 278, 286, 310, 318, 342, 350, 374, 382 }, { 321, 329, 353, 361, 257, 265, 289, 297, 323, 331, 355, 363, 259, 267, 291, 299, 325, 333, 357, 365, 261, 269, 293, 301, 327, 335, 359, 367, 263, 271, 295, 303 }, { 337, 345, 369, 377, 273, 281, 305, 313, 339, 347, 371, 379, 275, 283, 307, 315, 341, 349, 373, 381, 277, 285, 309, 317, 343, 351, 375, 383, 279, 287, 311, 319 }, { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 450, 458, 482, 490, 386, 394, 418, 426, 452, 460, 484, 492, 388, 396, 420, 428, 454, 462, 486, 494, 390, 398, 422, 430 }, { 464, 472, 496, 504, 400, 408, 432, 440, 466, 474, 498, 506, 402, 410, 434, 442, 468, 476, 500, 508, 404, 412, 436, 444, 470, 478, 502, 510, 406, 414, 438, 446 }, { 385, 393, 417, 425, 449, 457, 481, 489, 387, 395, 419, 427, 451, 459, 483, 491, 389, 397, 421, 429, 453, 461, 485, 493, 391, 399, 423, 431, 455, 463, 487, 495 }, { 401, 409, 433, 441, 465, 473, 497, 505, 403, 411, 435, 443, 467, 475, 499, 507, 405, 413, 437, 445, 469, 477, 501, 509, 407, 415, 439, 447, 471, 479, 503, 511 }, }; int BlockNumber32(int x, int y, int bp, int bw) { return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; } int BlockNumber16(int x, int y, int bp, int bw) { return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; } int BlockNumber16S(int x, int y, int bp, int bw) { return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; } int BlockNumber32Z(int x, int y, int bp, int bw) { return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; } int BlockNumber16Z(int x, int y, int bp, int bw) { return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; } int BlockNumber16SZ(int x, int y, int bp, int bw) { return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; } int BlockNumber8(int x, int y, int bp, int bw) { return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; } int BlockNumber4(int x, int y, int bp, int bw) { return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; } int PixelAddress32(int x, int y, int bp, int bw) { return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; } int PixelAddress16(int x, int y, int bp, int bw) { return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } int PixelAddress16S(int x, int y, int bp, int bw) { return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } int PixelAddress32Z(int x, int y, int bp, int bw) { return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; } int PixelAddress16Z(int x, int y, int bp, int bw) { return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } int PixelAddress16SZ(int x, int y, int bp, int bw) { return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } int PixelAddress8(int x, int y, int bp, int bw) { return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15]; } int PixelAddress4(int x, int y, int bp, int bw) { return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31]; } int PixelAddress(int x, int y, int bp, int bw, int psm) { switch(psm) { default: case PSM_PSMCT32: case PSM_PSMCT24: case PSM_PSMT8H: case PSM_PSMT4HL: case PSM_PSMT4HH: return PixelAddress32(x, y, bp, bw); case PSM_PSMCT16: return PixelAddress16(x, y, bp, bw); case PSM_PSMCT16S: return PixelAddress16S(x, y, bp, bw); case PSM_PSMZ32: case PSM_PSMZ24: return PixelAddress32Z(x, y, bp, bw); case PSM_PSMZ16: return PixelAddress16Z(x, y, bp, bw); case PSM_PSMZ16S: return PixelAddress16SZ(x, y, bp, bw); case PSM_PSMT8: return PixelAddress8(x, y, bp, bw); case PSM_PSMT4: return PixelAddress4(x, y, bp, bw); } } uint ReadFrame(__global uchar* vm, int addr, int psm) { switch(psm) { default: case PSM_PSMCT32: case PSM_PSMCT24: case PSM_PSMZ32: case PSM_PSMZ24: return ((__global uint*)vm)[addr]; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: return ((__global ushort*)vm)[addr]; } } void WriteFrame(__global uchar* vm, int addr, int psm, uint value) { switch(psm) { default: case PSM_PSMCT32: case PSM_PSMZ32: case PSM_PSMCT24: case PSM_PSMZ24: ((__global uint*)vm)[addr] = value; break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: ((__global ushort*)vm)[addr] = (ushort)value; break; } } bool is16bit(int psm) { return psm < 8 && (psm & 3) >= 2; } bool is24bit(int psm) { return psm < 8 && (psm & 3) == 1; } bool is32bit(int psm) { return psm < 8 && (psm & 3) == 0; } #ifdef PRIM int GetVertexPerPrim(int prim_class) { switch(prim_class) { default: case GS_POINT_CLASS: return 1; case GS_LINE_CLASS: return 2; case GS_TRIANGLE_CLASS: return 3; case GS_SPRITE_CLASS: return 2; } } #define VERTEX_PER_PRIM GetVertexPerPrim(PRIM) #endif #ifdef KERNEL_PRIM __kernel void KERNEL_PRIM( __global gs_env* env, __global uchar* vb_base, __global uchar* ib_base, __global uchar* pb_base, uint vb_start, uint ib_start, uint pb_start) { size_t prim_index = get_global_id(0); __global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start); __global uint* ib = (__global uint*)(ib_base + ib_start); __global gs_prim* prim = &env->prim[prim_index]; ib += prim_index * VERTEX_PER_PRIM; uint pb_index = ib[0] >> 24; prim->pb_index = pb_index; __global gs_param* pb = (__global gs_param*)(pb_base + pb_start + pb_index * TFX_PARAM_SIZE); __global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff]; __global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff]; __global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff]; int2 pmin, pmax; if(PRIM == GS_POINT_CLASS) { pmin = pmax = convert_int2_rte(v0->p.xy); prim->v[0].p = v0->p; prim->v[0].tc = v0->tc; } else if(PRIM == GS_LINE_CLASS) { int2 p0 = convert_int2_rte(v0->p.xy); int2 p1 = convert_int2_rte(v1->p.xy); pmin = min(p0, p1); pmax = max(p0, p1); } else if(PRIM == GS_TRIANGLE_CLASS) { int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); int2 p2 = convert_int2_rtp(v2->p.xy); pmin = min(min(p0, p1), p2); pmax = max(max(p0, p1), p2); // z needs special care, since it's a 32 bit unit, float cannot encode it exactly // only interpolate the relative to zmin and hopefully small values uint zmin = min(min(v0->z, v1->z), v2->z); uint zmax = max(max(v0->z, v1->z), v2->z); prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w); prim->v[0].tc = v0->tc; prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w); prim->v[1].tc = v1->tc; prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w); prim->v[2].tc = v2->tc; prim->zmin = zmin; prim->zmax = zmax; float4 dp0 = v1->p - v0->p; float4 dp1 = v0->p - v2->p; float4 dp2 = v2->p - v1->p; float cp = dp0.x * dp1.y - dp0.y * dp1.x; if(cp != 0.0f) { cp = native_recip(cp); float2 u = dp0.xy * cp; float2 v = -dp1.xy * cp; // v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0) gs_barycentric b; b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x); b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y); dp0.xy = dp0.xy * sign(cp); dp1.xy = dp1.xy * sign(cp); dp2.xy = dp2.xy * sign(cp); b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0))); b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0))); b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0))); // any barycentric(reject_corner) < 0, tile outside the triangle b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE; b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE; b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE; // TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed env->barycentric[prim_index] = b; } else // triangle has zero area { pmax = -1; // won't get included in any tile } } else if(PRIM == GS_SPRITE_CLASS) { int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); pmin = min(p0, p1); pmax = max(p0, p1); int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0); prim->v[0].p = select(v0->p, v1->p, mask); // pmin prim->v[0].tc = select(v0->tc, v1->tc, mask); prim->v[1].p = select(v1->p, v0->p, mask); // pmax prim->v[1].tc = select(v1->tc, v0->tc, mask); prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); } int4 scissor = pb->scissor; pmin = select(pmin, scissor.xy, pmin < scissor.xy); pmax = select(pmax, scissor.zw, pmax > scissor.zw); int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS; env->bbox[prim_index] = convert_uchar4_sat(r); } #endif #ifdef KERNEL_TILE int tile_in_triangle(float2 p, gs_barycentric b) { float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz; f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); return all(f >= b.zero.xyz); } #if CLEAR == 1 __kernel void KERNEL_TILE(__global gs_env* env) { env->bounds[get_global_id(0)].first = -1; env->bounds[get_global_id(0)].last = 0; } #elif MODE < 3 #if MAX_PRIM_PER_BATCH != 32 #error "MAX_PRIM_PER_BATCH != 32" #endif #define MAX_PRIM_PER_GROUP (32u >> MODE) __kernel void KERNEL_TILE( __global gs_env* env, uint prim_count, uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { uint batch_index = get_group_id(2) >> MODE; uint prim_start = get_group_id(2) << (5 - MODE); uint group_prim_index = get_local_id(2); uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0); __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; __global uchar4* bbox = &env->bbox[prim_start]; __global gs_barycentric* barycentric = &env->barycentric[prim_start]; __local uchar4 bbox_cache[MAX_PRIM_PER_GROUP]; __local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP]; __local uint visible[8 << MODE]; if(get_local_id(2) == 0) { visible[bin_index] = 0; } barrier(CLK_LOCAL_MEM_FENCE); uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP); event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0); wait_group_events(1, &e); if(PRIM == GS_TRIANGLE_CLASS) { e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); wait_group_events(1, &e); } if(group_prim_index < group_prim_count) { int x = bin_dim.x + get_local_id(0); int y = bin_dim.y + get_local_id(1); uchar4 r = bbox_cache[group_prim_index]; uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); } atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2))); } barrier(CLK_LOCAL_MEM_FENCE); if(get_local_id(2) == 0) { #if MODE == 0 ((__global uint*)&bin[bin_index])[0] = visible[bin_index]; #elif MODE == 1 ((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index]; #elif MODE == 2 ((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index]; #endif if(visible[bin_index] != 0) { atomic_min(&env->bounds[bin_index].first, batch_index); atomic_max(&env->bounds[bin_index].last, batch_index); } } } #elif MODE == 3 __kernel void KERNEL_TILE( __global gs_env* env, uint prim_count, uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { size_t batch_index = get_group_id(0); size_t local_id = get_local_id(0); size_t local_size = get_local_size(0); uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); wait_group_events(1, &e); if(PRIM == GS_TRIANGLE_CLASS) { e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); wait_group_events(1, &e); } for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) { int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware int x = bin_index - y * bin_dim.z; x += bin_dim.x; y += bin_dim.y; BIN_TYPE visible = 0; for(uint i = 0; i < batch_prim_count; i++) { uchar4 r = bbox_cache[i]; BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); } visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); } bin[bin_index] = visible; if(visible != 0) { atomic_min(&env->bounds[bin_index].first, batch_index); atomic_max(&env->bounds[bin_index].last, batch_index); } } } #endif #endif #ifdef KERNEL_TFX bool ZTest(uint zs, uint zd) { if(ZTEST) { if(is24bit(ZPSM)) zd &= 0x00ffffff; switch(ZTST) { case ZTST_NEVER: return false; case ZTST_ALWAYS: return true; case ZTST_GEQUAL: return zs >= zd; case ZTST_GREATER: return zs > zd; } } return true; } bool AlphaTest(int alpha, int aref, uint* fm, uint* zm) { switch(AFAIL) { case AFAIL_KEEP: break; case AFAIL_FB_ONLY: if(!ZWRITE) return true; break; case AFAIL_ZB_ONLY: if(!FWRITE) return true; break; case AFAIL_RGB_ONLY: if(!ZWRITE && is24bit(FPSM)) return true; break; } uint pass; switch(ATST) { case ATST_NEVER: pass = false; break; case ATST_ALWAYS: return true; case ATST_LESS: pass = alpha < aref; break; case ATST_LEQUAL: pass = alpha <= aref; break; case ATST_EQUAL: pass = alpha == aref; break; case ATST_GEQUAL: pass = alpha >= aref; break; case ATST_GREATER: pass = alpha > aref; break; case ATST_NOTEQUAL: pass = alpha != aref; break; } switch(AFAIL) { case AFAIL_KEEP: return pass; case AFAIL_FB_ONLY: *zm |= pass ? 0 : 0xffffffff; break; case AFAIL_ZB_ONLY: *fm |= pass ? 0 : 0xffffffff; break; case AFAIL_RGB_ONLY: if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000; if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000; *zm |= pass ? 0 : 0xffffffff; break; } return true; } bool DestAlphaTest(uint fd) { if(DATE) { if(DATM) { if(is32bit(FPSM)) return (fd & 0x80000000) != 0; if(is16bit(FPSM)) return (fd & 0x00008000) != 0; } else { if(is32bit(FPSM)) return (fd & 0x80000000) == 0; if(is16bit(FPSM)) return (fd & 0x00008000) == 0; } } return true; } int Wrap(int a, int b, int c, int mode) { if(MERGED) { return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0); } else { switch(mode) { case CLAMP_REGION_REPEAT: return (a & b) | c; case CLAMP_REPEAT: return a & b; case CLAMP_CLAMP: return clamp(a, 0, c); case CLAMP_REGION_CLAMP: return clamp(a, b, c); } } } int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel) { if(FWRITE && (ABE || AA1)) { int4 cs = c; int4 cd; if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED) { if(is32bit(FPSM) || is24bit(FPSM)) { cd.x = fd & 0xff; cd.y = (fd >> 8) & 0xff; cd.z = (fd >> 16) & 0xff; cd.w = fd >> 24; } else if(is16bit(FPSM)) { cd.x = (fd << 3) & 0xf8; cd.y = (fd >> 2) & 0xf8; cd.z = (fd >> 7) & 0xf8; cd.w = (fd >> 8) & 0x80; } } if(MERGED) { int aba = TFX_ABA(sel); int abb = TFX_ABB(sel); int abc = TFX_ABC(sel); int abd = TFX_ABD(sel); int ad = !is24bit(FPSM) ? cd.w : 0x80; int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0; int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0; int C = abc == 0 ? cs.w : abc == 1 ? ad : afix; int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0; c.xyz = (mul24(A - B, C) >> 7) + D; } else { if(ABA != ABB) { switch(ABA) { case 0: break; // c.xyz = cs.xyz; case 1: c.xyz = cd.xyz; break; case 2: c.xyz = 0; break; } switch(ABB) { case 0: c.xyz -= cs.xyz; break; case 1: c.xyz -= cd.xyz; break; case 2: break; } if(!(is24bit(FPSM) && ABC == 1)) { int a = 0; switch(ABC) { case 0: a = cs.w; break; case 1: a = cd.w; break; case 2: a = afix; break; } c.xyz = c.xyz * a >> 7; } switch(ABD) { case 0: c.xyz += cs.xyz; break; case 1: c.xyz += cd.xyz; break; case 2: break; } } else { switch(ABD) { case 0: break; case 1: c.xyz = cd.xyz; break; case 2: c.xyz = 0; break; } } } if(PABE) { c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24)); } } return c; } uchar4 Expand24To32(uint rgba, uchar ta0) { uchar4 c; c.x = rgba & 0xff; c.y = (rgba >> 8) & 0xff; c.z = (rgba >> 16) & 0xff; c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0; return c; } uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1) { uchar4 c; c.x = (rgba << 3) & 0xf8; c.y = (rgba >> 2) & 0xf8; c.z = (rgba >> 7) & 0xf8; c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; return c; } int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb) { uchar4 c; uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM); __global ushort* vm16 = (__global ushort*)vm; __global uint* vm32 = (__global uint*)vm; switch(TPSM) { default: case PSM_PSMCT32: case PSM_PSMZ32: c = ((__global uchar4*)vm)[addr]; break; case PSM_PSMCT24: case PSM_PSMZ24: c = Expand24To32(vm32[addr], pb->ta0); break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: c = Expand16To32(vm16[addr], pb->ta0, pb->ta1); break; case PSM_PSMT8: c = pb->clut[vm[addr]]; break; case PSM_PSMT4: c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f]; break; case PSM_PSMT8H: c = pb->clut[vm32[addr] >> 24]; break; case PSM_PSMT4HL: c = pb->clut[(vm32[addr] >> 24) & 0x0f]; break; case PSM_PSMT4HH: c = pb->clut[(vm32[addr] >> 28) & 0x0f]; break; } //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]); return convert_int4(c); } int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) { int4 c; if(0)//if(MMIN) { // TODO } else { int2 uv; if(!FST) { uv = convert_int2_rte(t.xy * native_recip(t.z)); } else { // sfex capcom logo third drawing call at (0,223) calculated as: // t0 + (p - p0) * (t - t0) / (p1 - p0) // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 // due to rounding errors (multiply-add instruction maybe): // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation // TODO: sw renderer samples at 112 exactly, check which one is correct // last line error in persona 3 movie clips if rounding is enabled uv = convert_int2(t.xy); } if(LTF) uv -= 0x0008; int2 uvf = uv & 0x000f; int2 uv0 = uv >> 4; int2 uv1 = uv0 + 1; uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS); uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT); uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS); uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT); int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); if(LTF) { c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; } c = c00; } return c; } // TODO: 2x2 MSAA idea // downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local // hittest and ztest 2x2 (create write mask, only skip if all -1) // calculate color 1x1, alpha tests 1x1 // use mask to filter failed sub-pixels when writing to __local // needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered // multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end // pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( __global gs_env* env, __global uchar* vm, __global uchar* tex, __global uchar* pb_base, uint pb_start, uint prim_start, uint prim_count, uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim, uint fbp, uint zbp, uint bw) { uint x = get_global_id(0); uint y = get_global_id(1); uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x; uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y; uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x); uint batch_first = env->bounds[bin_index].first; uint batch_last = env->bounds[bin_index].last; uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS; if(batch_last < batch_first) { return; } uint skip; if(batch_start < batch_first) { uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1)); if(n > prim_count) { return; } skip = 0; prim_count -= n; batch_start = batch_first; } else { skip = prim_start & (MAX_PRIM_PER_BATCH - 1); prim_count += skip; } if(batch_start > batch_last) { return; } prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS); // int2 pi = (int2)(x, y); float2 pf = convert_float2(pi); int faddr = PixelAddress(x, y, fbp, bw, FPSM); int zaddr = PixelAddress(x, y, zbp, bw, ZPSM); uint fd, zd; // TODO: fd as int4 and only pack before writing out? if(RFB) { fd = ReadFrame(vm, faddr, FPSM); } if(RZB) { zd = ReadFrame(vm, zaddr, ZPSM); } // early destination alpha test if(!DestAlphaTest(fd)) { return; } // uint fragments = 0; __global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case" __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; pb_base += pb_start; BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) { while(bin_value != 0) { uint i = clz(bin_value); if(prim_index + i >= prim_count) { break; } bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); __global gs_prim* prim = &prim_base[prim_index + i]; __global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE); if(!NOSCISSOR) { if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) { continue; } } uint2 zf; float3 t; int4 c; // TODO: do not hittest if we know the tile is fully inside the prim if(PRIM == GS_POINT_CLASS) { float2 dpf = pf - prim->v[0].p.xy; if(!all((dpf <= 0.5f) & (dpf > -0.5f))) { continue; } zf = as_uint2(prim->v[0].p.zw); t = prim->v[0].tc.xyz; c = convert_int4(prim->v[0].c); } else if(PRIM == GS_LINE_CLASS) { // TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f // TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled // TODO: do not draw last pixel of the line continue; } else if(PRIM == GS_TRIANGLE_CLASS) { // TODO: aa1: draw edge as a line if(!ZTest(prim->zmax, zd)) { continue; } __global gs_barycentric* b = &barycentric[prim_index + i]; float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1); if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz)) { continue; } float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw)); float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw)); float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw)); zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin; zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; if(IIP) { float4 c0 = convert_float4(prim->v[0].c); float4 c1 = convert_float4(prim->v[1].c); float4 c2 = convert_float4(prim->v[2].c); c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y); } else { c = convert_int4(prim->v[2].c); } } else if(PRIM == GS_SPRITE_CLASS) { int2 tl = convert_int2_rtp(prim->v[0].p.xy); int2 br = convert_int2_rtp(prim->v[1].p.xy); if(!all((pi >= tl) & (pi < br))) { continue; } zf = as_uint2(prim->v[1].p.zw); t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy); t.z = prim->v[0].tc.z; c = convert_int4(prim->v[1].c); } // z test uint zs = zf.x; if(!ZTest(zs, zd)) { continue; } // sample texture int4 ct; if(TFX != TFX_NONE) { ct = SampleTexture(tex, pb, t); } // alpha tfx int alpha = c.w; if(FB) { if(TCC) { switch(TFX) { case TFX_MODULATE: c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff); break; case TFX_DECAL: c.w = ct.w; break; case TFX_HIGHLIGHT: c.w = clamp(ct.w + c.w, 0, 0xff); break; case TFX_HIGHLIGHT2: c.w = ct.w; break; } } if(AA1) { if(!ABE || c.w == 0x80) { c.w = 0x80; // TODO: edge ? coverage : 0x80 } } } // read mask uint fm = pb->fm; uint zm = pb->zm; // alpha test if(!AlphaTest(c.w, pb->aref, &fm, &zm)) { continue; } // all tests done, we have a new output fragments++; // write z if(ZWRITE) { zd = RZB ? bitselect(zs, zd, zm) : zs; } // rgb tfx if(FWRITE) { switch(TFX) { case TFX_MODULATE: c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff); break; case TFX_DECAL: c.xyz = ct.xyz; break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff); break; } } // fog if(FWRITE && FGE) { int fog = (int)zf.y; int3 fv = mul24(c.xyz, fog) >> 8; int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8; c.xyz = fv + fc; } // alpha blend c = AlphaBlend(c, fd, pb->afix, pb->sel); // write frame if(FWRITE) { if(DTHE && is16bit(FPSM)) { c.xyz += pb->dimx[y & 3][x & 3]; } c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff; if(FBA && !is24bit(FPSM)) { c.w |= 0x80; } uint fs; if(is32bit(FPSM)) { fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x; } else if(is24bit(FPSM)) { fs = (c.z << 16) | (c.y << 8) | c.x; } else if(is16bit(FPSM)) { fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3); } fd = RFB ? bitselect(fs, fd, fm) : fs; // dest alpha test for the next loop if(!DestAlphaTest(fd)) { prim_index = prim_count; // game over break; } } } bin += bin_count; bin_value = *bin; } if(fragments > 0) { if(ZWRITE) { WriteFrame(vm, zaddr, ZPSM, zd); } if(FWRITE) { WriteFrame(vm, faddr, FPSM, fd); } } } #endif #endif