minor opencl kernel optimizations

This commit is contained in:
gabest11 2014-09-20 23:59:45 +02:00 committed by Gregory Hainaut
parent 72cfc6a6ef
commit 3d2b0e3766
3 changed files with 144 additions and 155 deletions

View File

@ -61,8 +61,6 @@ typedef struct
typedef struct typedef struct
{ {
cl_uint batch_counter;
cl_uint _pad[7];
struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH]; struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
BIN_TYPE bin[MAX_BIN_COUNT]; BIN_TYPE bin[MAX_BIN_COUNT];
cl_uchar4 bbox[MAX_PRIM_COUNT]; cl_uchar4 bbox[MAX_PRIM_COUNT];
@ -392,10 +390,9 @@ void GSRendererCL::Draw()
if(bbox.eq(bbox.rintersect(scissor))) if(bbox.eq(bbox.rintersect(scissor)))
{ {
pb->sel.noscissor = 1; job->sel.noscissor = 1;
} }
job->sel = pb->sel;
job->rect.x = rect.x; job->rect.x = rect.x;
job->rect.y = rect.y; job->rect.y = rect.y;
job->rect.z = rect.z; job->rect.z = rect.z;
@ -418,11 +415,11 @@ void GSRendererCL::Draw()
// mark pages used in rendering as source or target // mark pages used in rendering as source or target
if(pb->sel.fwrite || pb->sel.rfb) if(job->sel.fwrite || job->sel.rfb)
{ {
m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages); m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
if(pb->sel.rfb) if(job->sel.rfb)
{ {
for(int i = 0; i < 4; i++) for(int i = 0; i < 4; i++)
{ {
@ -430,7 +427,7 @@ void GSRendererCL::Draw()
} }
} }
if(pb->sel.fwrite) if(job->sel.fwrite)
{ {
GSVector4i* dst_pages = job->GetDstPages(); GSVector4i* dst_pages = job->GetDstPages();
@ -443,11 +440,11 @@ void GSRendererCL::Draw()
} }
} }
if(pb->sel.zwrite || pb->sel.rzb) if(job->sel.zwrite || job->sel.rzb)
{ {
m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages); m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
if(pb->sel.rzb) if(job->sel.rzb)
{ {
for(int i = 0; i < 4; i++) for(int i = 0; i < 4; i++)
{ {
@ -455,7 +452,7 @@ void GSRendererCL::Draw()
} }
} }
if(pb->sel.zwrite) if(job->sel.zwrite)
{ {
GSVector4i* dst_pages = job->GetDstPages(); GSVector4i* dst_pages = job->GetDstPages();
@ -998,12 +995,12 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
const GSDrawingContext* context = m_context; const GSDrawingContext* context = m_context;
const GS_PRIM_CLASS primclass = m_vt.m_primclass; const GS_PRIM_CLASS primclass = m_vt.m_primclass;
pb->sel.key = 0; job->sel.key = 0;
pb->sel.atst = ATST_ALWAYS; job->sel.atst = ATST_ALWAYS;
pb->sel.tfx = TFX_NONE; job->sel.tfx = TFX_NONE;
pb->sel.ababcd = 0xff; job->sel.ababcd = 0xff;
pb->sel.prim = primclass; job->sel.prim = primclass;
uint32 fm = context->FRAME.FBMSK; uint32 fm = context->FRAME.FBMSK;
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
@ -1026,8 +1023,8 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
{ {
if(!TryAlphaTest(fm, zm)) if(!TryAlphaTest(fm, zm))
{ {
pb->sel.atst = context->TEST.ATST; job->sel.atst = context->TEST.ATST;
pb->sel.afail = context->TEST.AFAIL; job->sel.afail = context->TEST.AFAIL;
pb->aref = context->TEST.AREF; pb->aref = context->TEST.AREF;
} }
} }
@ -1056,31 +1053,31 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(!fwrite && !zwrite) return false; if(!fwrite && !zwrite) return false;
bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; bool ftest = job->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
pb->sel.fwrite = fwrite; job->sel.fwrite = fwrite;
pb->sel.ftest = ftest; job->sel.ftest = ftest;
pb->sel.zwrite = zwrite; job->sel.zwrite = zwrite;
pb->sel.ztest = ztest; job->sel.ztest = ztest;
if(fwrite || ftest) if(fwrite || ftest)
{ {
pb->sel.fpsm = RemapPSM(context->FRAME.PSM); job->sel.fpsm = RemapPSM(context->FRAME.PSM);
if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
{ {
pb->sel.iip = PRIM->IIP; job->sel.iip = PRIM->IIP;
} }
if(PRIM->TME) if(PRIM->TME)
{ {
pb->sel.tfx = context->TEX0.TFX; job->sel.tfx = context->TEX0.TFX;
pb->sel.tcc = context->TEX0.TCC; job->sel.tcc = context->TEX0.TCC;
pb->sel.fst = PRIM->FST; job->sel.fst = PRIM->FST;
pb->sel.ltf = m_vt.IsLinear(); job->sel.ltf = m_vt.IsLinear();
pb->sel.tpsm = RemapPSM(context->TEX0.PSM); job->sel.tpsm = RemapPSM(context->TEX0.PSM);
pb->sel.aem = m_env.TEXA.AEM; job->sel.aem = m_env.TEXA.AEM;
pb->tbp[0] = context->TEX0.TBP0; pb->tbp[0] = context->TEX0.TBP0;
pb->tbw[0] = context->TEX0.TBW; pb->tbw[0] = context->TEX0.TBW;
@ -1089,24 +1086,24 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
{ {
pb->sel.tlu = 1; job->sel.tlu = 1;
memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
} }
pb->sel.wms = context->CLAMP.WMS; job->sel.wms = context->CLAMP.WMS;
pb->sel.wmt = context->CLAMP.WMT; job->sel.wmt = context->CLAMP.WMT;
if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) if(job->sel.tfx == TFX_MODULATE && job->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
{ {
// modulate does not do anything when vertex color is 0x80 // modulate does not do anything when vertex color is 0x80
pb->sel.tfx = TFX_DECAL; job->sel.tfx = TFX_DECAL;
} }
GSVector4i r; GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); GetTextureMinMax(r, context->TEX0, context->CLAMP, job->sel.ltf);
GSVector4i* src_pages = job->GetSrcPages(); GSVector4i* src_pages = job->GetSrcPages();
@ -1131,15 +1128,15 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(m_vt.m_lod.x > 0) if(m_vt.m_lod.x > 0)
{ {
pb->sel.ltf = context->TEX1.MMIN >> 2; job->sel.ltf = context->TEX1.MMIN >> 2;
} }
else else
{ {
// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
} }
pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri job->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
pb->sel.lcm = context->TEX1.LCM; job->sel.lcm = context->TEX1.LCM;
int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16; int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
int k = context->TEX1.K << 12; int k = context->TEX1.K << 12;
@ -1148,28 +1145,28 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
{ {
k = (int)m_vt.m_lod.x << 16; // set lod to max level k = (int)m_vt.m_lod.x << 16; // set lod to max level
pb->sel.lcm = 1; // lod is constant job->sel.lcm = 1; // lod is constant
pb->sel.mmin = 1; // tri-linear is meaningless job->sel.mmin = 1; // tri-linear is meaningless
} }
if(pb->sel.mmin == 2) if(job->sel.mmin == 2)
{ {
mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
} }
if(pb->sel.fst) if(job->sel.fst)
{ {
ASSERT(pb->sel.lcm == 1); ASSERT(job->sel.lcm == 1);
ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
pb->sel.lcm = 1; job->sel.lcm = 1;
} }
if(pb->sel.lcm) if(job->sel.lcm)
{ {
int lod = std::max<int>(std::min<int>(k, mxl), 0); int lod = std::max<int>(std::min<int>(k, mxl), 0);
if(pb->sel.mmin == 1) if(job->sel.mmin == 1)
{ {
lod = (lod + 0x8000) & 0xffff0000; // rounding lod = (lod + 0x8000) & 0xffff0000; // rounding
} }
@ -1241,7 +1238,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
GSVector4i r; GSVector4i r;
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, job->sel.ltf);
GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM); GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
@ -1260,7 +1257,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
} }
else else
{ {
if(pb->sel.fst == 0) if(job->sel.fst == 0)
{ {
// skip per pixel division if q is constant // skip per pixel division if q is constant
@ -1268,7 +1265,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(m_vt.m_eq.q) if(m_vt.m_eq.q)
{ {
pb->sel.fst = 1; job->sel.fst = 1;
const GSVector4& t = v[index[0]].t; const GSVector4& t = v[index[0]].t;
@ -1286,7 +1283,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
} }
else if(primclass == GS_SPRITE_CLASS) else if(primclass == GS_SPRITE_CLASS)
{ {
pb->sel.fst = 1; job->sel.fst = 1;
for(int i = 0, j = vertex_count; i < j; i += 2) for(int i = 0, j = vertex_count; i < j; i += 2)
{ {
@ -1301,7 +1298,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
} }
} }
if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel? if(job->sel.ltf && job->sel.fst) // TODO: quite slow, do this in the prim kernel?
{ {
// if q is constant we can do the half pel shift for bilinear sampling on the vertices // if q is constant we can do the half pel shift for bilinear sampling on the vertices
@ -1378,58 +1375,58 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(PRIM->FGE) if(PRIM->FGE)
{ {
pb->sel.fge = 1; job->sel.fge = 1;
pb->fog = env.FOGCOL.u32[0]; pb->fog = env.FOGCOL.u32[0];
} }
if(context->FRAME.PSM != PSM_PSMCT24) if(context->FRAME.PSM != PSM_PSMCT24)
{ {
pb->sel.date = context->TEST.DATE; job->sel.date = context->TEST.DATE;
pb->sel.datm = context->TEST.DATM; job->sel.datm = context->TEST.DATM;
} }
if(!IsOpaque()) if(!IsOpaque())
{ {
pb->sel.abe = PRIM->ABE; job->sel.abe = PRIM->ABE;
pb->sel.ababcd = context->ALPHA.u32[0]; job->sel.ababcd = context->ALPHA.u32[0];
if(env.PABE.PABE) if(env.PABE.PABE)
{ {
pb->sel.pabe = 1; job->sel.pabe = 1;
} }
if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
{ {
pb->sel.aa1 = 1; job->sel.aa1 = 1;
} }
pb->afix = context->ALPHA.FIX; pb->afix = context->ALPHA.FIX;
} }
if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1) if(job->sel.date || job->sel.aba == 1 || job->sel.abb == 1 || job->sel.abc == 1 || job->sel.abd == 1)
{ {
pb->sel.rfb = 1; job->sel.rfb = 1;
} }
else else
{ {
if(fwrite) if(fwrite)
{ {
if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY if(job->sel.atst != ATST_ALWAYS && job->sel.afail == AFAIL_RGB_ONLY
|| (pb->sel.fpsm & 3) == 0 && fm != 0 || (job->sel.fpsm & 3) == 0 && fm != 0
|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask || (job->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0) || (job->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
{ {
pb->sel.rfb = 1; job->sel.rfb = 1;
} }
} }
} }
pb->sel.colclamp = env.COLCLAMP.CLAMP; job->sel.colclamp = env.COLCLAMP.CLAMP;
pb->sel.fba = context->FBA.FBA; job->sel.fba = context->FBA.FBA;
if(env.DTHE.DTHE) if(env.DTHE.DTHE)
{ {
pb->sel.dthe = 1; job->sel.dthe = 1;
GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16); GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16); GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
@ -1442,21 +1439,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
if(zwrite || ztest) if(zwrite || ztest)
{ {
pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); job->sel.zpsm = RemapPSM(context->ZBUF.PSM);
pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; job->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
if(ztest) if(ztest)
{ {
pb->sel.rzb = 1; job->sel.rzb = 1;
} }
else else
{ {
if(zwrite) if(zwrite)
{ {
if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY) if(job->sel.atst != ATST_ALWAYS && (job->sel.afail == AFAIL_FB_ONLY || job->sel.afail == AFAIL_RGB_ONLY)
|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask || (job->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
{ {
pb->sel.rzb = 1; job->sel.rzb = 1;
} }
} }
} }
@ -1465,11 +1462,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
pb->fm = fm; pb->fm = fm;
pb->zm = zm; pb->zm = zm;
if((pb->sel.fpsm & 3) == 1) if((job->sel.fpsm & 3) == 1)
{ {
pb->fm |= 0xff000000; pb->fm |= 0xff000000;
} }
else if((pb->sel.fpsm & 3) >= 2) else if((job->sel.fpsm & 3) >= 2)
{ {
uint32 rb = pb->fm & 0x00f800f8; uint32 rb = pb->fm & 0x00f800f8;
uint32 ga = pb->fm & 0x8000f800; uint32 ga = pb->fm & 0x8000f800;
@ -1477,11 +1474,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
} }
if((pb->sel.zpsm & 3) == 1) if((job->sel.zpsm & 3) == 1)
{ {
pb->zm |= 0xff000000; pb->zm |= 0xff000000;
} }
else if((pb->sel.zpsm & 3) >= 2) else if((job->sel.zpsm & 3) >= 2)
{ {
pb->zm |= 0xffff0000; pb->zm |= 0xffff0000;
} }

View File

@ -146,7 +146,6 @@ class GSRendererCL : public GSRenderer
{ {
GSVector4i scissor; GSVector4i scissor;
GSVector4i dimx; // 4x4 signed char GSVector4i dimx; // 4x4 signed char
TFXSelector sel;
uint32 fbp, zbp, bw; uint32 fbp, zbp, bw;
uint32 fm, zm; uint32 fm, zm;
uint32 fog; // rgb uint32 fog; // rgb

View File

@ -1,5 +1,21 @@
#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
#ifdef cl_amd_printf
#pragma OPENCL EXTENSION cl_amd_printf : enable
#else
#define printf(x)
#endif
#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
#endif
#ifdef cl_amd_media_ops2
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#else
#endif
#ifndef CL_FLT_EPSILON #ifndef CL_FLT_EPSILON
#define CL_FLT_EPSILON 1.1920928955078125e-7f #define CL_FLT_EPSILON 1.1920928955078125e-7f
#endif #endif
@ -32,8 +48,6 @@ typedef struct
typedef struct typedef struct
{ {
uint batch_counter;
uint _pad[7];
struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
BIN_TYPE bin[MAX_BIN_COUNT]; BIN_TYPE bin[MAX_BIN_COUNT];
uchar4 bbox[MAX_PRIM_COUNT]; uchar4 bbox[MAX_PRIM_COUNT];
@ -45,7 +59,6 @@ typedef struct
{ {
int4 scissor; int4 scissor;
char dimx[4][4]; char dimx[4][4];
ulong sel;
int fbp, zbp, bw; int fbp, zbp, bw;
uint fm, zm; uint fm, zm;
uchar4 fog; // rgb uchar4 fog; // rgb
@ -679,7 +692,6 @@ int tile_in_triangle(float2 p, gs_barycentric b)
__kernel void KERNEL_TILE(__global gs_env* env) __kernel void KERNEL_TILE(__global gs_env* env)
{ {
env->batch_counter = 0;
env->bounds[get_global_id(0)].first = -1; env->bounds[get_global_id(0)].first = -1;
env->bounds[get_global_id(0)].last = 0; env->bounds[get_global_id(0)].last = 0;
} }
@ -777,77 +789,60 @@ __kernel void KERNEL_TILE(
uint bin_count, // == bin_dim.z * bin_dim.w uint bin_count, // == bin_dim.z * bin_dim.w
uchar4 bin_dim) uchar4 bin_dim)
{ {
__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; size_t batch_index = get_group_id(0);
__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
__local uint batch_index;
size_t local_id = get_local_id(0); size_t local_id = get_local_id(0);
size_t local_size = get_local_size(0); size_t local_size = get_local_size(0);
while(1) uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
wait_group_events(1, &e);
if(PRIM == GS_TRIANGLE_CLASS)
{ {
barrier(CLK_LOCAL_MEM_FENCE); e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
if(local_id == 0)
{
batch_index = atomic_inc(&env->batch_counter);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(batch_index >= batch_count)
{
break;
}
uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
wait_group_events(1, &e); wait_group_events(1, &e);
}
if(PRIM == GS_TRIANGLE_CLASS) for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
{
int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
int x = bin_index - y * bin_dim.z;
x += bin_dim.x;
y += bin_dim.y;
BIN_TYPE visible = 0;
for(uint i = 0; i < batch_prim_count; i++)
{ {
e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); uchar4 r = bbox_cache[i];
wait_group_events(1, &e); BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
{
test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
}
visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
} }
for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) bin[bin_index] = visible;
if(visible != 0)
{ {
int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware atomic_min(&env->bounds[bin_index].first, batch_index);
int x = bin_index - y * bin_dim.z; atomic_max(&env->bounds[bin_index].last, batch_index);
x += bin_dim.x;
y += bin_dim.y;
BIN_TYPE visible = 0;
for(uint i = 0; i < batch_prim_count; i++)
{
uchar4 r = bbox_cache[i];
BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
{
test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
}
visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
}
bin[bin_index] = visible;
if(visible != 0)
{
atomic_min(&env->bounds[bin_index].first, batch_index);
atomic_max(&env->bounds[bin_index].last, batch_index);
}
} }
} }
} }
@ -998,10 +993,10 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
} }
else if(is16bit(FPSM)) else if(is16bit(FPSM))
{ {
cd.x = (fd & 0x001f) << 3; cd.x = (fd << 3) & 0xf8;
cd.y = (fd & 0x03e0) >> 2; cd.y = (fd >> 2) & 0xf8;
cd.z = (fd & 0x7c00) >> 7; cd.z = (fd >> 7) & 0xf8;
cd.w = (fd & 0x8000) >> 8; cd.w = (fd >> 8) & 0x80;
} }
} }
@ -1077,9 +1072,9 @@ uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
{ {
uchar4 c; uchar4 c;
c.x = (rgba & 0x001f) << 3; c.x = (rgba << 3) & 0xf8;
c.y = (rgba & 0x03e0) >> 2; c.y = (rgba >> 2) & 0xf8;
c.z = (rgba & 0x7c00) >> 7; c.z = (rgba >> 7) & 0xf8;
c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
return c; return c;
@ -1202,7 +1197,7 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end // multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average // pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average
__kernel void KERNEL_TFX( __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
__global gs_env* env, __global gs_env* env,
__global uchar* vm, __global uchar* vm,
__global uchar* tex, __global uchar* tex,
@ -1214,8 +1209,6 @@ __kernel void KERNEL_TFX(
uint bin_count, // == bin_dim.z * bin_dim.w uint bin_count, // == bin_dim.z * bin_dim.w
uchar4 bin_dim) uchar4 bin_dim)
{ {
// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way
uint x = get_global_id(0); uint x = get_global_id(0);
uint y = get_global_id(1); uint y = get_global_id(1);
@ -1451,7 +1444,7 @@ __kernel void KERNEL_TFX(
{ {
if(!ABE || c.w == 0x80) if(!ABE || c.w == 0x80)
{ {
c.w = /*edge ? coverage :*/ 0x80; // TODO c.w = 0x80; // TODO: edge ? coverage : 0x80
} }
} }
} }