mirror of https://github.com/PCSX2/pcsx2.git
minor opencl kernel optimizations
This commit is contained in:
parent
72cfc6a6ef
commit
3d2b0e3766
|
@ -61,8 +61,6 @@ typedef struct
|
|||
|
||||
typedef struct
|
||||
{
|
||||
cl_uint batch_counter;
|
||||
cl_uint _pad[7];
|
||||
struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
|
||||
BIN_TYPE bin[MAX_BIN_COUNT];
|
||||
cl_uchar4 bbox[MAX_PRIM_COUNT];
|
||||
|
@ -392,10 +390,9 @@ void GSRendererCL::Draw()
|
|||
|
||||
if(bbox.eq(bbox.rintersect(scissor)))
|
||||
{
|
||||
pb->sel.noscissor = 1;
|
||||
job->sel.noscissor = 1;
|
||||
}
|
||||
|
||||
job->sel = pb->sel;
|
||||
job->rect.x = rect.x;
|
||||
job->rect.y = rect.y;
|
||||
job->rect.z = rect.z;
|
||||
|
@ -418,11 +415,11 @@ void GSRendererCL::Draw()
|
|||
|
||||
// mark pages used in rendering as source or target
|
||||
|
||||
if(pb->sel.fwrite || pb->sel.rfb)
|
||||
if(job->sel.fwrite || job->sel.rfb)
|
||||
{
|
||||
m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
|
||||
|
||||
if(pb->sel.rfb)
|
||||
if(job->sel.rfb)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
|
@ -430,7 +427,7 @@ void GSRendererCL::Draw()
|
|||
}
|
||||
}
|
||||
|
||||
if(pb->sel.fwrite)
|
||||
if(job->sel.fwrite)
|
||||
{
|
||||
GSVector4i* dst_pages = job->GetDstPages();
|
||||
|
||||
|
@ -443,11 +440,11 @@ void GSRendererCL::Draw()
|
|||
}
|
||||
}
|
||||
|
||||
if(pb->sel.zwrite || pb->sel.rzb)
|
||||
if(job->sel.zwrite || job->sel.rzb)
|
||||
{
|
||||
m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
|
||||
|
||||
if(pb->sel.rzb)
|
||||
if(job->sel.rzb)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
|
@ -455,7 +452,7 @@ void GSRendererCL::Draw()
|
|||
}
|
||||
}
|
||||
|
||||
if(pb->sel.zwrite)
|
||||
if(job->sel.zwrite)
|
||||
{
|
||||
GSVector4i* dst_pages = job->GetDstPages();
|
||||
|
||||
|
@ -998,12 +995,12 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
const GSDrawingContext* context = m_context;
|
||||
const GS_PRIM_CLASS primclass = m_vt.m_primclass;
|
||||
|
||||
pb->sel.key = 0;
|
||||
job->sel.key = 0;
|
||||
|
||||
pb->sel.atst = ATST_ALWAYS;
|
||||
pb->sel.tfx = TFX_NONE;
|
||||
pb->sel.ababcd = 0xff;
|
||||
pb->sel.prim = primclass;
|
||||
job->sel.atst = ATST_ALWAYS;
|
||||
job->sel.tfx = TFX_NONE;
|
||||
job->sel.ababcd = 0xff;
|
||||
job->sel.prim = primclass;
|
||||
|
||||
uint32 fm = context->FRAME.FBMSK;
|
||||
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
|
||||
|
@ -1026,8 +1023,8 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
{
|
||||
if(!TryAlphaTest(fm, zm))
|
||||
{
|
||||
pb->sel.atst = context->TEST.ATST;
|
||||
pb->sel.afail = context->TEST.AFAIL;
|
||||
job->sel.atst = context->TEST.ATST;
|
||||
job->sel.afail = context->TEST.AFAIL;
|
||||
pb->aref = context->TEST.AREF;
|
||||
}
|
||||
}
|
||||
|
@ -1056,31 +1053,31 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(!fwrite && !zwrite) return false;
|
||||
|
||||
bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
|
||||
bool ftest = job->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
|
||||
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
|
||||
|
||||
pb->sel.fwrite = fwrite;
|
||||
pb->sel.ftest = ftest;
|
||||
pb->sel.zwrite = zwrite;
|
||||
pb->sel.ztest = ztest;
|
||||
job->sel.fwrite = fwrite;
|
||||
job->sel.ftest = ftest;
|
||||
job->sel.zwrite = zwrite;
|
||||
job->sel.ztest = ztest;
|
||||
|
||||
if(fwrite || ftest)
|
||||
{
|
||||
pb->sel.fpsm = RemapPSM(context->FRAME.PSM);
|
||||
job->sel.fpsm = RemapPSM(context->FRAME.PSM);
|
||||
|
||||
if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
|
||||
{
|
||||
pb->sel.iip = PRIM->IIP;
|
||||
job->sel.iip = PRIM->IIP;
|
||||
}
|
||||
|
||||
if(PRIM->TME)
|
||||
{
|
||||
pb->sel.tfx = context->TEX0.TFX;
|
||||
pb->sel.tcc = context->TEX0.TCC;
|
||||
pb->sel.fst = PRIM->FST;
|
||||
pb->sel.ltf = m_vt.IsLinear();
|
||||
pb->sel.tpsm = RemapPSM(context->TEX0.PSM);
|
||||
pb->sel.aem = m_env.TEXA.AEM;
|
||||
job->sel.tfx = context->TEX0.TFX;
|
||||
job->sel.tcc = context->TEX0.TCC;
|
||||
job->sel.fst = PRIM->FST;
|
||||
job->sel.ltf = m_vt.IsLinear();
|
||||
job->sel.tpsm = RemapPSM(context->TEX0.PSM);
|
||||
job->sel.aem = m_env.TEXA.AEM;
|
||||
|
||||
pb->tbp[0] = context->TEX0.TBP0;
|
||||
pb->tbw[0] = context->TEX0.TBW;
|
||||
|
@ -1089,24 +1086,24 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
|
||||
{
|
||||
pb->sel.tlu = 1;
|
||||
job->sel.tlu = 1;
|
||||
|
||||
memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
|
||||
}
|
||||
|
||||
pb->sel.wms = context->CLAMP.WMS;
|
||||
pb->sel.wmt = context->CLAMP.WMT;
|
||||
job->sel.wms = context->CLAMP.WMS;
|
||||
job->sel.wmt = context->CLAMP.WMT;
|
||||
|
||||
if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
|
||||
if(job->sel.tfx == TFX_MODULATE && job->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
|
||||
{
|
||||
// modulate does not do anything when vertex color is 0x80
|
||||
|
||||
pb->sel.tfx = TFX_DECAL;
|
||||
job->sel.tfx = TFX_DECAL;
|
||||
}
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
|
||||
GetTextureMinMax(r, context->TEX0, context->CLAMP, job->sel.ltf);
|
||||
|
||||
GSVector4i* src_pages = job->GetSrcPages();
|
||||
|
||||
|
@ -1131,15 +1128,15 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(m_vt.m_lod.x > 0)
|
||||
{
|
||||
pb->sel.ltf = context->TEX1.MMIN >> 2;
|
||||
job->sel.ltf = context->TEX1.MMIN >> 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
|
||||
}
|
||||
|
||||
pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
|
||||
pb->sel.lcm = context->TEX1.LCM;
|
||||
job->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
|
||||
job->sel.lcm = context->TEX1.LCM;
|
||||
|
||||
int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
|
||||
int k = context->TEX1.K << 12;
|
||||
|
@ -1148,28 +1145,28 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
{
|
||||
k = (int)m_vt.m_lod.x << 16; // set lod to max level
|
||||
|
||||
pb->sel.lcm = 1; // lod is constant
|
||||
pb->sel.mmin = 1; // tri-linear is meaningless
|
||||
job->sel.lcm = 1; // lod is constant
|
||||
job->sel.mmin = 1; // tri-linear is meaningless
|
||||
}
|
||||
|
||||
if(pb->sel.mmin == 2)
|
||||
if(job->sel.mmin == 2)
|
||||
{
|
||||
mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
|
||||
}
|
||||
|
||||
if(pb->sel.fst)
|
||||
if(job->sel.fst)
|
||||
{
|
||||
ASSERT(pb->sel.lcm == 1);
|
||||
ASSERT(job->sel.lcm == 1);
|
||||
ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
|
||||
|
||||
pb->sel.lcm = 1;
|
||||
job->sel.lcm = 1;
|
||||
}
|
||||
|
||||
if(pb->sel.lcm)
|
||||
if(job->sel.lcm)
|
||||
{
|
||||
int lod = std::max<int>(std::min<int>(k, mxl), 0);
|
||||
|
||||
if(pb->sel.mmin == 1)
|
||||
if(job->sel.mmin == 1)
|
||||
{
|
||||
lod = (lod + 0x8000) & 0xffff0000; // rounding
|
||||
}
|
||||
|
@ -1241,7 +1238,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
|
||||
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, job->sel.ltf);
|
||||
|
||||
GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
|
||||
|
||||
|
@ -1260,7 +1257,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
}
|
||||
else
|
||||
{
|
||||
if(pb->sel.fst == 0)
|
||||
if(job->sel.fst == 0)
|
||||
{
|
||||
// skip per pixel division if q is constant
|
||||
|
||||
|
@ -1268,7 +1265,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(m_vt.m_eq.q)
|
||||
{
|
||||
pb->sel.fst = 1;
|
||||
job->sel.fst = 1;
|
||||
|
||||
const GSVector4& t = v[index[0]].t;
|
||||
|
||||
|
@ -1286,7 +1283,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
}
|
||||
else if(primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
pb->sel.fst = 1;
|
||||
job->sel.fst = 1;
|
||||
|
||||
for(int i = 0, j = vertex_count; i < j; i += 2)
|
||||
{
|
||||
|
@ -1301,7 +1298,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
}
|
||||
}
|
||||
|
||||
if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel?
|
||||
if(job->sel.ltf && job->sel.fst) // TODO: quite slow, do this in the prim kernel?
|
||||
{
|
||||
// if q is constant we can do the half pel shift for bilinear sampling on the vertices
|
||||
|
||||
|
@ -1378,58 +1375,58 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(PRIM->FGE)
|
||||
{
|
||||
pb->sel.fge = 1;
|
||||
job->sel.fge = 1;
|
||||
pb->fog = env.FOGCOL.u32[0];
|
||||
}
|
||||
|
||||
if(context->FRAME.PSM != PSM_PSMCT24)
|
||||
{
|
||||
pb->sel.date = context->TEST.DATE;
|
||||
pb->sel.datm = context->TEST.DATM;
|
||||
job->sel.date = context->TEST.DATE;
|
||||
job->sel.datm = context->TEST.DATM;
|
||||
}
|
||||
|
||||
if(!IsOpaque())
|
||||
{
|
||||
pb->sel.abe = PRIM->ABE;
|
||||
pb->sel.ababcd = context->ALPHA.u32[0];
|
||||
job->sel.abe = PRIM->ABE;
|
||||
job->sel.ababcd = context->ALPHA.u32[0];
|
||||
|
||||
if(env.PABE.PABE)
|
||||
{
|
||||
pb->sel.pabe = 1;
|
||||
job->sel.pabe = 1;
|
||||
}
|
||||
|
||||
if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
|
||||
{
|
||||
pb->sel.aa1 = 1;
|
||||
job->sel.aa1 = 1;
|
||||
}
|
||||
|
||||
pb->afix = context->ALPHA.FIX;
|
||||
}
|
||||
|
||||
if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1)
|
||||
if(job->sel.date || job->sel.aba == 1 || job->sel.abb == 1 || job->sel.abc == 1 || job->sel.abd == 1)
|
||||
{
|
||||
pb->sel.rfb = 1;
|
||||
job->sel.rfb = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(fwrite)
|
||||
{
|
||||
if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|
||||
|| (pb->sel.fpsm & 3) == 0 && fm != 0
|
||||
|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|
||||
|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
|
||||
if(job->sel.atst != ATST_ALWAYS && job->sel.afail == AFAIL_RGB_ONLY
|
||||
|| (job->sel.fpsm & 3) == 0 && fm != 0
|
||||
|| (job->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|
||||
|| (job->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
|
||||
{
|
||||
pb->sel.rfb = 1;
|
||||
job->sel.rfb = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pb->sel.colclamp = env.COLCLAMP.CLAMP;
|
||||
pb->sel.fba = context->FBA.FBA;
|
||||
job->sel.colclamp = env.COLCLAMP.CLAMP;
|
||||
job->sel.fba = context->FBA.FBA;
|
||||
|
||||
if(env.DTHE.DTHE)
|
||||
{
|
||||
pb->sel.dthe = 1;
|
||||
job->sel.dthe = 1;
|
||||
|
||||
GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
|
||||
GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
|
||||
|
@ -1442,21 +1439,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
|
||||
if(zwrite || ztest)
|
||||
{
|
||||
pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
|
||||
pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
|
||||
job->sel.zpsm = RemapPSM(context->ZBUF.PSM);
|
||||
job->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
|
||||
|
||||
if(ztest)
|
||||
{
|
||||
pb->sel.rzb = 1;
|
||||
job->sel.rzb = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(zwrite)
|
||||
{
|
||||
if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY)
|
||||
|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
|
||||
if(job->sel.atst != ATST_ALWAYS && (job->sel.afail == AFAIL_FB_ONLY || job->sel.afail == AFAIL_RGB_ONLY)
|
||||
|| (job->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
|
||||
{
|
||||
pb->sel.rzb = 1;
|
||||
job->sel.rzb = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1465,11 +1462,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
pb->fm = fm;
|
||||
pb->zm = zm;
|
||||
|
||||
if((pb->sel.fpsm & 3) == 1)
|
||||
if((job->sel.fpsm & 3) == 1)
|
||||
{
|
||||
pb->fm |= 0xff000000;
|
||||
}
|
||||
else if((pb->sel.fpsm & 3) >= 2)
|
||||
else if((job->sel.fpsm & 3) >= 2)
|
||||
{
|
||||
uint32 rb = pb->fm & 0x00f800f8;
|
||||
uint32 ga = pb->fm & 0x8000f800;
|
||||
|
@ -1477,11 +1474,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
|
||||
}
|
||||
|
||||
if((pb->sel.zpsm & 3) == 1)
|
||||
if((job->sel.zpsm & 3) == 1)
|
||||
{
|
||||
pb->zm |= 0xff000000;
|
||||
}
|
||||
else if((pb->sel.zpsm & 3) >= 2)
|
||||
else if((job->sel.zpsm & 3) >= 2)
|
||||
{
|
||||
pb->zm |= 0xffff0000;
|
||||
}
|
||||
|
|
|
@ -146,7 +146,6 @@ class GSRendererCL : public GSRenderer
|
|||
{
|
||||
GSVector4i scissor;
|
||||
GSVector4i dimx; // 4x4 signed char
|
||||
TFXSelector sel;
|
||||
uint32 fbp, zbp, bw;
|
||||
uint32 fm, zm;
|
||||
uint32 fog; // rgb
|
||||
|
|
|
@ -1,5 +1,21 @@
|
|||
#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
|
||||
|
||||
#ifdef cl_amd_printf
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#else
|
||||
#define printf(x)
|
||||
#endif
|
||||
|
||||
#ifdef cl_amd_media_ops
|
||||
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef cl_amd_media_ops2
|
||||
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifndef CL_FLT_EPSILON
|
||||
#define CL_FLT_EPSILON 1.1920928955078125e-7f
|
||||
#endif
|
||||
|
@ -32,8 +48,6 @@ typedef struct
|
|||
|
||||
typedef struct
|
||||
{
|
||||
uint batch_counter;
|
||||
uint _pad[7];
|
||||
struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
|
||||
BIN_TYPE bin[MAX_BIN_COUNT];
|
||||
uchar4 bbox[MAX_PRIM_COUNT];
|
||||
|
@ -45,7 +59,6 @@ typedef struct
|
|||
{
|
||||
int4 scissor;
|
||||
char dimx[4][4];
|
||||
ulong sel;
|
||||
int fbp, zbp, bw;
|
||||
uint fm, zm;
|
||||
uchar4 fog; // rgb
|
||||
|
@ -679,7 +692,6 @@ int tile_in_triangle(float2 p, gs_barycentric b)
|
|||
|
||||
__kernel void KERNEL_TILE(__global gs_env* env)
|
||||
{
|
||||
env->batch_counter = 0;
|
||||
env->bounds[get_global_id(0)].first = -1;
|
||||
env->bounds[get_global_id(0)].last = 0;
|
||||
}
|
||||
|
@ -777,77 +789,60 @@ __kernel void KERNEL_TILE(
|
|||
uint bin_count, // == bin_dim.z * bin_dim.w
|
||||
uchar4 bin_dim)
|
||||
{
|
||||
__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
|
||||
__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
|
||||
__local uint batch_index;
|
||||
|
||||
size_t batch_index = get_group_id(0);
|
||||
size_t local_id = get_local_id(0);
|
||||
size_t local_size = get_local_size(0);
|
||||
|
||||
while(1)
|
||||
uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
|
||||
|
||||
__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
|
||||
__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
|
||||
__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
|
||||
|
||||
__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
|
||||
__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
|
||||
|
||||
event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
|
||||
|
||||
wait_group_events(1, &e);
|
||||
|
||||
if(PRIM == GS_TRIANGLE_CLASS)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(local_id == 0)
|
||||
{
|
||||
batch_index = atomic_inc(&env->batch_counter);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(batch_index >= batch_count)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
|
||||
e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
|
||||
|
||||
__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
|
||||
__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
|
||||
__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
|
||||
|
||||
event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
|
||||
|
||||
wait_group_events(1, &e);
|
||||
}
|
||||
|
||||
if(PRIM == GS_TRIANGLE_CLASS)
|
||||
for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
|
||||
{
|
||||
int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
|
||||
int x = bin_index - y * bin_dim.z;
|
||||
|
||||
x += bin_dim.x;
|
||||
y += bin_dim.y;
|
||||
|
||||
BIN_TYPE visible = 0;
|
||||
|
||||
for(uint i = 0; i < batch_prim_count; i++)
|
||||
{
|
||||
e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
|
||||
|
||||
wait_group_events(1, &e);
|
||||
uchar4 r = bbox_cache[i];
|
||||
|
||||
BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
|
||||
|
||||
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
|
||||
{
|
||||
test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
|
||||
}
|
||||
|
||||
visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
|
||||
}
|
||||
|
||||
for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
|
||||
bin[bin_index] = visible;
|
||||
|
||||
if(visible != 0)
|
||||
{
|
||||
int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
|
||||
int x = bin_index - y * bin_dim.z;
|
||||
|
||||
x += bin_dim.x;
|
||||
y += bin_dim.y;
|
||||
|
||||
BIN_TYPE visible = 0;
|
||||
|
||||
for(uint i = 0; i < batch_prim_count; i++)
|
||||
{
|
||||
uchar4 r = bbox_cache[i];
|
||||
|
||||
BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
|
||||
|
||||
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
|
||||
{
|
||||
test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
|
||||
}
|
||||
|
||||
visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
|
||||
}
|
||||
|
||||
bin[bin_index] = visible;
|
||||
|
||||
if(visible != 0)
|
||||
{
|
||||
atomic_min(&env->bounds[bin_index].first, batch_index);
|
||||
atomic_max(&env->bounds[bin_index].last, batch_index);
|
||||
}
|
||||
atomic_min(&env->bounds[bin_index].first, batch_index);
|
||||
atomic_max(&env->bounds[bin_index].last, batch_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -998,10 +993,10 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
|
|||
}
|
||||
else if(is16bit(FPSM))
|
||||
{
|
||||
cd.x = (fd & 0x001f) << 3;
|
||||
cd.y = (fd & 0x03e0) >> 2;
|
||||
cd.z = (fd & 0x7c00) >> 7;
|
||||
cd.w = (fd & 0x8000) >> 8;
|
||||
cd.x = (fd << 3) & 0xf8;
|
||||
cd.y = (fd >> 2) & 0xf8;
|
||||
cd.z = (fd >> 7) & 0xf8;
|
||||
cd.w = (fd >> 8) & 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1077,9 +1072,9 @@ uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
|
|||
{
|
||||
uchar4 c;
|
||||
|
||||
c.x = (rgba & 0x001f) << 3;
|
||||
c.y = (rgba & 0x03e0) >> 2;
|
||||
c.z = (rgba & 0x7c00) >> 7;
|
||||
c.x = (rgba << 3) & 0xf8;
|
||||
c.y = (rgba >> 2) & 0xf8;
|
||||
c.z = (rgba >> 7) & 0xf8;
|
||||
c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
|
||||
|
||||
return c;
|
||||
|
@ -1202,7 +1197,7 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
|
|||
// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
|
||||
// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average
|
||||
|
||||
__kernel void KERNEL_TFX(
|
||||
__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
||||
__global gs_env* env,
|
||||
__global uchar* vm,
|
||||
__global uchar* tex,
|
||||
|
@ -1214,8 +1209,6 @@ __kernel void KERNEL_TFX(
|
|||
uint bin_count, // == bin_dim.z * bin_dim.w
|
||||
uchar4 bin_dim)
|
||||
{
|
||||
// TODO: try it the bin_index = atomic_inc(&env->bin_counter) way
|
||||
|
||||
uint x = get_global_id(0);
|
||||
uint y = get_global_id(1);
|
||||
|
||||
|
@ -1451,7 +1444,7 @@ __kernel void KERNEL_TFX(
|
|||
{
|
||||
if(!ABE || c.w == 0x80)
|
||||
{
|
||||
c.w = /*edge ? coverage :*/ 0x80; // TODO
|
||||
c.w = 0x80; // TODO: edge ? coverage : 0x80
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue