joined some tfx kernel calls, general speed up in most games

This commit is contained in:
gabest11 2014-09-21 18:13:55 +02:00 committed by Gregory Hainaut
parent 3d2b0e3766
commit 6f5cd1cd4d
5 changed files with 271 additions and 165 deletions

View File

@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
#define BIN_SIZE (1u << BIN_SIZE_BITS)
#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
#define TFX_PARAM_SIZE 2048
#if MAX_PRIM_PER_BATCH == 64u
#define BIN_TYPE cl_ulong
@ -72,6 +73,7 @@ typedef struct
GSRendererCL::GSRendererCL()
: m_vb_count(0)
, m_synced(true)
{
m_nativeres = true; // ignore ini, sw is always native
@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL()
InitCVB(GS_TRIANGLE_CLASS);
InitCVB(GS_SPRITE_CLASS);
// NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without
// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
}
@ -122,13 +127,17 @@ static int pageuploads = 0;
static int pageuploadcount = 0;
static int tfxcount = 0;
static int64 tfxpixels = 0;
static int tfxselcount = 0;
static int tfxdiffselcount = 0;
void GSRendererCL::VSync(int field)
{
GSRenderer::VSync(field);
//printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels);
//printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount);
pageuploads = pageuploadcount = tfxcount = tfxpixels = 0;
tfxselcount = tfxdiffselcount = 0;
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
}
@ -284,7 +293,9 @@ void GSRendererCL::Draw()
{
size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
size_t ib_size = m_index.tail * sizeof(uint32);
size_t pb_size = sizeof(TFXParameter);
size_t pb_size = TFX_PARAM_SIZE;
ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE);
if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
{
@ -366,12 +377,16 @@ void GSRendererCL::Draw()
m_vb_start = m_cl.vb.tail;
m_vb_count = 0;
m_pb_start = m_cl.pb.tail;
m_pb_count = 0;
}
else
{
// TODO: SIMD
uint32 vb_count = m_vb_count;
ASSERT(m_pb_count < 256);
uint32 vb_count = m_vb_count | (m_pb_count << 24);
for(size_t i = 0; i < m_index.tail; i++)
{
@ -398,21 +413,25 @@ void GSRendererCL::Draw()
job->rect.z = rect.z;
job->rect.w = rect.w;
job->ib_start = m_cl.ib.tail;
job->ib_count = m_index.tail;
job->pb_start = m_cl.pb.tail;
job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
job->fbp = pb->fbp;
job->zbp = pb->zbp;
job->bw = pb->bw;
#ifdef DEBUG
job->param = pb;
job->pb = pb;
#endif
m_jobs.push_back(job);
m_vb_count += m_vertex.next;
m_pb_count++;
m_cl.vb.tail += vb_size;
m_cl.ib.tail += ib_size;
m_cl.pb.tail += pb_size;
m_synced = false;
// mark pages used in rendering as source or target
if(job->sel.fwrite || job->sel.rfb)
@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason)
m_rw_pages[1][i] = GSVector4i::zero();
}
// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap,
// though it does not seem to be necessary even with GPU devices where it might be cached,
// needs more testing...
//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
m_synced = true;
}
void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
o->GetPagesAsBits(r, m_tmp_pages);
//if(!synced)
if(!m_synced)
{
for(int i = 0; i < 4; i++)
{
@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
{
if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
//if(!synced)
if(!m_synced)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
@ -620,16 +634,7 @@ void GSRendererCL::Enqueue()
int primclass = m_jobs.front()->sel.prim;
uint32 n;
switch(primclass)
{
case GS_POINT_CLASS: n = 1; break;
case GS_LINE_CLASS: n = 2; break;
case GS_TRIANGLE_CLASS: n = 3; break;
case GS_SPRITE_CLASS: n = 2; break;
default: __assume(0);
}
uint32 n = GSUtil::GetClassVertexCount(primclass);
PrimSelector psel;
@ -678,8 +683,6 @@ void GSRendererCL::Enqueue()
//
cl_kernel tfx_prev = NULL;
auto head = m_jobs.begin();
while(head != m_jobs.end())
@ -692,8 +695,8 @@ void GSRendererCL::Enqueue()
{
auto job = next++;
uint32 cur_prim_count = (*job)->ib_count / n;
uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
uint32 cur_prim_count = (*job)->prim_count;
uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0;
total_prim_count += cur_prim_count;
@ -775,9 +778,8 @@ void GSRendererCL::Enqueue()
uint32 group_count = batch_count * item_count;
tk.setArg(1, (cl_uint)prim_count);
tk.setArg(2, (cl_uint)batch_count);
tk.setArg(3, (cl_uint)bin_count);
tk.setArg(4, bin_dim);
tk.setArg(2, (cl_uint)bin_count);
tk.setArg(3, bin_dim);
m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
}
@ -789,68 +791,20 @@ void GSRendererCL::Enqueue()
}
}
//
std::list<shared_ptr<TFXJob>> jobs(head, next);
uint32 prim_start = 0;
for(auto i = head; i != next; i++)
{
ASSERT(prim_start < MAX_PRIM_COUNT);
// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same
// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
tfxcount++;
UpdateTextureCache((*i).get());
uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
// TODO: tile level z test
cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
if(tfx_prev != tfx())
{
tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
tfx_prev = tfx();
}
tfx.setArg(4, (cl_uint)(*i)->pb_start);
tfx.setArg(5, (cl_uint)prim_start);
tfx.setArg(6, (cl_uint)prim_count_inner);
tfx.setArg(7, (cl_uint)batch_count);
tfx.setArg(8, (cl_uint)bin_count);
tfx.setArg(9, bin_dim);
GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
r = r.ralign<Align_Outside>(GSVector2i(8, 8));
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
tfxpixels += r.width() * r.height();
InvalidateTextureCache((*i).get());
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
prim_start += prim_count_inner;
}
//
EnqueueTFX(jobs, bin_count, bin_dim);
if(total_prim_count > MAX_PRIM_COUNT)
{
prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
(*job)->ib_start += prim_count * n * sizeof(uint32);
(*job)->ib_count -= prim_count * n;
(*job)->prim_count -= prim_count;
next = job; // try again for the remainder
//printf("split %d\n", (*job)->ib_count / n);
//printf("split %d\n", (*job)->prim_count);
}
break;
@ -876,6 +830,131 @@ void GSRendererCL::Enqueue()
m_cl.Map();
}
void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
{
// join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages
//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
auto next = jobs.begin();
while(next != jobs.end())
{
auto prev = next++;
if(next == jobs.end())
{
break;
}
if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw)
{
if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
{
bool overlap = false;
for(int i = 0; i < 4; i++)
{
if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
{
overlap = true;
break;
}
}
if(overlap)
{
continue;
}
}
if((*prev)->src_pages != NULL)
{
GSVector4i* src_pages = (*next)->GetSrcPages();
for(int i = 0; i < 4; i++)
{
src_pages[i] |= (*prev)->src_pages[i];
}
}
if((*prev)->dst_pages != NULL)
{
GSVector4i* dst_pages = (*next)->GetDstPages();
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= (*prev)->dst_pages[i];
}
}
GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
(*next)->prim_count += (*prev)->prim_count;
(*next)->ib_start = (*prev)->ib_start;
jobs.erase(prev);
}
}
//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
//
cl_kernel tfx_prev = NULL;
uint32 prim_start = 0;
for(auto i : jobs)
{
ASSERT(prim_start < MAX_PRIM_COUNT);
tfxcount++;
UpdateTextureCache(i.get());
uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
// TODO: tile level z test
cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
if(tfx_prev != tfx())
{
tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
tfx.setArg(4, (cl_uint)m_pb_start);
tfx_prev = tfx();
}
tfx.setArg(5, (cl_uint)prim_start);
tfx.setArg(6, (cl_uint)prim_count);
tfx.setArg(7, (cl_uint)bin_count);
tfx.setArg(8, bin_dim);
tfx.setArg(9, i->fbp);
tfx.setArg(10, i->zbp);
tfx.setArg(11, i->bw);
GSVector4i r = GSVector4i::load<false>(&i->rect);
r = r.ralign<Align_Outside>(GSVector2i(8, 8));
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
tfxpixels += r.width() * r.height();
InvalidateTextureCache(i.get());
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
prim_start += prim_count;
}
}
void GSRendererCL::UpdateTextureCache(TFXJob* job)
{
if(job->src_pages == NULL) return;
@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
return true;
}
//////////
//
GSRendererCL::TFXJob::TFXJob()
: src_pages(NULL)
, dst_pages(NULL)
{
}
GSRendererCL::TFXJob::~TFXJob()
{
if(src_pages != NULL) _aligned_free(src_pages);
if(dst_pages != NULL) _aligned_free(dst_pages);
}
GSVector4i* GSRendererCL::TFXJob::GetSrcPages()
{
if(src_pages == NULL)
{
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
src_pages[0] = GSVector4i::zero();
src_pages[1] = GSVector4i::zero();
src_pages[2] = GSVector4i::zero();
src_pages[3] = GSVector4i::zero();
}
return src_pages;
}
GSVector4i* GSRendererCL::TFXJob::GetDstPages()
{
if(dst_pages == NULL)
{
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
dst_pages[0] = GSVector4i::zero();
dst_pages[1] = GSVector4i::zero();
dst_pages[2] = GSVector4i::zero();
dst_pages[3] = GSVector4i::zero();
}
return dst_pages;
}
//
//#define IOCL_DEBUG
@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL()
ib.mapped_ptr = ib.ptr = NULL;
pb.mapped_ptr = pb.ptr = NULL;
pb.size = sizeof(TFXParameter) * 256;
pb.size = TFX_PARAM_SIZE * 256;
pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map()
{
Unmap();
// TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+
if(vb.head < vb.size)
{
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
ASSERT(((size_t)vb.ptr & 15) == 0);
ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
}
if(ib.head < ib.size)
@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map()
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
ASSERT(((size_t)pb.ptr & 15) == 0);
ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
}
}
@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt)
opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";
opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u ";
#ifdef IOCL_DEBUG
opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
#endif

View File

@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer
{
public:
struct { int x, y, z, w; } rect;
TFXSelector sel; // uses primclass, solidrect only
uint32 ib_start, ib_count;
uint32 pb_start;
TFXSelector sel;
uint32 ib_start;
uint32 prim_count;
GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it
uint32 fbp, zbp, bw;
#ifdef DEBUG
TFXParameter* param;
TFXParameter* pb;
#endif
TFXJob()
: src_pages(NULL)
, dst_pages(NULL)
{
}
TFXJob();
virtual ~TFXJob();
virtual ~TFXJob()
{
if(src_pages != NULL) _aligned_free(src_pages);
if(dst_pages != NULL) _aligned_free(dst_pages);
}
GSVector4i* GetSrcPages()
{
if(src_pages == NULL)
{
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
src_pages[0] = GSVector4i::zero();
src_pages[1] = GSVector4i::zero();
src_pages[2] = GSVector4i::zero();
src_pages[3] = GSVector4i::zero();
}
return src_pages;
}
GSVector4i* GetDstPages()
{
if(dst_pages == NULL)
{
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
dst_pages[0] = GSVector4i::zero();
dst_pages[1] = GSVector4i::zero();
dst_pages[2] = GSVector4i::zero();
dst_pages[3] = GSVector4i::zero();
}
return dst_pages;
}
GSVector4i* GetSrcPages();
GSVector4i* GetDstPages();
};
class CL
@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer
std::list<shared_ptr<TFXJob>> m_jobs;
uint32 m_vb_start;
uint32 m_vb_count;
uint32 m_pb_start;
uint32 m_pb_count;
bool m_synced;
void Enqueue();
void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
void UpdateTextureCache(TFXJob* job);
void InvalidateTextureCache(TFXJob* job);

View File

@ -94,6 +94,7 @@ static class GSUtilMaps
public:
uint8 PrimClassField[8];
uint8 VertexCountField[8];
uint8 ClassVertexCountField[4];
uint32 CompatibleBitsField[64][2];
uint32 SharedBitsField[64][2];
@ -117,6 +118,11 @@ public:
VertexCountField[GS_SPRITE] = 2;
VertexCountField[GS_INVALID] = 1;
ClassVertexCountField[GS_POINT_CLASS] = 1;
ClassVertexCountField[GS_LINE_CLASS] = 2;
ClassVertexCountField[GS_TRIANGLE_CLASS] = 3;
ClassVertexCountField[GS_SPRITE_CLASS] = 2;
memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField));
for(int i = 0; i < 64; i++)
@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim)
return s_maps.VertexCountField[prim];
}
int GSUtil::GetClassVertexCount(uint32 primclass)
{
return s_maps.ClassVertexCountField[primclass];
}
const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
{
return s_maps.SharedBitsField[dpsm];

View File

@ -30,6 +30,7 @@ public:
static GS_PRIM_CLASS GetPrimClass(uint32 prim);
static int GetVertexCount(uint32 prim);
static int GetClassVertexCount(uint32 primclass);
static const uint32* HasSharedBitsPtr(uint32 dpsm);
static bool HasSharedBits(uint32 spsm, const uint32* ptr);

View File

@ -36,7 +36,10 @@ typedef struct
typedef struct
{
gs_vertex v[4];
gs_vertex v[3];
uint zmin;
uint pb_index;
uint _pad[2];
} gs_prim;
typedef struct
@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM(
ib += prim_index * VERTEX_PER_PRIM;
prim->pb_index = ib[0] >> 24;
__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];
int2 pmin, pmax;
if(PRIM == GS_POINT_CLASS)
{
__global gs_vertex* v0 = &vb[ib[0]];
pmin = pmax = convert_int2_rte(v0->p.xy);
prim->v[0].p = v0->p;
@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM(
}
else if(PRIM == GS_LINE_CLASS)
{
int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
int2 p0 = convert_int2_rte(v0->p.xy);
int2 p1 = convert_int2_rte(v1->p.xy);
pmin = min(p0, p1);
pmax = max(p0, p1);
}
else if(PRIM == GS_TRIANGLE_CLASS)
{
__global gs_vertex* v0 = &vb[ib[0]];
__global gs_vertex* v1 = &vb[ib[1]];
__global gs_vertex* v2 = &vb[ib[2]];
int2 p0 = convert_int2_rtp(v0->p.xy);
int2 p1 = convert_int2_rtp(v1->p.xy);
int2 p2 = convert_int2_rtp(v2->p.xy);
@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM(
pmax = max(max(p0, p1), p2);
// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
// pass the minimum through the unused 4th padding vector
// only interpolate the relative and hopefully small values
// only interpolate the relative to zmin and hopefully small values
uint zmin = min(min(v0->z, v1->z), v2->z);
@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM(
prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
prim->v[2].tc = v2->tc;
prim->v[3].z = zmin;
prim->zmin = zmin;
float4 dp0 = v1->p - v0->p;
float4 dp1 = v0->p - v2->p;
@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM(
}
else if(PRIM == GS_SPRITE_CLASS)
{
__global gs_vertex* v0 = &vb[ib[0]];
__global gs_vertex* v1 = &vb[ib[1]];
int2 p0 = convert_int2_rtp(v0->p.xy);
int2 p1 = convert_int2_rtp(v1->p.xy);
@ -785,7 +784,6 @@ __kernel void KERNEL_TILE(
__kernel void KERNEL_TILE(
__global gs_env* env,
uint prim_count,
uint batch_count,
uint bin_count, // == bin_dim.z * bin_dim.w
uchar4 bin_dim)
{
@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
uint pb_start,
uint prim_start,
uint prim_count,
uint batch_count,
uint bin_count, // == bin_dim.z * bin_dim.w
uchar4 bin_dim)
uchar4 bin_dim,
uint fbp,
uint zbp,
uint bw)
{
uint x = get_global_id(0);
uint y = get_global_id(1);
@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
//
__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
int2 pi = (int2)(x, y);
float2 pf = convert_float2(pi);
if(!NOSCISSOR)
{
if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
{
return;
}
}
int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
int faddr = PixelAddress(x, y, fbp, bw, FPSM);
int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);
uint fd, zd; // TODO: fd as int4 and only pack before writing out?
@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
pb_base += pb_start;
BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
break;
}
__global gs_prim* prim = &prim_base[prim_index + i];
bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
__global gs_prim* prim = &prim_base[prim_index + i];
__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);
if(!NOSCISSOR)
{
if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
{
continue;
}
}
uint2 zf;
float3 t;
int4 c;
@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
}
}
// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
// read mask
uint fm = pb->fm;
uint zm = pb->zm;