mirror of https://github.com/PCSX2/pcsx2.git
joined some tfx kernel calls, general speed up in most games
This commit is contained in:
parent
3d2b0e3766
commit
6f5cd1cd4d
|
@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
|
|||
#define BIN_SIZE (1u << BIN_SIZE_BITS)
|
||||
#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
|
||||
#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
|
||||
#define TFX_PARAM_SIZE 2048
|
||||
|
||||
#if MAX_PRIM_PER_BATCH == 64u
|
||||
#define BIN_TYPE cl_ulong
|
||||
|
@ -72,6 +73,7 @@ typedef struct
|
|||
|
||||
GSRendererCL::GSRendererCL()
|
||||
: m_vb_count(0)
|
||||
, m_synced(true)
|
||||
{
|
||||
m_nativeres = true; // ignore ini, sw is always native
|
||||
|
||||
|
@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL()
|
|||
InitCVB(GS_TRIANGLE_CLASS);
|
||||
InitCVB(GS_SPRITE_CLASS);
|
||||
|
||||
// NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without
|
||||
// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
|
||||
|
||||
m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
|
||||
m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize);
|
||||
}
|
||||
|
@ -122,13 +127,17 @@ static int pageuploads = 0;
|
|||
static int pageuploadcount = 0;
|
||||
static int tfxcount = 0;
|
||||
static int64 tfxpixels = 0;
|
||||
static int tfxselcount = 0;
|
||||
static int tfxdiffselcount = 0;
|
||||
|
||||
void GSRendererCL::VSync(int field)
|
||||
{
|
||||
GSRenderer::VSync(field);
|
||||
|
||||
//printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels);
|
||||
//printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount);
|
||||
pageuploads = pageuploadcount = tfxcount = tfxpixels = 0;
|
||||
tfxselcount = tfxdiffselcount = 0;
|
||||
|
||||
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
|
||||
}
|
||||
|
@ -284,7 +293,9 @@ void GSRendererCL::Draw()
|
|||
{
|
||||
size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
|
||||
size_t ib_size = m_index.tail * sizeof(uint32);
|
||||
size_t pb_size = sizeof(TFXParameter);
|
||||
size_t pb_size = TFX_PARAM_SIZE;
|
||||
|
||||
ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE);
|
||||
|
||||
if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
|
||||
{
|
||||
|
@ -366,12 +377,16 @@ void GSRendererCL::Draw()
|
|||
|
||||
m_vb_start = m_cl.vb.tail;
|
||||
m_vb_count = 0;
|
||||
m_pb_start = m_cl.pb.tail;
|
||||
m_pb_count = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: SIMD
|
||||
|
||||
uint32 vb_count = m_vb_count;
|
||||
ASSERT(m_pb_count < 256);
|
||||
|
||||
uint32 vb_count = m_vb_count | (m_pb_count << 24);
|
||||
|
||||
for(size_t i = 0; i < m_index.tail; i++)
|
||||
{
|
||||
|
@ -398,21 +413,25 @@ void GSRendererCL::Draw()
|
|||
job->rect.z = rect.z;
|
||||
job->rect.w = rect.w;
|
||||
job->ib_start = m_cl.ib.tail;
|
||||
job->ib_count = m_index.tail;
|
||||
job->pb_start = m_cl.pb.tail;
|
||||
job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
|
||||
job->fbp = pb->fbp;
|
||||
job->zbp = pb->zbp;
|
||||
job->bw = pb->bw;
|
||||
|
||||
#ifdef DEBUG
|
||||
job->param = pb;
|
||||
job->pb = pb;
|
||||
#endif
|
||||
|
||||
m_jobs.push_back(job);
|
||||
|
||||
m_vb_count += m_vertex.next;
|
||||
m_pb_count++;
|
||||
|
||||
m_cl.vb.tail += vb_size;
|
||||
m_cl.ib.tail += ib_size;
|
||||
m_cl.pb.tail += pb_size;
|
||||
|
||||
m_synced = false;
|
||||
|
||||
// mark pages used in rendering as source or target
|
||||
|
||||
if(job->sel.fwrite || job->sel.rfb)
|
||||
|
@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason)
|
|||
m_rw_pages[1][i] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap,
|
||||
// though it does not seem to be necessary even with GPU devices where it might be cached,
|
||||
// needs more testing...
|
||||
|
||||
//void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
|
||||
//m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr);
|
||||
m_synced = true;
|
||||
}
|
||||
|
||||
void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
|
||||
|
@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
|
||||
o->GetPagesAsBits(r, m_tmp_pages);
|
||||
|
||||
//if(!synced)
|
||||
if(!m_synced)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
|
@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
{
|
||||
if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
|
||||
|
||||
//if(!synced)
|
||||
if(!m_synced)
|
||||
{
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
|
||||
|
||||
|
@ -620,16 +634,7 @@ void GSRendererCL::Enqueue()
|
|||
|
||||
int primclass = m_jobs.front()->sel.prim;
|
||||
|
||||
uint32 n;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS: n = 1; break;
|
||||
case GS_LINE_CLASS: n = 2; break;
|
||||
case GS_TRIANGLE_CLASS: n = 3; break;
|
||||
case GS_SPRITE_CLASS: n = 2; break;
|
||||
default: __assume(0);
|
||||
}
|
||||
uint32 n = GSUtil::GetClassVertexCount(primclass);
|
||||
|
||||
PrimSelector psel;
|
||||
|
||||
|
@ -678,8 +683,6 @@ void GSRendererCL::Enqueue()
|
|||
|
||||
//
|
||||
|
||||
cl_kernel tfx_prev = NULL;
|
||||
|
||||
auto head = m_jobs.begin();
|
||||
|
||||
while(head != m_jobs.end())
|
||||
|
@ -692,8 +695,8 @@ void GSRendererCL::Enqueue()
|
|||
{
|
||||
auto job = next++;
|
||||
|
||||
uint32 cur_prim_count = (*job)->ib_count / n;
|
||||
uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
|
||||
uint32 cur_prim_count = (*job)->prim_count;
|
||||
uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0;
|
||||
|
||||
total_prim_count += cur_prim_count;
|
||||
|
||||
|
@ -775,9 +778,8 @@ void GSRendererCL::Enqueue()
|
|||
uint32 group_count = batch_count * item_count;
|
||||
|
||||
tk.setArg(1, (cl_uint)prim_count);
|
||||
tk.setArg(2, (cl_uint)batch_count);
|
||||
tk.setArg(3, (cl_uint)bin_count);
|
||||
tk.setArg(4, bin_dim);
|
||||
tk.setArg(2, (cl_uint)bin_count);
|
||||
tk.setArg(3, bin_dim);
|
||||
|
||||
m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
|
||||
}
|
||||
|
@ -789,68 +791,20 @@ void GSRendererCL::Enqueue()
|
|||
}
|
||||
}
|
||||
|
||||
//
|
||||
std::list<shared_ptr<TFXJob>> jobs(head, next);
|
||||
|
||||
uint32 prim_start = 0;
|
||||
|
||||
for(auto i = head; i != next; i++)
|
||||
{
|
||||
ASSERT(prim_start < MAX_PRIM_COUNT);
|
||||
|
||||
// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same
|
||||
// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
|
||||
|
||||
tfxcount++;
|
||||
|
||||
UpdateTextureCache((*i).get());
|
||||
|
||||
uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
|
||||
|
||||
// TODO: tile level z test
|
||||
|
||||
cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
|
||||
|
||||
if(tfx_prev != tfx())
|
||||
{
|
||||
tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
|
||||
|
||||
tfx_prev = tfx();
|
||||
}
|
||||
|
||||
tfx.setArg(4, (cl_uint)(*i)->pb_start);
|
||||
tfx.setArg(5, (cl_uint)prim_start);
|
||||
tfx.setArg(6, (cl_uint)prim_count_inner);
|
||||
tfx.setArg(7, (cl_uint)batch_count);
|
||||
tfx.setArg(8, (cl_uint)bin_count);
|
||||
tfx.setArg(9, bin_dim);
|
||||
|
||||
GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
|
||||
|
||||
r = r.ralign<Align_Outside>(GSVector2i(8, 8));
|
||||
|
||||
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
|
||||
|
||||
tfxpixels += r.width() * r.height();
|
||||
|
||||
InvalidateTextureCache((*i).get());
|
||||
|
||||
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
|
||||
|
||||
prim_start += prim_count_inner;
|
||||
}
|
||||
|
||||
//
|
||||
EnqueueTFX(jobs, bin_count, bin_dim);
|
||||
|
||||
if(total_prim_count > MAX_PRIM_COUNT)
|
||||
{
|
||||
prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
|
||||
|
||||
(*job)->ib_start += prim_count * n * sizeof(uint32);
|
||||
(*job)->ib_count -= prim_count * n;
|
||||
(*job)->prim_count -= prim_count;
|
||||
|
||||
next = job; // try again for the remainder
|
||||
|
||||
//printf("split %d\n", (*job)->ib_count / n);
|
||||
//printf("split %d\n", (*job)->prim_count);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -876,6 +830,131 @@ void GSRendererCL::Enqueue()
|
|||
m_cl.Map();
|
||||
}
|
||||
|
||||
void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
|
||||
{
|
||||
// join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages
|
||||
|
||||
//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
|
||||
|
||||
auto next = jobs.begin();
|
||||
|
||||
while(next != jobs.end())
|
||||
{
|
||||
auto prev = next++;
|
||||
|
||||
if(next == jobs.end())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw)
|
||||
{
|
||||
if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
|
||||
{
|
||||
bool overlap = false;
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
|
||||
{
|
||||
overlap = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(overlap)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if((*prev)->src_pages != NULL)
|
||||
{
|
||||
GSVector4i* src_pages = (*next)->GetSrcPages();
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
src_pages[i] |= (*prev)->src_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
if((*prev)->dst_pages != NULL)
|
||||
{
|
||||
GSVector4i* dst_pages = (*next)->GetDstPages();
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
dst_pages[i] |= (*prev)->dst_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
|
||||
GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
|
||||
|
||||
GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
|
||||
|
||||
(*next)->prim_count += (*prev)->prim_count;
|
||||
(*next)->ib_start = (*prev)->ib_start;
|
||||
|
||||
jobs.erase(prev);
|
||||
}
|
||||
}
|
||||
|
||||
//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
|
||||
|
||||
//
|
||||
|
||||
cl_kernel tfx_prev = NULL;
|
||||
|
||||
uint32 prim_start = 0;
|
||||
|
||||
for(auto i : jobs)
|
||||
{
|
||||
ASSERT(prim_start < MAX_PRIM_COUNT);
|
||||
|
||||
tfxcount++;
|
||||
|
||||
UpdateTextureCache(i.get());
|
||||
|
||||
uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
|
||||
|
||||
// TODO: tile level z test
|
||||
|
||||
cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
|
||||
|
||||
if(tfx_prev != tfx())
|
||||
{
|
||||
tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
|
||||
tfx.setArg(4, (cl_uint)m_pb_start);
|
||||
|
||||
tfx_prev = tfx();
|
||||
}
|
||||
|
||||
tfx.setArg(5, (cl_uint)prim_start);
|
||||
tfx.setArg(6, (cl_uint)prim_count);
|
||||
tfx.setArg(7, (cl_uint)bin_count);
|
||||
tfx.setArg(8, bin_dim);
|
||||
tfx.setArg(9, i->fbp);
|
||||
tfx.setArg(10, i->zbp);
|
||||
tfx.setArg(11, i->bw);
|
||||
|
||||
GSVector4i r = GSVector4i::load<false>(&i->rect);
|
||||
|
||||
r = r.ralign<Align_Outside>(GSVector2i(8, 8));
|
||||
|
||||
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
|
||||
|
||||
tfxpixels += r.width() * r.height();
|
||||
|
||||
InvalidateTextureCache(i.get());
|
||||
|
||||
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
|
||||
|
||||
prim_start += prim_count;
|
||||
}
|
||||
}
|
||||
|
||||
void GSRendererCL::UpdateTextureCache(TFXJob* job)
|
||||
{
|
||||
if(job->src_pages == NULL) return;
|
||||
|
@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
|
|||
return true;
|
||||
}
|
||||
|
||||
//////////
|
||||
//
|
||||
|
||||
GSRendererCL::TFXJob::TFXJob()
|
||||
: src_pages(NULL)
|
||||
, dst_pages(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
GSRendererCL::TFXJob::~TFXJob()
|
||||
{
|
||||
if(src_pages != NULL) _aligned_free(src_pages);
|
||||
if(dst_pages != NULL) _aligned_free(dst_pages);
|
||||
}
|
||||
|
||||
GSVector4i* GSRendererCL::TFXJob::GetSrcPages()
|
||||
{
|
||||
if(src_pages == NULL)
|
||||
{
|
||||
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
src_pages[0] = GSVector4i::zero();
|
||||
src_pages[1] = GSVector4i::zero();
|
||||
src_pages[2] = GSVector4i::zero();
|
||||
src_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return src_pages;
|
||||
}
|
||||
|
||||
GSVector4i* GSRendererCL::TFXJob::GetDstPages()
|
||||
{
|
||||
if(dst_pages == NULL)
|
||||
{
|
||||
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
dst_pages[0] = GSVector4i::zero();
|
||||
dst_pages[1] = GSVector4i::zero();
|
||||
dst_pages[2] = GSVector4i::zero();
|
||||
dst_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return dst_pages;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
//#define IOCL_DEBUG
|
||||
|
||||
|
@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL()
|
|||
ib.mapped_ptr = ib.ptr = NULL;
|
||||
pb.mapped_ptr = pb.ptr = NULL;
|
||||
|
||||
pb.size = sizeof(TFXParameter) * 256;
|
||||
pb.size = TFX_PARAM_SIZE * 256;
|
||||
pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
|
||||
pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
|
||||
|
||||
|
@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map()
|
|||
{
|
||||
Unmap();
|
||||
|
||||
// TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+
|
||||
|
||||
if(vb.head < vb.size)
|
||||
{
|
||||
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
|
||||
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
|
||||
ASSERT(((size_t)vb.ptr & 15) == 0);
|
||||
ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
|
||||
}
|
||||
|
||||
if(ib.head < ib.size)
|
||||
|
@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map()
|
|||
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
|
||||
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
|
||||
ASSERT(((size_t)pb.ptr & 15) == 0);
|
||||
ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt)
|
|||
opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
|
||||
opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";
|
||||
opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
|
||||
opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u ";
|
||||
#ifdef IOCL_DEBUG
|
||||
opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
|
||||
#endif
|
||||
|
|
|
@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer
|
|||
{
|
||||
public:
|
||||
struct { int x, y, z, w; } rect;
|
||||
TFXSelector sel; // uses primclass, solidrect only
|
||||
uint32 ib_start, ib_count;
|
||||
uint32 pb_start;
|
||||
TFXSelector sel;
|
||||
uint32 ib_start;
|
||||
uint32 prim_count;
|
||||
GSVector4i* src_pages; // read by any texture level
|
||||
GSVector4i* dst_pages; // f/z writes to it
|
||||
uint32 fbp, zbp, bw;
|
||||
#ifdef DEBUG
|
||||
TFXParameter* param;
|
||||
TFXParameter* pb;
|
||||
#endif
|
||||
TFXJob()
|
||||
: src_pages(NULL)
|
||||
, dst_pages(NULL)
|
||||
{
|
||||
}
|
||||
TFXJob();
|
||||
virtual ~TFXJob();
|
||||
|
||||
virtual ~TFXJob()
|
||||
{
|
||||
if(src_pages != NULL) _aligned_free(src_pages);
|
||||
if(dst_pages != NULL) _aligned_free(dst_pages);
|
||||
}
|
||||
|
||||
GSVector4i* GetSrcPages()
|
||||
{
|
||||
if(src_pages == NULL)
|
||||
{
|
||||
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
src_pages[0] = GSVector4i::zero();
|
||||
src_pages[1] = GSVector4i::zero();
|
||||
src_pages[2] = GSVector4i::zero();
|
||||
src_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return src_pages;
|
||||
}
|
||||
|
||||
GSVector4i* GetDstPages()
|
||||
{
|
||||
if(dst_pages == NULL)
|
||||
{
|
||||
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
dst_pages[0] = GSVector4i::zero();
|
||||
dst_pages[1] = GSVector4i::zero();
|
||||
dst_pages[2] = GSVector4i::zero();
|
||||
dst_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return dst_pages;
|
||||
}
|
||||
GSVector4i* GetSrcPages();
|
||||
GSVector4i* GetDstPages();
|
||||
};
|
||||
|
||||
class CL
|
||||
|
@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer
|
|||
std::list<shared_ptr<TFXJob>> m_jobs;
|
||||
uint32 m_vb_start;
|
||||
uint32 m_vb_count;
|
||||
uint32 m_pb_start;
|
||||
uint32 m_pb_count;
|
||||
bool m_synced;
|
||||
|
||||
void Enqueue();
|
||||
void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
|
||||
void UpdateTextureCache(TFXJob* job);
|
||||
void InvalidateTextureCache(TFXJob* job);
|
||||
|
||||
|
|
|
@ -94,6 +94,7 @@ static class GSUtilMaps
|
|||
public:
|
||||
uint8 PrimClassField[8];
|
||||
uint8 VertexCountField[8];
|
||||
uint8 ClassVertexCountField[4];
|
||||
uint32 CompatibleBitsField[64][2];
|
||||
uint32 SharedBitsField[64][2];
|
||||
|
||||
|
@ -117,6 +118,11 @@ public:
|
|||
VertexCountField[GS_SPRITE] = 2;
|
||||
VertexCountField[GS_INVALID] = 1;
|
||||
|
||||
ClassVertexCountField[GS_POINT_CLASS] = 1;
|
||||
ClassVertexCountField[GS_LINE_CLASS] = 2;
|
||||
ClassVertexCountField[GS_TRIANGLE_CLASS] = 3;
|
||||
ClassVertexCountField[GS_SPRITE_CLASS] = 2;
|
||||
|
||||
memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField));
|
||||
|
||||
for(int i = 0; i < 64; i++)
|
||||
|
@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim)
|
|||
return s_maps.VertexCountField[prim];
|
||||
}
|
||||
|
||||
int GSUtil::GetClassVertexCount(uint32 primclass)
|
||||
{
|
||||
return s_maps.ClassVertexCountField[primclass];
|
||||
}
|
||||
|
||||
const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
|
||||
{
|
||||
return s_maps.SharedBitsField[dpsm];
|
||||
|
|
|
@ -30,6 +30,7 @@ public:
|
|||
|
||||
static GS_PRIM_CLASS GetPrimClass(uint32 prim);
|
||||
static int GetVertexCount(uint32 prim);
|
||||
static int GetClassVertexCount(uint32 primclass);
|
||||
|
||||
static const uint32* HasSharedBitsPtr(uint32 dpsm);
|
||||
static bool HasSharedBits(uint32 spsm, const uint32* ptr);
|
||||
|
|
|
@ -36,7 +36,10 @@ typedef struct
|
|||
|
||||
typedef struct
|
||||
{
|
||||
gs_vertex v[4];
|
||||
gs_vertex v[3];
|
||||
uint zmin;
|
||||
uint pb_index;
|
||||
uint _pad[2];
|
||||
} gs_prim;
|
||||
|
||||
typedef struct
|
||||
|
@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM(
|
|||
|
||||
ib += prim_index * VERTEX_PER_PRIM;
|
||||
|
||||
prim->pb_index = ib[0] >> 24;
|
||||
|
||||
__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
|
||||
__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
|
||||
__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];
|
||||
|
||||
int2 pmin, pmax;
|
||||
|
||||
if(PRIM == GS_POINT_CLASS)
|
||||
{
|
||||
__global gs_vertex* v0 = &vb[ib[0]];
|
||||
|
||||
pmin = pmax = convert_int2_rte(v0->p.xy);
|
||||
|
||||
prim->v[0].p = v0->p;
|
||||
|
@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM(
|
|||
}
|
||||
else if(PRIM == GS_LINE_CLASS)
|
||||
{
|
||||
int2 p0 = convert_int2_rte(vb[ib[0]].p.xy);
|
||||
int2 p1 = convert_int2_rte(vb[ib[1]].p.xy);
|
||||
int2 p0 = convert_int2_rte(v0->p.xy);
|
||||
int2 p1 = convert_int2_rte(v1->p.xy);
|
||||
|
||||
pmin = min(p0, p1);
|
||||
pmax = max(p0, p1);
|
||||
}
|
||||
else if(PRIM == GS_TRIANGLE_CLASS)
|
||||
{
|
||||
__global gs_vertex* v0 = &vb[ib[0]];
|
||||
__global gs_vertex* v1 = &vb[ib[1]];
|
||||
__global gs_vertex* v2 = &vb[ib[2]];
|
||||
|
||||
int2 p0 = convert_int2_rtp(v0->p.xy);
|
||||
int2 p1 = convert_int2_rtp(v1->p.xy);
|
||||
int2 p2 = convert_int2_rtp(v2->p.xy);
|
||||
|
@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM(
|
|||
pmax = max(max(p0, p1), p2);
|
||||
|
||||
// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
|
||||
// pass the minimum through the unused 4th padding vector
|
||||
// only interpolate the relative and hopefully small values
|
||||
// only interpolate the relative to zmin and hopefully small values
|
||||
|
||||
uint zmin = min(min(v0->z, v1->z), v2->z);
|
||||
|
||||
|
@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM(
|
|||
prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
|
||||
prim->v[2].tc = v2->tc;
|
||||
|
||||
prim->v[3].z = zmin;
|
||||
prim->zmin = zmin;
|
||||
|
||||
float4 dp0 = v1->p - v0->p;
|
||||
float4 dp1 = v0->p - v2->p;
|
||||
|
@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM(
|
|||
}
|
||||
else if(PRIM == GS_SPRITE_CLASS)
|
||||
{
|
||||
__global gs_vertex* v0 = &vb[ib[0]];
|
||||
__global gs_vertex* v1 = &vb[ib[1]];
|
||||
|
||||
int2 p0 = convert_int2_rtp(v0->p.xy);
|
||||
int2 p1 = convert_int2_rtp(v1->p.xy);
|
||||
|
||||
|
@ -785,7 +784,6 @@ __kernel void KERNEL_TILE(
|
|||
__kernel void KERNEL_TILE(
|
||||
__global gs_env* env,
|
||||
uint prim_count,
|
||||
uint batch_count,
|
||||
uint bin_count, // == bin_dim.z * bin_dim.w
|
||||
uchar4 bin_dim)
|
||||
{
|
||||
|
@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
uint pb_start,
|
||||
uint prim_start,
|
||||
uint prim_count,
|
||||
uint batch_count,
|
||||
uint bin_count, // == bin_dim.z * bin_dim.w
|
||||
uchar4 bin_dim)
|
||||
uchar4 bin_dim,
|
||||
uint fbp,
|
||||
uint zbp,
|
||||
uint bw)
|
||||
{
|
||||
uint x = get_global_id(0);
|
||||
uint y = get_global_id(1);
|
||||
|
@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
|
||||
//
|
||||
|
||||
__global gs_param* pb = (__global gs_param*)(pb_base + pb_start);
|
||||
|
||||
int2 pi = (int2)(x, y);
|
||||
float2 pf = convert_float2(pi);
|
||||
|
||||
if(!NOSCISSOR)
|
||||
{
|
||||
if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM);
|
||||
int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM);
|
||||
int faddr = PixelAddress(x, y, fbp, bw, FPSM);
|
||||
int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);
|
||||
|
||||
uint fd, zd; // TODO: fd as int4 and only pack before writing out?
|
||||
|
||||
|
@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
|
||||
__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
|
||||
|
||||
pb_base += pb_start;
|
||||
|
||||
BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
|
||||
|
||||
for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
|
||||
|
@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
break;
|
||||
}
|
||||
|
||||
__global gs_prim* prim = &prim_base[prim_index + i];
|
||||
|
||||
bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
|
||||
|
||||
__global gs_prim* prim = &prim_base[prim_index + i];
|
||||
__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);
|
||||
|
||||
if(!NOSCISSOR)
|
||||
{
|
||||
if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint2 zf;
|
||||
float3 t;
|
||||
int4 c;
|
||||
|
@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
|
||||
float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
|
||||
|
||||
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
|
||||
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
|
||||
zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
|
||||
|
||||
t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
|
||||
|
@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
|
|||
}
|
||||
}
|
||||
|
||||
// read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway)
|
||||
// read mask
|
||||
|
||||
uint fm = pb->fm;
|
||||
uint zm = pb->zm;
|
||||
|
|
Loading…
Reference in New Issue