diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 77693579d6..ff7aa4d43f 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; #define BIN_SIZE (1u << BIN_SIZE_BITS) #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) +#define TFX_PARAM_SIZE 2048 #if MAX_PRIM_PER_BATCH == 64u #define BIN_TYPE cl_ulong @@ -72,6 +73,7 @@ typedef struct GSRendererCL::GSRendererCL() : m_vb_count(0) + , m_synced(true) { m_nativeres = true; // ignore ini, sw is always native @@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL() InitCVB(GS_TRIANGLE_CLASS); InitCVB(GS_SPRITE_CLASS); + // NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without + // mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be. + m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL); m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize); } @@ -122,13 +127,17 @@ static int pageuploads = 0; static int pageuploadcount = 0; static int tfxcount = 0; static int64 tfxpixels = 0; +static int tfxselcount = 0; +static int tfxdiffselcount = 0; void GSRendererCL::VSync(int field) { GSRenderer::VSync(field); //printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels); + //printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount); pageuploads = pageuploadcount = tfxcount = tfxpixels = 0; + tfxselcount = tfxdiffselcount = 0; //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); } @@ -284,7 +293,9 @@ void GSRendererCL::Draw() { size_t vb_size = m_vertex.next * sizeof(GSVertexCL); size_t ib_size = m_index.tail * sizeof(uint32); - size_t pb_size = sizeof(TFXParameter); + size_t pb_size = TFX_PARAM_SIZE; + + ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE); if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size) { @@ -366,12 +377,16 @@ void GSRendererCL::Draw() m_vb_start = m_cl.vb.tail; m_vb_count = 0; + m_pb_start = m_cl.pb.tail; + m_pb_count = 0; } else { // TODO: SIMD - uint32 vb_count = m_vb_count; + ASSERT(m_pb_count < 256); + + uint32 vb_count = m_vb_count | (m_pb_count << 24); for(size_t i = 0; i < m_index.tail; i++) { @@ -398,21 +413,25 @@ void GSRendererCL::Draw() job->rect.z = rect.z; job->rect.w = rect.w; job->ib_start = m_cl.ib.tail; - job->ib_count = m_index.tail; - job->pb_start = m_cl.pb.tail; + job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass); + job->fbp = pb->fbp; + job->zbp = pb->zbp; + job->bw = pb->bw; #ifdef DEBUG - job->param = pb; + job->pb = pb; #endif - m_jobs.push_back(job); m_vb_count += m_vertex.next; + m_pb_count++; m_cl.vb.tail += vb_size; m_cl.ib.tail += ib_size; m_cl.pb.tail += pb_size; + m_synced = false; + // mark pages used in rendering as source or target if(job->sel.fwrite || job->sel.rfb) @@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason) m_rw_pages[1][i] = GSVector4i::zero(); } - // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, - // though it does not seem to be necessary even with GPU devices where it might be cached, - // needs more testing... - - //void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); - //m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr); + m_synced = true; } void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) @@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS o->GetPagesAsBits(r, m_tmp_pages); - //if(!synced) + if(!m_synced) { for(int i = 0; i < 4; i++) { @@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS { if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} - //if(!synced) + if(!m_synced) { GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); @@ -620,16 +634,7 @@ void GSRendererCL::Enqueue() int primclass = m_jobs.front()->sel.prim; - uint32 n; - - switch(primclass) - { - case GS_POINT_CLASS: n = 1; break; - case GS_LINE_CLASS: n = 2; break; - case GS_TRIANGLE_CLASS: n = 3; break; - case GS_SPRITE_CLASS: n = 2; break; - default: __assume(0); - } + uint32 n = GSUtil::GetClassVertexCount(primclass); PrimSelector psel; @@ -678,8 +683,6 @@ void GSRendererCL::Enqueue() // - cl_kernel tfx_prev = NULL; - auto head = m_jobs.begin(); while(head != m_jobs.end()) @@ -692,8 +695,8 @@ void GSRendererCL::Enqueue() { auto job = next++; - uint32 cur_prim_count = (*job)->ib_count / n; - uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0; + uint32 cur_prim_count = (*job)->prim_count; + uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0; total_prim_count += cur_prim_count; @@ -775,9 +778,8 @@ void GSRendererCL::Enqueue() uint32 group_count = batch_count * item_count; tk.setArg(1, (cl_uint)prim_count); - tk.setArg(2, (cl_uint)batch_count); - tk.setArg(3, (cl_uint)bin_count); - tk.setArg(4, bin_dim); + tk.setArg(2, (cl_uint)bin_count); + tk.setArg(3, bin_dim); m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count)); } @@ -789,68 +791,20 @@ void GSRendererCL::Enqueue() } } - // + std::list> jobs(head, next); - uint32 prim_start = 0; - - for(auto i = head; i != next; i++) - { - ASSERT(prim_start < MAX_PRIM_COUNT); - - // TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same - // move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim - - tfxcount++; - - UpdateTextureCache((*i).get()); - - uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start); - - // TODO: tile level z test - - cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel); - - if(tfx_prev != tfx()) - { - tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); - - tfx_prev = tfx(); - } - - tfx.setArg(4, (cl_uint)(*i)->pb_start); - tfx.setArg(5, (cl_uint)prim_start); - tfx.setArg(6, (cl_uint)prim_count_inner); - tfx.setArg(7, (cl_uint)batch_count); - tfx.setArg(8, (cl_uint)bin_count); - tfx.setArg(9, bin_dim); - - GSVector4i r = GSVector4i::load(&(*i)->rect); - - r = r.ralign(GSVector2i(8, 8)); - - m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); - - tfxpixels += r.width() * r.height(); - - InvalidateTextureCache((*i).get()); - - // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily - - prim_start += prim_count_inner; - } - - // + EnqueueTFX(jobs, bin_count, bin_dim); if(total_prim_count > MAX_PRIM_COUNT) { prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); (*job)->ib_start += prim_count * n * sizeof(uint32); - (*job)->ib_count -= prim_count * n; + (*job)->prim_count -= prim_count; next = job; // try again for the remainder - //printf("split %d\n", (*job)->ib_count / n); + //printf("split %d\n", (*job)->prim_count); } break; @@ -876,6 +830,131 @@ void GSRendererCL::Enqueue() m_cl.Map(); } +void GSRendererCL::EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim) +{ + // join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages + + //printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); + + auto next = jobs.begin(); + + while(next != jobs.end()) + { + auto prev = next++; + + if(next == jobs.end()) + { + break; + } + + if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw) + { + if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL) + { + bool overlap = false; + + for(int i = 0; i < 4; i++) + { + if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero())) + { + overlap = true; + + break; + } + } + + if(overlap) + { + continue; + } + } + + if((*prev)->src_pages != NULL) + { + GSVector4i* src_pages = (*next)->GetSrcPages(); + + for(int i = 0; i < 4; i++) + { + src_pages[i] |= (*prev)->src_pages[i]; + } + } + + if((*prev)->dst_pages != NULL) + { + GSVector4i* dst_pages = (*next)->GetDstPages(); + + for(int i = 0; i < 4; i++) + { + dst_pages[i] |= (*prev)->dst_pages[i]; + } + } + + GSVector4i prev_rect = GSVector4i::load(&(*prev)->rect); + GSVector4i next_rect = GSVector4i::load(&(*next)->rect); + + GSVector4i::store(&(*next)->rect, prev_rect.runion(next_rect)); + + (*next)->prim_count += (*prev)->prim_count; + (*next)->ib_start = (*prev)->ib_start; + + jobs.erase(prev); + } + } + + //printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); + + // + + cl_kernel tfx_prev = NULL; + + uint32 prim_start = 0; + + for(auto i : jobs) + { + ASSERT(prim_start < MAX_PRIM_COUNT); + + tfxcount++; + + UpdateTextureCache(i.get()); + + uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start); + + // TODO: tile level z test + + cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); + + if(tfx_prev != tfx()) + { + tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); + tfx.setArg(4, (cl_uint)m_pb_start); + + tfx_prev = tfx(); + } + + tfx.setArg(5, (cl_uint)prim_start); + tfx.setArg(6, (cl_uint)prim_count); + tfx.setArg(7, (cl_uint)bin_count); + tfx.setArg(8, bin_dim); + tfx.setArg(9, i->fbp); + tfx.setArg(10, i->zbp); + tfx.setArg(11, i->bw); + + GSVector4i r = GSVector4i::load(&i->rect); + + r = r.ralign(GSVector2i(8, 8)); + + m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); + + tfxpixels += r.width() * r.height(); + + InvalidateTextureCache(i.get()); + + // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily + + prim_start += prim_count; + } +} + void GSRendererCL::UpdateTextureCache(TFXJob* job) { if(job->src_pages == NULL) return; @@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver return true; } -////////// +// + +GSRendererCL::TFXJob::TFXJob() + : src_pages(NULL) + , dst_pages(NULL) +{ +} + +GSRendererCL::TFXJob::~TFXJob() +{ + if(src_pages != NULL) _aligned_free(src_pages); + if(dst_pages != NULL) _aligned_free(dst_pages); +} + +GSVector4i* GSRendererCL::TFXJob::GetSrcPages() +{ + if(src_pages == NULL) + { + src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + src_pages[0] = GSVector4i::zero(); + src_pages[1] = GSVector4i::zero(); + src_pages[2] = GSVector4i::zero(); + src_pages[3] = GSVector4i::zero(); + } + + return src_pages; +} + +GSVector4i* GSRendererCL::TFXJob::GetDstPages() +{ + if(dst_pages == NULL) + { + dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + dst_pages[0] = GSVector4i::zero(); + dst_pages[1] = GSVector4i::zero(); + dst_pages[2] = GSVector4i::zero(); + dst_pages[3] = GSVector4i::zero(); + } + + return dst_pages; +} + +// //#define IOCL_DEBUG @@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL() ib.mapped_ptr = ib.ptr = NULL; pb.mapped_ptr = pb.ptr = NULL; - pb.size = sizeof(TFXParameter) * 256; + pb.size = TFX_PARAM_SIZE * 256; pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); @@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map() { Unmap(); + // TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+ + if(vb.head < vb.size) { vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; ASSERT(((size_t)vb.ptr & 15) == 0); - ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); } if(ib.head < ib.size) @@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map() pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; ASSERT(((size_t)pb.ptr & 15) == 0); - ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); } } @@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt) opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; + opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u "; #ifdef IOCL_DEBUG opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; #endif diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index 59fd524943..fdfea3f6d3 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer { public: struct { int x, y, z, w; } rect; - TFXSelector sel; // uses primclass, solidrect only - uint32 ib_start, ib_count; - uint32 pb_start; + TFXSelector sel; + uint32 ib_start; + uint32 prim_count; GSVector4i* src_pages; // read by any texture level GSVector4i* dst_pages; // f/z writes to it + uint32 fbp, zbp, bw; #ifdef DEBUG - TFXParameter* param; + TFXParameter* pb; #endif - TFXJob() - : src_pages(NULL) - , dst_pages(NULL) - { - } + TFXJob(); + virtual ~TFXJob(); - virtual ~TFXJob() - { - if(src_pages != NULL) _aligned_free(src_pages); - if(dst_pages != NULL) _aligned_free(dst_pages); - } - - GSVector4i* GetSrcPages() - { - if(src_pages == NULL) - { - src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - src_pages[0] = GSVector4i::zero(); - src_pages[1] = GSVector4i::zero(); - src_pages[2] = GSVector4i::zero(); - src_pages[3] = GSVector4i::zero(); - } - - return src_pages; - } - - GSVector4i* GetDstPages() - { - if(dst_pages == NULL) - { - dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - dst_pages[0] = GSVector4i::zero(); - dst_pages[1] = GSVector4i::zero(); - dst_pages[2] = GSVector4i::zero(); - dst_pages[3] = GSVector4i::zero(); - } - - return dst_pages; - } + GSVector4i* GetSrcPages(); + GSVector4i* GetDstPages(); }; class CL @@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer std::list> m_jobs; uint32 m_vb_start; uint32 m_vb_count; + uint32 m_pb_start; + uint32 m_pb_count; + bool m_synced; void Enqueue(); + void EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim); void UpdateTextureCache(TFXJob* job); void InvalidateTextureCache(TFXJob* job); diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 533e51642a..7c0e239edc 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -94,6 +94,7 @@ static class GSUtilMaps public: uint8 PrimClassField[8]; uint8 VertexCountField[8]; + uint8 ClassVertexCountField[4]; uint32 CompatibleBitsField[64][2]; uint32 SharedBitsField[64][2]; @@ -117,6 +118,11 @@ public: VertexCountField[GS_SPRITE] = 2; VertexCountField[GS_INVALID] = 1; + ClassVertexCountField[GS_POINT_CLASS] = 1; + ClassVertexCountField[GS_LINE_CLASS] = 2; + ClassVertexCountField[GS_TRIANGLE_CLASS] = 3; + ClassVertexCountField[GS_SPRITE_CLASS] = 2; + memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField)); for(int i = 0; i < 64; i++) @@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim) return s_maps.VertexCountField[prim]; } +int GSUtil::GetClassVertexCount(uint32 primclass) +{ + return s_maps.ClassVertexCountField[primclass]; +} + const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm) { return s_maps.SharedBitsField[dpsm]; diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index b3697c0812..a4cad91014 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -30,6 +30,7 @@ public: static GS_PRIM_CLASS GetPrimClass(uint32 prim); static int GetVertexCount(uint32 prim); + static int GetClassVertexCount(uint32 primclass); static const uint32* HasSharedBitsPtr(uint32 dpsm); static bool HasSharedBits(uint32 spsm, const uint32* ptr); diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index f9bb6dac2b..65f7cfa05e 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -36,7 +36,10 @@ typedef struct typedef struct { - gs_vertex v[4]; + gs_vertex v[3]; + uint zmin; + uint pb_index; + uint _pad[2]; } gs_prim; typedef struct @@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM( ib += prim_index * VERTEX_PER_PRIM; + prim->pb_index = ib[0] >> 24; + + __global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff]; + __global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff]; + __global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff]; + int2 pmin, pmax; if(PRIM == GS_POINT_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - pmin = pmax = convert_int2_rte(v0->p.xy); prim->v[0].p = v0->p; @@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM( } else if(PRIM == GS_LINE_CLASS) { - int2 p0 = convert_int2_rte(vb[ib[0]].p.xy); - int2 p1 = convert_int2_rte(vb[ib[1]].p.xy); + int2 p0 = convert_int2_rte(v0->p.xy); + int2 p1 = convert_int2_rte(v1->p.xy); pmin = min(p0, p1); pmax = max(p0, p1); } else if(PRIM == GS_TRIANGLE_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - __global gs_vertex* v1 = &vb[ib[1]]; - __global gs_vertex* v2 = &vb[ib[2]]; - int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); int2 p2 = convert_int2_rtp(v2->p.xy); @@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM( pmax = max(max(p0, p1), p2); // z needs special care, since it's a 32 bit unit, float cannot encode it exactly - // pass the minimum through the unused 4th padding vector - // only interpolate the relative and hopefully small values + // only interpolate the relative to zmin and hopefully small values uint zmin = min(min(v0->z, v1->z), v2->z); @@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM( prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w); prim->v[2].tc = v2->tc; - prim->v[3].z = zmin; + prim->zmin = zmin; float4 dp0 = v1->p - v0->p; float4 dp1 = v0->p - v2->p; @@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM( } else if(PRIM == GS_SPRITE_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - __global gs_vertex* v1 = &vb[ib[1]]; - int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); @@ -785,7 +784,6 @@ __kernel void KERNEL_TILE( __kernel void KERNEL_TILE( __global gs_env* env, uint prim_count, - uint batch_count, uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { @@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( uint pb_start, uint prim_start, uint prim_count, - uint batch_count, uint bin_count, // == bin_dim.z * bin_dim.w - uchar4 bin_dim) + uchar4 bin_dim, + uint fbp, + uint zbp, + uint bw) { uint x = get_global_id(0); uint y = get_global_id(1); @@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( // - __global gs_param* pb = (__global gs_param*)(pb_base + pb_start); - int2 pi = (int2)(x, y); float2 pf = convert_float2(pi); - if(!NOSCISSOR) - { - if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) - { - return; - } - } - - int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); - int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); + int faddr = PixelAddress(x, y, fbp, bw, FPSM); + int zaddr = PixelAddress(x, y, zbp, bw, ZPSM); uint fd, zd; // TODO: fd as int4 and only pack before writing out? @@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; + pb_base += pb_start; + BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) @@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( break; } - __global gs_prim* prim = &prim_base[prim_index + i]; - bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); + __global gs_prim* prim = &prim_base[prim_index + i]; + __global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE); + + if(!NOSCISSOR) + { + if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) + { + continue; + } + } + uint2 zf; float3 t; int4 c; @@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw)); float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw)); - zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; + zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin; zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; @@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( } } - // read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway) + // read mask uint fm = pb->fm; uint zm = pb->zm;