mirror of https://github.com/PCSX2/pcsx2.git
Less opencl bugs, some games are actually playable now, there are still texture errors.
This commit is contained in:
parent
881735b562
commit
ba1e522bbb
|
@ -2711,7 +2711,7 @@ bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga)
|
|||
|
||||
case AFAIL_RGB_ONLY:
|
||||
zm |= t;
|
||||
fm |= t & T::xff000000();
|
||||
fm |= t & T::xff000000(); // fpsm 16 bit => & 0xffff8000?
|
||||
break;
|
||||
|
||||
default:
|
||||
|
|
|
@ -2082,3 +2082,45 @@ uint32* GSOffset::GetPages(const GSVector4i& rect, uint32* pages, GSVector4i* bb
|
|||
|
||||
return pages;
|
||||
}
|
||||
|
||||
GSVector4i* GSOffset::GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages, GSVector4i* bbox)
|
||||
{
|
||||
if(pages == NULL)
|
||||
{
|
||||
pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
}
|
||||
|
||||
pages[0] = GSVector4i::zero();
|
||||
pages[1] = GSVector4i::zero();
|
||||
pages[2] = GSVector4i::zero();
|
||||
pages[3] = GSVector4i::zero();
|
||||
|
||||
GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
|
||||
|
||||
GSVector4i r = rect.ralign<Align_Outside>(bs);
|
||||
|
||||
if(bbox != NULL) *bbox = r;
|
||||
|
||||
r = r.sra32(3);
|
||||
|
||||
bs.x >>= 3;
|
||||
bs.y >>= 3;
|
||||
|
||||
for(int y = r.top; y < r.bottom; y += bs.y)
|
||||
{
|
||||
uint32 base = block.row[y];
|
||||
|
||||
for(int x = r.left; x < r.right; x += bs.x)
|
||||
{
|
||||
uint32 n = (base + block.col[x]) >> 5;
|
||||
|
||||
if(n < MAX_PAGES)
|
||||
{
|
||||
((uint32*)pages)[n >> 5] |= 1 << (n & 31);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pages;
|
||||
|
||||
}
|
|
@ -54,6 +54,7 @@ public:
|
|||
enum {EOP = 0xffffffff};
|
||||
|
||||
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
|
||||
GSVector4i* GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages = NULL, GSVector4i* bbox = NULL); // free returned value with _aligned_free
|
||||
};
|
||||
|
||||
struct GSPixelOffset
|
||||
|
|
|
@ -77,18 +77,16 @@ GSRendererCL::GSRendererCL()
|
|||
{
|
||||
m_nativeres = true; // ignore ini, sw is always native
|
||||
|
||||
//s_dump = 1;
|
||||
//s_save = 1;
|
||||
//s_savez = 1;
|
||||
|
||||
// TODO: m_tc = new GSTextureCacheCL(this);
|
||||
|
||||
memset(m_texture, 0, sizeof(m_texture));
|
||||
|
||||
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
|
||||
|
||||
memset(m_rw_pages, 0, sizeof(m_rw_pages));
|
||||
memset(m_tex_pages, 0, sizeof(m_tex_pages));
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[0][i] = GSVector4i::zero();
|
||||
m_rw_pages[1][i] = GSVector4i::zero();
|
||||
m_tc_pages[i] = GSVector4i::xffffffff();
|
||||
}
|
||||
|
||||
#define InitCVB(P) \
|
||||
m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
|
||||
|
@ -107,8 +105,6 @@ GSRendererCL::GSRendererCL()
|
|||
|
||||
GSRendererCL::~GSRendererCL()
|
||||
{
|
||||
// TODO: delete m_tc;
|
||||
|
||||
for(size_t i = 0; i < countof(m_texture); i++)
|
||||
{
|
||||
delete m_texture[i];
|
||||
|
@ -121,18 +117,21 @@ void GSRendererCL::Reset()
|
|||
{
|
||||
Sync(-1);
|
||||
|
||||
// TODO: m_tc->RemoveAll();
|
||||
|
||||
GSRenderer::Reset();
|
||||
}
|
||||
|
||||
static int pageuploads = 0;
|
||||
static int pageuploadcount = 0;
|
||||
static int tfxcount = 0;
|
||||
|
||||
void GSRendererCL::VSync(int field)
|
||||
{
|
||||
Sync(0); // IncAge might delete a cached texture in use
|
||||
Sync(0);
|
||||
|
||||
GSRenderer::VSync(field);
|
||||
|
||||
// TODO: m_tc->IncAge();
|
||||
printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount);
|
||||
pageuploads = pageuploadcount = tfxcount = 0;
|
||||
|
||||
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
|
||||
}
|
||||
|
@ -364,7 +363,7 @@ void GSRendererCL::Draw()
|
|||
{
|
||||
// only allow batches of the same primclass in Enqueue
|
||||
|
||||
if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass)
|
||||
if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass)
|
||||
{
|
||||
Enqueue();
|
||||
}
|
||||
|
@ -372,6 +371,16 @@ void GSRendererCL::Draw()
|
|||
|
||||
//
|
||||
|
||||
shared_ptr<TFXJob> job(new TFXJob());
|
||||
|
||||
job->rect.x = rect.x;
|
||||
job->rect.y = rect.y;
|
||||
job->rect.z = rect.z;
|
||||
job->rect.w = rect.w;
|
||||
job->ib_start = m_cl.ib.tail;
|
||||
job->ib_count = m_index.tail;
|
||||
job->pb_start = m_cl.pb.tail;
|
||||
|
||||
GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
|
||||
uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
|
||||
TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
|
||||
|
@ -402,21 +411,12 @@ void GSRendererCL::Draw()
|
|||
|
||||
m_vb_count += m_vertex.next;
|
||||
|
||||
if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail))
|
||||
if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
TFXJob job;
|
||||
|
||||
job.rect.x = rect.x;
|
||||
job.rect.y = rect.y;
|
||||
job.rect.z = rect.z;
|
||||
job.rect.w = rect.w;
|
||||
job.sel = pb->sel;
|
||||
job.ib_start = m_cl.ib.tail;
|
||||
job.ib_count = m_index.tail;
|
||||
job.pb_start = m_cl.pb.tail;
|
||||
job->sel = pb->sel;
|
||||
|
||||
m_jobs.push_back(job);
|
||||
|
||||
|
@ -424,29 +424,67 @@ void GSRendererCL::Draw()
|
|||
m_cl.ib.tail += ib_size;
|
||||
m_cl.pb.tail += pb_size;
|
||||
|
||||
// mark pages for writing
|
||||
// mark pages used in rendering as source or target
|
||||
|
||||
if(pb->sel.fb)
|
||||
if(pb->sel.fwrite || pb->sel.rfb)
|
||||
{
|
||||
uint8 flag = pb->sel.fb;
|
||||
m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
|
||||
|
||||
const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages);
|
||||
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
if(pb->sel.rfb)
|
||||
{
|
||||
m_rw_pages[*p] |= flag;
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[0][i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
if(pb->sel.zb)
|
||||
if(pb->sel.fwrite)
|
||||
{
|
||||
uint8 flag = pb->sel.zb;
|
||||
|
||||
const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages);
|
||||
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[*p] |= flag;
|
||||
m_rw_pages[1][i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
GSVector4i* dst_pages = job->GetDstPages();
|
||||
|
||||
if(pb->sel.fwrite)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
dst_pages[i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(pb->sel.zwrite || pb->sel.rzb)
|
||||
{
|
||||
m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
|
||||
|
||||
if(pb->sel.rzb)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[0][i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
if(pb->sel.zwrite)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[1][i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
GSVector4i* dst_pages = job->GetDstPages();
|
||||
|
||||
if(pb->sel.zwrite)
|
||||
{
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
dst_pages[i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -456,52 +494,6 @@ void GSRendererCL::Draw()
|
|||
{
|
||||
Enqueue();
|
||||
}
|
||||
|
||||
/*
|
||||
// check if the texture is not part of a target currently in use
|
||||
|
||||
if(CheckSourcePages(data))
|
||||
{
|
||||
Sync(4);
|
||||
}
|
||||
|
||||
// addref source and target pages
|
||||
|
||||
data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
|
||||
*/
|
||||
|
||||
// update previously invalidated parts
|
||||
|
||||
//data->UpdateSource();
|
||||
/*
|
||||
if(LOG)
|
||||
{
|
||||
fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
|
||||
sd->counter,
|
||||
m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
|
||||
m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
|
||||
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
|
||||
PRIM->PRIM, sd->vertex_count, sd->index_count);
|
||||
|
||||
fflush(s_fp);
|
||||
}
|
||||
*/
|
||||
|
||||
//printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w);
|
||||
|
||||
/*
|
||||
// invalidate new parts rendered onto
|
||||
|
||||
if(sd->global.sel.fwrite)
|
||||
{
|
||||
m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
|
||||
}
|
||||
|
||||
if(sd->global.sel.zwrite)
|
||||
{
|
||||
m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
|
||||
}
|
||||
*/
|
||||
}
|
||||
catch(cl::Error err)
|
||||
{
|
||||
|
@ -552,8 +544,11 @@ void GSRendererCL::Sync(int reason)
|
|||
|
||||
m_cl.queue[2].finish();
|
||||
|
||||
memset(m_rw_pages, 0, sizeof(m_rw_pages));
|
||||
memset(m_tex_pages, 0, sizeof(m_tex_pages));
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_rw_pages[0][i] = GSVector4i::zero();
|
||||
m_rw_pages[1][i] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap,
|
||||
// though it does not seem to be necessary even with GPU devices where it might be cached,
|
||||
|
@ -569,13 +564,15 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
|
||||
|
||||
o->GetPages(r, m_tmp_pages);
|
||||
o->GetPagesAsBits(r, m_tmp_pages);
|
||||
|
||||
//if(!synced)
|
||||
{
|
||||
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
if(m_rw_pages[*p] & 3) // rw
|
||||
GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
|
||||
|
||||
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
|
||||
{
|
||||
Sync(3);
|
||||
|
||||
|
@ -584,9 +581,9 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
}
|
||||
}
|
||||
|
||||
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
m_tex_pages[*p] = 1;
|
||||
m_tc_pages[i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -598,11 +595,13 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
{
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
|
||||
|
||||
o->GetPages(r, m_tmp_pages);
|
||||
o->GetPagesAsBits(r, m_tmp_pages);
|
||||
|
||||
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
if(m_rw_pages[*p] & 1) // w
|
||||
GSVector4i pages = m_rw_pages[1][i];
|
||||
|
||||
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
|
||||
{
|
||||
Sync(4);
|
||||
|
||||
|
@ -611,34 +610,6 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
bool GSRendererCL::CheckSourcePages(RasterizerData* data)
|
||||
{
|
||||
// TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync
|
||||
{
|
||||
for(size_t i = 0; data->m_tex[i].t != NULL; i++)
|
||||
{
|
||||
data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages);
|
||||
|
||||
uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n;
|
||||
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
|
||||
|
||||
if(m_fzb_pages[*p]) // currently being drawn to? => sync
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
//#include "GSTextureCL.h"
|
||||
|
||||
void GSRendererCL::Enqueue()
|
||||
{
|
||||
|
@ -650,7 +621,7 @@ void GSRendererCL::Enqueue()
|
|||
ASSERT(m_cl.ib.tail > m_cl.ib.head);
|
||||
ASSERT(m_cl.pb.tail > m_cl.pb.head);
|
||||
|
||||
int primclass = m_jobs.front().sel.prim;
|
||||
int primclass = m_jobs.front()->sel.prim;
|
||||
|
||||
uint32 n;
|
||||
|
||||
|
@ -724,8 +695,8 @@ void GSRendererCL::Enqueue()
|
|||
{
|
||||
auto job = next++;
|
||||
|
||||
uint32 cur_prim_count = job->ib_count / n;
|
||||
uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0;
|
||||
uint32 cur_prim_count = (*job)->ib_count / n;
|
||||
uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
|
||||
|
||||
total_prim_count += cur_prim_count;
|
||||
|
||||
|
@ -734,7 +705,7 @@ void GSRendererCL::Enqueue()
|
|||
uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);
|
||||
|
||||
pk.setArg(3, (cl_uint)m_vb_start);
|
||||
pk.setArg(4, (cl_uint)head->ib_start);
|
||||
pk.setArg(4, (cl_uint)(*head)->ib_start);
|
||||
|
||||
m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
|
||||
|
||||
|
@ -748,7 +719,7 @@ void GSRendererCL::Enqueue()
|
|||
|
||||
for(auto i = head; i != next; i++)
|
||||
{
|
||||
rect = rect.runion(GSVector4i::load<false>(&i->rect));
|
||||
rect = rect.runion(GSVector4i::load<false>(&(*i)->rect));
|
||||
}
|
||||
|
||||
rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
|
||||
|
@ -829,14 +800,40 @@ void GSRendererCL::Enqueue()
|
|||
{
|
||||
ASSERT(prim_start < MAX_PRIM_COUNT);
|
||||
|
||||
uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start);
|
||||
uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
|
||||
|
||||
// TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text),
|
||||
// changed by tfx in the previous loop or marked by InvalidateVideoMem
|
||||
tfxcount++;
|
||||
if((*i)->src_pages != NULL)
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j];
|
||||
|
||||
if(!pages.eq(GSVector4i::zero()))
|
||||
{
|
||||
// TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex)
|
||||
// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
|
||||
|
||||
for(int ii = 0; ii < 4; ii++)
|
||||
for(int jj = 0; jj < 32; jj++)
|
||||
if(pages.u32[ii] & (1 << jj)) count++;
|
||||
|
||||
m_tc_pages[j] &= ~(*i)->src_pages[j];
|
||||
}
|
||||
}
|
||||
|
||||
if(count > 0)
|
||||
{
|
||||
pageuploads += count;
|
||||
pageuploadcount++;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: tile level z test
|
||||
|
||||
cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
|
||||
cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
|
||||
|
||||
if(tfx_prev != tfx())
|
||||
{
|
||||
|
@ -845,28 +842,32 @@ void GSRendererCL::Enqueue()
|
|||
tfx_prev = tfx();
|
||||
}
|
||||
|
||||
tfx.setArg(4, (cl_uint)i->pb_start);
|
||||
tfx.setArg(4, (cl_uint)(*i)->pb_start);
|
||||
tfx.setArg(5, (cl_uint)prim_start);
|
||||
tfx.setArg(6, (cl_uint)prim_count_inner);
|
||||
tfx.setArg(7, (cl_uint)batch_count);
|
||||
tfx.setArg(8, (cl_uint)bin_count);
|
||||
tfx.setArg(9, bin_dim);
|
||||
|
||||
//m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256));
|
||||
|
||||
//printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y);
|
||||
|
||||
GSVector4i r = GSVector4i::load<false>(&i->rect);
|
||||
GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
|
||||
|
||||
r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
|
||||
/*
|
||||
if(i->sel.IsSolidRect()) // TODO: simple mem fill
|
||||
if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask
|
||||
;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
|
||||
else
|
||||
*/
|
||||
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
|
||||
|
||||
// TODO: invalidate texture cache pages
|
||||
if((*i)->dst_pages != NULL)
|
||||
{
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
m_tc_pages[j] |= (*i)->dst_pages[j];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
|
||||
|
||||
prim_start += prim_count_inner;
|
||||
}
|
||||
|
@ -877,10 +878,12 @@ void GSRendererCL::Enqueue()
|
|||
{
|
||||
prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
|
||||
|
||||
job->ib_start += prim_count * n * sizeof(uint32);
|
||||
job->ib_count -= prim_count * n;
|
||||
(*job)->ib_start += prim_count * n * sizeof(uint32);
|
||||
(*job)->ib_count -= prim_count * n;
|
||||
|
||||
next = job; // try again for the reminder
|
||||
|
||||
printf("split %d\n", (*job)->ib_count / n);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -929,7 +932,7 @@ static int RemapPSM(int psm)
|
|||
return psm;
|
||||
}
|
||||
|
||||
bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
|
||||
bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
|
||||
{
|
||||
const GSDrawingEnvironment& env = m_env;
|
||||
const GSDrawingContext* context = m_context;
|
||||
|
@ -970,7 +973,7 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
}
|
||||
|
||||
bool fwrite;
|
||||
bool zwrite;
|
||||
bool zwrite = zm != 0xffffffff;
|
||||
|
||||
switch(context->FRAME.PSM)
|
||||
{
|
||||
|
@ -991,26 +994,6 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
break;
|
||||
}
|
||||
|
||||
switch(context->ZBUF.PSM)
|
||||
{
|
||||
default:
|
||||
case PSM_PSMCT32:
|
||||
case PSM_PSMZ32:
|
||||
zwrite = zm != 0xffffffff;
|
||||
break;
|
||||
case PSM_PSMCT24:
|
||||
case PSM_PSMZ24:
|
||||
zwrite = (zm & 0x00ffffff) != 0x00ffffff;
|
||||
break;
|
||||
case PSM_PSMCT16:
|
||||
case PSM_PSMCT16S:
|
||||
case PSM_PSMZ16:
|
||||
case PSM_PSMZ16S:
|
||||
zm &= 0x80f8f8f8;
|
||||
zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8;
|
||||
break;
|
||||
}
|
||||
|
||||
if(!fwrite && !zwrite) return false;
|
||||
|
||||
bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
|
||||
|
@ -1061,19 +1044,21 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
pb->sel.tfx = TFX_DECAL;
|
||||
}
|
||||
|
||||
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
|
||||
|
||||
// TODO: if(t == NULL) {ASSERT(0); return false;}
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
|
||||
|
||||
// TODO: data->SetSource(t, r, 0);
|
||||
GSVector4i* src_pages = job->GetSrcPages();
|
||||
|
||||
// TODO: pb->sel.tw = t->m_tw - 3;
|
||||
GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
|
||||
|
||||
// TODO: store r to current job
|
||||
o->GetPagesAsBits(r, m_tmp_pages);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
src_pages[i] |= m_tmp_pages[i];
|
||||
m_rw_pages[0][i] |= m_tmp_pages[i];
|
||||
}
|
||||
|
||||
if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
|
||||
{
|
||||
|
@ -1195,17 +1180,19 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
m_vt.m_min.t *= 0.5f;
|
||||
m_vt.m_max.t *= 0.5f;
|
||||
|
||||
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3);
|
||||
|
||||
// TODO: if(t == NULL) {ASSERT(0); return false;}
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
|
||||
|
||||
// TODO: data->SetSource(t, r, i);
|
||||
GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
|
||||
|
||||
// TODO: store r to current job
|
||||
o->GetPagesAsBits(r, m_tmp_pages);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
src_pages[i] |= m_tmp_pages[i];
|
||||
m_rw_pages[0][i] |= m_tmp_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
s_counter++;
|
||||
|
@ -1361,15 +1348,23 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
pb->afix = context->ALPHA.FIX;
|
||||
}
|
||||
|
||||
if(pb->sel.date
|
||||
|| pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1
|
||||
|| pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|
||||
|| (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0
|
||||
|| (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask
|
||||
|| (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0)
|
||||
if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1)
|
||||
{
|
||||
pb->sel.rfb = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(fwrite)
|
||||
{
|
||||
if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|
||||
|| (pb->sel.fpsm & 3) == 0 && fm != 0
|
||||
|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|
||||
|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
|
||||
{
|
||||
pb->sel.rfb = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pb->sel.colclamp = env.COLCLAMP.CLAMP;
|
||||
pb->sel.fba = context->FBA.FBA;
|
||||
|
@ -1391,7 +1386,22 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
|
|||
{
|
||||
pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
|
||||
pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
|
||||
pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
|
||||
|
||||
if(ztest)
|
||||
{
|
||||
pb->sel.rzb = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(zwrite)
|
||||
{
|
||||
if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY)
|
||||
|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
|
||||
{
|
||||
pb->sel.rzb = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pb->fm = fm;
|
||||
|
@ -1732,7 +1742,7 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
|
|||
opt << "-D RFB=" << sel.rfb << " ";
|
||||
opt << "-D ZWRITE=" << sel.zwrite << " ";
|
||||
opt << "-D ZTEST=" << sel.ztest << " ";
|
||||
opt << "-D ZOVERFLOW=" << sel.zoverflow << " ";
|
||||
opt << "-D RZB=" << sel.rzb << " ";
|
||||
opt << "-D WMS=" << sel.wms << " ";
|
||||
opt << "-D WMT=" << sel.wmt << " ";
|
||||
opt << "-D DATM=" << sel.datm << " ";
|
||||
|
@ -1740,7 +1750,6 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
|
|||
opt << "-D FBA=" << sel.fba << " ";
|
||||
opt << "-D DTHE=" << sel.dthe << " ";
|
||||
opt << "-D PRIM=" << sel.prim << " ";
|
||||
opt << "-D TW=" << sel.tw << " ";
|
||||
opt << "-D LCM=" << sel.lcm << " ";
|
||||
opt << "-D MMIN=" << sel.mmin << " ";
|
||||
opt << "-D NOSCISSOR=" << sel.noscissor << " ";
|
||||
|
@ -1751,6 +1760,8 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
|
|||
|
||||
AddDefs(opt);
|
||||
|
||||
printf("building kernel (%s)\n", entry);
|
||||
|
||||
program.build(opt.str().c_str());
|
||||
}
|
||||
catch(cl::Error err)
|
||||
|
|
|
@ -64,18 +64,6 @@ class GSRendererCL : public GSRenderer
|
|||
operator uint32() const { return key; }
|
||||
};
|
||||
|
||||
union JobSelector
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32 dummy:1; // 0
|
||||
};
|
||||
|
||||
uint32 key;
|
||||
|
||||
operator uint32() const { return key; }
|
||||
};
|
||||
|
||||
union TFXSelector
|
||||
{
|
||||
struct
|
||||
|
@ -106,7 +94,7 @@ class GSRendererCL : public GSRenderer
|
|||
uint32 rfb:1; // 36
|
||||
uint32 zwrite:1; // 37
|
||||
uint32 ztest:1; // 38
|
||||
uint32 zoverflow:1; // 39 (z max >= 0x80000000)
|
||||
uint32 rzb:1; // 39
|
||||
uint32 wms:2; // 40
|
||||
uint32 wmt:2; // 42
|
||||
uint32 datm:1; // 44
|
||||
|
@ -114,12 +102,11 @@ class GSRendererCL : public GSRenderer
|
|||
uint32 fba:1; // 46
|
||||
uint32 dthe:1; // 47
|
||||
uint32 prim:2; // 48
|
||||
uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
|
||||
uint32 lcm:1; // 53
|
||||
uint32 mmin:2; // 54
|
||||
uint32 noscissor:1; // 55
|
||||
uint32 tpsm:4; // 56
|
||||
uint32 aem:1; // 60
|
||||
uint32 lcm:1; // 50
|
||||
uint32 mmin:2; // 51
|
||||
uint32 noscissor:1; // 53
|
||||
uint32 tpsm:4; // 54
|
||||
uint32 aem:1; // 58
|
||||
// TODO
|
||||
};
|
||||
|
||||
|
@ -177,12 +164,57 @@ class GSRendererCL : public GSRenderer
|
|||
uint32 clut[256];
|
||||
};
|
||||
|
||||
struct TFXJob
|
||||
class TFXJob
|
||||
{
|
||||
struct {int x, y, z, w;} rect;
|
||||
TFXSelector sel;
|
||||
public:
|
||||
struct { int x, y, z, w; } rect;
|
||||
TFXSelector sel; // uses primclass, solidrect only
|
||||
uint32 ib_start, ib_count;
|
||||
uint32 pb_start;
|
||||
GSVector4i* src_pages; // read by any texture level
|
||||
GSVector4i* dst_pages; // f/z writes to it
|
||||
|
||||
TFXJob()
|
||||
: src_pages(NULL)
|
||||
, dst_pages(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~TFXJob()
|
||||
{
|
||||
if(src_pages != NULL) _aligned_free(src_pages);
|
||||
if(dst_pages != NULL) _aligned_free(dst_pages);
|
||||
}
|
||||
|
||||
GSVector4i* GetSrcPages()
|
||||
{
|
||||
if(src_pages == NULL)
|
||||
{
|
||||
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
src_pages[0] = GSVector4i::zero();
|
||||
src_pages[1] = GSVector4i::zero();
|
||||
src_pages[2] = GSVector4i::zero();
|
||||
src_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return src_pages;
|
||||
}
|
||||
|
||||
GSVector4i* GetDstPages()
|
||||
{
|
||||
if(dst_pages == NULL)
|
||||
{
|
||||
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
|
||||
|
||||
dst_pages[0] = GSVector4i::zero();
|
||||
dst_pages[1] = GSVector4i::zero();
|
||||
dst_pages[2] = GSVector4i::zero();
|
||||
dst_pages[3] = GSVector4i::zero();
|
||||
}
|
||||
|
||||
return dst_pages;
|
||||
}
|
||||
};
|
||||
|
||||
class CL
|
||||
|
@ -217,7 +249,7 @@ class GSRendererCL : public GSRenderer
|
|||
};
|
||||
|
||||
CL m_cl;
|
||||
std::list<TFXJob> m_jobs;
|
||||
std::list<shared_ptr<TFXJob>> m_jobs;
|
||||
uint32 m_vb_start;
|
||||
uint32 m_vb_count;
|
||||
|
||||
|
@ -283,9 +315,9 @@ protected:
|
|||
GSTexture* m_texture[2];
|
||||
uint8* m_output;
|
||||
|
||||
uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read)
|
||||
uint8 m_tex_pages[512];
|
||||
uint32 m_tmp_pages[512 + 1];
|
||||
GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
|
||||
GSVector4i m_tc_pages[4]; // invalidated texture cache pages
|
||||
GSVector4i m_tmp_pages[4];
|
||||
|
||||
void Reset();
|
||||
void VSync(int field);
|
||||
|
@ -297,12 +329,7 @@ protected:
|
|||
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
|
||||
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
|
||||
|
||||
void UsePages(const uint32* pages, int type);
|
||||
void ReleasePages(const uint32* pages, int type);
|
||||
|
||||
//bool CheckSourcePages(RasterizerData* data);
|
||||
|
||||
bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
|
||||
bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
|
||||
|
||||
public:
|
||||
GSRendererCL();
|
||||
|
|
|
@ -47,6 +47,10 @@ GSState::GSState()
|
|||
s_savez = !!theApp.GetConfig("savez", 0);
|
||||
s_saven = theApp.GetConfig("saven", 0);
|
||||
|
||||
//s_dump = 1;
|
||||
//s_save = 1;
|
||||
//s_savez = 1;
|
||||
|
||||
UserHacks_AggressiveCRC = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_AggressiveCRC", 0) : 0;
|
||||
UserHacks_DisableCrcHacks = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig( "UserHacks_DisableCrcHacks", 0 ) : 0;
|
||||
UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;
|
||||
|
|
|
@ -968,7 +968,8 @@ bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
|
|||
*fm |= pass ? 0 : 0xffffffff;
|
||||
break;
|
||||
case AFAIL_RGB_ONLY:
|
||||
*fm |= pass ? 0 : 0xff000000;
|
||||
if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000;
|
||||
if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000;
|
||||
*zm |= pass ? 0 : 0xffffffff;
|
||||
break;
|
||||
}
|
||||
|
@ -1248,7 +1249,7 @@ __kernel void KERNEL_TFX(
|
|||
fd = ReadFrame(vm, faddr, FPSM);
|
||||
}
|
||||
|
||||
if(ZTEST)
|
||||
if(RZB)
|
||||
{
|
||||
zd = ReadFrame(vm, zaddr, ZPSM);
|
||||
}
|
||||
|
@ -1284,7 +1285,7 @@ __kernel void KERNEL_TFX(
|
|||
wait_group_events(1, &e);
|
||||
}
|
||||
|
||||
if(ZTEST)
|
||||
if(RZB)
|
||||
{
|
||||
event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
|
||||
|
||||
|
@ -1409,7 +1410,7 @@ __kernel void KERNEL_TFX(
|
|||
|
||||
int4 ct;
|
||||
|
||||
if(FB && TFX != TFX_NONE)
|
||||
if(TFX != TFX_NONE)
|
||||
{
|
||||
// TODO
|
||||
|
||||
|
@ -1423,12 +1424,19 @@ __kernel void KERNEL_TFX(
|
|||
|
||||
if(!FST)
|
||||
{
|
||||
uv = convert_int2(t.xy * (1.0f / t.z));
|
||||
uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z));
|
||||
|
||||
if(LTF) uv -= 0x0008;
|
||||
}
|
||||
else
|
||||
{
|
||||
// sfex capcom logo third drawing call at (0,223) calculated as:
|
||||
// t0 + (p - p0) * (t - t0) / (p1 - p0)
|
||||
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
|
||||
// due to rounding errors (multiply-add instruction maybe):
|
||||
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
|
||||
// TODO: sw renderer samples at 112 exactly, check which one is correct
|
||||
|
||||
uv = convert_int2(t.xy);
|
||||
}
|
||||
|
||||
|
@ -1462,6 +1470,8 @@ __kernel void KERNEL_TFX(
|
|||
|
||||
// alpha tfx
|
||||
|
||||
int alpha = c.w;
|
||||
|
||||
if(FB)
|
||||
{
|
||||
if(TCC)
|
||||
|
@ -1512,7 +1522,7 @@ __kernel void KERNEL_TFX(
|
|||
|
||||
if(ZWRITE)
|
||||
{
|
||||
zd = bitselect(zs, zd, zm);
|
||||
zd = RZB ? bitselect(zs, zd, zm) : zs;
|
||||
}
|
||||
|
||||
// rgb tfx
|
||||
|
@ -1529,7 +1539,7 @@ __kernel void KERNEL_TFX(
|
|||
break;
|
||||
case TFX_HIGHLIGHT:
|
||||
case TFX_HIGHLIGHT2:
|
||||
c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff);
|
||||
c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1553,10 +1563,10 @@ __kernel void KERNEL_TFX(
|
|||
{
|
||||
if(DTHE && is16bit(FPSM))
|
||||
{
|
||||
// TODO: c += pb->dimx[y & 3]
|
||||
c.xyz += pb->dimx[y & 3][x & 3];
|
||||
}
|
||||
|
||||
c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff);
|
||||
c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff;
|
||||
|
||||
if(FBA && !is24bit(FPSM))
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue