Less opencl bugs, some games are actually playable now, there are still texture errors.

This commit is contained in:
gabest11 2014-09-16 05:37:06 +02:00 committed by Gregory Hainaut
parent 881735b562
commit ba1e522bbb
7 changed files with 325 additions and 230 deletions

View File

@ -2711,7 +2711,7 @@ bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga)
case AFAIL_RGB_ONLY:
zm |= t;
fm |= t & T::xff000000();
fm |= t & T::xff000000(); // fpsm 16 bit => & 0xffff8000?
break;
default:

View File

@ -2082,3 +2082,45 @@ uint32* GSOffset::GetPages(const GSVector4i& rect, uint32* pages, GSVector4i* bb
return pages;
}
GSVector4i* GSOffset::GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages, GSVector4i* bbox)
{
if(pages == NULL)
{
pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
}
pages[0] = GSVector4i::zero();
pages[1] = GSVector4i::zero();
pages[2] = GSVector4i::zero();
pages[3] = GSVector4i::zero();
GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
GSVector4i r = rect.ralign<Align_Outside>(bs);
if(bbox != NULL) *bbox = r;
r = r.sra32(3);
bs.x >>= 3;
bs.y >>= 3;
for(int y = r.top; y < r.bottom; y += bs.y)
{
uint32 base = block.row[y];
for(int x = r.left; x < r.right; x += bs.x)
{
uint32 n = (base + block.col[x]) >> 5;
if(n < MAX_PAGES)
{
((uint32*)pages)[n >> 5] |= 1 << (n & 31);
}
}
}
return pages;
}

View File

@ -54,6 +54,7 @@ public:
enum {EOP = 0xffffffff};
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
GSVector4i* GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages = NULL, GSVector4i* bbox = NULL); // free returned value with _aligned_free
};
struct GSPixelOffset

View File

@ -77,18 +77,16 @@ GSRendererCL::GSRendererCL()
{
m_nativeres = true; // ignore ini, sw is always native
//s_dump = 1;
//s_save = 1;
//s_savez = 1;
// TODO: m_tc = new GSTextureCacheCL(this);
memset(m_texture, 0, sizeof(m_texture));
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
memset(m_rw_pages, 0, sizeof(m_rw_pages));
memset(m_tex_pages, 0, sizeof(m_tex_pages));
for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] = GSVector4i::zero();
m_rw_pages[1][i] = GSVector4i::zero();
m_tc_pages[i] = GSVector4i::xffffffff();
}
#define InitCVB(P) \
m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
@ -107,8 +105,6 @@ GSRendererCL::GSRendererCL()
GSRendererCL::~GSRendererCL()
{
// TODO: delete m_tc;
for(size_t i = 0; i < countof(m_texture); i++)
{
delete m_texture[i];
@ -121,18 +117,21 @@ void GSRendererCL::Reset()
{
Sync(-1);
// TODO: m_tc->RemoveAll();
GSRenderer::Reset();
}
static int pageuploads = 0;
static int pageuploadcount = 0;
static int tfxcount = 0;
void GSRendererCL::VSync(int field)
{
Sync(0); // IncAge might delete a cached texture in use
Sync(0);
GSRenderer::VSync(field);
// TODO: m_tc->IncAge();
printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount);
pageuploads = pageuploadcount = tfxcount = 0;
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
}
@ -364,7 +363,7 @@ void GSRendererCL::Draw()
{
// only allow batches of the same primclass in Enqueue
if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass)
if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass)
{
Enqueue();
}
@ -372,6 +371,16 @@ void GSRendererCL::Draw()
//
shared_ptr<TFXJob> job(new TFXJob());
job->rect.x = rect.x;
job->rect.y = rect.y;
job->rect.z = rect.z;
job->rect.w = rect.w;
job->ib_start = m_cl.ib.tail;
job->ib_count = m_index.tail;
job->pb_start = m_cl.pb.tail;
GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
@ -402,21 +411,12 @@ void GSRendererCL::Draw()
m_vb_count += m_vertex.next;
if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail))
if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail))
{
return;
}
TFXJob job;
job.rect.x = rect.x;
job.rect.y = rect.y;
job.rect.z = rect.z;
job.rect.w = rect.w;
job.sel = pb->sel;
job.ib_start = m_cl.ib.tail;
job.ib_count = m_index.tail;
job.pb_start = m_cl.pb.tail;
job->sel = pb->sel;
m_jobs.push_back(job);
@ -424,29 +424,67 @@ void GSRendererCL::Draw()
m_cl.ib.tail += ib_size;
m_cl.pb.tail += pb_size;
// mark pages for writing
// mark pages used in rendering as source or target
if(pb->sel.fb)
if(pb->sel.fwrite || pb->sel.rfb)
{
uint8 flag = pb->sel.fb;
m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages);
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
if(pb->sel.rfb)
{
m_rw_pages[*p] |= flag;
for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
if(pb->sel.fwrite)
{
for(int i = 0; i < 4; i++)
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.fwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
}
}
}
if(pb->sel.zb)
if(pb->sel.zwrite || pb->sel.rzb)
{
uint8 flag = pb->sel.zb;
m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages);
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
if(pb->sel.rzb)
{
m_rw_pages[*p] |= flag;
for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
if(pb->sel.zwrite)
{
for(int i = 0; i < 4; i++)
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.zwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
}
}
}
@ -456,52 +494,6 @@ void GSRendererCL::Draw()
{
Enqueue();
}
/*
// check if the texture is not part of a target currently in use
if(CheckSourcePages(data))
{
Sync(4);
}
// addref source and target pages
data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
*/
// update previously invalidated parts
//data->UpdateSource();
/*
if(LOG)
{
fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
sd->counter,
m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
PRIM->PRIM, sd->vertex_count, sd->index_count);
fflush(s_fp);
}
*/
//printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w);
/*
// invalidate new parts rendered onto
if(sd->global.sel.fwrite)
{
m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
}
if(sd->global.sel.zwrite)
{
m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
}
*/
}
catch(cl::Error err)
{
@ -552,8 +544,11 @@ void GSRendererCL::Sync(int reason)
m_cl.queue[2].finish();
memset(m_rw_pages, 0, sizeof(m_rw_pages));
memset(m_tex_pages, 0, sizeof(m_tex_pages));
for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] = GSVector4i::zero();
m_rw_pages[1][i] = GSVector4i::zero();
}
// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap,
// though it does not seem to be necessary even with GPU devices where it might be cached,
@ -569,13 +564,15 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
o->GetPages(r, m_tmp_pages);
o->GetPagesAsBits(r, m_tmp_pages);
//if(!synced)
{
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
for(int i = 0; i < 4; i++)
{
if(m_rw_pages[*p] & 3) // rw
GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
{
Sync(3);
@ -584,9 +581,9 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
}
}
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
for(int i = 0; i < 4; i++)
{
m_tex_pages[*p] = 1;
m_tc_pages[i] |= m_tmp_pages[i];
}
}
@ -598,11 +595,13 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
o->GetPages(r, m_tmp_pages);
o->GetPagesAsBits(r, m_tmp_pages);
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
for(int i = 0; i < 4; i++)
{
if(m_rw_pages[*p] & 1) // w
GSVector4i pages = m_rw_pages[1][i];
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
{
Sync(4);
@ -611,34 +610,6 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
}
}
}
/*
bool GSRendererCL::CheckSourcePages(RasterizerData* data)
{
// TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync
{
for(size_t i = 0; data->m_tex[i].t != NULL; i++)
{
data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages);
uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n;
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{
// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
if(m_fzb_pages[*p]) // currently being drawn to? => sync
{
return true;
}
}
}
}
return false;
}
*/
//#include "GSTextureCL.h"
void GSRendererCL::Enqueue()
{
@ -650,7 +621,7 @@ void GSRendererCL::Enqueue()
ASSERT(m_cl.ib.tail > m_cl.ib.head);
ASSERT(m_cl.pb.tail > m_cl.pb.head);
int primclass = m_jobs.front().sel.prim;
int primclass = m_jobs.front()->sel.prim;
uint32 n;
@ -724,8 +695,8 @@ void GSRendererCL::Enqueue()
{
auto job = next++;
uint32 cur_prim_count = job->ib_count / n;
uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0;
uint32 cur_prim_count = (*job)->ib_count / n;
uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
total_prim_count += cur_prim_count;
@ -734,7 +705,7 @@ void GSRendererCL::Enqueue()
uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);
pk.setArg(3, (cl_uint)m_vb_start);
pk.setArg(4, (cl_uint)head->ib_start);
pk.setArg(4, (cl_uint)(*head)->ib_start);
m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
@ -748,7 +719,7 @@ void GSRendererCL::Enqueue()
for(auto i = head; i != next; i++)
{
rect = rect.runion(GSVector4i::load<false>(&i->rect));
rect = rect.runion(GSVector4i::load<false>(&(*i)->rect));
}
rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
@ -829,14 +800,40 @@ void GSRendererCL::Enqueue()
{
ASSERT(prim_start < MAX_PRIM_COUNT);
uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start);
uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
// TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text),
// changed by tfx in the previous loop or marked by InvalidateVideoMem
tfxcount++;
if((*i)->src_pages != NULL)
{
int count = 0;
for(int j = 0; j < 4; j++)
{
GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j];
if(!pages.eq(GSVector4i::zero()))
{
// TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex)
// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
for(int ii = 0; ii < 4; ii++)
for(int jj = 0; jj < 32; jj++)
if(pages.u32[ii] & (1 << jj)) count++;
m_tc_pages[j] &= ~(*i)->src_pages[j];
}
}
if(count > 0)
{
pageuploads += count;
pageuploadcount++;
}
}
// TODO: tile level z test
cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
if(tfx_prev != tfx())
{
@ -845,28 +842,32 @@ void GSRendererCL::Enqueue()
tfx_prev = tfx();
}
tfx.setArg(4, (cl_uint)i->pb_start);
tfx.setArg(4, (cl_uint)(*i)->pb_start);
tfx.setArg(5, (cl_uint)prim_start);
tfx.setArg(6, (cl_uint)prim_count_inner);
tfx.setArg(7, (cl_uint)batch_count);
tfx.setArg(8, (cl_uint)bin_count);
tfx.setArg(9, bin_dim);
//m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256));
//printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y);
GSVector4i r = GSVector4i::load<false>(&i->rect);
GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
/*
if(i->sel.IsSolidRect()) // TODO: simple mem fill
if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask
;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
else
*/
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
// TODO: invalidate texture cache pages
if((*i)->dst_pages != NULL)
{
for(int j = 0; j < 4; j++)
{
m_tc_pages[j] |= (*i)->dst_pages[j];
}
}
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
prim_start += prim_count_inner;
}
@ -877,10 +878,12 @@ void GSRendererCL::Enqueue()
{
prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
job->ib_start += prim_count * n * sizeof(uint32);
job->ib_count -= prim_count * n;
(*job)->ib_start += prim_count * n * sizeof(uint32);
(*job)->ib_count -= prim_count * n;
next = job; // try again for the reminder
printf("split %d\n", (*job)->ib_count / n);
}
break;
@ -929,7 +932,7 @@ static int RemapPSM(int psm)
return psm;
}
bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
{
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
@ -970,7 +973,7 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
}
bool fwrite;
bool zwrite;
bool zwrite = zm != 0xffffffff;
switch(context->FRAME.PSM)
{
@ -991,26 +994,6 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
break;
}
switch(context->ZBUF.PSM)
{
default:
case PSM_PSMCT32:
case PSM_PSMZ32:
zwrite = zm != 0xffffffff;
break;
case PSM_PSMCT24:
case PSM_PSMZ24:
zwrite = (zm & 0x00ffffff) != 0x00ffffff;
break;
case PSM_PSMCT16:
case PSM_PSMCT16S:
case PSM_PSMZ16:
case PSM_PSMZ16S:
zm &= 0x80f8f8f8;
zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8;
break;
}
if(!fwrite && !zwrite) return false;
bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
@ -1061,19 +1044,21 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
pb->sel.tfx = TFX_DECAL;
}
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
// TODO: if(t == NULL) {ASSERT(0); return false;}
GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
// TODO: data->SetSource(t, r, 0);
GSVector4i* src_pages = job->GetSrcPages();
// TODO: pb->sel.tw = t->m_tw - 3;
GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
o->GetPagesAsBits(r, m_tmp_pages);
// TODO: store r to current job
for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
{
@ -1195,17 +1180,19 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
m_vt.m_min.t *= 0.5f;
m_vt.m_max.t *= 0.5f;
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3);
// TODO: if(t == NULL) {ASSERT(0); return false;}
GSVector4i r;
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
// TODO: data->SetSource(t, r, i);
GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
o->GetPagesAsBits(r, m_tmp_pages);
// TODO: store r to current job
for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
s_counter++;
@ -1361,15 +1348,23 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
pb->afix = context->ALPHA.FIX;
}
if(pb->sel.date
|| pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1
|| pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|| (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0
|| (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask
|| (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0)
if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1)
{
pb->sel.rfb = 1;
}
else
{
if(fwrite)
{
if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|| (pb->sel.fpsm & 3) == 0 && fm != 0
|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
{
pb->sel.rfb = 1;
}
}
}
pb->sel.colclamp = env.COLCLAMP.CLAMP;
pb->sel.fba = context->FBA.FBA;
@ -1391,7 +1386,22 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
{
pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
if(ztest)
{
pb->sel.rzb = 1;
}
else
{
if(zwrite)
{
if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY)
|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
{
pb->sel.rzb = 1;
}
}
}
}
pb->fm = fm;
@ -1732,7 +1742,7 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
opt << "-D RFB=" << sel.rfb << " ";
opt << "-D ZWRITE=" << sel.zwrite << " ";
opt << "-D ZTEST=" << sel.ztest << " ";
opt << "-D ZOVERFLOW=" << sel.zoverflow << " ";
opt << "-D RZB=" << sel.rzb << " ";
opt << "-D WMS=" << sel.wms << " ";
opt << "-D WMT=" << sel.wmt << " ";
opt << "-D DATM=" << sel.datm << " ";
@ -1740,7 +1750,6 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
opt << "-D FBA=" << sel.fba << " ";
opt << "-D DTHE=" << sel.dthe << " ";
opt << "-D PRIM=" << sel.prim << " ";
opt << "-D TW=" << sel.tw << " ";
opt << "-D LCM=" << sel.lcm << " ";
opt << "-D MMIN=" << sel.mmin << " ";
opt << "-D NOSCISSOR=" << sel.noscissor << " ";
@ -1751,6 +1760,8 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
AddDefs(opt);
printf("building kernel (%s)\n", entry);
program.build(opt.str().c_str());
}
catch(cl::Error err)

View File

@ -64,18 +64,6 @@ class GSRendererCL : public GSRenderer
operator uint32() const { return key; }
};
union JobSelector
{
struct
{
uint32 dummy:1; // 0
};
uint32 key;
operator uint32() const { return key; }
};
union TFXSelector
{
struct
@ -106,7 +94,7 @@ class GSRendererCL : public GSRenderer
uint32 rfb:1; // 36
uint32 zwrite:1; // 37
uint32 ztest:1; // 38
uint32 zoverflow:1; // 39 (z max >= 0x80000000)
uint32 rzb:1; // 39
uint32 wms:2; // 40
uint32 wmt:2; // 42
uint32 datm:1; // 44
@ -114,12 +102,11 @@ class GSRendererCL : public GSRenderer
uint32 fba:1; // 46
uint32 dthe:1; // 47
uint32 prim:2; // 48
uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 lcm:1; // 53
uint32 mmin:2; // 54
uint32 noscissor:1; // 55
uint32 tpsm:4; // 56
uint32 aem:1; // 60
uint32 lcm:1; // 50
uint32 mmin:2; // 51
uint32 noscissor:1; // 53
uint32 tpsm:4; // 54
uint32 aem:1; // 58
// TODO
};
@ -177,12 +164,57 @@ class GSRendererCL : public GSRenderer
uint32 clut[256];
};
struct TFXJob
class TFXJob
{
struct {int x, y, z, w;} rect;
TFXSelector sel;
public:
struct { int x, y, z, w; } rect;
TFXSelector sel; // uses primclass, solidrect only
uint32 ib_start, ib_count;
uint32 pb_start;
GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it
TFXJob()
: src_pages(NULL)
, dst_pages(NULL)
{
}
virtual ~TFXJob()
{
if(src_pages != NULL) _aligned_free(src_pages);
if(dst_pages != NULL) _aligned_free(dst_pages);
}
GSVector4i* GetSrcPages()
{
if(src_pages == NULL)
{
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
src_pages[0] = GSVector4i::zero();
src_pages[1] = GSVector4i::zero();
src_pages[2] = GSVector4i::zero();
src_pages[3] = GSVector4i::zero();
}
return src_pages;
}
GSVector4i* GetDstPages()
{
if(dst_pages == NULL)
{
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
dst_pages[0] = GSVector4i::zero();
dst_pages[1] = GSVector4i::zero();
dst_pages[2] = GSVector4i::zero();
dst_pages[3] = GSVector4i::zero();
}
return dst_pages;
}
};
class CL
@ -217,7 +249,7 @@ class GSRendererCL : public GSRenderer
};
CL m_cl;
std::list<TFXJob> m_jobs;
std::list<shared_ptr<TFXJob>> m_jobs;
uint32 m_vb_start;
uint32 m_vb_count;
@ -282,10 +314,10 @@ protected:
// GSTextureCacheCL* m_tc;
GSTexture* m_texture[2];
uint8* m_output;
uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read)
uint8 m_tex_pages[512];
uint32 m_tmp_pages[512 + 1];
GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
GSVector4i m_tc_pages[4]; // invalidated texture cache pages
GSVector4i m_tmp_pages[4];
void Reset();
void VSync(int field);
@ -297,12 +329,7 @@ protected:
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
void UsePages(const uint32* pages, int type);
void ReleasePages(const uint32* pages, int type);
//bool CheckSourcePages(RasterizerData* data);
bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
public:
GSRendererCL();

View File

@ -47,6 +47,10 @@ GSState::GSState()
s_savez = !!theApp.GetConfig("savez", 0);
s_saven = theApp.GetConfig("saven", 0);
//s_dump = 1;
//s_save = 1;
//s_savez = 1;
UserHacks_AggressiveCRC = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_AggressiveCRC", 0) : 0;
UserHacks_DisableCrcHacks = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig( "UserHacks_DisableCrcHacks", 0 ) : 0;
UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;

View File

@ -968,7 +968,8 @@ bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
*fm |= pass ? 0 : 0xffffffff;
break;
case AFAIL_RGB_ONLY:
*fm |= pass ? 0 : 0xff000000;
if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000;
if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000;
*zm |= pass ? 0 : 0xffffffff;
break;
}
@ -1248,7 +1249,7 @@ __kernel void KERNEL_TFX(
fd = ReadFrame(vm, faddr, FPSM);
}
if(ZTEST)
if(RZB)
{
zd = ReadFrame(vm, zaddr, ZPSM);
}
@ -1284,7 +1285,7 @@ __kernel void KERNEL_TFX(
wait_group_events(1, &e);
}
if(ZTEST)
if(RZB)
{
event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
@ -1409,7 +1410,7 @@ __kernel void KERNEL_TFX(
int4 ct;
if(FB && TFX != TFX_NONE)
if(TFX != TFX_NONE)
{
// TODO
@ -1423,13 +1424,20 @@ __kernel void KERNEL_TFX(
if(!FST)
{
uv = convert_int2(t.xy * (1.0f / t.z));
uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z));
if(LTF) uv -= 0x0008;
}
else
{
uv = convert_int2(t.xy);
// sfex capcom logo third drawing call at (0,223) calculated as:
// t0 + (p - p0) * (t - t0) / (p1 - p0)
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
// due to rounding errors (multiply-add instruction maybe):
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
// TODO: sw renderer samples at 112 exactly, check which one is correct
uv = convert_int2(t.xy);
}
int2 uvf = uv & 0x000f;
@ -1462,6 +1470,8 @@ __kernel void KERNEL_TFX(
// alpha tfx
int alpha = c.w;
if(FB)
{
if(TCC)
@ -1512,7 +1522,7 @@ __kernel void KERNEL_TFX(
if(ZWRITE)
{
zd = bitselect(zs, zd, zm);
zd = RZB ? bitselect(zs, zd, zm) : zs;
}
// rgb tfx
@ -1529,7 +1539,7 @@ __kernel void KERNEL_TFX(
break;
case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2:
c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff);
c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff);
break;
}
}
@ -1553,10 +1563,10 @@ __kernel void KERNEL_TFX(
{
if(DTHE && is16bit(FPSM))
{
// TODO: c += pb->dimx[y & 3]
c.xyz += pb->dimx[y & 3][x & 3];
}
c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff);
c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff;
if(FBA && !is24bit(FPSM))
{