Less opencl bugs, some games are actually playable now, there are still texture errors.

This commit is contained in:
gabest11 2014-09-16 05:37:06 +02:00 committed by Gregory Hainaut
parent 881735b562
commit ba1e522bbb
7 changed files with 325 additions and 230 deletions

View File

@ -2711,7 +2711,7 @@ bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga)
case AFAIL_RGB_ONLY: case AFAIL_RGB_ONLY:
zm |= t; zm |= t;
fm |= t & T::xff000000(); fm |= t & T::xff000000(); // fpsm 16 bit => & 0xffff8000?
break; break;
default: default:

View File

@ -2082,3 +2082,45 @@ uint32* GSOffset::GetPages(const GSVector4i& rect, uint32* pages, GSVector4i* bb
return pages; return pages;
} }
GSVector4i* GSOffset::GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages, GSVector4i* bbox)
{
if(pages == NULL)
{
pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
}
pages[0] = GSVector4i::zero();
pages[1] = GSVector4i::zero();
pages[2] = GSVector4i::zero();
pages[3] = GSVector4i::zero();
GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
GSVector4i r = rect.ralign<Align_Outside>(bs);
if(bbox != NULL) *bbox = r;
r = r.sra32(3);
bs.x >>= 3;
bs.y >>= 3;
for(int y = r.top; y < r.bottom; y += bs.y)
{
uint32 base = block.row[y];
for(int x = r.left; x < r.right; x += bs.x)
{
uint32 n = (base + block.col[x]) >> 5;
if(n < MAX_PAGES)
{
((uint32*)pages)[n >> 5] |= 1 << (n & 31);
}
}
}
return pages;
}

View File

@ -54,6 +54,7 @@ public:
enum {EOP = 0xffffffff}; enum {EOP = 0xffffffff};
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL); uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
GSVector4i* GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages = NULL, GSVector4i* bbox = NULL); // free returned value with _aligned_free
}; };
struct GSPixelOffset struct GSPixelOffset

View File

@ -77,18 +77,16 @@ GSRendererCL::GSRendererCL()
{ {
m_nativeres = true; // ignore ini, sw is always native m_nativeres = true; // ignore ini, sw is always native
//s_dump = 1;
//s_save = 1;
//s_savez = 1;
// TODO: m_tc = new GSTextureCacheCL(this);
memset(m_texture, 0, sizeof(m_texture)); memset(m_texture, 0, sizeof(m_texture));
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
memset(m_rw_pages, 0, sizeof(m_rw_pages)); for(int i = 0; i < 4; i++)
memset(m_tex_pages, 0, sizeof(m_tex_pages)); {
m_rw_pages[0][i] = GSVector4i::zero();
m_rw_pages[1][i] = GSVector4i::zero();
m_tc_pages[i] = GSVector4i::xffffffff();
}
#define InitCVB(P) \ #define InitCVB(P) \
m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \ m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
@ -107,8 +105,6 @@ GSRendererCL::GSRendererCL()
GSRendererCL::~GSRendererCL() GSRendererCL::~GSRendererCL()
{ {
// TODO: delete m_tc;
for(size_t i = 0; i < countof(m_texture); i++) for(size_t i = 0; i < countof(m_texture); i++)
{ {
delete m_texture[i]; delete m_texture[i];
@ -121,18 +117,21 @@ void GSRendererCL::Reset()
{ {
Sync(-1); Sync(-1);
// TODO: m_tc->RemoveAll();
GSRenderer::Reset(); GSRenderer::Reset();
} }
static int pageuploads = 0;
static int pageuploadcount = 0;
static int tfxcount = 0;
void GSRendererCL::VSync(int field) void GSRendererCL::VSync(int field)
{ {
Sync(0); // IncAge might delete a cached texture in use Sync(0);
GSRenderer::VSync(field); GSRenderer::VSync(field);
// TODO: m_tc->IncAge(); printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount);
pageuploads = pageuploadcount = tfxcount = 0;
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
} }
@ -364,7 +363,7 @@ void GSRendererCL::Draw()
{ {
// only allow batches of the same primclass in Enqueue // only allow batches of the same primclass in Enqueue
if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass) if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass)
{ {
Enqueue(); Enqueue();
} }
@ -372,6 +371,16 @@ void GSRendererCL::Draw()
// //
shared_ptr<TFXJob> job(new TFXJob());
job->rect.x = rect.x;
job->rect.y = rect.y;
job->rect.z = rect.z;
job->rect.w = rect.w;
job->ib_start = m_cl.ib.tail;
job->ib_count = m_index.tail;
job->pb_start = m_cl.pb.tail;
GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
@ -402,21 +411,12 @@ void GSRendererCL::Draw()
m_vb_count += m_vertex.next; m_vb_count += m_vertex.next;
if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail)) if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail))
{ {
return; return;
} }
TFXJob job; job->sel = pb->sel;
job.rect.x = rect.x;
job.rect.y = rect.y;
job.rect.z = rect.z;
job.rect.w = rect.w;
job.sel = pb->sel;
job.ib_start = m_cl.ib.tail;
job.ib_count = m_index.tail;
job.pb_start = m_cl.pb.tail;
m_jobs.push_back(job); m_jobs.push_back(job);
@ -424,29 +424,67 @@ void GSRendererCL::Draw()
m_cl.ib.tail += ib_size; m_cl.ib.tail += ib_size;
m_cl.pb.tail += pb_size; m_cl.pb.tail += pb_size;
// mark pages for writing // mark pages used in rendering as source or target
if(pb->sel.fb) if(pb->sel.fwrite || pb->sel.rfb)
{ {
uint8 flag = pb->sel.fb; m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages); if(pb->sel.rfb)
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{ {
m_rw_pages[*p] |= flag; for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
if(pb->sel.fwrite)
{
for(int i = 0; i < 4; i++)
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.fwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
}
} }
} }
if(pb->sel.zb) if(pb->sel.zwrite || pb->sel.rzb)
{ {
uint8 flag = pb->sel.zb; m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages); if(pb->sel.rzb)
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{ {
m_rw_pages[*p] |= flag; for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
if(pb->sel.zwrite)
{
for(int i = 0; i < 4; i++)
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.zwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
}
} }
} }
@ -456,52 +494,6 @@ void GSRendererCL::Draw()
{ {
Enqueue(); Enqueue();
} }
/*
// check if the texture is not part of a target currently in use
if(CheckSourcePages(data))
{
Sync(4);
}
// addref source and target pages
data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
*/
// update previously invalidated parts
//data->UpdateSource();
/*
if(LOG)
{
fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
sd->counter,
m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
PRIM->PRIM, sd->vertex_count, sd->index_count);
fflush(s_fp);
}
*/
//printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w);
/*
// invalidate new parts rendered onto
if(sd->global.sel.fwrite)
{
m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
}
if(sd->global.sel.zwrite)
{
m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
}
*/
} }
catch(cl::Error err) catch(cl::Error err)
{ {
@ -552,8 +544,11 @@ void GSRendererCL::Sync(int reason)
m_cl.queue[2].finish(); m_cl.queue[2].finish();
memset(m_rw_pages, 0, sizeof(m_rw_pages)); for(int i = 0; i < 4; i++)
memset(m_tex_pages, 0, sizeof(m_tex_pages)); {
m_rw_pages[0][i] = GSVector4i::zero();
m_rw_pages[1][i] = GSVector4i::zero();
}
// TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap,
// though it does not seem to be necessary even with GPU devices where it might be cached, // though it does not seem to be necessary even with GPU devices where it might be cached,
@ -569,13 +564,15 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
o->GetPages(r, m_tmp_pages); o->GetPagesAsBits(r, m_tmp_pages);
//if(!synced) //if(!synced)
{ {
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) for(int i = 0; i < 4; i++)
{ {
if(m_rw_pages[*p] & 3) // rw GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
{ {
Sync(3); Sync(3);
@ -584,9 +581,9 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
} }
} }
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) for(int i = 0; i < 4; i++)
{ {
m_tex_pages[*p] = 1; m_tc_pages[i] |= m_tmp_pages[i];
} }
} }
@ -598,11 +595,13 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
{ {
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
o->GetPages(r, m_tmp_pages); o->GetPagesAsBits(r, m_tmp_pages);
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) for(int i = 0; i < 4; i++)
{ {
if(m_rw_pages[*p] & 1) // w GSVector4i pages = m_rw_pages[1][i];
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
{ {
Sync(4); Sync(4);
@ -611,34 +610,6 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
} }
} }
} }
/*
bool GSRendererCL::CheckSourcePages(RasterizerData* data)
{
// TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync
{
for(size_t i = 0; data->m_tex[i].t != NULL; i++)
{
data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages);
uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n;
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{
// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
if(m_fzb_pages[*p]) // currently being drawn to? => sync
{
return true;
}
}
}
}
return false;
}
*/
//#include "GSTextureCL.h"
void GSRendererCL::Enqueue() void GSRendererCL::Enqueue()
{ {
@ -650,7 +621,7 @@ void GSRendererCL::Enqueue()
ASSERT(m_cl.ib.tail > m_cl.ib.head); ASSERT(m_cl.ib.tail > m_cl.ib.head);
ASSERT(m_cl.pb.tail > m_cl.pb.head); ASSERT(m_cl.pb.tail > m_cl.pb.head);
int primclass = m_jobs.front().sel.prim; int primclass = m_jobs.front()->sel.prim;
uint32 n; uint32 n;
@ -724,8 +695,8 @@ void GSRendererCL::Enqueue()
{ {
auto job = next++; auto job = next++;
uint32 cur_prim_count = job->ib_count / n; uint32 cur_prim_count = (*job)->ib_count / n;
uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0; uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0;
total_prim_count += cur_prim_count; total_prim_count += cur_prim_count;
@ -734,7 +705,7 @@ void GSRendererCL::Enqueue()
uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);
pk.setArg(3, (cl_uint)m_vb_start); pk.setArg(3, (cl_uint)m_vb_start);
pk.setArg(4, (cl_uint)head->ib_start); pk.setArg(4, (cl_uint)(*head)->ib_start);
m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
@ -748,7 +719,7 @@ void GSRendererCL::Enqueue()
for(auto i = head; i != next; i++) for(auto i = head; i != next; i++)
{ {
rect = rect.runion(GSVector4i::load<false>(&i->rect)); rect = rect.runion(GSVector4i::load<false>(&(*i)->rect));
} }
rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS; rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
@ -829,14 +800,40 @@ void GSRendererCL::Enqueue()
{ {
ASSERT(prim_start < MAX_PRIM_COUNT); ASSERT(prim_start < MAX_PRIM_COUNT);
uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start); uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
// TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text), tfxcount++;
// changed by tfx in the previous loop or marked by InvalidateVideoMem if((*i)->src_pages != NULL)
{
int count = 0;
for(int j = 0; j < 4; j++)
{
GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j];
if(!pages.eq(GSVector4i::zero()))
{
// TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex)
// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
for(int ii = 0; ii < 4; ii++)
for(int jj = 0; jj < 32; jj++)
if(pages.u32[ii] & (1 << jj)) count++;
m_tc_pages[j] &= ~(*i)->src_pages[j];
}
}
if(count > 0)
{
pageuploads += count;
pageuploadcount++;
}
}
// TODO: tile level z test // TODO: tile level z test
cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel);
if(tfx_prev != tfx()) if(tfx_prev != tfx())
{ {
@ -845,28 +842,32 @@ void GSRendererCL::Enqueue()
tfx_prev = tfx(); tfx_prev = tfx();
} }
tfx.setArg(4, (cl_uint)i->pb_start); tfx.setArg(4, (cl_uint)(*i)->pb_start);
tfx.setArg(5, (cl_uint)prim_start); tfx.setArg(5, (cl_uint)prim_start);
tfx.setArg(6, (cl_uint)prim_count_inner); tfx.setArg(6, (cl_uint)prim_count_inner);
tfx.setArg(7, (cl_uint)batch_count); tfx.setArg(7, (cl_uint)batch_count);
tfx.setArg(8, (cl_uint)bin_count); tfx.setArg(8, (cl_uint)bin_count);
tfx.setArg(9, bin_dim); tfx.setArg(9, bin_dim);
//m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256)); GSVector4i r = GSVector4i::load<false>(&(*i)->rect);
//printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y);
GSVector4i r = GSVector4i::load<false>(&i->rect);
r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)); r = r.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE));
/* /*
if(i->sel.IsSolidRect()) // TODO: simple mem fill if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask
;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height());
else else
*/ */
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
// TODO: invalidate texture cache pages if((*i)->dst_pages != NULL)
{
for(int j = 0; j < 4; j++)
{
m_tc_pages[j] |= (*i)->dst_pages[j];
}
}
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
prim_start += prim_count_inner; prim_start += prim_count_inner;
} }
@ -877,10 +878,12 @@ void GSRendererCL::Enqueue()
{ {
prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
job->ib_start += prim_count * n * sizeof(uint32); (*job)->ib_start += prim_count * n * sizeof(uint32);
job->ib_count -= prim_count * n; (*job)->ib_count -= prim_count * n;
next = job; // try again for the reminder next = job; // try again for the reminder
printf("split %d\n", (*job)->ib_count / n);
} }
break; break;
@ -929,7 +932,7 @@ static int RemapPSM(int psm)
return psm; return psm;
} }
bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
{ {
const GSDrawingEnvironment& env = m_env; const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context; const GSDrawingContext* context = m_context;
@ -970,7 +973,7 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
} }
bool fwrite; bool fwrite;
bool zwrite; bool zwrite = zm != 0xffffffff;
switch(context->FRAME.PSM) switch(context->FRAME.PSM)
{ {
@ -991,26 +994,6 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
break; break;
} }
switch(context->ZBUF.PSM)
{
default:
case PSM_PSMCT32:
case PSM_PSMZ32:
zwrite = zm != 0xffffffff;
break;
case PSM_PSMCT24:
case PSM_PSMZ24:
zwrite = (zm & 0x00ffffff) != 0x00ffffff;
break;
case PSM_PSMCT16:
case PSM_PSMCT16S:
case PSM_PSMZ16:
case PSM_PSMZ16S:
zm &= 0x80f8f8f8;
zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8;
break;
}
if(!fwrite && !zwrite) return false; if(!fwrite && !zwrite) return false;
bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
@ -1061,19 +1044,21 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
pb->sel.tfx = TFX_DECAL; pb->sel.tfx = TFX_DECAL;
} }
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
// TODO: if(t == NULL) {ASSERT(0); return false;}
GSVector4i r; GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf);
// TODO: data->SetSource(t, r, 0); GSVector4i* src_pages = job->GetSrcPages();
// TODO: pb->sel.tw = t->m_tw - 3; GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
o->GetPagesAsBits(r, m_tmp_pages);
// TODO: store r to current job for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
{ {
@ -1195,17 +1180,19 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
m_vt.m_min.t *= 0.5f; m_vt.m_min.t *= 0.5f;
m_vt.m_max.t *= 0.5f; m_vt.m_max.t *= 0.5f;
// TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3);
// TODO: if(t == NULL) {ASSERT(0); return false;}
GSVector4i r; GSVector4i r;
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf);
// TODO: data->SetSource(t, r, i); GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
o->GetPagesAsBits(r, m_tmp_pages);
// TODO: store r to current job for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
} }
s_counter++; s_counter++;
@ -1361,15 +1348,23 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
pb->afix = context->ALPHA.FIX; pb->afix = context->ALPHA.FIX;
} }
if(pb->sel.date if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1)
|| pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1
|| pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|| (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0
|| (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask
|| (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0)
{ {
pb->sel.rfb = 1; pb->sel.rfb = 1;
} }
else
{
if(fwrite)
{
if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY
|| (pb->sel.fpsm & 3) == 0 && fm != 0
|| (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
|| (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
{
pb->sel.rfb = 1;
}
}
}
pb->sel.colclamp = env.COLCLAMP.CLAMP; pb->sel.colclamp = env.COLCLAMP.CLAMP;
pb->sel.fba = context->FBA.FBA; pb->sel.fba = context->FBA.FBA;
@ -1391,7 +1386,22 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v
{ {
pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); pb->sel.zpsm = RemapPSM(context->ZBUF.PSM);
pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
if(ztest)
{
pb->sel.rzb = 1;
}
else
{
if(zwrite)
{
if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY)
|| (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
{
pb->sel.rzb = 1;
}
}
}
} }
pb->fm = fm; pb->fm = fm;
@ -1732,7 +1742,7 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
opt << "-D RFB=" << sel.rfb << " "; opt << "-D RFB=" << sel.rfb << " ";
opt << "-D ZWRITE=" << sel.zwrite << " "; opt << "-D ZWRITE=" << sel.zwrite << " ";
opt << "-D ZTEST=" << sel.ztest << " "; opt << "-D ZTEST=" << sel.ztest << " ";
opt << "-D ZOVERFLOW=" << sel.zoverflow << " "; opt << "-D RZB=" << sel.rzb << " ";
opt << "-D WMS=" << sel.wms << " "; opt << "-D WMS=" << sel.wms << " ";
opt << "-D WMT=" << sel.wmt << " "; opt << "-D WMT=" << sel.wmt << " ";
opt << "-D DATM=" << sel.datm << " "; opt << "-D DATM=" << sel.datm << " ";
@ -1740,7 +1750,6 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
opt << "-D FBA=" << sel.fba << " "; opt << "-D FBA=" << sel.fba << " ";
opt << "-D DTHE=" << sel.dthe << " "; opt << "-D DTHE=" << sel.dthe << " ";
opt << "-D PRIM=" << sel.prim << " "; opt << "-D PRIM=" << sel.prim << " ";
opt << "-D TW=" << sel.tw << " ";
opt << "-D LCM=" << sel.lcm << " "; opt << "-D LCM=" << sel.lcm << " ";
opt << "-D MMIN=" << sel.mmin << " "; opt << "-D MMIN=" << sel.mmin << " ";
opt << "-D NOSCISSOR=" << sel.noscissor << " "; opt << "-D NOSCISSOR=" << sel.noscissor << " ";
@ -1751,6 +1760,8 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
AddDefs(opt); AddDefs(opt);
printf("building kernel (%s)\n", entry);
program.build(opt.str().c_str()); program.build(opt.str().c_str());
} }
catch(cl::Error err) catch(cl::Error err)

View File

@ -64,18 +64,6 @@ class GSRendererCL : public GSRenderer
operator uint32() const { return key; } operator uint32() const { return key; }
}; };
union JobSelector
{
struct
{
uint32 dummy:1; // 0
};
uint32 key;
operator uint32() const { return key; }
};
union TFXSelector union TFXSelector
{ {
struct struct
@ -106,7 +94,7 @@ class GSRendererCL : public GSRenderer
uint32 rfb:1; // 36 uint32 rfb:1; // 36
uint32 zwrite:1; // 37 uint32 zwrite:1; // 37
uint32 ztest:1; // 38 uint32 ztest:1; // 38
uint32 zoverflow:1; // 39 (z max >= 0x80000000) uint32 rzb:1; // 39
uint32 wms:2; // 40 uint32 wms:2; // 40
uint32 wmt:2; // 42 uint32 wmt:2; // 42
uint32 datm:1; // 44 uint32 datm:1; // 44
@ -114,12 +102,11 @@ class GSRendererCL : public GSRenderer
uint32 fba:1; // 46 uint32 fba:1; // 46
uint32 dthe:1; // 47 uint32 dthe:1; // 47
uint32 prim:2; // 48 uint32 prim:2; // 48
uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) uint32 lcm:1; // 50
uint32 lcm:1; // 53 uint32 mmin:2; // 51
uint32 mmin:2; // 54 uint32 noscissor:1; // 53
uint32 noscissor:1; // 55 uint32 tpsm:4; // 54
uint32 tpsm:4; // 56 uint32 aem:1; // 58
uint32 aem:1; // 60
// TODO // TODO
}; };
@ -177,12 +164,57 @@ class GSRendererCL : public GSRenderer
uint32 clut[256]; uint32 clut[256];
}; };
struct TFXJob class TFXJob
{ {
struct {int x, y, z, w;} rect; public:
TFXSelector sel; struct { int x, y, z, w; } rect;
TFXSelector sel; // uses primclass, solidrect only
uint32 ib_start, ib_count; uint32 ib_start, ib_count;
uint32 pb_start; uint32 pb_start;
GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it
TFXJob()
: src_pages(NULL)
, dst_pages(NULL)
{
}
virtual ~TFXJob()
{
if(src_pages != NULL) _aligned_free(src_pages);
if(dst_pages != NULL) _aligned_free(dst_pages);
}
GSVector4i* GetSrcPages()
{
if(src_pages == NULL)
{
src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
src_pages[0] = GSVector4i::zero();
src_pages[1] = GSVector4i::zero();
src_pages[2] = GSVector4i::zero();
src_pages[3] = GSVector4i::zero();
}
return src_pages;
}
GSVector4i* GetDstPages()
{
if(dst_pages == NULL)
{
dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
dst_pages[0] = GSVector4i::zero();
dst_pages[1] = GSVector4i::zero();
dst_pages[2] = GSVector4i::zero();
dst_pages[3] = GSVector4i::zero();
}
return dst_pages;
}
}; };
class CL class CL
@ -217,7 +249,7 @@ class GSRendererCL : public GSRenderer
}; };
CL m_cl; CL m_cl;
std::list<TFXJob> m_jobs; std::list<shared_ptr<TFXJob>> m_jobs;
uint32 m_vb_start; uint32 m_vb_start;
uint32 m_vb_count; uint32 m_vb_count;
@ -282,10 +314,10 @@ protected:
// GSTextureCacheCL* m_tc; // GSTextureCacheCL* m_tc;
GSTexture* m_texture[2]; GSTexture* m_texture[2];
uint8* m_output; uint8* m_output;
uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read) GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
uint8 m_tex_pages[512]; GSVector4i m_tc_pages[4]; // invalidated texture cache pages
uint32 m_tmp_pages[512 + 1]; GSVector4i m_tmp_pages[4];
void Reset(); void Reset();
void VSync(int field); void VSync(int field);
@ -297,12 +329,7 @@ protected:
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
void UsePages(const uint32* pages, int type); bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
void ReleasePages(const uint32* pages, int type);
//bool CheckSourcePages(RasterizerData* data);
bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
public: public:
GSRendererCL(); GSRendererCL();

View File

@ -47,6 +47,10 @@ GSState::GSState()
s_savez = !!theApp.GetConfig("savez", 0); s_savez = !!theApp.GetConfig("savez", 0);
s_saven = theApp.GetConfig("saven", 0); s_saven = theApp.GetConfig("saven", 0);
//s_dump = 1;
//s_save = 1;
//s_savez = 1;
UserHacks_AggressiveCRC = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_AggressiveCRC", 0) : 0; UserHacks_AggressiveCRC = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_AggressiveCRC", 0) : 0;
UserHacks_DisableCrcHacks = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig( "UserHacks_DisableCrcHacks", 0 ) : 0; UserHacks_DisableCrcHacks = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig( "UserHacks_DisableCrcHacks", 0 ) : 0;
UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0; UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;

View File

@ -968,7 +968,8 @@ bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
*fm |= pass ? 0 : 0xffffffff; *fm |= pass ? 0 : 0xffffffff;
break; break;
case AFAIL_RGB_ONLY: case AFAIL_RGB_ONLY:
*fm |= pass ? 0 : 0xff000000; if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000;
if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000;
*zm |= pass ? 0 : 0xffffffff; *zm |= pass ? 0 : 0xffffffff;
break; break;
} }
@ -1248,7 +1249,7 @@ __kernel void KERNEL_TFX(
fd = ReadFrame(vm, faddr, FPSM); fd = ReadFrame(vm, faddr, FPSM);
} }
if(ZTEST) if(RZB)
{ {
zd = ReadFrame(vm, zaddr, ZPSM); zd = ReadFrame(vm, zaddr, ZPSM);
} }
@ -1284,7 +1285,7 @@ __kernel void KERNEL_TFX(
wait_group_events(1, &e); wait_group_events(1, &e);
} }
if(ZTEST) if(RZB)
{ {
event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0); event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0);
@ -1409,7 +1410,7 @@ __kernel void KERNEL_TFX(
int4 ct; int4 ct;
if(FB && TFX != TFX_NONE) if(TFX != TFX_NONE)
{ {
// TODO // TODO
@ -1423,13 +1424,20 @@ __kernel void KERNEL_TFX(
if(!FST) if(!FST)
{ {
uv = convert_int2(t.xy * (1.0f / t.z)); uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z));
if(LTF) uv -= 0x0008; if(LTF) uv -= 0x0008;
} }
else else
{ {
uv = convert_int2(t.xy); // sfex capcom logo third drawing call at (0,223) calculated as:
// t0 + (p - p0) * (t - t0) / (p1 - p0)
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
// due to rounding errors (multiply-add instruction maybe):
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
// TODO: sw renderer samples at 112 exactly, check which one is correct
uv = convert_int2(t.xy);
} }
int2 uvf = uv & 0x000f; int2 uvf = uv & 0x000f;
@ -1462,6 +1470,8 @@ __kernel void KERNEL_TFX(
// alpha tfx // alpha tfx
int alpha = c.w;
if(FB) if(FB)
{ {
if(TCC) if(TCC)
@ -1512,7 +1522,7 @@ __kernel void KERNEL_TFX(
if(ZWRITE) if(ZWRITE)
{ {
zd = bitselect(zs, zd, zm); zd = RZB ? bitselect(zs, zd, zm) : zs;
} }
// rgb tfx // rgb tfx
@ -1529,7 +1539,7 @@ __kernel void KERNEL_TFX(
break; break;
case TFX_HIGHLIGHT: case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2: case TFX_HIGHLIGHT2:
c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff); c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff);
break; break;
} }
} }
@ -1553,10 +1563,10 @@ __kernel void KERNEL_TFX(
{ {
if(DTHE && is16bit(FPSM)) if(DTHE && is16bit(FPSM))
{ {
// TODO: c += pb->dimx[y & 3] c.xyz += pb->dimx[y & 3][x & 3];
} }
c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff); c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff;
if(FBA && !is24bit(FPSM)) if(FBA && !is24bit(FPSM))
{ {