squishing opencl bugs, there aren't many left hopefully

This commit is contained in:
gabest11 2014-09-17 08:52:25 +02:00 committed by Gregory Hainaut
parent ba1e522bbb
commit c64f9ad9b1
3 changed files with 278 additions and 195 deletions

View File

@ -126,11 +126,9 @@ static int tfxcount = 0;
void GSRendererCL::VSync(int field)
{
Sync(0);
GSRenderer::VSync(field);
printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount);
//printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount);
pageuploads = pageuploadcount = tfxcount = 0;
//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
@ -371,24 +369,10 @@ void GSRendererCL::Draw()
//
shared_ptr<TFXJob> job(new TFXJob());
job->rect.x = rect.x;
job->rect.y = rect.y;
job->rect.z = rect.z;
job->rect.w = rect.w;
job->ib_start = m_cl.ib.tail;
job->ib_count = m_index.tail;
job->pb_start = m_cl.pb.tail;
GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
pb->scissor = scissor;
pb->bbox = bbox;
pb->rect = rect;
(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel?
if(m_jobs.empty())
@ -396,6 +380,7 @@ void GSRendererCL::Draw()
memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32));
m_vb_start = m_cl.vb.tail;
m_vb_count = 0;
}
else
{
@ -409,17 +394,45 @@ void GSRendererCL::Draw()
}
}
m_vb_count += m_vertex.next;
shared_ptr<TFXJob> job(new TFXJob());
if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail))
{
return;
}
pb->scissor = scissor;
if(bbox.eq(bbox.rintersect(scissor)))
{
pb->sel.noscissor = 1;
}
job->sel = pb->sel;
job->rect.x = rect.x;
job->rect.y = rect.y;
job->rect.z = rect.z;
job->rect.w = rect.w;
job->ib_start = m_cl.ib.tail;
job->ib_count = m_index.tail;
job->pb_start = m_cl.pb.tail;
#ifdef DEBUG
job->fbp = context->FRAME.Block();
job->fbw = context->FRAME.FBW;
job->fpsm = context->FRAME.PSM;
job->zbp = context->ZBUF.Block();
job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff;
job->tbw = PRIM->TME ? context->TEX0.TBW : 1;
job->tpsm = PRIM->TME ? context->TEX0.PSM : 0;
job->tw = PRIM->TME ? context->TEX0.TW : 0;
job->th = PRIM->TME ? context->TEX0.TH : 0;
#endif
m_jobs.push_back(job);
m_vb_count += m_vertex.next;
m_cl.vb.tail += vb_size;
m_cl.ib.tail += ib_size;
m_cl.pb.tail += pb_size;
@ -444,12 +457,9 @@ void GSRendererCL::Draw()
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.fwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
@ -475,12 +485,9 @@ void GSRendererCL::Draw()
{
m_rw_pages[1][i] |= m_tmp_pages[i];
}
}
GSVector4i* dst_pages = job->GetDstPages();
GSVector4i* dst_pages = job->GetDstPages();
if(pb->sel.zwrite)
{
for(int i = 0; i < 4; i++)
{
dst_pages[i] |= m_tmp_pages[i];
@ -488,6 +495,19 @@ void GSRendererCL::Draw()
}
}
if(job->src_pages != NULL)
{
for(int i = 0; i < 4; i++)
{
m_rw_pages[0][i] |= job->src_pages[i];
if(job->dst_pages != NULL && !(job->dst_pages[i] & job->src_pages[i]).eq(GSVector4i::zero()))
{
//printf("src and dst overlap!\n");
}
}
}
// don't buffer too much data, feed them to the device if there is enough
if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64)
@ -536,6 +556,8 @@ void GSRendererCL::Draw()
void GSRendererCL::Sync(int reason)
{
if(LOG) { fprintf(s_fp, "Sync (%d)\n", reason); fflush(s_fp); }
//printf("sync %d\n", reason);
GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
@ -574,7 +596,12 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
{
Sync(3);
// TODO: an awesome idea to avoid this Sync
// - call Enqueue() to flush m_jobs
// - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it
// - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written
Sync(3);
break;
}
@ -800,36 +827,28 @@ void GSRendererCL::Enqueue()
{
ASSERT(prim_start < MAX_PRIM_COUNT);
tfxcount++;
//if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); }
UpdateTextureCache((*i).get());
uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start);
tfxcount++;
if((*i)->src_pages != NULL)
/*
if(m_perfmon.GetFrame() >= 5036) if((*i)->src_pages != NULL)
{
int count = 0;
m_cl.queue[2].finish();
for(int j = 0; j < 4; j++)
{
GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j];
uint64 frame = m_perfmon.GetFrame();
if(!pages.eq(GSVector4i::zero()))
{
// TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex)
// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
std::string s;
for(int ii = 0; ii < 4; ii++)
for(int jj = 0; jj < 32; jj++)
if(pages.u32[ii] & (1 << jj)) count++;
s = format("c:\\temp1\\_%05d_f%lld_tex2_%05x_%d.bmp", s_n++, frame, (*i)->tbp, (*i)->tpsm);
m_tc_pages[j] &= ~(*i)->src_pages[j];
}
}
if(count > 0)
{
pageuploads += count;
pageuploadcount++;
}
m_mem.SaveBMP(s, (*i)->tbp, (*i)->tbw, (*i)->tpsm, 1 << (*i)->tw, 1 << (*i)->th);
}
*/
// TODO: tile level z test
@ -859,13 +878,22 @@ void GSRendererCL::Enqueue()
*/
m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16));
if((*i)->dst_pages != NULL)
/*
if(m_perfmon.GetFrame() >= 5036)
{
for(int j = 0; j < 4; j++)
{
m_tc_pages[j] |= (*i)->dst_pages[j];
}
m_cl.queue[2].finish();
uint64 frame = m_perfmon.GetFrame();
std::string s;
s = format("c:\\temp1\\_%05d_f%lld_rt2_%05x_%d.bmp", s_n++, frame, (*i)->fbp, (*i)->fpsm);
m_mem.SaveBMP(s, (*i)->fbp, (*i)->fbw, (*i)->fpsm, GetFrameRect().width(), 512);
}
*/
InvalidateTextureCache((*i).get());
// TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily
@ -881,9 +909,9 @@ void GSRendererCL::Enqueue()
(*job)->ib_start += prim_count * n * sizeof(uint32);
(*job)->ib_count -= prim_count * n;
next = job; // try again for the reminder
next = job; // try again for the remainder
printf("split %d\n", (*job)->ib_count / n);
//printf("split %d\n", (*job)->ib_count / n);
}
break;
@ -909,6 +937,96 @@ void GSRendererCL::Enqueue()
m_cl.Map();
}
void GSRendererCL::UpdateTextureCache(TFXJob* job)
{
if(job->src_pages == NULL) return;
int count = 0;
for(int i = 0; i < 4; i++)
{
GSVector4i pages = m_tc_pages[i] & job->src_pages[i];
if(pages.eq(GSVector4i::zero())) continue;
size_t page_size = 8192;
// TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded)
if(0) for(int j = 0; j < 4; j++)
{
if(pages.u32[j] == 0) continue;
if(pages.u32[j] == 0xffffffff)
{
size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * page_size;
m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 32);
if(LOG) { fprintf(s_fp, "tc (%d x32)\n", offset >> 13); fflush(s_fp); }
pageuploadcount++;
count += 32;
continue;
}
for(int k = 0; k < 4; k++)
{
uint8 b = pages.u8[j * 4 + k];
if(b == 0) continue;
if(b == 0xff)
{
size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * page_size;
m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 8);
if(LOG) { fprintf(s_fp, "tc (%d x8)\n", offset >> 13); fflush(s_fp); }
pageuploadcount++;
count += 8;
continue;
}
for(int l = 0; l < 8; l++)
{
if(b & (1 << l))
{
size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * page_size;
m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size);
if(LOG) { fprintf(s_fp, "tc (%d x1)\n", offset >> 13); fflush(s_fp); }
pageuploadcount++;
count++;
}
}
}
}
m_tc_pages[i] &= ~job->src_pages[i];
}
if(count > 0)
{
pageuploads += count;
}
}
void GSRendererCL::InvalidateTextureCache(TFXJob* job)
{
if(job->dst_pages == NULL) return;
for(int j = 0; j < 4; j++)
{
m_tc_pages[j] |= job->dst_pages[j];
}
}
static int RemapPSM(int psm)
{
switch(psm)
@ -1057,7 +1175,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0)
@ -1191,7 +1308,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
for(int i = 0; i < 4; i++)
{
src_pages[i] |= m_tmp_pages[i];
m_rw_pages[0][i] |= m_tmp_pages[i];
}
}
@ -1428,11 +1544,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver
pb->zm |= 0xffff0000;
}
if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor)))
{
pb->sel.noscissor = 1;
}
pb->fbp = context->FRAME.Block();
pb->zbp = context->ZBUF.Block();
pb->bw = context->FRAME.FBW;
@ -1481,8 +1592,10 @@ GSRendererCL::CL::CL()
#ifdef IOCL_DEBUG
if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
#else
//if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
//if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL)
//if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL)
if(type == CL_DEVICE_TYPE_GPU)
#endif
{
devices.push_back(device);
@ -1595,28 +1708,16 @@ static void AddDefs(ostringstream& opt)
#endif
}
cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
{
auto i = prim_map.find(sel);
// TODO: cache binary on disk
if(i != prim_map.end())
{
return i->second;
}
char entry[256];
sprintf(entry, "prim_%02x", sel);
printf("building kernel (%s)\n", entry);
cl::Program program = cl::Program(context, kernel_str);
try
{
ostringstream opt;
opt << "-D KERNEL_PRIM=" << entry << " ";
opt << "-D PRIM=" << sel.prim << " ";
AddDefs(opt);
program.build(opt.str().c_str());
@ -1636,7 +1737,28 @@ cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
throw err;
}
cl::Kernel k(program, entry);
return cl::Kernel(program, entry);
}
cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
{
auto i = prim_map.find(sel);
if(i != prim_map.end())
{
return i->second;
}
char entry[256];
sprintf(entry, "prim_%02x", sel);
ostringstream opt;
opt << "-D KERNEL_PRIM=" << entry << " ";
opt << "-D PRIM=" << sel.prim << " ";
cl::Kernel k = Build(entry, opt);
prim_map[sel] = k;
@ -1658,37 +1780,14 @@ cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel)
sprintf(entry, "tile_%02x", sel);
cl::Program program = cl::Program(context, kernel_str);
ostringstream opt;
try
{
ostringstream opt;
opt << "-D KERNEL_TILE=" << entry << " ";
opt << "-D PRIM=" << sel.prim << " ";
opt << "-D MODE=" << sel.mode << " ";
opt << "-D CLEAR=" << sel.clear << " ";
opt << "-D KERNEL_TILE=" << entry << " ";
opt << "-D PRIM=" << sel.prim << " ";
opt << "-D MODE=" << sel.mode << " ";
opt << "-D CLEAR=" << sel.clear << " ";
AddDefs(opt);
program.build(opt.str().c_str());
}
catch(cl::Error err)
{
if(err.err() == CL_BUILD_PROGRAM_FAILURE)
{
for(auto device : devices)
{
auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
printf("kernel (%s) build error: %s\n", entry, s.c_str());
}
}
throw err;
}
cl::Kernel k(program, entry);
cl::Kernel k = Build(entry, opt);
tile_map[sel] = k;
@ -1708,78 +1807,53 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
char entry[256];
sprintf(entry, "tfx_%016x", sel);
sprintf(entry, "tfx_%016llx", sel);
cl::Program program = cl::Program(context, kernel_str);
ostringstream opt;
try
{
ostringstream opt;
opt << "-D KERNEL_TFX=" << entry << " ";
opt << "-D FPSM=" << sel.fpsm << " ";
opt << "-D ZPSM=" << sel.zpsm << " ";
opt << "-D ZTST=" << sel.ztst << " ";
opt << "-D ATST=" << sel.atst << " ";
opt << "-D AFAIL=" << sel.afail << " ";
opt << "-D IIP=" << sel.iip << " ";
opt << "-D TFX=" << sel.tfx << " ";
opt << "-D TCC=" << sel.tcc << " ";
opt << "-D FST=" << sel.fst << " ";
opt << "-D LTF=" << sel.ltf << " ";
opt << "-D TLU=" << sel.tlu << " ";
opt << "-D FGE=" << sel.fge << " ";
opt << "-D DATE=" << sel.date << " ";
opt << "-D ABE=" << sel.abe << " ";
opt << "-D ABA=" << sel.aba << " ";
opt << "-D ABB=" << sel.abb << " ";
opt << "-D ABC=" << sel.abc << " ";
opt << "-D ABD=" << sel.abd << " ";
opt << "-D PABE=" << sel.pabe << " ";
opt << "-D AA1=" << sel.aa1 << " ";
opt << "-D FWRITE=" << sel.fwrite << " ";
opt << "-D FTEST=" << sel.ftest << " ";
opt << "-D RFB=" << sel.rfb << " ";
opt << "-D ZWRITE=" << sel.zwrite << " ";
opt << "-D ZTEST=" << sel.ztest << " ";
opt << "-D RZB=" << sel.rzb << " ";
opt << "-D WMS=" << sel.wms << " ";
opt << "-D WMT=" << sel.wmt << " ";
opt << "-D DATM=" << sel.datm << " ";
opt << "-D COLCLAMP=" << sel.colclamp << " ";
opt << "-D FBA=" << sel.fba << " ";
opt << "-D DTHE=" << sel.dthe << " ";
opt << "-D PRIM=" << sel.prim << " ";
opt << "-D LCM=" << sel.lcm << " ";
opt << "-D MMIN=" << sel.mmin << " ";
opt << "-D NOSCISSOR=" << sel.noscissor << " ";
opt << "-D TPSM=" << sel.tpsm << " ";
opt << "-D AEM=" << sel.aem << " ";
opt << "-D FB=" << sel.fb << " ";
opt << "-D ZB=" << sel.zb << " ";
opt << "-D KERNEL_TFX=" << entry << " ";
opt << "-D FPSM=" << sel.fpsm << " ";
opt << "-D ZPSM=" << sel.zpsm << " ";
opt << "-D ZTST=" << sel.ztst << " ";
opt << "-D ATST=" << sel.atst << " ";
opt << "-D AFAIL=" << sel.afail << " ";
opt << "-D IIP=" << sel.iip << " ";
opt << "-D TFX=" << sel.tfx << " ";
opt << "-D TCC=" << sel.tcc << " ";
opt << "-D FST=" << sel.fst << " ";
opt << "-D LTF=" << sel.ltf << " ";
opt << "-D TLU=" << sel.tlu << " ";
opt << "-D FGE=" << sel.fge << " ";
opt << "-D DATE=" << sel.date << " ";
opt << "-D ABE=" << sel.abe << " ";
opt << "-D ABA=" << sel.aba << " ";
opt << "-D ABB=" << sel.abb << " ";
opt << "-D ABC=" << sel.abc << " ";
opt << "-D ABD=" << sel.abd << " ";
opt << "-D PABE=" << sel.pabe << " ";
opt << "-D AA1=" << sel.aa1 << " ";
opt << "-D FWRITE=" << sel.fwrite << " ";
opt << "-D FTEST=" << sel.ftest << " ";
opt << "-D RFB=" << sel.rfb << " ";
opt << "-D ZWRITE=" << sel.zwrite << " ";
opt << "-D ZTEST=" << sel.ztest << " ";
opt << "-D RZB=" << sel.rzb << " ";
opt << "-D WMS=" << sel.wms << " ";
opt << "-D WMT=" << sel.wmt << " ";
opt << "-D DATM=" << sel.datm << " ";
opt << "-D COLCLAMP=" << sel.colclamp << " ";
opt << "-D FBA=" << sel.fba << " ";
opt << "-D DTHE=" << sel.dthe << " ";
opt << "-D PRIM=" << sel.prim << " ";
opt << "-D LCM=" << sel.lcm << " ";
opt << "-D MMIN=" << sel.mmin << " ";
opt << "-D NOSCISSOR=" << sel.noscissor << " ";
opt << "-D TPSM=" << sel.tpsm << " ";
opt << "-D AEM=" << sel.aem << " ";
opt << "-D FB=" << sel.fb << " ";
opt << "-D ZB=" << sel.zb << " ";
AddDefs(opt);
printf("building kernel (%s)\n", entry);
program.build(opt.str().c_str());
}
catch(cl::Error err)
{
if(err.err() == CL_BUILD_PROGRAM_FAILURE)
{
for(auto device : devices)
{
auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
printf("kernel (%s) build error: %s\n", entry, s.c_str());
}
}
throw err;
}
cl::Kernel k(program, entry);
cl::Kernel k = Build(entry, opt);
tfx_map[sel] = k;

View File

@ -146,8 +146,6 @@ class GSRendererCL : public GSRenderer
__aligned(struct, 32) TFXParameter
{
GSVector4i scissor;
GSVector4i bbox;
GSVector4i rect;
GSVector4i dimx; // 4x4 signed char
TFXSelector sel;
uint32 fbp, zbp, bw;
@ -173,7 +171,9 @@ class GSRendererCL : public GSRenderer
uint32 pb_start;
GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it
#ifdef DEBUG
uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th;
#endif
TFXJob()
: src_pages(NULL)
, dst_pages(NULL)
@ -224,6 +224,8 @@ class GSRendererCL : public GSRenderer
std::map<uint32, cl::Kernel> tile_map;
std::map<uint64, cl::Kernel> tfx_map;
cl::Kernel Build(const char* entry, ostringstream& opt);
public:
std::vector<cl::Device> devices;
cl::Context context;
@ -254,6 +256,8 @@ class GSRendererCL : public GSRenderer
uint32 m_vb_count;
void Enqueue();
void UpdateTextureCache(TFXJob* job);
void InvalidateTextureCache(TFXJob* job);
/*
class RasterizerData : public GSAlignedClass<32>
@ -311,13 +315,12 @@ class GSRendererCL : public GSRenderer
};
*/
protected:
// GSTextureCacheCL* m_tc;
GSTexture* m_texture[2];
uint8* m_output;
GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
GSVector4i m_tc_pages[4]; // invalidated texture cache pages
GSVector4i m_tmp_pages[4];
GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?)
GSVector4i m_tmp_pages[4]; // TODO: this should be block level, too many overlaps inside pages with render targets
void Reset();
void VSync(int field);

View File

@ -44,8 +44,6 @@ typedef struct
typedef struct
{
int4 scissor;
int4 bbox;
int4 rect;
char dimx[4][4];
ulong sel;
uint fbp, zbp, bw;
@ -663,9 +661,9 @@ __kernel void KERNEL_PRIM(
env->barycentric[prim_index] = b;
}
else
else // triangle has zero area
{
// TODO: set b.zero to something that always fails the tests
pmax = -1; // won't get included in any tile
}
}
else if(PRIM == GS_SPRITE_CLASS)
@ -769,7 +767,7 @@ __kernel void KERNEL_TILE(
uchar4 r = bbox_cache[group_prim_index];
uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
uint test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y);
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
{
@ -862,7 +860,7 @@ __kernel void KERNEL_TILE(
{
uchar4 r = bbox_cache[i];
BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y);
BIN_TYPE test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y);
if(PRIM == GS_TRIANGLE_CLASS && test != 0)
{
@ -1146,7 +1144,7 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p
c = pb->clut[vm[addr]];
break;
case PSM_PSMT4:
c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f];
c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f];
break;
case PSM_PSMT8H:
c = pb->clut[vm32[addr] >> 24];
@ -1159,11 +1157,20 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p
break;
}
//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]);
//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]);
return convert_int4(c);
}
// TODO: 2x2 MSAA idea
// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
// hittest and ztest 2x2 (create write mask, only skip if all -1)
// calculate color 1x1, alpha tests 1x1
// use mask to filter failed sub-pixels when writing to __local
// needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered
// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average
__kernel void KERNEL_TFX(
__global gs_env* env,
__global uchar* vm,
@ -1437,7 +1444,7 @@ __kernel void KERNEL_TFX(
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
// TODO: sw renderer samples at 112 exactly, check which one is correct
uv = convert_int2(t.xy);
uv = convert_int2_rte(t.xy);
}
int2 uvf = uv & 0x000f;
@ -1497,7 +1504,7 @@ __kernel void KERNEL_TFX(
{
if(!ABE || c.w == 0x80)
{
// TODO: c.w = coverage; // coverage 0x80 at 100%
c.w = /*edge ? coverage :*/ 0x80; // TODO
}
}
}
@ -1619,7 +1626,6 @@ __kernel void KERNEL_TFX(
if(FWRITE)
{
WriteFrame(vm, faddr, FPSM, fd);
//WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments);
}
}
}