GSdx: Better multi-threading for the sw renderer. Threads must be synchronized lot less, 1/10th in average, can run parallel longer and uses more cpu (bit more empty spinning, too). There could be some new bugs, as usual.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4992 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-12-18 08:13:20 +00:00
parent 4b77052d21
commit f318e84aca
20 changed files with 980 additions and 818 deletions

View File

@ -76,14 +76,9 @@ void GPUDrawScanline::BeginDraw(const void* param)
m_sp = m_sp_map[sel];
}
void GPUDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
void GPUDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels)
{
m_ds_map.UpdateStats(stats, frame);
}
void GPUDrawScanline::PrintStats()
{
m_ds_map.PrintStats();
m_ds_map.UpdateStats(frame, ticks, pixels);
}
#ifndef ENABLE_JIT_RASTERIZER

View File

@ -42,8 +42,7 @@ public:
// IDrawScanline
void BeginDraw(const void* param);
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
void PrintStats();
void EndDraw(uint64 frame, uint64 ticks, int pixels);
#ifndef ENABLE_JIT_RASTERIZER

View File

@ -301,6 +301,11 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
return;
}
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
// xmm2 = s
// xmm3 = t
// xmm7 = test
@ -953,7 +958,7 @@ void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr)
if(m_sel.tlu) movzx(eax, byte[esi + eax]);
const Address& src = m_sel.tlu ? ptr[eax * 2 + (size_t)m_local.gd->clut] : ptr[esi + eax * 2];
const Address& src = m_sel.tlu ? ptr[edx + eax * 2] : ptr[esi + eax * 2];
if(i == 0) movd(dst, src);
else pinsrw(dst, src, (uint8)i);

View File

@ -29,13 +29,15 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
{
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
m_rl.Create<GPUDrawScanline>(threads);
m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads);
}
GPURendererSW::~GPURendererSW()
{
delete m_texture;
delete m_rl;
_aligned_free(m_output);
}
@ -67,12 +69,34 @@ GSTexture* GPURendererSW::GetOutput()
void GPURendererSW::Draw()
{
class GPURasterizerData : public GSRasterizerData
{
public:
GPURasterizerData()
{
GPUScanlineGlobalData* gd = (GPUScanlineGlobalData*)_aligned_malloc(sizeof(GPUScanlineGlobalData), 32);
gd->clut = NULL;
param = gd;
}
virtual ~GPURasterizerData()
{
GPUScanlineGlobalData* gd = (GPUScanlineGlobalData*)param;
if(gd->clut) _aligned_free(gd->clut);
_aligned_free(gd);
}
};
shared_ptr<GSRasterizerData> data(new GPURasterizerData());
GPUScanlineGlobalData& gd = *(GPUScanlineGlobalData*)data->param;
const GPUDrawingEnvironment& env = m_env;
//
GPUScanlineGlobalData gd;
gd.sel.key = 0;
gd.sel.iip = env.PRIM.IIP;
gd.sel.me = env.STATUS.ME;
@ -97,7 +121,11 @@ void GPURendererSW::Draw()
if(!t) {ASSERT(0); return;}
gd.tex = t;
gd.clut = m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y);
gd.clut = (uint16*)_aligned_malloc(sizeof(uint16) * 256, 32);
memcpy(gd.clut, m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y), sizeof(uint16) * (env.STATUS.TP == 0 ? 16 : 256));
gd.twin = GSVector4i(env.TWIN.TWW, env.TWIN.TWH, env.TWIN.TWX, env.TWIN.TWY);
}
@ -108,25 +136,22 @@ void GPURendererSW::Draw()
gd.vm = m_mem.GetPixelAddress(0, 0);
//
data->vertices = (GSVertexSW*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16);
memcpy(data->vertices, m_vertices, sizeof(GSVertexSW) * m_count);
data->count = m_count;
GSRasterizerData data;
data->frame = m_perfmon.GetFrame();
data.vertices = m_vertices;
data.count = m_count;
data.frame = m_perfmon.GetFrame();
data.param = &gd;
data.scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
data.scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
data.scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
data.scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
data->scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
data->scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
switch(env.PRIM.TYPE)
{
case GPU_POLYGON: data.primclass = GS_TRIANGLE_CLASS; break;
case GPU_LINE: data.primclass = GS_LINE_CLASS; break;
case GPU_SPRITE: data.primclass = GS_SPRITE_CLASS; break;
case GPU_POLYGON: data->primclass = GS_TRIANGLE_CLASS; break;
case GPU_LINE: data->primclass = GS_LINE_CLASS; break;
case GPU_SPRITE: data->primclass = GS_SPRITE_CLASS; break;
default: __assume(0);
}
@ -135,34 +160,34 @@ void GPURendererSW::Draw()
GSVector4 tl(+1e10f);
GSVector4 br(-1e10f);
GSVertexSW* v = data->vertices;
for(int i = 0, j = m_count; i < j; i++)
{
GSVector4 p = m_vertices[i].p;
GSVector4 p = v[i].p;
tl = tl.min(p);
br = br.max(p);
}
GSVector4i r = GSVector4i(tl.xyxy(br)).rintersect(data.scissor);
data->bbox = GSVector4i(tl.xyxy(br));
GSVector4i r = data->bbox.rintersect(data->scissor);
r.left >>= m_scale.x;
r.top >>= m_scale.y;
r.right >>= m_scale.x;
r.bottom >>= m_scale.y;
m_rl.Draw(&data, r.width(), r.height());
Invalidate(r);
m_rl.Sync();
m_rl->Queue(data);
GSRasterizerStats stats;
m_rl->Sync();
m_rl.GetStats(stats);
m_perfmon.Put(GSPerfMon::Draw, 1);
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
// TODO: m_perfmon.Put(GSPerfMon::Draw, 1);
// TODO: m_perfmon.Put(GSPerfMon::Prim, stats.prims);
// TODO: m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
}
void GPURendererSW::VertexKick()

View File

@ -27,7 +27,7 @@
class GPURendererSW : public GPURendererT<GSVertexSW>
{
protected:
GSRasterizerList m_rl;
GSRasterizerList* m_rl;
GSTexture* m_texture;
uint32* m_output;

View File

@ -62,7 +62,7 @@ __aligned(struct, 32) GPUScanlineGlobalData
void* vm;
const void* tex;
const uint16* clut;
uint16* clut;
GSVector4i twin; // TWW, TWH, TWX, TWY
};

View File

@ -95,14 +95,9 @@ void GSDrawScanline::BeginDraw(const void* param)
m_sp = m_sp_map[sel];
}
void GSDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
void GSDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels)
{
m_ds_map.UpdateStats(stats, frame);
}
void GSDrawScanline::PrintStats()
{
m_ds_map.PrintStats();
m_ds_map.UpdateStats(frame, ticks, pixels);
}
#ifndef ENABLE_JIT_RASTERIZER

View File

@ -51,8 +51,7 @@ public:
// IDrawScanline
void BeginDraw(const void* param);
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
void PrintStats();
void EndDraw(uint64 frame, uint64 ticks, int pixels);
void DrawRect(const GSVector4i& r, const GSVertexSW& v);

View File

@ -63,6 +63,7 @@ L("loop");
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// - xmm0
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
@ -688,7 +689,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
mov(ebx, ptr[&m_local.gd->tex[0]]);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
// ebx = tex
// edx = clut
if(!m_sel.fst)
{
@ -1095,7 +1102,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
return;
}
mov(edx, (size_t)m_local.gd->tex);
push(ebp);
mov(ebp, (size_t)m_local.gd->tex);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
if(!m_sel.fst)
{
@ -1477,255 +1491,258 @@ return;
vpsrlw(xmm6, 8);
}
if(m_sel.mmin == 1) return; // round-off mode
vmovdqa(ptr[&m_local.temp.trb], xmm5);
vmovdqa(ptr[&m_local.temp.tga], xmm6);
vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]);
vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]);
vpsrad(xmm2, 1);
vpsrad(xmm3, 1);
vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
vpsrlw(xmm5, 1);
vpsrlw(xmm6, 1);
if(m_sel.ltf)
if(m_sel.mmin != 1) // !round-off mode
{
// u -= 0x8000;
// v -= 0x8000;
vmovdqa(ptr[&m_local.temp.trb], xmm5);
vmovdqa(ptr[&m_local.temp.tga], xmm6);
mov(eax, 0x8000);
vmovd(xmm4, eax);
vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]);
vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]);
vpsubd(xmm2, xmm4);
vpsubd(xmm3, xmm4);
vpsrad(xmm2, 1);
vpsrad(xmm3, 1);
// GSVector4i uf = u.xxzzlh().srl16(1);
vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
vpsrlw(xmm5, 1);
vpsrlw(xmm6, 1);
if(m_sel.ltf)
{
// u -= 0x8000;
// v -= 0x8000;
mov(eax, 0x8000);
vmovd(xmm4, eax);
vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
vpsubd(xmm2, xmm4);
vpsubd(xmm3, xmm4);
// GSVector4i uf = u.xxzzlh().srl16(1);
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vmovdqa(ptr[&m_local.temp.uf], xmm0);
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vmovdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
// GSVector4i vf = v.xxzzlh().srl16(1);
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vmovdqa(ptr[&m_local.temp.vf], xmm0);
}
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vmovdqa(ptr[&m_local.temp.vf], xmm0);
}
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
vpsrad(xmm2, 16);
vpsrad(xmm3, 16);
vpackssdw(xmm2, xmm3);
vpsrad(xmm2, 16);
vpsrad(xmm3, 16);
vpackssdw(xmm2, xmm3);
if(m_sel.ltf)
{
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
if(m_sel.ltf)
{
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
vpcmpeqd(xmm1, xmm1);
vpsrlw(xmm1, 15);
vpaddw(xmm3, xmm2, xmm1);
vpcmpeqd(xmm1, xmm1);
vpsrlw(xmm1, 15);
vpaddw(xmm3, xmm2, xmm1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
}
WrapLOD(xmm2);
}
// xmm2 = uv0
// xmm3 = uv1 (ltf)
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
// xmm7 = used
// xmm2 = uv0
// xmm3 = uv1 (ltf)
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
// xmm7 = used
// GSVector4i x0 = uv0.upl16();
// GSVector4i y0 = uv0.uph16() << tw;
// GSVector4i x0 = uv0.upl16();
// GSVector4i y0 = uv0.uph16() << tw;
vpxor(xmm0, xmm0);
vpxor(xmm0, xmm0);
vpunpcklwd(xmm4, xmm2, xmm0);
vpunpckhwd(xmm2, xmm2, xmm0);
vpslld(xmm2, m_sel.tw + 3);
// xmm0 = 0
// xmm2 = y0
// xmm3 = uv1 (ltf)
// xmm4 = x0
// xmm1, xmm5, xmm6 = free
// xmm7 = used
if(m_sel.ltf)
{
// GSVector4i x1 = uv1.upl16();
// GSVector4i y1 = uv1.uph16() << tw;
vpunpcklwd(xmm6, xmm3, xmm0);
vpunpckhwd(xmm3, xmm3, xmm0);
vpslld(xmm3, m_sel.tw + 3);
vpunpcklwd(xmm4, xmm2, xmm0);
vpunpckhwd(xmm2, xmm2, xmm0);
vpslld(xmm2, m_sel.tw + 3);
// xmm0 = 0
// xmm2 = y0
// xmm3 = y1
// xmm3 = uv1 (ltf)
// xmm4 = x0
// xmm6 = x1
// xmm0, xmm5, xmm6 = free
// xmm1, xmm5, xmm6 = free
// xmm7 = used
// GSVector4i addr00 = y0 + x0;
// GSVector4i addr01 = y0 + x1;
// GSVector4i addr10 = y1 + x0;
// GSVector4i addr11 = y1 + x1;
if(m_sel.ltf)
{
// GSVector4i x1 = uv1.upl16();
// GSVector4i y1 = uv1.uph16() << tw;
vpaddd(xmm5, xmm2, xmm4);
vpaddd(xmm2, xmm2, xmm6);
vpaddd(xmm0, xmm3, xmm4);
vpaddd(xmm3, xmm3, xmm6);
vpunpcklwd(xmm6, xmm3, xmm0);
vpunpckhwd(xmm3, xmm3, xmm0);
vpslld(xmm3, m_sel.tw + 3);
// xmm5 = addr00
// xmm2 = addr01
// xmm0 = addr10
// xmm3 = addr11
// xmm1, xmm4, xmm6 = free
// xmm7 = used
// xmm2 = y0
// xmm3 = y1
// xmm4 = x0
// xmm6 = x1
// xmm0, xmm5, xmm6 = free
// xmm7 = used
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
// GSVector4i addr00 = y0 + x0;
// GSVector4i addr01 = y0 + x1;
// GSVector4i addr10 = y1 + x0;
// GSVector4i addr11 = y1 + x1;
ReadTexel(4, 1);
vpaddd(xmm5, xmm2, xmm4);
vpaddd(xmm2, xmm2, xmm6);
vpaddd(xmm0, xmm3, xmm4);
vpaddd(xmm3, xmm3, xmm6);
// xmm6 = c00
// xmm4 = c01
// xmm1 = c10
// xmm5 = c11
// xmm0, xmm2, xmm3 = free
// xmm7 = used
// xmm5 = addr00
// xmm2 = addr01
// xmm0 = addr10
// xmm3 = addr11
// xmm1, xmm4, xmm6 = free
// xmm7 = used
vmovdqa(xmm0, ptr[&m_local.temp.uf]);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
// GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask;
ReadTexel(4, 1);
vpsllw(xmm2, xmm6, 8);
vpsrlw(xmm2, 8);
vpsrlw(xmm6, 8);
// xmm6 = c00
// xmm4 = c01
// xmm1 = c10
// xmm5 = c11
// xmm0, xmm2, xmm3 = free
// xmm7 = used
// GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask;
vmovdqa(xmm0, ptr[&m_local.temp.uf]);
vpsllw(xmm3, xmm4, 8);
vpsrlw(xmm3, 8);
vpsrlw(xmm4, 8);
// GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask;
// xmm0 = uf
// xmm2 = rb00
// xmm3 = rb01
// xmm6 = ga00
// xmm4 = ga01
// xmm1 = c10
// xmm5 = c11
// xmm7 = used
vpsllw(xmm2, xmm6, 8);
vpsrlw(xmm2, 8);
vpsrlw(xmm6, 8);
// rb00 = rb00.lerp16<0>(rb01, uf);
// ga00 = ga00.lerp16<0>(ga01, uf);
// GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask;
lerp16(xmm3, xmm2, xmm0, 0);
lerp16(xmm4, xmm6, xmm0, 0);
vpsllw(xmm3, xmm4, 8);
vpsrlw(xmm3, 8);
vpsrlw(xmm4, 8);
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = c10
// xmm5 = c11
// xmm2, xmm6 = free
// xmm7 = used
// xmm0 = uf
// xmm2 = rb00
// xmm3 = rb01
// xmm6 = ga00
// xmm4 = ga01
// xmm1 = c10
// xmm5 = c11
// xmm7 = used
// GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask;
// rb00 = rb00.lerp16<0>(rb01, uf);
// ga00 = ga00.lerp16<0>(ga01, uf);
vpsrlw(xmm2, xmm1, 8);
vpsllw(xmm1, 8);
vpsrlw(xmm1, 8);
lerp16(xmm3, xmm2, xmm0, 0);
lerp16(xmm4, xmm6, xmm0, 0);
// GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask;
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = c10
// xmm5 = c11
// xmm2, xmm6 = free
// xmm7 = used
vpsrlw(xmm6, xmm5, 8);
vpsllw(xmm5, 8);
vpsrlw(xmm5, 8);
// GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask;
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = rb10
// xmm5 = rb11
// xmm2 = ga10
// xmm6 = ga11
// xmm7 = used
vpsrlw(xmm2, xmm1, 8);
vpsllw(xmm1, 8);
vpsrlw(xmm1, 8);
// rb10 = rb10.lerp16<0>(rb11, uf);
// ga10 = ga10.lerp16<0>(ga11, uf);
// GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask;
lerp16(xmm5, xmm1, xmm0, 0);
lerp16(xmm6, xmm2, xmm0, 0);
vpsrlw(xmm6, xmm5, 8);
vpsllw(xmm5, 8);
vpsrlw(xmm5, 8);
// xmm3 = rb00
// xmm4 = ga00
// xmm5 = rb10
// xmm6 = ga10
// xmm0, xmm1, xmm2 = free
// xmm7 = used
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = rb10
// xmm5 = rb11
// xmm2 = ga10
// xmm6 = ga11
// xmm7 = used
// rb00 = rb00.lerp16<0>(rb10, vf);
// ga00 = ga00.lerp16<0>(ga10, vf);
// rb10 = rb10.lerp16<0>(rb11, uf);
// ga10 = ga10.lerp16<0>(ga11, uf);
vmovdqa(xmm0, ptr[&m_local.temp.vf]);
lerp16(xmm5, xmm1, xmm0, 0);
lerp16(xmm6, xmm2, xmm0, 0);
lerp16(xmm5, xmm3, xmm0, 0);
lerp16(xmm6, xmm4, xmm0, 0);
}
else
{
// GSVector4i addr00 = y0 + x0;
// xmm3 = rb00
// xmm4 = ga00
// xmm5 = rb10
// xmm6 = ga10
// xmm0, xmm1, xmm2 = free
// xmm7 = used
vpaddd(xmm5, xmm2, xmm4);
// rb00 = rb00.lerp16<0>(rb10, vf);
// ga00 = ga00.lerp16<0>(ga10, vf);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
vmovdqa(xmm0, ptr[&m_local.temp.vf]);
ReadTexel(1, 1);
lerp16(xmm5, xmm3, xmm0, 0);
lerp16(xmm6, xmm4, xmm0, 0);
}
else
{
// GSVector4i addr00 = y0 + x0;
// GSVector4i mask = GSVector4i::x00ff();
vpaddd(xmm5, xmm2, xmm4);
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
vpsllw(xmm5, xmm6, 8);
vpsrlw(xmm5, 8);
vpsrlw(xmm6, 8);
ReadTexel(1, 1);
// GSVector4i mask = GSVector4i::x00ff();
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
vpsllw(xmm5, xmm6, 8);
vpsrlw(xmm5, 8);
vpsrlw(xmm6, 8);
}
vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
vpsrlw(xmm0, xmm0, 1);
vmovdqa(xmm2, ptr[&m_local.temp.trb]);
vmovdqa(xmm3, ptr[&m_local.temp.tga]);
lerp16(xmm5, xmm2, xmm0, 0);
lerp16(xmm6, xmm3, xmm0, 0);
}
vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
vpsrlw(xmm0, xmm0, 1);
vmovdqa(xmm2, ptr[&m_local.temp.trb]);
vmovdqa(xmm3, ptr[&m_local.temp.tga]);
lerp16(xmm5, xmm2, xmm0, 0);
lerp16(xmm6, xmm3, xmm0, 0);
pop(ebp);
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
@ -2592,8 +2609,9 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
mov(eax, ptr[esp + _top]);
and(eax, 3);
shl(eax, 5);
vpaddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]);
vpaddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]);
mov(ebp, ptr[&m_local.gd->dimx]);
vpaddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
vpaddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
}
// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
@ -2739,7 +2757,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
// xmm0 = addr10
// xmm3 = addr11
// ebx = m_local.tex[0] (!m_sel.mmin)
// edx = m_local.tex (m_sel.mmin)
// ebp = m_local.tex (m_sel.mmin)
// edx = m_local.clut (m_sel.tlu)
// out
// xmm6 = c00
@ -2765,7 +2784,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
for(int i = 0; i < pixels; i++)
{
@ -2784,7 +2803,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
if(m_sel.mmin && m_sel.lcm)
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
}
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
@ -2801,7 +2820,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
{
const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4];
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
if(i == 0) vmovd(eax, addr);
else vpextrd(eax, addr, i);

View File

@ -63,6 +63,7 @@ L("loop");
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// - xmm0
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
@ -693,7 +694,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
mov(ebx, ptr[&m_local.gd->tex[0]]);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
// ebx = tex
// edx = clut
if(!m_sel.fst)
{
@ -1144,7 +1151,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
return;
}
mov(edx, (size_t)m_local.gd->tex);
push(ebp);
mov(ebp, (size_t)m_local.gd->tex);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
if(!m_sel.fst)
{
@ -1544,267 +1558,270 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
psrlw(xmm6, 8);
}
if(m_sel.mmin == 1) return; // round-off mode
movdqa(ptr[&m_local.temp.trb], xmm5);
movdqa(ptr[&m_local.temp.tga], xmm6);
movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
psrad(xmm2, 1);
psrad(xmm3, 1);
movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
psrlw(xmm5, 1);
psrlw(xmm6, 1);
if(m_sel.ltf)
if(m_sel.mmin != 1) // !round-off mode
{
// u -= 0x8000;
// v -= 0x8000;
movdqa(ptr[&m_local.temp.trb], xmm5);
movdqa(ptr[&m_local.temp.tga], xmm6);
mov(eax, 0x8000);
movd(xmm4, eax);
pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
psubd(xmm2, xmm4);
psubd(xmm3, xmm4);
psrad(xmm2, 1);
psrad(xmm3, 1);
// GSVector4i uf = u.xxzzlh().srl16(1);
movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
psrlw(xmm5, 1);
psrlw(xmm6, 1);
if(m_sel.ltf)
{
// u -= 0x8000;
// v -= 0x8000;
mov(eax, 0x8000);
movd(xmm4, eax);
pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
psubd(xmm2, xmm4);
psubd(xmm3, xmm4);
// GSVector4i uf = u.xxzzlh().srl16(1);
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
movdqa(ptr[&m_local.temp.uf], xmm0);
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
movdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
// GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
psrad(xmm2, 16);
psrad(xmm3, 16);
packssdw(xmm2, xmm3);
psrad(xmm2, 16);
psrad(xmm3, 16);
packssdw(xmm2, xmm3);
if(m_sel.ltf)
{
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
if(m_sel.ltf)
{
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
movdqa(xmm3, xmm2);
pcmpeqd(xmm1, xmm1);
psrlw(xmm1, 15);
paddw(xmm3, xmm1);
movdqa(xmm3, xmm2);
pcmpeqd(xmm1, xmm1);
psrlw(xmm1, 15);
paddw(xmm3, xmm1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
}
WrapLOD(xmm2);
}
// xmm2 = uv0
// xmm3 = uv1 (ltf)
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
// xmm7 = used
// xmm2 = uv0
// xmm3 = uv1 (ltf)
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
// xmm7 = used
// GSVector4i x0 = uv0.upl16();
// GSVector4i y0 = uv0.uph16() << tw;
// GSVector4i x0 = uv0.upl16();
// GSVector4i y0 = uv0.uph16() << tw;
pxor(xmm0, xmm0);
pxor(xmm0, xmm0);
movdqa(xmm4, xmm2);
punpckhwd(xmm2, xmm0);
punpcklwd(xmm4, xmm0);
pslld(xmm2, m_sel.tw + 3);
// xmm0 = 0
// xmm2 = y0
// xmm3 = uv1 (ltf)
// xmm4 = x0
// xmm1, xmm5, xmm6 = free
// xmm7 = used
if(m_sel.ltf)
{
// GSVector4i x1 = uv1.upl16();
// GSVector4i y1 = uv1.uph16() << tw;
movdqa(xmm6, xmm3);
punpckhwd(xmm3, xmm0);
punpcklwd(xmm6, xmm0);
pslld(xmm3, m_sel.tw + 3);
movdqa(xmm4, xmm2);
punpckhwd(xmm2, xmm0);
punpcklwd(xmm4, xmm0);
pslld(xmm2, m_sel.tw + 3);
// xmm0 = 0
// xmm2 = y0
// xmm3 = y1
// xmm3 = uv1 (ltf)
// xmm4 = x0
// xmm6 = x1
// xmm0, xmm5, xmm6 = free
// xmm1, xmm5, xmm6 = free
// xmm7 = used
// GSVector4i addr00 = y0 + x0;
// GSVector4i addr01 = y0 + x1;
// GSVector4i addr10 = y1 + x0;
// GSVector4i addr11 = y1 + x1;
if(m_sel.ltf)
{
// GSVector4i x1 = uv1.upl16();
// GSVector4i y1 = uv1.uph16() << tw;
movdqa(xmm5, xmm2);
paddd(xmm5, xmm4);
paddd(xmm2, xmm6);
movdqa(xmm6, xmm3);
punpckhwd(xmm3, xmm0);
punpcklwd(xmm6, xmm0);
pslld(xmm3, m_sel.tw + 3);
movdqa(xmm0, xmm3);
paddd(xmm0, xmm4);
paddd(xmm3, xmm6);
// xmm2 = y0
// xmm3 = y1
// xmm4 = x0
// xmm6 = x1
// xmm0, xmm5, xmm6 = free
// xmm7 = used
// xmm5 = addr00
// xmm2 = addr01
// xmm0 = addr10
// xmm3 = addr11
// xmm1, xmm4, xmm6 = free
// xmm7 = used
// GSVector4i addr00 = y0 + x0;
// GSVector4i addr01 = y0 + x1;
// GSVector4i addr10 = y1 + x0;
// GSVector4i addr11 = y1 + x1;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
movdqa(xmm5, xmm2);
paddd(xmm5, xmm4);
paddd(xmm2, xmm6);
ReadTexel(4, 1);
movdqa(xmm0, xmm3);
paddd(xmm0, xmm4);
paddd(xmm3, xmm6);
// xmm6 = c00
// xmm4 = c01
// xmm1 = c10
// xmm5 = c11
// xmm0, xmm2, xmm3 = free
// xmm7 = used
// xmm5 = addr00
// xmm2 = addr01
// xmm0 = addr10
// xmm3 = addr11
// xmm1, xmm4, xmm6 = free
// xmm7 = used
movdqa(xmm0, ptr[&m_local.temp.uf]);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
// GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask;
ReadTexel(4, 1);
movdqa(xmm2, xmm6);
psllw(xmm2, 8);
psrlw(xmm2, 8);
psrlw(xmm6, 8);
// xmm6 = c00
// xmm4 = c01
// xmm1 = c10
// xmm5 = c11
// xmm0, xmm2, xmm3 = free
// xmm7 = used
// GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask;
movdqa(xmm0, ptr[&m_local.temp.uf]);
movdqa(xmm3, xmm4);
psllw(xmm3, 8);
psrlw(xmm3, 8);
psrlw(xmm4, 8);
// GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask;
// xmm0 = uf
// xmm2 = rb00
// xmm3 = rb01
// xmm6 = ga00
// xmm4 = ga01
// xmm1 = c10
// xmm5 = c11
// xmm7 = used
movdqa(xmm2, xmm6);
psllw(xmm2, 8);
psrlw(xmm2, 8);
psrlw(xmm6, 8);
// rb00 = rb00.lerp16<0>(rb01, uf);
// ga00 = ga00.lerp16<0>(ga01, uf);
// GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask;
lerp16(xmm3, xmm2, xmm0, 0);
lerp16(xmm4, xmm6, xmm0, 0);
movdqa(xmm3, xmm4);
psllw(xmm3, 8);
psrlw(xmm3, 8);
psrlw(xmm4, 8);
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = c10
// xmm5 = c11
// xmm2, xmm6 = free
// xmm7 = used
// xmm0 = uf
// xmm2 = rb00
// xmm3 = rb01
// xmm6 = ga00
// xmm4 = ga01
// xmm1 = c10
// xmm5 = c11
// xmm7 = used
// GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask;
// rb00 = rb00.lerp16<0>(rb01, uf);
// ga00 = ga00.lerp16<0>(ga01, uf);
movdqa(xmm2, xmm1);
psllw(xmm1, 8);
psrlw(xmm1, 8);
psrlw(xmm2, 8);
lerp16(xmm3, xmm2, xmm0, 0);
lerp16(xmm4, xmm6, xmm0, 0);
// GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask;
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = c10
// xmm5 = c11
// xmm2, xmm6 = free
// xmm7 = used
movdqa(xmm6, xmm5);
psllw(xmm5, 8);
psrlw(xmm5, 8);
psrlw(xmm6, 8);
// GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask;
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = rb10
// xmm5 = rb11
// xmm2 = ga10
// xmm6 = ga11
// xmm7 = used
movdqa(xmm2, xmm1);
psllw(xmm1, 8);
psrlw(xmm1, 8);
psrlw(xmm2, 8);
// rb10 = rb10.lerp16<0>(rb11, uf);
// ga10 = ga10.lerp16<0>(ga11, uf);
// GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask;
lerp16(xmm5, xmm1, xmm0, 0);
lerp16(xmm6, xmm2, xmm0, 0);
movdqa(xmm6, xmm5);
psllw(xmm5, 8);
psrlw(xmm5, 8);
psrlw(xmm6, 8);
// xmm3 = rb00
// xmm4 = ga00
// xmm5 = rb10
// xmm6 = ga10
// xmm0, xmm1, xmm2 = free
// xmm7 = used
// xmm0 = uf
// xmm3 = rb00
// xmm4 = ga00
// xmm1 = rb10
// xmm5 = rb11
// xmm2 = ga10
// xmm6 = ga11
// xmm7 = used
// rb00 = rb00.lerp16<0>(rb10, vf);
// ga00 = ga00.lerp16<0>(ga10, vf);
// rb10 = rb10.lerp16<0>(rb11, uf);
// ga10 = ga10.lerp16<0>(ga11, uf);
movdqa(xmm0, ptr[&m_local.temp.vf]);
lerp16(xmm5, xmm1, xmm0, 0);
lerp16(xmm6, xmm2, xmm0, 0);
lerp16(xmm5, xmm3, xmm0, 0);
lerp16(xmm6, xmm4, xmm0, 0);
}
else
{
// GSVector4i addr00 = y0 + x0;
// xmm3 = rb00
// xmm4 = ga00
// xmm5 = rb10
// xmm6 = ga10
// xmm0, xmm1, xmm2 = free
// xmm7 = used
paddd(xmm2, xmm4);
movdqa(xmm5, xmm2);
// rb00 = rb00.lerp16<0>(rb10, vf);
// ga00 = ga00.lerp16<0>(ga10, vf);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
movdqa(xmm0, ptr[&m_local.temp.vf]);
ReadTexel(1, 1);
lerp16(xmm5, xmm3, xmm0, 0);
lerp16(xmm6, xmm4, xmm0, 0);
}
else
{
// GSVector4i addr00 = y0 + x0;
// GSVector4i mask = GSVector4i::x00ff();
paddd(xmm2, xmm4);
movdqa(xmm5, xmm2);
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
movdqa(xmm5, xmm6);
psllw(xmm5, 8);
psrlw(xmm5, 8);
psrlw(xmm6, 8);
ReadTexel(1, 1);
// GSVector4i mask = GSVector4i::x00ff();
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
movdqa(xmm5, xmm6);
psllw(xmm5, 8);
psrlw(xmm5, 8);
psrlw(xmm6, 8);
}
movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
psrlw(xmm0, 1);
movdqa(xmm2, ptr[&m_local.temp.trb]);
movdqa(xmm3, ptr[&m_local.temp.tga]);
lerp16(xmm5, xmm2, xmm0, 0);
lerp16(xmm6, xmm3, xmm0, 0);
}
movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
psrlw(xmm0, 1);
movdqa(xmm2, ptr[&m_local.temp.trb]);
movdqa(xmm3, ptr[&m_local.temp.tga]);
lerp16(xmm5, xmm2, xmm0, 0);
lerp16(xmm6, xmm3, xmm0, 0);
pop(ebp);
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
@ -2727,8 +2744,9 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
mov(eax, ptr[esp + _top]);
and(eax, 3);
shl(eax, 5);
paddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]);
paddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]);
mov(ebp, ptr[&m_local.gd->dimx]);
paddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
paddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
}
// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
@ -2902,7 +2920,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
// xmm0 = addr10
// xmm3 = addr11
// ebx = m_local.tex[0] (!m_sel.mmin)
// edx = m_local.tex (m_sel.mmin)
// ebp = m_local.tex (m_sel.mmin)
// edx = m_local.clut (m_sel.tlu)
// out
// xmm6 = c00
@ -2930,7 +2949,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
for(int i = 0; i < pixels; i++)
{
@ -2951,7 +2970,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
movdqa(ptr[&m_local.temp.test], xmm7);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4);
@ -2959,7 +2978,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
@ -2970,7 +2989,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
punpckldq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
@ -2978,7 +2997,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm5, xmm5, 0);
ReadTexel(xmm2, xmm2, 0);
@ -2990,7 +3009,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
punpcklqdq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm0, 0);
psrldq(xmm0, 4);
@ -2998,7 +3017,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
@ -3009,7 +3028,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
punpckldq(xmm5, xmm7);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
@ -3017,7 +3036,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm0, xmm0, 0);
ReadTexel(xmm3, xmm3, 0);
@ -3033,13 +3052,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
else
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
@ -3047,13 +3066,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
punpckldq(xmm6, xmm1);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm4, xmm5, 0);
// psrldq(xmm5, 4);
@ -3070,7 +3089,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
if(m_sel.mmin && m_sel.lcm)
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
}
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
@ -3117,7 +3136,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
{
const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4];
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
#if _M_SSE < 0x401

View File

@ -26,30 +26,13 @@
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
struct GSRasterizerStats
{
int64 ticks;
int prims, pixels;
GSRasterizerStats()
{
Reset();
}
void Reset()
{
ticks = 0;
pixels = prims = 0;
}
};
template<class KEY, class VALUE> class GSFunctionMap
{
protected:
struct ActivePtr
{
uint64 frame, frames;
int64 ticks, pixels;
uint64 ticks, pixels;
VALUE f;
};
@ -101,7 +84,7 @@ public:
return m_active->f;
}
void UpdateStats(const GSRasterizerStats& stats, uint64 frame)
void UpdateStats(uint64 frame, uint64 ticks, int pixels)
{
if(m_active)
{
@ -111,14 +94,14 @@ public:
m_active->frames++;
}
m_active->pixels += stats.pixels;
m_active->ticks += stats.ticks;
m_active->ticks += ticks;
m_active->pixels += pixels;
}
}
virtual void PrintStats()
{
int64 ttpf = 0;
uint64 ttpf = 0;
typename hash_map<KEY, ActivePtr*>::iterator i;
@ -141,9 +124,9 @@ public:
if(p->frames > 0)
{
int64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0;
int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
uint64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0;
uint64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
uint64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
printf("[%014llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n",
(uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ',

View File

@ -25,18 +25,27 @@
#include "GSRasterizer.h"
#define THREAD_HEIGHT 5
//#define THREAD_HEIGHT 1
GSRasterizer::GSRasterizer(IDrawScanline* ds)
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
: m_ds(ds)
, m_id(-1)
, m_threads(-1)
, m_id(id)
, m_threads(threads)
{
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
m_edge.count = 0;
m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
Init(0, 1);
int row = 0;
while(row < (2048 >> THREAD_HEIGHT))
{
for(int i = 0; i < threads; i++, row++)
{
m_myscanline[row] = i == id ? 1 : 0;
}
}
}
GSRasterizer::~GSRasterizer()
@ -67,82 +76,52 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
return false;
}
void GSRasterizer::Init(int id, int threads)
void GSRasterizer::Queue(shared_ptr<GSRasterizerData> data)
{
if(m_id != id || m_threads != threads)
{
m_id = id;
m_threads = threads;
if(threads > 1)
{
int row = 0;
while(row < (2048 >> THREAD_HEIGHT))
{
for(int i = 0; i < threads; i++, row++)
{
m_myscanline[row] = i == id ? 1 : 0;
}
}
}
else
{
memset(m_myscanline, 1, 2048 >> THREAD_HEIGHT);
}
}
Draw(data);
}
void GSRasterizer::Draw(const GSRasterizerData* data)
void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data)
{
m_ds->BeginDraw(data->param);
const GSVertexSW* vertices = data->vertices;
const int count = data->count;
bool scissor_test = !data->bbox.eq(data->bbox.rintersect(data->scissor));
m_scissor = data->scissor;
m_fscissor = GSVector4(data->scissor);
m_stats.Reset();
m_pixels = 0;
uint64 start = __rdtsc();
// NOTE: data->scissor_test with templated Draw* speeds up large point lists (ffxii videos), but do not seem to make any difference for others
switch(data->primclass)
{
case GS_POINT_CLASS:
m_stats.prims = count;
if(data->scissor_test) DrawPoint<true>(vertices, count);
if(scissor_test) DrawPoint<true>(vertices, count);
else DrawPoint<false>(vertices, count);
break;
case GS_LINE_CLASS:
ASSERT(!(count & 1));
m_stats.prims = count / 2;
for(int i = 0; i < count; i += 2) DrawLine(&vertices[i]);
break;
case GS_TRIANGLE_CLASS:
ASSERT(!(count % 3));
m_stats.prims = count / 3;
for(int i = 0; i < count; i += 3) DrawTriangle(&vertices[i]);
break;
case GS_SPRITE_CLASS:
ASSERT(!(count & 1));
m_stats.prims = count / 2;
for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i]);
for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i], data->solidrect);
break;
default:
__assume(0);
}
m_stats.ticks = __rdtsc() - start;
uint64 ticks = __rdtsc() - start;
m_ds->EndDraw(m_stats, data->frame);
}
void GSRasterizer::GetStats(GSRasterizerStats& stats)
{
stats = m_stats;
m_ds->EndDraw(data->frame, ticks, m_pixels);
}
template<bool scissor_test>
@ -156,7 +135,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v, int count)
{
if(IsOneOfMyScanlines(p.y))
{
m_stats.pixels++;
m_pixels++;
m_ds->SetupPrim(v, *v);
@ -174,7 +153,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy|
if(m_ds->IsEdge())
if(m_ds->HasEdge())
{
DrawEdge(v[0], v[1], dv, i, 0);
DrawEdge(v[0], v[1], dv, i, 1);
@ -218,7 +197,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
if(pixels > 0)
{
m_stats.pixels += pixels;
m_pixels += pixels;
GSVertexSW dscan = dv / dv.p.xxxx();
@ -406,7 +385,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
Flush(v, dscan);
if(m_ds->IsEdge())
if(m_ds->HasEdge())
{
GSVector4 a = dx.abs() < dy.abs(); // |dx| <= |dy|
GSVector4 b = dx < GSVector4::zero(); // dx < 0
@ -466,7 +445,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
m_edge.count += e - &m_edge.buff[m_edge.count];
}
void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
void GSRasterizer::DrawSprite(const GSVertexSW* vertices, bool solidrect)
{
GSVertexSW v[2];
@ -487,13 +466,13 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
GSVertexSW scan = v[0];
if(m_ds->IsRect())
if(solidrect)
{
if(m_id == 0)
{
m_ds->DrawRect(r, scan);
m_stats.pixels += r.width() * r.height();
m_pixels += r.width() * r.height();
}
return;
@ -522,7 +501,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
{
if(IsOneOfMyScanlines(r.top))
{
m_stats.pixels += r.width();
m_pixels += r.width();
m_ds->DrawScanline(r.width(), r.left, r.top, scan);
}
@ -754,7 +733,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
int left = e->p.i16[1];
int top = e->p.i16[2];
m_stats.pixels += pixels;
m_pixels += pixels;
m_ds->DrawScanline(pixels, left, top, *e++);
}
@ -768,7 +747,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
int left = e->p.i16[1];
int top = e->p.i16[2];
m_stats.pixels += pixels;
m_pixels += pixels;
m_ds->DrawEdge(pixels, left, top, *e++);
}
@ -781,117 +760,134 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
//
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, volatile long& sync)
: GSRasterizer(ds)
, m_sync(sync)
, m_data(NULL)
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads)
: GSRasterizer(ds, id, threads)
, m_exit(false)
, m_break(true)
, m_ready(true)
{
CreateThread();
}
GSRasterizerMT::~GSRasterizerMT()
{
Init(0, 1);
m_break = true;
Draw(NULL);
m_exit = true;
m_draw.Set();
CloseThread();
}
void GSRasterizerMT::Draw(const GSRasterizerData* data)
void GSRasterizerMT::Queue(shared_ptr<GSRasterizerData> data)
{
m_data = data;
GSAutoLock l(&m_lock);
m_draw.Set();
m_queue.push(data);
if(m_break)
{
m_break = false;
m_ready = false;
m_draw.Set();
}
}
void GSRasterizerMT::Sync()
{
while(!m_queue.empty()) _mm_pause();
m_break = true;
while(!m_ready) _mm_pause();
}
void GSRasterizerMT::ThreadProc()
{
while(m_draw.Wait() && m_data != NULL)
while(m_draw.Wait() && !m_exit)
{
GSRasterizer::Draw(m_data);
// once we are running it is better to spin, jobs can be smaller than the cost of waking up every time
_interlockedbittestandreset(&m_sync, m_id);
while(!m_break)
{
if(!m_queue.empty())
{
queue<shared_ptr<GSRasterizerData> > queue;
{
GSAutoLock l(&m_lock);
queue.swap(m_queue);
}
while(!queue.empty())
{
Draw(queue.front());
queue.pop();
}
}
else
{
_mm_pause();
}
}
m_ready = true;
}
}
//
GSRasterizerList::GSRasterizerList()
: m_sync(0)
: m_sync_count(0)
, m_count(0)
{
}
GSRasterizerList::~GSRasterizerList()
{
for(size_t i = 0; i < size(); i++)
for(vector<GSRasterizer*>::iterator i = begin(); i != end(); i++)
{
delete (*this)[i];
delete *i;
}
}
void GSRasterizerList::Draw(const GSRasterizerData* data, int width, int height)
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
{
m_stats.Reset();
// TODO: do not send data to every thread, try to bin them (based on bbox & scissor)
m_start = __rdtsc();
m_threads = std::min<int>(1 + (height >> THREAD_HEIGHT), size());
m_sync = 0;
for(int i = 1; i < m_threads; i++)
if(data->solidrect)
{
m_sync |= 1 << i;
Sync();
front()->Draw(data);
return;
}
for(int i = 1; i < m_threads; i++)
{
(*this)[i]->Init(i, m_threads);
(*this)[i]->Draw(data);
for(int i = 0; i < size(); i++)
{
(*this)[i]->Queue(data);
}
(*this)[0]->Init(0, m_threads);
(*this)[0]->Draw(data);
m_count++;
}
void GSRasterizerList::Sync()
{
while(m_sync) _mm_pause();
m_stats.ticks = __rdtsc() - m_start;
for(int i = 0; i < m_threads; i++)
if(m_count > 0)
{
GSRasterizerStats s;
(*this)[i]->GetStats(s);
m_stats.pixels += s.pixels;
m_stats.prims = std::max<int>(m_stats.prims, s.prims);
}
}
void GSRasterizerList::GetStats(GSRasterizerStats& stats)
{
stats = m_stats;
}
void GSRasterizerList::PrintStats()
{
if(!empty())
{
front()->PrintStats();
/*
int index = 0;
for(std::vector<IRasterizer*>::iterator i = begin(); i != end(); i++)
for(int i = 0; i < size(); i++)
{
printf("[Thread %d]\n", index++);
(*i)->PrintStats();
(*this)[i]->Sync();
}
*/
m_sync_count++;
m_count = 0;
}
}

View File

@ -27,18 +27,32 @@
#include "GSThread.h"
#include "GSAlignedClass.h"
__aligned(class, 32) GSRasterizerData
__aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
{
public:
GSVector4i scissor;
bool scissor_test;
GSVector4i bbox;
GS_PRIM_CLASS primclass;
const GSVertexSW* vertices;
GSVertexSW* vertices;
int count;
bool solidrect;
uint64 frame;
const void* param;
void* param;
GSRasterizerData() : scissor_test(true) {}
GSRasterizerData()
: vertices(NULL)
, count(0)
, solidrect(false)
, param(NULL)
{
}
virtual ~GSRasterizerData()
{
if(vertices != NULL) _aligned_free(vertices);
// derived class should free param and its members
}
};
class IDrawScanline : public GSAlignedClass<32>
@ -59,8 +73,7 @@ public:
virtual ~IDrawScanline() {}
virtual void BeginDraw(const void* param) = 0;
virtual void EndDraw(const GSRasterizerStats& stats, uint64 frame) = 0;
virtual void PrintStats() = 0;
virtual void EndDraw(uint64 frame, uint64 ticks, int pixels) = 0;
#ifdef ENABLE_JIT_RASTERIZER
@ -78,32 +91,29 @@ public:
#endif
__forceinline bool IsEdge() const {return m_de != NULL;}
__forceinline bool IsRect() const {return m_dr != NULL;}
__forceinline bool HasEdge() const {return m_de != NULL;}
};
class IRasterizer
class IRasterizer : public GSAlignedClass<32>
{
public:
virtual ~IRasterizer() {}
virtual void Init(int id, int threads) = 0;
virtual void Draw(const GSRasterizerData* data) = 0;
virtual void GetStats(GSRasterizerStats& stats) = 0;
virtual void PrintStats() = 0;
virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
virtual void Sync() = 0;
};
__aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer
__aligned(class, 32) GSRasterizer : public IRasterizer
{
protected:
IDrawScanline* m_ds;
int m_id;
int m_threads;
uint8* m_myscanline;
GSRasterizerStats m_stats;
GSVector4i m_scissor;
GSVector4 m_fscissor;
struct {GSVertexSW* buff; int count;} m_edge;
int m_pixels;
typedef void (GSRasterizer::*DrawPrimPtr)(const GSVertexSW* v, int count);
@ -111,7 +121,7 @@ protected:
void DrawPoint(const GSVertexSW* v, int count);
void DrawLine(const GSVertexSW* v);
void DrawTriangle(const GSVertexSW* v);
void DrawSprite(const GSVertexSW* v);
void DrawSprite(const GSVertexSW* v, bool solidrect);
__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0);
@ -123,61 +133,64 @@ protected:
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
public:
GSRasterizer(IDrawScanline* ds);
GSRasterizer(IDrawScanline* ds, int id, int threads);
virtual ~GSRasterizer();
void Draw(shared_ptr<GSRasterizerData> data);
// IRasterizer
void Init(int id, int threads);
void Draw(const GSRasterizerData* data);
void GetStats(GSRasterizerStats& stats);
void PrintStats() {m_ds->PrintStats();}
void Queue(shared_ptr<GSRasterizerData> data);
};
class GSRasterizerMT : public GSRasterizer, private GSThread
{
protected:
volatile long& m_sync;
volatile bool m_exit;
volatile bool m_break;
volatile bool m_ready;
GSAutoResetEvent m_draw;
const GSRasterizerData* m_data;
queue<shared_ptr<GSRasterizerData> > m_queue;
GSCritSec m_lock;
void ThreadProc();
public:
GSRasterizerMT(IDrawScanline* ds, volatile long& sync);
GSRasterizerMT(IDrawScanline* ds, int id, int threads);
virtual ~GSRasterizerMT();
// IRasterizer
void Draw(const GSRasterizerData* data);
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();
};
class GSRasterizerList : protected vector<IRasterizer*>
class GSRasterizerList : public IRasterizer, protected vector<GSRasterizer*>
{
protected:
volatile long m_sync;
GSRasterizerStats m_stats;
int64 m_start;
int m_threads;
int m_count;
GSRasterizerList();
public:
GSRasterizerList();
virtual ~GSRasterizerList();
template<class DS> void Create(int threads)
template<class DS> static GSRasterizerList* Create(int threads)
{
GSRasterizerList* rl = new GSRasterizerList();
threads = std::max<int>(threads, 1); // TODO: min(threads, number of cpu cores)
push_back(new GSRasterizer(new DS()));
for(int i = 1; i < threads; i++)
for(int i = 0; i < threads; i++)
{
push_back(new GSRasterizerMT(new DS(), m_sync));
rl->push_back(new GSRasterizerMT(new DS(), i, threads));
}
return rl;
}
void Draw(const GSRasterizerData* data, int width, int height);
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();
void GetStats(GSRasterizerStats& stats);
void PrintStats();
int m_sync_count;
};

View File

@ -67,6 +67,7 @@ public:
bool s_save;
bool s_savez;
int s_saven;
GSCritSec s_lock;
public:
GSRenderer();

View File

@ -25,6 +25,7 @@
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
GSRendererSW::GSRendererSW(int threads)
: m_fzb(NULL)
{
InitVertexKick(GSRendererSW);
@ -32,7 +33,7 @@ GSRendererSW::GSRendererSW(int threads)
memset(m_texture, 0, sizeof(m_texture));
m_rl.Create<GSDrawScanline>(threads);
m_rl = GSRasterizerList::Create<GSDrawScanline>(threads);
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
}
@ -46,6 +47,8 @@ GSRendererSW::~GSRendererSW()
delete m_texture[i];
}
delete m_rl;
_aligned_free(m_output);
}
@ -63,6 +66,10 @@ void GSRendererSW::VSync(int field)
{
GSRendererT<GSVertexSW>::VSync(field);
Sync(); // IncAge might delete a cached texture in use
// printf("m_sync_count = %d\n", m_rl->m_sync_count); m_rl->m_sync_count = 0;
m_tc->IncAge();
if(m_reset)
@ -87,6 +94,8 @@ void GSRendererSW::ResetDevice()
GSTexture* GSRendererSW::GetOutput(int i)
{
Sync();
const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
int w = DISPFB.FBW * 64;
@ -122,130 +131,95 @@ GSTexture* GSRendererSW::GetOutput(int i)
void GSRendererSW::Draw()
{
if(m_dump)
if(m_dump) m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
// TODO: palette may be rendered (point-list in a few visual novels) and not ready by the time it needs to be loaded => vm to clut transfer (TEX0.CLD) should wait for the rasterizers to finish, if needed
if(m_fzb != m_context->offset.fzb)
{
m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
// rasterizers must write the same outputs at the same time, this makes sure each thread has its own private surface area
// TODO: detect if frame/zbuf overlap eachother (?)
m_fzb = m_context->offset.fzb;
Sync();
}
GSScanlineGlobalData gd;
shared_ptr<GSRasterizerData> data(new GSRasterizerData2(this));
if(!GetScanlineGlobalData(gd))
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)data->param;
if(!GetScanlineGlobalData(*gd))
{
return;
}
if(!gd.sel.fwrite && !gd.sel.zwrite)
data->scissor = GSVector4i(m_context->scissor.in);
data->scissor.z = std::min<int>(data->scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
data->bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p));
data->primclass = m_vt.m_primclass;
data->vertices = (GSVertexSW*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16); // TODO: detach m_vertices and reallocate later?
memcpy(data->vertices, m_vertices, sizeof(GSVertexSW) * m_count); // TODO: m_vt.Update fetches all the vertices already, could also store them here
data->count = m_count;
data->solidrect = gd->sel.IsSolidRect();
data->frame = m_perfmon.GetFrame();
if(s_dump)
{
return;
if(data->solidrect) Sync();
((GSRasterizerData2*)data.get())->DumpInput();
}
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
{
uint64 frame = m_perfmon.GetFrame();
m_rl->Queue(data);
string s;
GSVector4i r = data->bbox.rintersect(data->scissor);
if(s_save && s_n >= s_saven && PRIM->TME)
{
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
}
s_n++;
if(s_save && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);//GetFrameSize(1).cy);
}
if(s_savez && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
}
s_n++;
}
GSVector4i scissor(m_context->scissor.in);
GSVector4i bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p));
GSVector4i r = bbox.rintersect(scissor);
GSRasterizerData data;
data.scissor = scissor;
data.scissor.z = std::min<int>(data.scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
data.scissor_test = !bbox.eq(r);
data.primclass = m_vt.m_primclass;
data.vertices = m_vertices;
data.count = m_count;
data.frame = m_perfmon.GetFrame();
data.param = &gd;
m_rl.Draw(&data, r.width(), r.height());
if(gd.sel.fwrite)
if(gd->sel.fwrite)
{
m_tc->InvalidateVideoMem(m_context->offset.fb, r);
}
if(gd.sel.zwrite)
if(gd->sel.zwrite)
{
m_tc->InvalidateVideoMem(m_context->offset.zb, r);
}
// By only syncing here we can do the two InvalidateVideoMem calls free if the other threads finish
// their drawings later than this one (they usually do because they start on an event).
// Sync();
m_rl.Sync();
GSRasterizerStats stats;
m_rl.GetStats(stats);
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
{
uint64 frame = m_perfmon.GetFrame();
string s;
if(s_save && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);//GetFrameSize(1).cy);
}
if(s_savez && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
}
s_n++;
}
// TODO: m_perfmon.Put(GSPerfMon::Prim, stats.prims);
// TODO: m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
/*
if(0)//stats.ticks > 5000000)
{
printf("* [%lld | %012llx] ticks %lld prims %d (%d) pixels %d (%d)\n",
m_perfmon.GetFrame(), gd.sel.key,
m_perfmon.GetFrame(), gd->sel.key,
stats.ticks,
stats.prims, stats.prims > 0 ? (int)(stats.ticks / stats.prims) : -1,
stats.pixels, stats.pixels > 0 ? (int)(stats.ticks / stats.pixels) : -1);
}
*/
}
void GSRendererSW::Sync()
{
m_rl->Sync();
m_tc->ResetInvalidPages();
}
void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
Sync(); // TODO: not needed if nothing uses the affected pages (this is the most frequently called Sync! get rid of it)
}
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
Sync(); // TODO: not needed if nothing uses the affected pages
}
#include "GSTextureSW.h"
@ -257,7 +231,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
const GS_PRIM_CLASS primclass = m_vt.m_primclass;
gd.vm = m_mem.m_vm8;
gd.dimx = env.dimx;
gd.fbr = context->offset.fb->pixel.row;
gd.zbr = context->offset.zb->pixel.row;
@ -315,6 +288,11 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
bool fwrite = fm != 0xffffffff;
bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
bool zwrite = zm != 0xffffffff;
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
if(!fwrite && !zwrite) return false;
gd.sel.fwrite = fwrite;
gd.sel.ftest = ftest;
@ -329,13 +307,20 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
if(PRIM->TME)
{
gd.clut = m_mem.m_clut;
gd.sel.tfx = context->TEX0.TFX;
gd.sel.tcc = context->TEX0.TCC;
gd.sel.fst = PRIM->FST;
gd.sel.ltf = m_vt.IsLinear();
gd.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0;
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
{
gd.sel.tlu = 1;
gd.clut = (uint32*)_aligned_malloc(sizeof(uint32) * 256, 32); // FIXME: might address uninitialized data of the texture (0xCD) that is not in 0-15 range for 4-bpp formats
memcpy(gd.clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
}
gd.sel.wms = context->CLAMP.WMS;
gd.sel.wmt = context->CLAMP.WMT;
@ -346,13 +331,17 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
gd.sel.tfx = TFX_DECAL;
}
GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
if(t == NULL) {ASSERT(0); return false;}
if(!m_tc->CanUpdate(t)) Sync();
GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, gd.sel.ltf);
const GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA, r);
if(t == NULL) {ASSERT(0); return false;}
if(!t->Update(r)) {ASSERT(0); return false;}
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
{
@ -495,13 +484,17 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
m_vt.m_min.t *= 0.5f;
m_vt.m_max.t *= 0.5f;
GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, gd.sel.tw + 3);
if(t == NULL) {ASSERT(0); return false;}
if(!m_tc->CanUpdate(t)) Sync();
GSVector4i r;
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf);
const GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, r, gd.sel.tw + 3);
if(t == NULL) {ASSERT(0); return false;}
if(!t->Update(r)) {ASSERT(0); return false;}
gd.tex[i] = t->m_buff;
@ -699,11 +692,16 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
gd.sel.colclamp = env.COLCLAMP.CLAMP;
gd.sel.fba = context->FBA.FBA;
gd.sel.dthe = env.DTHE.DTHE;
}
bool zwrite = zm != 0xffffffff;
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
if(env.DTHE.DTHE)
{
gd.sel.dthe = 1;
gd.dimx = (GSVector4i*)_aligned_malloc(sizeof(env.dimx), 32);
memcpy(gd.dimx, env.dimx, sizeof(env.dimx));
}
}
gd.sel.zwrite = zwrite;
gd.sel.ztest = ztest;

View File

@ -27,12 +27,118 @@
class GSRendererSW : public GSRendererT<GSVertexSW>
{
class GSRasterizerData2 : public GSRasterizerData
{
GSRenderer* renderer;
GIFRegFRAME FRAME;
GIFRegZBUF ZBUF;
GIFRegTEX0 TEX0;
uint32 TME;
GSVector2i framesize;
public:
GSRasterizerData2(GSRenderer* r)
{
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)_aligned_malloc(sizeof(GSScanlineGlobalData), 32);
gd->clut = NULL;
gd->dimx = NULL;
param = gd;
renderer = r;
FRAME = r->m_context->FRAME;
ZBUF = r->m_context->ZBUF;
TEX0 = r->m_context->TEX0;
TME = r->PRIM->TME;
framesize = GSVector2i(r->GetFrameRect().width(), 512);
}
virtual ~GSRasterizerData2()
{
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)param;
if(gd->clut) _aligned_free(gd->clut);
if(gd->dimx) _aligned_free(gd->dimx);
_aligned_free(gd);
DumpOutput();
}
// FIXME: not really possible to save whole input/output anymore, strips of the picture may lag in multi-threaded mode
void DumpInput()
{
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0))
GSAutoLock l(&renderer->s_lock);
uint64 frame = renderer->m_perfmon.GetFrame();
string s;
if(renderer->s_save && renderer->s_n >= renderer->s_saven && TME)
{
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", renderer->s_n, frame, (int)TEX0.TBP0, (int)TEX0.PSM);
renderer->m_mem.SaveBMP(s, TEX0.TBP0, TEX0.TBW, TEX0.PSM, 1 << TEX0.TW, 1 << TEX0.TH);
}
renderer->s_n++;
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
}
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
}
renderer->s_n++;
}
void DumpOutput()
{
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
GSAutoLock l(&renderer->s_lock);
uint64 frame = renderer->m_perfmon.GetFrame();
string s;
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
}
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
}
renderer->s_n++;
}
};
protected:
GSRasterizerList m_rl;
GSRasterizerList* m_rl;
GSTextureCacheSW* m_tc;
GSTexture* m_texture[2];
uint8* m_output;
bool m_reset;
GSPixelOffset4* m_fzb;
void Reset();
void VSync(int field);
@ -40,7 +146,9 @@ protected:
GSTexture* GetOutput(int i);
void Draw();
void Sync();
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
bool GetScanlineGlobalData(GSScanlineGlobalData& gd);

View File

@ -107,15 +107,14 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
{
GSScanlineSelector sel;
// - the data of vm, tex, clut, dimx may change, multi-threaded drawing must be finished before that happens (an idea: remember which pages are used, sync when something needs to read or write them)
// - the data of vm, tex may change, multi-threaded drawing must be finished before that happens, clut and dimx are copies
// - tex is a cached texture, it may be recycled to free up memory, its absolute address cannot be compiled into code
// - row and column pointers are allocated once and never change or freed, thier address can be used directly
// - if in the future drawing does not have to be synchronized per batch, the rest of GSRasterizerData should be copied here, too (scissor, prim type, vertices)
void* vm;
const void* tex[7];
const uint32* clut;
const GSVector4i* dimx;
uint32* clut;
GSVector4i* dimx;
const int* fbr;
const int* zbr;

View File

@ -25,6 +25,7 @@
GSTextureCacheSW::GSTextureCacheSW(GSState* state)
: m_state(state)
{
memset(m_invalid, 0, sizeof(m_invalid));
}
GSTextureCacheSW::~GSTextureCacheSW()
@ -32,7 +33,7 @@ GSTextureCacheSW::~GSTextureCacheSW()
RemoveAll();
}
const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, uint32 tw0)
GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0)
{
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
@ -76,36 +77,9 @@ const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0
m_textures.insert(t);
__aligned(uint32, 16) pages[16];
((GSVector4i*)pages)[0] = GSVector4i::zero();
((GSVector4i*)pages)[1] = GSVector4i::zero();
((GSVector4i*)pages)[2] = GSVector4i::zero();
((GSVector4i*)pages)[3] = GSVector4i::zero();
GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
int tw = 1 << TEX0.TW;
int th = 1 << TEX0.TH;
for(int y = 0; y < th; y += bs.y)
for(int i = 0; i < countof(t->m_pages); i++)
{
uint32 base = o->block.row[y >> 3];
for(int x = 0; x < tw; x += bs.x)
{
uint32 page = (base + o->block.col[x >> 3]) >> 5;
if(page < MAX_PAGES)
{
pages[page >> 5] |= 1 << (page & 31);
}
}
}
for(int i = 0; i < countof(pages); i++)
{
uint32 p = pages[i];
uint32 p = t->m_pages[i];
if(p != 0)
{
@ -123,22 +97,11 @@ const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0
}
}
if(!t->Update(r))
{
printf("!@#$\n"); // memory allocation may fail if the game is too hungry (tales of legendia fight transition/scene)
RemoveAt(t);
t = NULL;
}
return t;
}
bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rect)
void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rect)
{
bool changed = false;
uint32 bp = o->bp;
uint32 bw = o->bw;
uint32 psm = o->psm;
@ -153,10 +116,12 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
for(int x = r.left; x < r.right; x += bs.x)
{
uint32 page = (base + o->block.col[x >> 3]) >> 5;
uint32 page = (base + o->block.col[x >> 3]) >> 5;
if(page < MAX_PAGES)
{
m_invalid[page >> 5] |= 1 << (page & 31); // remember which pages might be invalid for future texture updates
const list<Texture*>& map = m_map[page];
for(list<Texture*>::const_iterator i = map.begin(); i != map.end(); i++)
@ -165,8 +130,6 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM))
{
changed = true;
if(t->m_repeating)
{
list<GSVector2i>& l = t->m_p2t[page];
@ -187,8 +150,6 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
}
}
}
return changed;
}
void GSTextureCacheSW::RemoveAll()
@ -237,6 +198,24 @@ void GSTextureCacheSW::IncAge()
}
}
bool GSTextureCacheSW::CanUpdate(Texture* t)
{
for(size_t i = 0; i < countof(m_invalid); i++)
{
if(m_invalid[i] & t->m_pages[i])
{
return false;
}
}
return true;
}
void GSTextureCacheSW::ResetInvalidPages()
{
memset(m_invalid, 0, sizeof(m_invalid));
}
//
GSTextureCacheSW::Texture::Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
@ -252,7 +231,30 @@ GSTextureCacheSW::Texture::Texture(GSState* state, const GSOffset* offset, uint3
m_TEXA = TEXA;
memset(m_valid, 0, sizeof(m_valid));
memset(m_pages, 0, sizeof(m_pages));
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
int tw = 1 << TEX0.TW;
int th = 1 << TEX0.TH;
for(int y = 0; y < th; y += bs.y)
{
uint32 base = offset->block.row[y >> 3];
for(int x = 0; x < tw; x += bs.x)
{
uint32 page = (base + offset->block.col[x >> 3]) >> 5;
if(page < MAX_PAGES)
{
m_pages[page >> 5] |= 1 << (page & 31);
}
}
}
m_repeating = m_TEX0.IsRepeating(); // repeating mode always works, it is just slightly slower
if(m_repeating)

View File

@ -39,13 +39,14 @@ public:
bool m_complete;
bool m_repeating;
list<GSVector2i>* m_p2t;
uint32 m_valid[MAX_PAGES];
uint32 m_valid[MAX_PAGES];
uint32 m_pages[16];
// m_valid
// fast mode: each uint32 bits map to the 32 blocks of that page
// repeating mode: 1 bpp image of the texture tiles (8x8), also having 512 elements is just a coincidence (worst case: (1024*1024)/(8*8)/(sizeof(uint32)*8))
explicit Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
virtual ~Texture();
bool Update(const GSVector4i& r);
@ -56,16 +57,20 @@ protected:
GSState* m_state;
hash_set<Texture*> m_textures;
list<Texture*> m_map[MAX_PAGES];
uint32 m_invalid[16];
public:
GSTextureCacheSW(GSState* state);
virtual ~GSTextureCacheSW();
const Texture* Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, uint32 tw0 = 0);
Texture* Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0 = 0);
bool InvalidateVideoMem(const GSOffset* o, const GSVector4i& r);
void InvalidateVideoMem(const GSOffset* o, const GSVector4i& r);
void RemoveAll();
void RemoveAt(Texture* t);
void IncAge();
bool CanUpdate(Texture* t);
void ResetInvalidPages();
};

View File

@ -87,6 +87,7 @@
#include <list>
#include <map>
#include <set>
#include <queue>
#include <algorithm>
using namespace std;