mirror of https://github.com/PCSX2/pcsx2.git
GSdx: Better multi-threading for the sw renderer. Threads must be synchronized lot less, 1/10th in average, can run parallel longer and uses more cpu (bit more empty spinning, too). There could be some new bugs, as usual.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4992 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
4b77052d21
commit
f318e84aca
|
@ -76,14 +76,9 @@ void GPUDrawScanline::BeginDraw(const void* param)
|
|||
m_sp = m_sp_map[sel];
|
||||
}
|
||||
|
||||
void GPUDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
|
||||
void GPUDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels)
|
||||
{
|
||||
m_ds_map.UpdateStats(stats, frame);
|
||||
}
|
||||
|
||||
void GPUDrawScanline::PrintStats()
|
||||
{
|
||||
m_ds_map.PrintStats();
|
||||
m_ds_map.UpdateStats(frame, ticks, pixels);
|
||||
}
|
||||
|
||||
#ifndef ENABLE_JIT_RASTERIZER
|
||||
|
|
|
@ -42,8 +42,7 @@ public:
|
|||
// IDrawScanline
|
||||
|
||||
void BeginDraw(const void* param);
|
||||
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
|
||||
void PrintStats();
|
||||
void EndDraw(uint64 frame, uint64 ticks, int pixels);
|
||||
|
||||
#ifndef ENABLE_JIT_RASTERIZER
|
||||
|
||||
|
|
|
@ -301,6 +301,11 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
|
|||
return;
|
||||
}
|
||||
|
||||
if(m_sel.tlu)
|
||||
{
|
||||
mov(edx, ptr[&m_local.gd->clut]);
|
||||
}
|
||||
|
||||
// xmm2 = s
|
||||
// xmm3 = t
|
||||
// xmm7 = test
|
||||
|
@ -953,7 +958,7 @@ void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr)
|
|||
|
||||
if(m_sel.tlu) movzx(eax, byte[esi + eax]);
|
||||
|
||||
const Address& src = m_sel.tlu ? ptr[eax * 2 + (size_t)m_local.gd->clut] : ptr[esi + eax * 2];
|
||||
const Address& src = m_sel.tlu ? ptr[edx + eax * 2] : ptr[esi + eax * 2];
|
||||
|
||||
if(i == 0) movd(dst, src);
|
||||
else pinsrw(dst, src, (uint8)i);
|
||||
|
|
|
@ -29,13 +29,15 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
|
|||
{
|
||||
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
|
||||
|
||||
m_rl.Create<GPUDrawScanline>(threads);
|
||||
m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads);
|
||||
}
|
||||
|
||||
GPURendererSW::~GPURendererSW()
|
||||
{
|
||||
delete m_texture;
|
||||
|
||||
delete m_rl;
|
||||
|
||||
_aligned_free(m_output);
|
||||
}
|
||||
|
||||
|
@ -67,12 +69,34 @@ GSTexture* GPURendererSW::GetOutput()
|
|||
|
||||
void GPURendererSW::Draw()
|
||||
{
|
||||
class GPURasterizerData : public GSRasterizerData
|
||||
{
|
||||
public:
|
||||
GPURasterizerData()
|
||||
{
|
||||
GPUScanlineGlobalData* gd = (GPUScanlineGlobalData*)_aligned_malloc(sizeof(GPUScanlineGlobalData), 32);
|
||||
|
||||
gd->clut = NULL;
|
||||
|
||||
param = gd;
|
||||
}
|
||||
|
||||
virtual ~GPURasterizerData()
|
||||
{
|
||||
GPUScanlineGlobalData* gd = (GPUScanlineGlobalData*)param;
|
||||
|
||||
if(gd->clut) _aligned_free(gd->clut);
|
||||
|
||||
_aligned_free(gd);
|
||||
}
|
||||
};
|
||||
|
||||
shared_ptr<GSRasterizerData> data(new GPURasterizerData());
|
||||
|
||||
GPUScanlineGlobalData& gd = *(GPUScanlineGlobalData*)data->param;
|
||||
|
||||
const GPUDrawingEnvironment& env = m_env;
|
||||
|
||||
//
|
||||
|
||||
GPUScanlineGlobalData gd;
|
||||
|
||||
gd.sel.key = 0;
|
||||
gd.sel.iip = env.PRIM.IIP;
|
||||
gd.sel.me = env.STATUS.ME;
|
||||
|
@ -97,7 +121,11 @@ void GPURendererSW::Draw()
|
|||
if(!t) {ASSERT(0); return;}
|
||||
|
||||
gd.tex = t;
|
||||
gd.clut = m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y);
|
||||
|
||||
gd.clut = (uint16*)_aligned_malloc(sizeof(uint16) * 256, 32);
|
||||
|
||||
memcpy(gd.clut, m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y), sizeof(uint16) * (env.STATUS.TP == 0 ? 16 : 256));
|
||||
|
||||
gd.twin = GSVector4i(env.TWIN.TWW, env.TWIN.TWH, env.TWIN.TWX, env.TWIN.TWY);
|
||||
}
|
||||
|
||||
|
@ -108,25 +136,22 @@ void GPURendererSW::Draw()
|
|||
|
||||
gd.vm = m_mem.GetPixelAddress(0, 0);
|
||||
|
||||
//
|
||||
data->vertices = (GSVertexSW*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16);
|
||||
memcpy(data->vertices, m_vertices, sizeof(GSVertexSW) * m_count);
|
||||
data->count = m_count;
|
||||
|
||||
GSRasterizerData data;
|
||||
data->frame = m_perfmon.GetFrame();
|
||||
|
||||
data.vertices = m_vertices;
|
||||
data.count = m_count;
|
||||
data.frame = m_perfmon.GetFrame();
|
||||
data.param = &gd;
|
||||
|
||||
data.scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
|
||||
data.scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
|
||||
data.scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
|
||||
data.scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
|
||||
data->scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
|
||||
data->scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
|
||||
data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
|
||||
data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
|
||||
|
||||
switch(env.PRIM.TYPE)
|
||||
{
|
||||
case GPU_POLYGON: data.primclass = GS_TRIANGLE_CLASS; break;
|
||||
case GPU_LINE: data.primclass = GS_LINE_CLASS; break;
|
||||
case GPU_SPRITE: data.primclass = GS_SPRITE_CLASS; break;
|
||||
case GPU_POLYGON: data->primclass = GS_TRIANGLE_CLASS; break;
|
||||
case GPU_LINE: data->primclass = GS_LINE_CLASS; break;
|
||||
case GPU_SPRITE: data->primclass = GS_SPRITE_CLASS; break;
|
||||
default: __assume(0);
|
||||
}
|
||||
|
||||
|
@ -135,34 +160,34 @@ void GPURendererSW::Draw()
|
|||
GSVector4 tl(+1e10f);
|
||||
GSVector4 br(-1e10f);
|
||||
|
||||
GSVertexSW* v = data->vertices;
|
||||
|
||||
for(int i = 0, j = m_count; i < j; i++)
|
||||
{
|
||||
GSVector4 p = m_vertices[i].p;
|
||||
GSVector4 p = v[i].p;
|
||||
|
||||
tl = tl.min(p);
|
||||
br = br.max(p);
|
||||
}
|
||||
|
||||
GSVector4i r = GSVector4i(tl.xyxy(br)).rintersect(data.scissor);
|
||||
data->bbox = GSVector4i(tl.xyxy(br));
|
||||
|
||||
GSVector4i r = data->bbox.rintersect(data->scissor);
|
||||
|
||||
r.left >>= m_scale.x;
|
||||
r.top >>= m_scale.y;
|
||||
r.right >>= m_scale.x;
|
||||
r.bottom >>= m_scale.y;
|
||||
|
||||
m_rl.Draw(&data, r.width(), r.height());
|
||||
|
||||
Invalidate(r);
|
||||
|
||||
m_rl.Sync();
|
||||
m_rl->Queue(data);
|
||||
|
||||
GSRasterizerStats stats;
|
||||
m_rl->Sync();
|
||||
|
||||
m_rl.GetStats(stats);
|
||||
|
||||
m_perfmon.Put(GSPerfMon::Draw, 1);
|
||||
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
|
||||
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
|
||||
// TODO: m_perfmon.Put(GSPerfMon::Draw, 1);
|
||||
// TODO: m_perfmon.Put(GSPerfMon::Prim, stats.prims);
|
||||
// TODO: m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
|
||||
}
|
||||
|
||||
void GPURendererSW::VertexKick()
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
class GPURendererSW : public GPURendererT<GSVertexSW>
|
||||
{
|
||||
protected:
|
||||
GSRasterizerList m_rl;
|
||||
GSRasterizerList* m_rl;
|
||||
GSTexture* m_texture;
|
||||
uint32* m_output;
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ __aligned(struct, 32) GPUScanlineGlobalData
|
|||
|
||||
void* vm;
|
||||
const void* tex;
|
||||
const uint16* clut;
|
||||
uint16* clut;
|
||||
GSVector4i twin; // TWW, TWH, TWX, TWY
|
||||
};
|
||||
|
||||
|
|
|
@ -95,14 +95,9 @@ void GSDrawScanline::BeginDraw(const void* param)
|
|||
m_sp = m_sp_map[sel];
|
||||
}
|
||||
|
||||
void GSDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
|
||||
void GSDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels)
|
||||
{
|
||||
m_ds_map.UpdateStats(stats, frame);
|
||||
}
|
||||
|
||||
void GSDrawScanline::PrintStats()
|
||||
{
|
||||
m_ds_map.PrintStats();
|
||||
m_ds_map.UpdateStats(frame, ticks, pixels);
|
||||
}
|
||||
|
||||
#ifndef ENABLE_JIT_RASTERIZER
|
||||
|
|
|
@ -51,8 +51,7 @@ public:
|
|||
// IDrawScanline
|
||||
|
||||
void BeginDraw(const void* param);
|
||||
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
|
||||
void PrintStats();
|
||||
void EndDraw(uint64 frame, uint64 ticks, int pixels);
|
||||
|
||||
void DrawRect(const GSVector4i& r, const GSVertexSW& v);
|
||||
|
||||
|
|
|
@ -63,6 +63,7 @@ L("loop");
|
|||
// ecx = steps
|
||||
// esi = fzbr
|
||||
// edi = fzbc
|
||||
// ebp = za
|
||||
// - xmm0
|
||||
// xmm2 = s/u (tme)
|
||||
// xmm3 = t/v (tme)
|
||||
|
@ -688,7 +689,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
|
|||
|
||||
mov(ebx, ptr[&m_local.gd->tex[0]]);
|
||||
|
||||
if(m_sel.tlu)
|
||||
{
|
||||
mov(edx, ptr[&m_local.gd->clut]);
|
||||
}
|
||||
|
||||
// ebx = tex
|
||||
// edx = clut
|
||||
|
||||
if(!m_sel.fst)
|
||||
{
|
||||
|
@ -1095,7 +1102,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
|
|||
return;
|
||||
}
|
||||
|
||||
mov(edx, (size_t)m_local.gd->tex);
|
||||
push(ebp);
|
||||
|
||||
mov(ebp, (size_t)m_local.gd->tex);
|
||||
|
||||
if(m_sel.tlu)
|
||||
{
|
||||
mov(edx, ptr[&m_local.gd->clut]);
|
||||
}
|
||||
|
||||
if(!m_sel.fst)
|
||||
{
|
||||
|
@ -1477,255 +1491,258 @@ return;
|
|||
vpsrlw(xmm6, 8);
|
||||
}
|
||||
|
||||
if(m_sel.mmin == 1) return; // round-off mode
|
||||
|
||||
vmovdqa(ptr[&m_local.temp.trb], xmm5);
|
||||
vmovdqa(ptr[&m_local.temp.tga], xmm6);
|
||||
|
||||
vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]);
|
||||
vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]);
|
||||
|
||||
vpsrad(xmm2, 1);
|
||||
vpsrad(xmm3, 1);
|
||||
|
||||
vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
|
||||
vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
|
||||
|
||||
vpsrlw(xmm5, 1);
|
||||
vpsrlw(xmm6, 1);
|
||||
|
||||
if(m_sel.ltf)
|
||||
if(m_sel.mmin != 1) // !round-off mode
|
||||
{
|
||||
// u -= 0x8000;
|
||||
// v -= 0x8000;
|
||||
vmovdqa(ptr[&m_local.temp.trb], xmm5);
|
||||
vmovdqa(ptr[&m_local.temp.tga], xmm6);
|
||||
|
||||
mov(eax, 0x8000);
|
||||
vmovd(xmm4, eax);
|
||||
vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]);
|
||||
vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]);
|
||||
|
||||
vpsubd(xmm2, xmm4);
|
||||
vpsubd(xmm3, xmm4);
|
||||
vpsrad(xmm2, 1);
|
||||
vpsrad(xmm3, 1);
|
||||
|
||||
// GSVector4i uf = u.xxzzlh().srl16(1);
|
||||
vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
|
||||
vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
|
||||
|
||||
vpsrlw(xmm5, 1);
|
||||
vpsrlw(xmm6, 1);
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// u -= 0x8000;
|
||||
// v -= 0x8000;
|
||||
|
||||
mov(eax, 0x8000);
|
||||
vmovd(xmm4, eax);
|
||||
vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
vpsubd(xmm2, xmm4);
|
||||
vpsubd(xmm3, xmm4);
|
||||
|
||||
// GSVector4i uf = u.xxzzlh().srl16(1);
|
||||
|
||||
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpsrlw(xmm0, 1);
|
||||
vmovdqa(ptr[&m_local.temp.uf], xmm0);
|
||||
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpsrlw(xmm0, 1);
|
||||
vmovdqa(ptr[&m_local.temp.uf], xmm0);
|
||||
|
||||
// GSVector4i vf = v.xxzzlh().srl16(1);
|
||||
// GSVector4i vf = v.xxzzlh().srl16(1);
|
||||
|
||||
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpsrlw(xmm0, 1);
|
||||
vmovdqa(ptr[&m_local.temp.vf], xmm0);
|
||||
}
|
||||
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpsrlw(xmm0, 1);
|
||||
vmovdqa(ptr[&m_local.temp.vf], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
|
||||
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
|
||||
|
||||
vpsrad(xmm2, 16);
|
||||
vpsrad(xmm3, 16);
|
||||
vpackssdw(xmm2, xmm3);
|
||||
vpsrad(xmm2, 16);
|
||||
vpsrad(xmm3, 16);
|
||||
vpackssdw(xmm2, xmm3);
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
|
||||
|
||||
vpcmpeqd(xmm1, xmm1);
|
||||
vpsrlw(xmm1, 15);
|
||||
vpaddw(xmm3, xmm2, xmm1);
|
||||
vpcmpeqd(xmm1, xmm1);
|
||||
vpsrlw(xmm1, 15);
|
||||
vpaddw(xmm3, xmm2, xmm1);
|
||||
|
||||
// uv0 = Wrap(uv0);
|
||||
// uv1 = Wrap(uv1);
|
||||
// uv0 = Wrap(uv0);
|
||||
// uv1 = Wrap(uv1);
|
||||
|
||||
WrapLOD(xmm2, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
// uv0 = Wrap(uv0);
|
||||
WrapLOD(xmm2, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
// uv0 = Wrap(uv0);
|
||||
|
||||
WrapLOD(xmm2);
|
||||
}
|
||||
WrapLOD(xmm2);
|
||||
}
|
||||
|
||||
// xmm2 = uv0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
// xmm2 = uv0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i x0 = uv0.upl16();
|
||||
// GSVector4i y0 = uv0.uph16() << tw;
|
||||
// GSVector4i x0 = uv0.upl16();
|
||||
// GSVector4i y0 = uv0.uph16() << tw;
|
||||
|
||||
vpxor(xmm0, xmm0);
|
||||
vpxor(xmm0, xmm0);
|
||||
|
||||
vpunpcklwd(xmm4, xmm2, xmm0);
|
||||
vpunpckhwd(xmm2, xmm2, xmm0);
|
||||
vpslld(xmm2, m_sel.tw + 3);
|
||||
|
||||
// xmm0 = 0
|
||||
// xmm2 = y0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm4 = x0
|
||||
// xmm1, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i x1 = uv1.upl16();
|
||||
// GSVector4i y1 = uv1.uph16() << tw;
|
||||
|
||||
vpunpcklwd(xmm6, xmm3, xmm0);
|
||||
vpunpckhwd(xmm3, xmm3, xmm0);
|
||||
vpslld(xmm3, m_sel.tw + 3);
|
||||
vpunpcklwd(xmm4, xmm2, xmm0);
|
||||
vpunpckhwd(xmm2, xmm2, xmm0);
|
||||
vpslld(xmm2, m_sel.tw + 3);
|
||||
|
||||
// xmm0 = 0
|
||||
// xmm2 = y0
|
||||
// xmm3 = y1
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm4 = x0
|
||||
// xmm6 = x1
|
||||
// xmm0, xmm5, xmm6 = free
|
||||
// xmm1, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// GSVector4i addr01 = y0 + x1;
|
||||
// GSVector4i addr10 = y1 + x0;
|
||||
// GSVector4i addr11 = y1 + x1;
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i x1 = uv1.upl16();
|
||||
// GSVector4i y1 = uv1.uph16() << tw;
|
||||
|
||||
vpaddd(xmm5, xmm2, xmm4);
|
||||
vpaddd(xmm2, xmm2, xmm6);
|
||||
vpaddd(xmm0, xmm3, xmm4);
|
||||
vpaddd(xmm3, xmm3, xmm6);
|
||||
vpunpcklwd(xmm6, xmm3, xmm0);
|
||||
vpunpckhwd(xmm3, xmm3, xmm0);
|
||||
vpslld(xmm3, m_sel.tw + 3);
|
||||
|
||||
// xmm5 = addr00
|
||||
// xmm2 = addr01
|
||||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// xmm1, xmm4, xmm6 = free
|
||||
// xmm7 = used
|
||||
// xmm2 = y0
|
||||
// xmm3 = y1
|
||||
// xmm4 = x0
|
||||
// xmm6 = x1
|
||||
// xmm0, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// GSVector4i addr01 = y0 + x1;
|
||||
// GSVector4i addr10 = y1 + x0;
|
||||
// GSVector4i addr11 = y1 + x1;
|
||||
|
||||
ReadTexel(4, 1);
|
||||
vpaddd(xmm5, xmm2, xmm4);
|
||||
vpaddd(xmm2, xmm2, xmm6);
|
||||
vpaddd(xmm0, xmm3, xmm4);
|
||||
vpaddd(xmm3, xmm3, xmm6);
|
||||
|
||||
// xmm6 = c00
|
||||
// xmm4 = c01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm0, xmm2, xmm3 = free
|
||||
// xmm7 = used
|
||||
// xmm5 = addr00
|
||||
// xmm2 = addr01
|
||||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// xmm1, xmm4, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
vmovdqa(xmm0, ptr[&m_local.temp.uf]);
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
|
||||
// GSVector4i rb00 = c00 & mask;
|
||||
// GSVector4i ga00 = (c00 >> 8) & mask;
|
||||
ReadTexel(4, 1);
|
||||
|
||||
vpsllw(xmm2, xmm6, 8);
|
||||
vpsrlw(xmm2, 8);
|
||||
vpsrlw(xmm6, 8);
|
||||
// xmm6 = c00
|
||||
// xmm4 = c01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm0, xmm2, xmm3 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i rb01 = c01 & mask;
|
||||
// GSVector4i ga01 = (c01 >> 8) & mask;
|
||||
vmovdqa(xmm0, ptr[&m_local.temp.uf]);
|
||||
|
||||
vpsllw(xmm3, xmm4, 8);
|
||||
vpsrlw(xmm3, 8);
|
||||
vpsrlw(xmm4, 8);
|
||||
// GSVector4i rb00 = c00 & mask;
|
||||
// GSVector4i ga00 = (c00 >> 8) & mask;
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm2 = rb00
|
||||
// xmm3 = rb01
|
||||
// xmm6 = ga00
|
||||
// xmm4 = ga01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm7 = used
|
||||
vpsllw(xmm2, xmm6, 8);
|
||||
vpsrlw(xmm2, 8);
|
||||
vpsrlw(xmm6, 8);
|
||||
|
||||
// rb00 = rb00.lerp16<0>(rb01, uf);
|
||||
// ga00 = ga00.lerp16<0>(ga01, uf);
|
||||
// GSVector4i rb01 = c01 & mask;
|
||||
// GSVector4i ga01 = (c01 >> 8) & mask;
|
||||
|
||||
lerp16(xmm3, xmm2, xmm0, 0);
|
||||
lerp16(xmm4, xmm6, xmm0, 0);
|
||||
vpsllw(xmm3, xmm4, 8);
|
||||
vpsrlw(xmm3, 8);
|
||||
vpsrlw(xmm4, 8);
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm2, xmm6 = free
|
||||
// xmm7 = used
|
||||
// xmm0 = uf
|
||||
// xmm2 = rb00
|
||||
// xmm3 = rb01
|
||||
// xmm6 = ga00
|
||||
// xmm4 = ga01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i rb10 = c10 & mask;
|
||||
// GSVector4i ga10 = (c10 >> 8) & mask;
|
||||
// rb00 = rb00.lerp16<0>(rb01, uf);
|
||||
// ga00 = ga00.lerp16<0>(ga01, uf);
|
||||
|
||||
vpsrlw(xmm2, xmm1, 8);
|
||||
vpsllw(xmm1, 8);
|
||||
vpsrlw(xmm1, 8);
|
||||
lerp16(xmm3, xmm2, xmm0, 0);
|
||||
lerp16(xmm4, xmm6, xmm0, 0);
|
||||
|
||||
// GSVector4i rb11 = c11 & mask;
|
||||
// GSVector4i ga11 = (c11 >> 8) & mask;
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm2, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
vpsrlw(xmm6, xmm5, 8);
|
||||
vpsllw(xmm5, 8);
|
||||
vpsrlw(xmm5, 8);
|
||||
// GSVector4i rb10 = c10 & mask;
|
||||
// GSVector4i ga10 = (c10 >> 8) & mask;
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = rb10
|
||||
// xmm5 = rb11
|
||||
// xmm2 = ga10
|
||||
// xmm6 = ga11
|
||||
// xmm7 = used
|
||||
vpsrlw(xmm2, xmm1, 8);
|
||||
vpsllw(xmm1, 8);
|
||||
vpsrlw(xmm1, 8);
|
||||
|
||||
// rb10 = rb10.lerp16<0>(rb11, uf);
|
||||
// ga10 = ga10.lerp16<0>(ga11, uf);
|
||||
// GSVector4i rb11 = c11 & mask;
|
||||
// GSVector4i ga11 = (c11 >> 8) & mask;
|
||||
|
||||
lerp16(xmm5, xmm1, xmm0, 0);
|
||||
lerp16(xmm6, xmm2, xmm0, 0);
|
||||
vpsrlw(xmm6, xmm5, 8);
|
||||
vpsllw(xmm5, 8);
|
||||
vpsrlw(xmm5, 8);
|
||||
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm5 = rb10
|
||||
// xmm6 = ga10
|
||||
// xmm0, xmm1, xmm2 = free
|
||||
// xmm7 = used
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = rb10
|
||||
// xmm5 = rb11
|
||||
// xmm2 = ga10
|
||||
// xmm6 = ga11
|
||||
// xmm7 = used
|
||||
|
||||
// rb00 = rb00.lerp16<0>(rb10, vf);
|
||||
// ga00 = ga00.lerp16<0>(ga10, vf);
|
||||
// rb10 = rb10.lerp16<0>(rb11, uf);
|
||||
// ga10 = ga10.lerp16<0>(ga11, uf);
|
||||
|
||||
vmovdqa(xmm0, ptr[&m_local.temp.vf]);
|
||||
lerp16(xmm5, xmm1, xmm0, 0);
|
||||
lerp16(xmm6, xmm2, xmm0, 0);
|
||||
|
||||
lerp16(xmm5, xmm3, xmm0, 0);
|
||||
lerp16(xmm6, xmm4, xmm0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm5 = rb10
|
||||
// xmm6 = ga10
|
||||
// xmm0, xmm1, xmm2 = free
|
||||
// xmm7 = used
|
||||
|
||||
vpaddd(xmm5, xmm2, xmm4);
|
||||
// rb00 = rb00.lerp16<0>(rb10, vf);
|
||||
// ga00 = ga00.lerp16<0>(ga10, vf);
|
||||
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
vmovdqa(xmm0, ptr[&m_local.temp.vf]);
|
||||
|
||||
ReadTexel(1, 1);
|
||||
lerp16(xmm5, xmm3, xmm0, 0);
|
||||
lerp16(xmm6, xmm4, xmm0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
|
||||
// GSVector4i mask = GSVector4i::x00ff();
|
||||
vpaddd(xmm5, xmm2, xmm4);
|
||||
|
||||
// c[0] = c00 & mask;
|
||||
// c[1] = (c00 >> 8) & mask;
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
|
||||
vpsllw(xmm5, xmm6, 8);
|
||||
vpsrlw(xmm5, 8);
|
||||
vpsrlw(xmm6, 8);
|
||||
ReadTexel(1, 1);
|
||||
|
||||
// GSVector4i mask = GSVector4i::x00ff();
|
||||
|
||||
// c[0] = c00 & mask;
|
||||
// c[1] = (c00 >> 8) & mask;
|
||||
|
||||
vpsllw(xmm5, xmm6, 8);
|
||||
vpsrlw(xmm5, 8);
|
||||
vpsrlw(xmm6, 8);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
|
||||
vpsrlw(xmm0, xmm0, 1);
|
||||
|
||||
vmovdqa(xmm2, ptr[&m_local.temp.trb]);
|
||||
vmovdqa(xmm3, ptr[&m_local.temp.tga]);
|
||||
|
||||
lerp16(xmm5, xmm2, xmm0, 0);
|
||||
lerp16(xmm6, xmm3, xmm0, 0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
|
||||
vpsrlw(xmm0, xmm0, 1);
|
||||
|
||||
vmovdqa(xmm2, ptr[&m_local.temp.trb]);
|
||||
vmovdqa(xmm3, ptr[&m_local.temp.tga]);
|
||||
|
||||
lerp16(xmm5, xmm2, xmm0, 0);
|
||||
lerp16(xmm6, xmm3, xmm0, 0);
|
||||
pop(ebp);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
|
||||
|
@ -2592,8 +2609,9 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
|
|||
mov(eax, ptr[esp + _top]);
|
||||
and(eax, 3);
|
||||
shl(eax, 5);
|
||||
vpaddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]);
|
||||
vpaddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]);
|
||||
mov(ebp, ptr[&m_local.gd->dimx]);
|
||||
vpaddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
|
||||
vpaddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
|
||||
}
|
||||
|
||||
// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
|
||||
|
@ -2739,7 +2757,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// ebx = m_local.tex[0] (!m_sel.mmin)
|
||||
// edx = m_local.tex (m_sel.mmin)
|
||||
// ebp = m_local.tex (m_sel.mmin)
|
||||
// edx = m_local.clut (m_sel.tlu)
|
||||
|
||||
// out
|
||||
// xmm6 = c00
|
||||
|
@ -2765,7 +2784,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[j]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
for(int i = 0; i < pixels; i++)
|
||||
{
|
||||
|
@ -2784,7 +2803,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
if(m_sel.mmin && m_sel.lcm)
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[0]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
}
|
||||
|
||||
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
|
||||
|
@ -2801,7 +2820,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4];
|
||||
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
|
||||
|
||||
if(i == 0) vmovd(eax, addr);
|
||||
else vpextrd(eax, addr, i);
|
||||
|
|
|
@ -63,6 +63,7 @@ L("loop");
|
|||
// ecx = steps
|
||||
// esi = fzbr
|
||||
// edi = fzbc
|
||||
// ebp = za
|
||||
// - xmm0
|
||||
// xmm2 = s/u (tme)
|
||||
// xmm3 = t/v (tme)
|
||||
|
@ -693,7 +694,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
|
|||
|
||||
mov(ebx, ptr[&m_local.gd->tex[0]]);
|
||||
|
||||
if(m_sel.tlu)
|
||||
{
|
||||
mov(edx, ptr[&m_local.gd->clut]);
|
||||
}
|
||||
|
||||
// ebx = tex
|
||||
// edx = clut
|
||||
|
||||
if(!m_sel.fst)
|
||||
{
|
||||
|
@ -1144,7 +1151,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
|
|||
return;
|
||||
}
|
||||
|
||||
mov(edx, (size_t)m_local.gd->tex);
|
||||
push(ebp);
|
||||
|
||||
mov(ebp, (size_t)m_local.gd->tex);
|
||||
|
||||
if(m_sel.tlu)
|
||||
{
|
||||
mov(edx, ptr[&m_local.gd->clut]);
|
||||
}
|
||||
|
||||
if(!m_sel.fst)
|
||||
{
|
||||
|
@ -1544,267 +1558,270 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
|
|||
psrlw(xmm6, 8);
|
||||
}
|
||||
|
||||
if(m_sel.mmin == 1) return; // round-off mode
|
||||
|
||||
movdqa(ptr[&m_local.temp.trb], xmm5);
|
||||
movdqa(ptr[&m_local.temp.tga], xmm6);
|
||||
|
||||
movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
|
||||
movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
|
||||
|
||||
psrad(xmm2, 1);
|
||||
psrad(xmm3, 1);
|
||||
|
||||
movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
|
||||
movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
|
||||
|
||||
psrlw(xmm5, 1);
|
||||
psrlw(xmm6, 1);
|
||||
|
||||
if(m_sel.ltf)
|
||||
if(m_sel.mmin != 1) // !round-off mode
|
||||
{
|
||||
// u -= 0x8000;
|
||||
// v -= 0x8000;
|
||||
movdqa(ptr[&m_local.temp.trb], xmm5);
|
||||
movdqa(ptr[&m_local.temp.tga], xmm6);
|
||||
|
||||
mov(eax, 0x8000);
|
||||
movd(xmm4, eax);
|
||||
pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
|
||||
movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
|
||||
|
||||
psubd(xmm2, xmm4);
|
||||
psubd(xmm3, xmm4);
|
||||
psrad(xmm2, 1);
|
||||
psrad(xmm3, 1);
|
||||
|
||||
// GSVector4i uf = u.xxzzlh().srl16(1);
|
||||
movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
|
||||
movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
|
||||
|
||||
psrlw(xmm5, 1);
|
||||
psrlw(xmm6, 1);
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// u -= 0x8000;
|
||||
// v -= 0x8000;
|
||||
|
||||
mov(eax, 0x8000);
|
||||
movd(xmm4, eax);
|
||||
pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
psubd(xmm2, xmm4);
|
||||
psubd(xmm3, xmm4);
|
||||
|
||||
// GSVector4i uf = u.xxzzlh().srl16(1);
|
||||
|
||||
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
psrlw(xmm0, 1);
|
||||
movdqa(ptr[&m_local.temp.uf], xmm0);
|
||||
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
psrlw(xmm0, 1);
|
||||
movdqa(ptr[&m_local.temp.uf], xmm0);
|
||||
|
||||
// GSVector4i vf = v.xxzzlh().srl16(1);
|
||||
// GSVector4i vf = v.xxzzlh().srl16(1);
|
||||
|
||||
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
psrlw(xmm0, 1);
|
||||
movdqa(ptr[&m_local.temp.vf], xmm0);
|
||||
}
|
||||
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
psrlw(xmm0, 1);
|
||||
movdqa(ptr[&m_local.temp.vf], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
|
||||
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
|
||||
|
||||
psrad(xmm2, 16);
|
||||
psrad(xmm3, 16);
|
||||
packssdw(xmm2, xmm3);
|
||||
psrad(xmm2, 16);
|
||||
psrad(xmm3, 16);
|
||||
packssdw(xmm2, xmm3);
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
|
||||
|
||||
movdqa(xmm3, xmm2);
|
||||
pcmpeqd(xmm1, xmm1);
|
||||
psrlw(xmm1, 15);
|
||||
paddw(xmm3, xmm1);
|
||||
movdqa(xmm3, xmm2);
|
||||
pcmpeqd(xmm1, xmm1);
|
||||
psrlw(xmm1, 15);
|
||||
paddw(xmm3, xmm1);
|
||||
|
||||
// uv0 = Wrap(uv0);
|
||||
// uv1 = Wrap(uv1);
|
||||
// uv0 = Wrap(uv0);
|
||||
// uv1 = Wrap(uv1);
|
||||
|
||||
WrapLOD(xmm2, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
// uv0 = Wrap(uv0);
|
||||
WrapLOD(xmm2, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
// uv0 = Wrap(uv0);
|
||||
|
||||
WrapLOD(xmm2);
|
||||
}
|
||||
WrapLOD(xmm2);
|
||||
}
|
||||
|
||||
// xmm2 = uv0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
// xmm2 = uv0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i x0 = uv0.upl16();
|
||||
// GSVector4i y0 = uv0.uph16() << tw;
|
||||
// GSVector4i x0 = uv0.upl16();
|
||||
// GSVector4i y0 = uv0.uph16() << tw;
|
||||
|
||||
pxor(xmm0, xmm0);
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
movdqa(xmm4, xmm2);
|
||||
punpckhwd(xmm2, xmm0);
|
||||
punpcklwd(xmm4, xmm0);
|
||||
pslld(xmm2, m_sel.tw + 3);
|
||||
|
||||
// xmm0 = 0
|
||||
// xmm2 = y0
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm4 = x0
|
||||
// xmm1, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i x1 = uv1.upl16();
|
||||
// GSVector4i y1 = uv1.uph16() << tw;
|
||||
|
||||
movdqa(xmm6, xmm3);
|
||||
punpckhwd(xmm3, xmm0);
|
||||
punpcklwd(xmm6, xmm0);
|
||||
pslld(xmm3, m_sel.tw + 3);
|
||||
movdqa(xmm4, xmm2);
|
||||
punpckhwd(xmm2, xmm0);
|
||||
punpcklwd(xmm4, xmm0);
|
||||
pslld(xmm2, m_sel.tw + 3);
|
||||
|
||||
// xmm0 = 0
|
||||
// xmm2 = y0
|
||||
// xmm3 = y1
|
||||
// xmm3 = uv1 (ltf)
|
||||
// xmm4 = x0
|
||||
// xmm6 = x1
|
||||
// xmm0, xmm5, xmm6 = free
|
||||
// xmm1, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// GSVector4i addr01 = y0 + x1;
|
||||
// GSVector4i addr10 = y1 + x0;
|
||||
// GSVector4i addr11 = y1 + x1;
|
||||
if(m_sel.ltf)
|
||||
{
|
||||
// GSVector4i x1 = uv1.upl16();
|
||||
// GSVector4i y1 = uv1.uph16() << tw;
|
||||
|
||||
movdqa(xmm5, xmm2);
|
||||
paddd(xmm5, xmm4);
|
||||
paddd(xmm2, xmm6);
|
||||
movdqa(xmm6, xmm3);
|
||||
punpckhwd(xmm3, xmm0);
|
||||
punpcklwd(xmm6, xmm0);
|
||||
pslld(xmm3, m_sel.tw + 3);
|
||||
|
||||
movdqa(xmm0, xmm3);
|
||||
paddd(xmm0, xmm4);
|
||||
paddd(xmm3, xmm6);
|
||||
// xmm2 = y0
|
||||
// xmm3 = y1
|
||||
// xmm4 = x0
|
||||
// xmm6 = x1
|
||||
// xmm0, xmm5, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
// xmm5 = addr00
|
||||
// xmm2 = addr01
|
||||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// xmm1, xmm4, xmm6 = free
|
||||
// xmm7 = used
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// GSVector4i addr01 = y0 + x1;
|
||||
// GSVector4i addr10 = y1 + x0;
|
||||
// GSVector4i addr11 = y1 + x1;
|
||||
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
movdqa(xmm5, xmm2);
|
||||
paddd(xmm5, xmm4);
|
||||
paddd(xmm2, xmm6);
|
||||
|
||||
ReadTexel(4, 1);
|
||||
movdqa(xmm0, xmm3);
|
||||
paddd(xmm0, xmm4);
|
||||
paddd(xmm3, xmm6);
|
||||
|
||||
// xmm6 = c00
|
||||
// xmm4 = c01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm0, xmm2, xmm3 = free
|
||||
// xmm7 = used
|
||||
// xmm5 = addr00
|
||||
// xmm2 = addr01
|
||||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// xmm1, xmm4, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
movdqa(xmm0, ptr[&m_local.temp.uf]);
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
|
||||
// GSVector4i rb00 = c00 & mask;
|
||||
// GSVector4i ga00 = (c00 >> 8) & mask;
|
||||
ReadTexel(4, 1);
|
||||
|
||||
movdqa(xmm2, xmm6);
|
||||
psllw(xmm2, 8);
|
||||
psrlw(xmm2, 8);
|
||||
psrlw(xmm6, 8);
|
||||
// xmm6 = c00
|
||||
// xmm4 = c01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm0, xmm2, xmm3 = free
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i rb01 = c01 & mask;
|
||||
// GSVector4i ga01 = (c01 >> 8) & mask;
|
||||
movdqa(xmm0, ptr[&m_local.temp.uf]);
|
||||
|
||||
movdqa(xmm3, xmm4);
|
||||
psllw(xmm3, 8);
|
||||
psrlw(xmm3, 8);
|
||||
psrlw(xmm4, 8);
|
||||
// GSVector4i rb00 = c00 & mask;
|
||||
// GSVector4i ga00 = (c00 >> 8) & mask;
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm2 = rb00
|
||||
// xmm3 = rb01
|
||||
// xmm6 = ga00
|
||||
// xmm4 = ga01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm7 = used
|
||||
movdqa(xmm2, xmm6);
|
||||
psllw(xmm2, 8);
|
||||
psrlw(xmm2, 8);
|
||||
psrlw(xmm6, 8);
|
||||
|
||||
// rb00 = rb00.lerp16<0>(rb01, uf);
|
||||
// ga00 = ga00.lerp16<0>(ga01, uf);
|
||||
// GSVector4i rb01 = c01 & mask;
|
||||
// GSVector4i ga01 = (c01 >> 8) & mask;
|
||||
|
||||
lerp16(xmm3, xmm2, xmm0, 0);
|
||||
lerp16(xmm4, xmm6, xmm0, 0);
|
||||
movdqa(xmm3, xmm4);
|
||||
psllw(xmm3, 8);
|
||||
psrlw(xmm3, 8);
|
||||
psrlw(xmm4, 8);
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm2, xmm6 = free
|
||||
// xmm7 = used
|
||||
// xmm0 = uf
|
||||
// xmm2 = rb00
|
||||
// xmm3 = rb01
|
||||
// xmm6 = ga00
|
||||
// xmm4 = ga01
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm7 = used
|
||||
|
||||
// GSVector4i rb10 = c10 & mask;
|
||||
// GSVector4i ga10 = (c10 >> 8) & mask;
|
||||
// rb00 = rb00.lerp16<0>(rb01, uf);
|
||||
// ga00 = ga00.lerp16<0>(ga01, uf);
|
||||
|
||||
movdqa(xmm2, xmm1);
|
||||
psllw(xmm1, 8);
|
||||
psrlw(xmm1, 8);
|
||||
psrlw(xmm2, 8);
|
||||
lerp16(xmm3, xmm2, xmm0, 0);
|
||||
lerp16(xmm4, xmm6, xmm0, 0);
|
||||
|
||||
// GSVector4i rb11 = c11 & mask;
|
||||
// GSVector4i ga11 = (c11 >> 8) & mask;
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = c10
|
||||
// xmm5 = c11
|
||||
// xmm2, xmm6 = free
|
||||
// xmm7 = used
|
||||
|
||||
movdqa(xmm6, xmm5);
|
||||
psllw(xmm5, 8);
|
||||
psrlw(xmm5, 8);
|
||||
psrlw(xmm6, 8);
|
||||
// GSVector4i rb10 = c10 & mask;
|
||||
// GSVector4i ga10 = (c10 >> 8) & mask;
|
||||
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = rb10
|
||||
// xmm5 = rb11
|
||||
// xmm2 = ga10
|
||||
// xmm6 = ga11
|
||||
// xmm7 = used
|
||||
movdqa(xmm2, xmm1);
|
||||
psllw(xmm1, 8);
|
||||
psrlw(xmm1, 8);
|
||||
psrlw(xmm2, 8);
|
||||
|
||||
// rb10 = rb10.lerp16<0>(rb11, uf);
|
||||
// ga10 = ga10.lerp16<0>(ga11, uf);
|
||||
// GSVector4i rb11 = c11 & mask;
|
||||
// GSVector4i ga11 = (c11 >> 8) & mask;
|
||||
|
||||
lerp16(xmm5, xmm1, xmm0, 0);
|
||||
lerp16(xmm6, xmm2, xmm0, 0);
|
||||
movdqa(xmm6, xmm5);
|
||||
psllw(xmm5, 8);
|
||||
psrlw(xmm5, 8);
|
||||
psrlw(xmm6, 8);
|
||||
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm5 = rb10
|
||||
// xmm6 = ga10
|
||||
// xmm0, xmm1, xmm2 = free
|
||||
// xmm7 = used
|
||||
// xmm0 = uf
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm1 = rb10
|
||||
// xmm5 = rb11
|
||||
// xmm2 = ga10
|
||||
// xmm6 = ga11
|
||||
// xmm7 = used
|
||||
|
||||
// rb00 = rb00.lerp16<0>(rb10, vf);
|
||||
// ga00 = ga00.lerp16<0>(ga10, vf);
|
||||
// rb10 = rb10.lerp16<0>(rb11, uf);
|
||||
// ga10 = ga10.lerp16<0>(ga11, uf);
|
||||
|
||||
movdqa(xmm0, ptr[&m_local.temp.vf]);
|
||||
lerp16(xmm5, xmm1, xmm0, 0);
|
||||
lerp16(xmm6, xmm2, xmm0, 0);
|
||||
|
||||
lerp16(xmm5, xmm3, xmm0, 0);
|
||||
lerp16(xmm6, xmm4, xmm0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
// xmm3 = rb00
|
||||
// xmm4 = ga00
|
||||
// xmm5 = rb10
|
||||
// xmm6 = ga10
|
||||
// xmm0, xmm1, xmm2 = free
|
||||
// xmm7 = used
|
||||
|
||||
paddd(xmm2, xmm4);
|
||||
movdqa(xmm5, xmm2);
|
||||
// rb00 = rb00.lerp16<0>(rb10, vf);
|
||||
// ga00 = ga00.lerp16<0>(ga10, vf);
|
||||
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
movdqa(xmm0, ptr[&m_local.temp.vf]);
|
||||
|
||||
ReadTexel(1, 1);
|
||||
lerp16(xmm5, xmm3, xmm0, 0);
|
||||
lerp16(xmm6, xmm4, xmm0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i addr00 = y0 + x0;
|
||||
|
||||
// GSVector4i mask = GSVector4i::x00ff();
|
||||
paddd(xmm2, xmm4);
|
||||
movdqa(xmm5, xmm2);
|
||||
|
||||
// c[0] = c00 & mask;
|
||||
// c[1] = (c00 >> 8) & mask;
|
||||
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
||||
|
||||
movdqa(xmm5, xmm6);
|
||||
psllw(xmm5, 8);
|
||||
psrlw(xmm5, 8);
|
||||
psrlw(xmm6, 8);
|
||||
ReadTexel(1, 1);
|
||||
|
||||
// GSVector4i mask = GSVector4i::x00ff();
|
||||
|
||||
// c[0] = c00 & mask;
|
||||
// c[1] = (c00 >> 8) & mask;
|
||||
|
||||
movdqa(xmm5, xmm6);
|
||||
psllw(xmm5, 8);
|
||||
psrlw(xmm5, 8);
|
||||
psrlw(xmm6, 8);
|
||||
}
|
||||
|
||||
movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
|
||||
psrlw(xmm0, 1);
|
||||
|
||||
movdqa(xmm2, ptr[&m_local.temp.trb]);
|
||||
movdqa(xmm3, ptr[&m_local.temp.tga]);
|
||||
|
||||
lerp16(xmm5, xmm2, xmm0, 0);
|
||||
lerp16(xmm6, xmm3, xmm0, 0);
|
||||
}
|
||||
|
||||
movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
|
||||
psrlw(xmm0, 1);
|
||||
|
||||
movdqa(xmm2, ptr[&m_local.temp.trb]);
|
||||
movdqa(xmm3, ptr[&m_local.temp.tga]);
|
||||
|
||||
lerp16(xmm5, xmm2, xmm0, 0);
|
||||
lerp16(xmm6, xmm3, xmm0, 0);
|
||||
pop(ebp);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
|
||||
|
@ -2727,8 +2744,9 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
|
|||
mov(eax, ptr[esp + _top]);
|
||||
and(eax, 3);
|
||||
shl(eax, 5);
|
||||
paddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]);
|
||||
paddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]);
|
||||
mov(ebp, ptr[&m_local.gd->dimx]);
|
||||
paddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
|
||||
paddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
|
||||
}
|
||||
|
||||
// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
|
||||
|
@ -2902,7 +2920,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
// xmm0 = addr10
|
||||
// xmm3 = addr11
|
||||
// ebx = m_local.tex[0] (!m_sel.mmin)
|
||||
// edx = m_local.tex (m_sel.mmin)
|
||||
// ebp = m_local.tex (m_sel.mmin)
|
||||
// edx = m_local.clut (m_sel.tlu)
|
||||
|
||||
// out
|
||||
// xmm6 = c00
|
||||
|
@ -2930,7 +2949,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[j]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
for(int i = 0; i < pixels; i++)
|
||||
{
|
||||
|
@ -2951,7 +2970,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
movdqa(ptr[&m_local.temp.test], xmm7);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[0]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm6, xmm5, 0);
|
||||
psrldq(xmm5, 4);
|
||||
|
@ -2959,7 +2978,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
psrldq(xmm2, 4);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[1]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm1, xmm5, 0);
|
||||
psrldq(xmm5, 4);
|
||||
|
@ -2970,7 +2989,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpckldq(xmm4, xmm7);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[2]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm1, xmm5, 0);
|
||||
psrldq(xmm5, 4);
|
||||
|
@ -2978,7 +2997,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
psrldq(xmm2, 4);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[3]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm5, xmm5, 0);
|
||||
ReadTexel(xmm2, xmm2, 0);
|
||||
|
@ -2990,7 +3009,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpcklqdq(xmm4, xmm7);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[0]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm1, xmm0, 0);
|
||||
psrldq(xmm0, 4);
|
||||
|
@ -2998,7 +3017,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
psrldq(xmm3, 4);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[1]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm2, xmm0, 0);
|
||||
psrldq(xmm0, 4);
|
||||
|
@ -3009,7 +3028,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpckldq(xmm5, xmm7);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[2]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm2, xmm0, 0);
|
||||
psrldq(xmm0, 4);
|
||||
|
@ -3017,7 +3036,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
psrldq(xmm3, 4);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[3]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm0, xmm0, 0);
|
||||
ReadTexel(xmm3, xmm3, 0);
|
||||
|
@ -3033,13 +3052,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
else
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[0]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm6, xmm5, 0);
|
||||
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[1]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm1, xmm5, 0);
|
||||
psrldq(xmm5, 4);
|
||||
|
@ -3047,13 +3066,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpckldq(xmm6, xmm1);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[2]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm1, xmm5, 0);
|
||||
psrldq(xmm5, 4);
|
||||
|
||||
mov(ebx, ptr[&lod_i->u32[3]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
|
||||
ReadTexel(xmm4, xmm5, 0);
|
||||
// psrldq(xmm5, 4);
|
||||
|
@ -3070,7 +3089,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
if(m_sel.mmin && m_sel.lcm)
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[0]]);
|
||||
mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
}
|
||||
|
||||
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
|
||||
|
@ -3117,7 +3136,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4];
|
||||
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
|
||||
|
||||
#if _M_SSE < 0x401
|
||||
|
||||
|
|
|
@ -26,30 +26,13 @@
|
|||
#include "xbyak/xbyak.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
|
||||
struct GSRasterizerStats
|
||||
{
|
||||
int64 ticks;
|
||||
int prims, pixels;
|
||||
|
||||
GSRasterizerStats()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
void Reset()
|
||||
{
|
||||
ticks = 0;
|
||||
pixels = prims = 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<class KEY, class VALUE> class GSFunctionMap
|
||||
{
|
||||
protected:
|
||||
struct ActivePtr
|
||||
{
|
||||
uint64 frame, frames;
|
||||
int64 ticks, pixels;
|
||||
uint64 ticks, pixels;
|
||||
VALUE f;
|
||||
};
|
||||
|
||||
|
@ -101,7 +84,7 @@ public:
|
|||
return m_active->f;
|
||||
}
|
||||
|
||||
void UpdateStats(const GSRasterizerStats& stats, uint64 frame)
|
||||
void UpdateStats(uint64 frame, uint64 ticks, int pixels)
|
||||
{
|
||||
if(m_active)
|
||||
{
|
||||
|
@ -111,14 +94,14 @@ public:
|
|||
m_active->frames++;
|
||||
}
|
||||
|
||||
m_active->pixels += stats.pixels;
|
||||
m_active->ticks += stats.ticks;
|
||||
m_active->ticks += ticks;
|
||||
m_active->pixels += pixels;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void PrintStats()
|
||||
{
|
||||
int64 ttpf = 0;
|
||||
uint64 ttpf = 0;
|
||||
|
||||
typename hash_map<KEY, ActivePtr*>::iterator i;
|
||||
|
||||
|
@ -141,9 +124,9 @@ public:
|
|||
|
||||
if(p->frames > 0)
|
||||
{
|
||||
int64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0;
|
||||
int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
|
||||
int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
|
||||
uint64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0;
|
||||
uint64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
|
||||
uint64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
|
||||
|
||||
printf("[%014llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n",
|
||||
(uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ',
|
||||
|
|
|
@ -25,18 +25,27 @@
|
|||
#include "GSRasterizer.h"
|
||||
|
||||
#define THREAD_HEIGHT 5
|
||||
//#define THREAD_HEIGHT 1
|
||||
|
||||
GSRasterizer::GSRasterizer(IDrawScanline* ds)
|
||||
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
|
||||
: m_ds(ds)
|
||||
, m_id(-1)
|
||||
, m_threads(-1)
|
||||
, m_id(id)
|
||||
, m_threads(threads)
|
||||
{
|
||||
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
|
||||
m_edge.count = 0;
|
||||
|
||||
m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
|
||||
|
||||
Init(0, 1);
|
||||
int row = 0;
|
||||
|
||||
while(row < (2048 >> THREAD_HEIGHT))
|
||||
{
|
||||
for(int i = 0; i < threads; i++, row++)
|
||||
{
|
||||
m_myscanline[row] = i == id ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GSRasterizer::~GSRasterizer()
|
||||
|
@ -67,82 +76,52 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
|
|||
return false;
|
||||
}
|
||||
|
||||
void GSRasterizer::Init(int id, int threads)
|
||||
void GSRasterizer::Queue(shared_ptr<GSRasterizerData> data)
|
||||
{
|
||||
if(m_id != id || m_threads != threads)
|
||||
{
|
||||
m_id = id;
|
||||
m_threads = threads;
|
||||
|
||||
if(threads > 1)
|
||||
{
|
||||
int row = 0;
|
||||
|
||||
while(row < (2048 >> THREAD_HEIGHT))
|
||||
{
|
||||
for(int i = 0; i < threads; i++, row++)
|
||||
{
|
||||
m_myscanline[row] = i == id ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memset(m_myscanline, 1, 2048 >> THREAD_HEIGHT);
|
||||
}
|
||||
}
|
||||
Draw(data);
|
||||
}
|
||||
|
||||
void GSRasterizer::Draw(const GSRasterizerData* data)
|
||||
void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data)
|
||||
{
|
||||
m_ds->BeginDraw(data->param);
|
||||
|
||||
const GSVertexSW* vertices = data->vertices;
|
||||
const int count = data->count;
|
||||
|
||||
bool scissor_test = !data->bbox.eq(data->bbox.rintersect(data->scissor));
|
||||
|
||||
m_scissor = data->scissor;
|
||||
m_fscissor = GSVector4(data->scissor);
|
||||
|
||||
m_stats.Reset();
|
||||
m_pixels = 0;
|
||||
|
||||
uint64 start = __rdtsc();
|
||||
|
||||
// NOTE: data->scissor_test with templated Draw* speeds up large point lists (ffxii videos), but do not seem to make any difference for others
|
||||
|
||||
switch(data->primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
m_stats.prims = count;
|
||||
if(data->scissor_test) DrawPoint<true>(vertices, count);
|
||||
if(scissor_test) DrawPoint<true>(vertices, count);
|
||||
else DrawPoint<false>(vertices, count);
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
ASSERT(!(count & 1));
|
||||
m_stats.prims = count / 2;
|
||||
for(int i = 0; i < count; i += 2) DrawLine(&vertices[i]);
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
ASSERT(!(count % 3));
|
||||
m_stats.prims = count / 3;
|
||||
for(int i = 0; i < count; i += 3) DrawTriangle(&vertices[i]);
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
ASSERT(!(count & 1));
|
||||
m_stats.prims = count / 2;
|
||||
for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i]);
|
||||
for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i], data->solidrect);
|
||||
break;
|
||||
default:
|
||||
__assume(0);
|
||||
}
|
||||
|
||||
m_stats.ticks = __rdtsc() - start;
|
||||
uint64 ticks = __rdtsc() - start;
|
||||
|
||||
m_ds->EndDraw(m_stats, data->frame);
|
||||
}
|
||||
|
||||
void GSRasterizer::GetStats(GSRasterizerStats& stats)
|
||||
{
|
||||
stats = m_stats;
|
||||
m_ds->EndDraw(data->frame, ticks, m_pixels);
|
||||
}
|
||||
|
||||
template<bool scissor_test>
|
||||
|
@ -156,7 +135,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v, int count)
|
|||
{
|
||||
if(IsOneOfMyScanlines(p.y))
|
||||
{
|
||||
m_stats.pixels++;
|
||||
m_pixels++;
|
||||
|
||||
m_ds->SetupPrim(v, *v);
|
||||
|
||||
|
@ -174,7 +153,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
|
|||
|
||||
int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy|
|
||||
|
||||
if(m_ds->IsEdge())
|
||||
if(m_ds->HasEdge())
|
||||
{
|
||||
DrawEdge(v[0], v[1], dv, i, 0);
|
||||
DrawEdge(v[0], v[1], dv, i, 1);
|
||||
|
@ -218,7 +197,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
|
|||
|
||||
if(pixels > 0)
|
||||
{
|
||||
m_stats.pixels += pixels;
|
||||
m_pixels += pixels;
|
||||
|
||||
GSVertexSW dscan = dv / dv.p.xxxx();
|
||||
|
||||
|
@ -406,7 +385,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
|
|||
|
||||
Flush(v, dscan);
|
||||
|
||||
if(m_ds->IsEdge())
|
||||
if(m_ds->HasEdge())
|
||||
{
|
||||
GSVector4 a = dx.abs() < dy.abs(); // |dx| <= |dy|
|
||||
GSVector4 b = dx < GSVector4::zero(); // dx < 0
|
||||
|
@ -466,7 +445,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
|
|||
m_edge.count += e - &m_edge.buff[m_edge.count];
|
||||
}
|
||||
|
||||
void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
|
||||
void GSRasterizer::DrawSprite(const GSVertexSW* vertices, bool solidrect)
|
||||
{
|
||||
GSVertexSW v[2];
|
||||
|
||||
|
@ -487,13 +466,13 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
|
|||
|
||||
GSVertexSW scan = v[0];
|
||||
|
||||
if(m_ds->IsRect())
|
||||
if(solidrect)
|
||||
{
|
||||
if(m_id == 0)
|
||||
{
|
||||
m_ds->DrawRect(r, scan);
|
||||
|
||||
m_stats.pixels += r.width() * r.height();
|
||||
m_pixels += r.width() * r.height();
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -522,7 +501,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
|
|||
{
|
||||
if(IsOneOfMyScanlines(r.top))
|
||||
{
|
||||
m_stats.pixels += r.width();
|
||||
m_pixels += r.width();
|
||||
|
||||
m_ds->DrawScanline(r.width(), r.left, r.top, scan);
|
||||
}
|
||||
|
@ -754,7 +733,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
|
|||
int left = e->p.i16[1];
|
||||
int top = e->p.i16[2];
|
||||
|
||||
m_stats.pixels += pixels;
|
||||
m_pixels += pixels;
|
||||
|
||||
m_ds->DrawScanline(pixels, left, top, *e++);
|
||||
}
|
||||
|
@ -768,7 +747,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
|
|||
int left = e->p.i16[1];
|
||||
int top = e->p.i16[2];
|
||||
|
||||
m_stats.pixels += pixels;
|
||||
m_pixels += pixels;
|
||||
|
||||
m_ds->DrawEdge(pixels, left, top, *e++);
|
||||
}
|
||||
|
@ -781,117 +760,134 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
|
|||
|
||||
//
|
||||
|
||||
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, volatile long& sync)
|
||||
: GSRasterizer(ds)
|
||||
, m_sync(sync)
|
||||
, m_data(NULL)
|
||||
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads)
|
||||
: GSRasterizer(ds, id, threads)
|
||||
, m_exit(false)
|
||||
, m_break(true)
|
||||
, m_ready(true)
|
||||
{
|
||||
CreateThread();
|
||||
}
|
||||
|
||||
GSRasterizerMT::~GSRasterizerMT()
|
||||
{
|
||||
Init(0, 1);
|
||||
m_break = true;
|
||||
|
||||
Draw(NULL);
|
||||
m_exit = true;
|
||||
|
||||
m_draw.Set();
|
||||
|
||||
CloseThread();
|
||||
}
|
||||
|
||||
void GSRasterizerMT::Draw(const GSRasterizerData* data)
|
||||
void GSRasterizerMT::Queue(shared_ptr<GSRasterizerData> data)
|
||||
{
|
||||
m_data = data;
|
||||
GSAutoLock l(&m_lock);
|
||||
|
||||
m_draw.Set();
|
||||
m_queue.push(data);
|
||||
|
||||
if(m_break)
|
||||
{
|
||||
m_break = false;
|
||||
|
||||
m_ready = false;
|
||||
|
||||
m_draw.Set();
|
||||
}
|
||||
}
|
||||
|
||||
void GSRasterizerMT::Sync()
|
||||
{
|
||||
while(!m_queue.empty()) _mm_pause();
|
||||
|
||||
m_break = true;
|
||||
|
||||
while(!m_ready) _mm_pause();
|
||||
}
|
||||
|
||||
void GSRasterizerMT::ThreadProc()
|
||||
{
|
||||
while(m_draw.Wait() && m_data != NULL)
|
||||
while(m_draw.Wait() && !m_exit)
|
||||
{
|
||||
GSRasterizer::Draw(m_data);
|
||||
// once we are running it is better to spin, jobs can be smaller than the cost of waking up every time
|
||||
|
||||
_interlockedbittestandreset(&m_sync, m_id);
|
||||
while(!m_break)
|
||||
{
|
||||
if(!m_queue.empty())
|
||||
{
|
||||
queue<shared_ptr<GSRasterizerData> > queue;
|
||||
|
||||
{
|
||||
GSAutoLock l(&m_lock);
|
||||
|
||||
queue.swap(m_queue);
|
||||
}
|
||||
|
||||
while(!queue.empty())
|
||||
{
|
||||
Draw(queue.front());
|
||||
|
||||
queue.pop();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_pause();
|
||||
}
|
||||
}
|
||||
|
||||
m_ready = true;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
GSRasterizerList::GSRasterizerList()
|
||||
: m_sync(0)
|
||||
: m_sync_count(0)
|
||||
, m_count(0)
|
||||
{
|
||||
}
|
||||
|
||||
GSRasterizerList::~GSRasterizerList()
|
||||
{
|
||||
for(size_t i = 0; i < size(); i++)
|
||||
for(vector<GSRasterizer*>::iterator i = begin(); i != end(); i++)
|
||||
{
|
||||
delete (*this)[i];
|
||||
delete *i;
|
||||
}
|
||||
}
|
||||
|
||||
void GSRasterizerList::Draw(const GSRasterizerData* data, int width, int height)
|
||||
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
|
||||
{
|
||||
m_stats.Reset();
|
||||
// TODO: do not send data to every thread, try to bin them (based on bbox & scissor)
|
||||
|
||||
m_start = __rdtsc();
|
||||
|
||||
m_threads = std::min<int>(1 + (height >> THREAD_HEIGHT), size());
|
||||
|
||||
m_sync = 0;
|
||||
|
||||
for(int i = 1; i < m_threads; i++)
|
||||
if(data->solidrect)
|
||||
{
|
||||
m_sync |= 1 << i;
|
||||
Sync();
|
||||
|
||||
front()->Draw(data);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
for(int i = 1; i < m_threads; i++)
|
||||
{
|
||||
(*this)[i]->Init(i, m_threads);
|
||||
(*this)[i]->Draw(data);
|
||||
for(int i = 0; i < size(); i++)
|
||||
{
|
||||
(*this)[i]->Queue(data);
|
||||
}
|
||||
|
||||
(*this)[0]->Init(0, m_threads);
|
||||
(*this)[0]->Draw(data);
|
||||
m_count++;
|
||||
}
|
||||
|
||||
void GSRasterizerList::Sync()
|
||||
{
|
||||
while(m_sync) _mm_pause();
|
||||
|
||||
m_stats.ticks = __rdtsc() - m_start;
|
||||
|
||||
for(int i = 0; i < m_threads; i++)
|
||||
if(m_count > 0)
|
||||
{
|
||||
GSRasterizerStats s;
|
||||
|
||||
(*this)[i]->GetStats(s);
|
||||
|
||||
m_stats.pixels += s.pixels;
|
||||
m_stats.prims = std::max<int>(m_stats.prims, s.prims);
|
||||
}
|
||||
}
|
||||
|
||||
void GSRasterizerList::GetStats(GSRasterizerStats& stats)
|
||||
{
|
||||
stats = m_stats;
|
||||
}
|
||||
|
||||
void GSRasterizerList::PrintStats()
|
||||
{
|
||||
if(!empty())
|
||||
{
|
||||
front()->PrintStats();
|
||||
|
||||
/*
|
||||
int index = 0;
|
||||
|
||||
for(std::vector<IRasterizer*>::iterator i = begin(); i != end(); i++)
|
||||
for(int i = 0; i < size(); i++)
|
||||
{
|
||||
printf("[Thread %d]\n", index++);
|
||||
|
||||
(*i)->PrintStats();
|
||||
(*this)[i]->Sync();
|
||||
}
|
||||
*/
|
||||
|
||||
m_sync_count++;
|
||||
|
||||
m_count = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,18 +27,32 @@
|
|||
#include "GSThread.h"
|
||||
#include "GSAlignedClass.h"
|
||||
|
||||
__aligned(class, 32) GSRasterizerData
|
||||
__aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
|
||||
{
|
||||
public:
|
||||
GSVector4i scissor;
|
||||
bool scissor_test;
|
||||
GSVector4i bbox;
|
||||
GS_PRIM_CLASS primclass;
|
||||
const GSVertexSW* vertices;
|
||||
GSVertexSW* vertices;
|
||||
int count;
|
||||
bool solidrect;
|
||||
uint64 frame;
|
||||
const void* param;
|
||||
void* param;
|
||||
|
||||
GSRasterizerData() : scissor_test(true) {}
|
||||
GSRasterizerData()
|
||||
: vertices(NULL)
|
||||
, count(0)
|
||||
, solidrect(false)
|
||||
, param(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~GSRasterizerData()
|
||||
{
|
||||
if(vertices != NULL) _aligned_free(vertices);
|
||||
|
||||
// derived class should free param and its members
|
||||
}
|
||||
};
|
||||
|
||||
class IDrawScanline : public GSAlignedClass<32>
|
||||
|
@ -59,8 +73,7 @@ public:
|
|||
virtual ~IDrawScanline() {}
|
||||
|
||||
virtual void BeginDraw(const void* param) = 0;
|
||||
virtual void EndDraw(const GSRasterizerStats& stats, uint64 frame) = 0;
|
||||
virtual void PrintStats() = 0;
|
||||
virtual void EndDraw(uint64 frame, uint64 ticks, int pixels) = 0;
|
||||
|
||||
#ifdef ENABLE_JIT_RASTERIZER
|
||||
|
||||
|
@ -78,32 +91,29 @@ public:
|
|||
|
||||
#endif
|
||||
|
||||
__forceinline bool IsEdge() const {return m_de != NULL;}
|
||||
__forceinline bool IsRect() const {return m_dr != NULL;}
|
||||
__forceinline bool HasEdge() const {return m_de != NULL;}
|
||||
};
|
||||
|
||||
class IRasterizer
|
||||
class IRasterizer : public GSAlignedClass<32>
|
||||
{
|
||||
public:
|
||||
virtual ~IRasterizer() {}
|
||||
|
||||
virtual void Init(int id, int threads) = 0;
|
||||
virtual void Draw(const GSRasterizerData* data) = 0;
|
||||
virtual void GetStats(GSRasterizerStats& stats) = 0;
|
||||
virtual void PrintStats() = 0;
|
||||
virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
|
||||
virtual void Sync() = 0;
|
||||
};
|
||||
|
||||
__aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer
|
||||
__aligned(class, 32) GSRasterizer : public IRasterizer
|
||||
{
|
||||
protected:
|
||||
IDrawScanline* m_ds;
|
||||
int m_id;
|
||||
int m_threads;
|
||||
uint8* m_myscanline;
|
||||
GSRasterizerStats m_stats;
|
||||
GSVector4i m_scissor;
|
||||
GSVector4 m_fscissor;
|
||||
struct {GSVertexSW* buff; int count;} m_edge;
|
||||
int m_pixels;
|
||||
|
||||
typedef void (GSRasterizer::*DrawPrimPtr)(const GSVertexSW* v, int count);
|
||||
|
||||
|
@ -111,7 +121,7 @@ protected:
|
|||
void DrawPoint(const GSVertexSW* v, int count);
|
||||
void DrawLine(const GSVertexSW* v);
|
||||
void DrawTriangle(const GSVertexSW* v);
|
||||
void DrawSprite(const GSVertexSW* v);
|
||||
void DrawSprite(const GSVertexSW* v, bool solidrect);
|
||||
|
||||
__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0);
|
||||
|
||||
|
@ -123,61 +133,64 @@ protected:
|
|||
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
|
||||
|
||||
public:
|
||||
GSRasterizer(IDrawScanline* ds);
|
||||
GSRasterizer(IDrawScanline* ds, int id, int threads);
|
||||
virtual ~GSRasterizer();
|
||||
|
||||
void Draw(shared_ptr<GSRasterizerData> data);
|
||||
|
||||
// IRasterizer
|
||||
|
||||
void Init(int id, int threads);
|
||||
void Draw(const GSRasterizerData* data);
|
||||
void GetStats(GSRasterizerStats& stats);
|
||||
void PrintStats() {m_ds->PrintStats();}
|
||||
void Queue(shared_ptr<GSRasterizerData> data);
|
||||
};
|
||||
|
||||
class GSRasterizerMT : public GSRasterizer, private GSThread
|
||||
{
|
||||
protected:
|
||||
volatile long& m_sync;
|
||||
volatile bool m_exit;
|
||||
volatile bool m_break;
|
||||
volatile bool m_ready;
|
||||
GSAutoResetEvent m_draw;
|
||||
const GSRasterizerData* m_data;
|
||||
queue<shared_ptr<GSRasterizerData> > m_queue;
|
||||
GSCritSec m_lock;
|
||||
|
||||
void ThreadProc();
|
||||
|
||||
public:
|
||||
GSRasterizerMT(IDrawScanline* ds, volatile long& sync);
|
||||
GSRasterizerMT(IDrawScanline* ds, int id, int threads);
|
||||
virtual ~GSRasterizerMT();
|
||||
|
||||
// IRasterizer
|
||||
|
||||
void Draw(const GSRasterizerData* data);
|
||||
void Queue(shared_ptr<GSRasterizerData> data);
|
||||
void Sync();
|
||||
};
|
||||
|
||||
class GSRasterizerList : protected vector<IRasterizer*>
|
||||
class GSRasterizerList : public IRasterizer, protected vector<GSRasterizer*>
|
||||
{
|
||||
protected:
|
||||
volatile long m_sync;
|
||||
GSRasterizerStats m_stats;
|
||||
int64 m_start;
|
||||
int m_threads;
|
||||
int m_count;
|
||||
|
||||
GSRasterizerList();
|
||||
|
||||
public:
|
||||
GSRasterizerList();
|
||||
virtual ~GSRasterizerList();
|
||||
|
||||
template<class DS> void Create(int threads)
|
||||
template<class DS> static GSRasterizerList* Create(int threads)
|
||||
{
|
||||
GSRasterizerList* rl = new GSRasterizerList();
|
||||
|
||||
threads = std::max<int>(threads, 1); // TODO: min(threads, number of cpu cores)
|
||||
|
||||
push_back(new GSRasterizer(new DS()));
|
||||
|
||||
for(int i = 1; i < threads; i++)
|
||||
for(int i = 0; i < threads; i++)
|
||||
{
|
||||
push_back(new GSRasterizerMT(new DS(), m_sync));
|
||||
rl->push_back(new GSRasterizerMT(new DS(), i, threads));
|
||||
}
|
||||
|
||||
return rl;
|
||||
}
|
||||
|
||||
void Draw(const GSRasterizerData* data, int width, int height);
|
||||
void Queue(shared_ptr<GSRasterizerData> data);
|
||||
void Sync();
|
||||
void GetStats(GSRasterizerStats& stats);
|
||||
void PrintStats();
|
||||
|
||||
int m_sync_count;
|
||||
};
|
||||
|
|
|
@ -67,6 +67,7 @@ public:
|
|||
bool s_save;
|
||||
bool s_savez;
|
||||
int s_saven;
|
||||
GSCritSec s_lock;
|
||||
|
||||
public:
|
||||
GSRenderer();
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
|
||||
|
||||
GSRendererSW::GSRendererSW(int threads)
|
||||
: m_fzb(NULL)
|
||||
{
|
||||
InitVertexKick(GSRendererSW);
|
||||
|
||||
|
@ -32,7 +33,7 @@ GSRendererSW::GSRendererSW(int threads)
|
|||
|
||||
memset(m_texture, 0, sizeof(m_texture));
|
||||
|
||||
m_rl.Create<GSDrawScanline>(threads);
|
||||
m_rl = GSRasterizerList::Create<GSDrawScanline>(threads);
|
||||
|
||||
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
|
||||
}
|
||||
|
@ -46,6 +47,8 @@ GSRendererSW::~GSRendererSW()
|
|||
delete m_texture[i];
|
||||
}
|
||||
|
||||
delete m_rl;
|
||||
|
||||
_aligned_free(m_output);
|
||||
}
|
||||
|
||||
|
@ -63,6 +66,10 @@ void GSRendererSW::VSync(int field)
|
|||
{
|
||||
GSRendererT<GSVertexSW>::VSync(field);
|
||||
|
||||
Sync(); // IncAge might delete a cached texture in use
|
||||
|
||||
// printf("m_sync_count = %d\n", m_rl->m_sync_count); m_rl->m_sync_count = 0;
|
||||
|
||||
m_tc->IncAge();
|
||||
|
||||
if(m_reset)
|
||||
|
@ -87,6 +94,8 @@ void GSRendererSW::ResetDevice()
|
|||
|
||||
GSTexture* GSRendererSW::GetOutput(int i)
|
||||
{
|
||||
Sync();
|
||||
|
||||
const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
|
||||
|
||||
int w = DISPFB.FBW * 64;
|
||||
|
@ -122,130 +131,95 @@ GSTexture* GSRendererSW::GetOutput(int i)
|
|||
|
||||
void GSRendererSW::Draw()
|
||||
{
|
||||
if(m_dump)
|
||||
if(m_dump) m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
|
||||
|
||||
// TODO: palette may be rendered (point-list in a few visual novels) and not ready by the time it needs to be loaded => vm to clut transfer (TEX0.CLD) should wait for the rasterizers to finish, if needed
|
||||
|
||||
if(m_fzb != m_context->offset.fzb)
|
||||
{
|
||||
m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
|
||||
// rasterizers must write the same outputs at the same time, this makes sure each thread has its own private surface area
|
||||
|
||||
// TODO: detect if frame/zbuf overlap eachother (?)
|
||||
|
||||
m_fzb = m_context->offset.fzb;
|
||||
|
||||
Sync();
|
||||
}
|
||||
|
||||
GSScanlineGlobalData gd;
|
||||
shared_ptr<GSRasterizerData> data(new GSRasterizerData2(this));
|
||||
|
||||
if(!GetScanlineGlobalData(gd))
|
||||
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)data->param;
|
||||
|
||||
if(!GetScanlineGlobalData(*gd))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!gd.sel.fwrite && !gd.sel.zwrite)
|
||||
data->scissor = GSVector4i(m_context->scissor.in);
|
||||
data->scissor.z = std::min<int>(data->scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
|
||||
data->bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p));
|
||||
data->primclass = m_vt.m_primclass;
|
||||
data->vertices = (GSVertexSW*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16); // TODO: detach m_vertices and reallocate later?
|
||||
memcpy(data->vertices, m_vertices, sizeof(GSVertexSW) * m_count); // TODO: m_vt.Update fetches all the vertices already, could also store them here
|
||||
data->count = m_count;
|
||||
data->solidrect = gd->sel.IsSolidRect();
|
||||
data->frame = m_perfmon.GetFrame();
|
||||
|
||||
if(s_dump)
|
||||
{
|
||||
return;
|
||||
if(data->solidrect) Sync();
|
||||
|
||||
((GSRasterizerData2*)data.get())->DumpInput();
|
||||
}
|
||||
|
||||
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
|
||||
{
|
||||
uint64 frame = m_perfmon.GetFrame();
|
||||
m_rl->Queue(data);
|
||||
|
||||
string s;
|
||||
GSVector4i r = data->bbox.rintersect(data->scissor);
|
||||
|
||||
if(s_save && s_n >= s_saven && PRIM->TME)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
|
||||
}
|
||||
|
||||
s_n++;
|
||||
|
||||
if(s_save && s_n >= s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);//GetFrameSize(1).cy);
|
||||
}
|
||||
|
||||
if(s_savez && s_n >= s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
|
||||
}
|
||||
|
||||
s_n++;
|
||||
}
|
||||
|
||||
GSVector4i scissor(m_context->scissor.in);
|
||||
GSVector4i bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p));
|
||||
GSVector4i r = bbox.rintersect(scissor);
|
||||
|
||||
GSRasterizerData data;
|
||||
|
||||
data.scissor = scissor;
|
||||
data.scissor.z = std::min<int>(data.scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
|
||||
data.scissor_test = !bbox.eq(r);
|
||||
data.primclass = m_vt.m_primclass;
|
||||
data.vertices = m_vertices;
|
||||
data.count = m_count;
|
||||
data.frame = m_perfmon.GetFrame();
|
||||
data.param = &gd;
|
||||
|
||||
m_rl.Draw(&data, r.width(), r.height());
|
||||
|
||||
if(gd.sel.fwrite)
|
||||
if(gd->sel.fwrite)
|
||||
{
|
||||
m_tc->InvalidateVideoMem(m_context->offset.fb, r);
|
||||
}
|
||||
|
||||
if(gd.sel.zwrite)
|
||||
if(gd->sel.zwrite)
|
||||
{
|
||||
m_tc->InvalidateVideoMem(m_context->offset.zb, r);
|
||||
}
|
||||
|
||||
// By only syncing here we can do the two InvalidateVideoMem calls free if the other threads finish
|
||||
// their drawings later than this one (they usually do because they start on an event).
|
||||
// Sync();
|
||||
|
||||
m_rl.Sync();
|
||||
|
||||
GSRasterizerStats stats;
|
||||
|
||||
m_rl.GetStats(stats);
|
||||
|
||||
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
|
||||
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
|
||||
|
||||
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
|
||||
{
|
||||
uint64 frame = m_perfmon.GetFrame();
|
||||
|
||||
string s;
|
||||
|
||||
if(s_save && s_n >= s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);//GetFrameSize(1).cy);
|
||||
}
|
||||
|
||||
if(s_savez && s_n >= s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
|
||||
}
|
||||
|
||||
s_n++;
|
||||
}
|
||||
// TODO: m_perfmon.Put(GSPerfMon::Prim, stats.prims);
|
||||
// TODO: m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
|
||||
|
||||
/*
|
||||
if(0)//stats.ticks > 5000000)
|
||||
{
|
||||
printf("* [%lld | %012llx] ticks %lld prims %d (%d) pixels %d (%d)\n",
|
||||
m_perfmon.GetFrame(), gd.sel.key,
|
||||
m_perfmon.GetFrame(), gd->sel.key,
|
||||
stats.ticks,
|
||||
stats.prims, stats.prims > 0 ? (int)(stats.ticks / stats.prims) : -1,
|
||||
stats.pixels, stats.pixels > 0 ? (int)(stats.ticks / stats.pixels) : -1);
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void GSRendererSW::Sync()
|
||||
{
|
||||
m_rl->Sync();
|
||||
|
||||
m_tc->ResetInvalidPages();
|
||||
}
|
||||
|
||||
void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
|
||||
{
|
||||
m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
|
||||
|
||||
Sync(); // TODO: not needed if nothing uses the affected pages (this is the most frequently called Sync! get rid of it)
|
||||
}
|
||||
|
||||
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
|
||||
{
|
||||
Sync(); // TODO: not needed if nothing uses the affected pages
|
||||
}
|
||||
|
||||
#include "GSTextureSW.h"
|
||||
|
@ -257,7 +231,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
const GS_PRIM_CLASS primclass = m_vt.m_primclass;
|
||||
|
||||
gd.vm = m_mem.m_vm8;
|
||||
gd.dimx = env.dimx;
|
||||
|
||||
gd.fbr = context->offset.fb->pixel.row;
|
||||
gd.zbr = context->offset.zb->pixel.row;
|
||||
|
@ -315,6 +288,11 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
bool fwrite = fm != 0xffffffff;
|
||||
bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
|
||||
|
||||
bool zwrite = zm != 0xffffffff;
|
||||
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
|
||||
|
||||
if(!fwrite && !zwrite) return false;
|
||||
|
||||
gd.sel.fwrite = fwrite;
|
||||
gd.sel.ftest = ftest;
|
||||
|
||||
|
@ -329,13 +307,20 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
|
||||
if(PRIM->TME)
|
||||
{
|
||||
gd.clut = m_mem.m_clut;
|
||||
|
||||
gd.sel.tfx = context->TEX0.TFX;
|
||||
gd.sel.tcc = context->TEX0.TCC;
|
||||
gd.sel.fst = PRIM->FST;
|
||||
gd.sel.ltf = m_vt.IsLinear();
|
||||
gd.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0;
|
||||
|
||||
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
|
||||
{
|
||||
gd.sel.tlu = 1;
|
||||
|
||||
gd.clut = (uint32*)_aligned_malloc(sizeof(uint32) * 256, 32); // FIXME: might address uninitialized data of the texture (0xCD) that is not in 0-15 range for 4-bpp formats
|
||||
|
||||
memcpy(gd.clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
|
||||
}
|
||||
|
||||
gd.sel.wms = context->CLAMP.WMS;
|
||||
gd.sel.wmt = context->CLAMP.WMT;
|
||||
|
||||
|
@ -346,13 +331,17 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
gd.sel.tfx = TFX_DECAL;
|
||||
}
|
||||
|
||||
GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA);
|
||||
|
||||
if(t == NULL) {ASSERT(0); return false;}
|
||||
|
||||
if(!m_tc->CanUpdate(t)) Sync();
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, context->TEX0, context->CLAMP, gd.sel.ltf);
|
||||
|
||||
const GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA, r);
|
||||
|
||||
if(t == NULL) {ASSERT(0); return false;}
|
||||
if(!t->Update(r)) {ASSERT(0); return false;}
|
||||
|
||||
if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
|
||||
{
|
||||
|
@ -495,13 +484,17 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
m_vt.m_min.t *= 0.5f;
|
||||
m_vt.m_max.t *= 0.5f;
|
||||
|
||||
GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, gd.sel.tw + 3);
|
||||
|
||||
if(t == NULL) {ASSERT(0); return false;}
|
||||
|
||||
if(!m_tc->CanUpdate(t)) Sync();
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf);
|
||||
|
||||
const GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, r, gd.sel.tw + 3);
|
||||
|
||||
if(t == NULL) {ASSERT(0); return false;}
|
||||
if(!t->Update(r)) {ASSERT(0); return false;}
|
||||
|
||||
gd.tex[i] = t->m_buff;
|
||||
|
||||
|
@ -699,11 +692,16 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
|
|||
|
||||
gd.sel.colclamp = env.COLCLAMP.CLAMP;
|
||||
gd.sel.fba = context->FBA.FBA;
|
||||
gd.sel.dthe = env.DTHE.DTHE;
|
||||
}
|
||||
|
||||
bool zwrite = zm != 0xffffffff;
|
||||
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
|
||||
if(env.DTHE.DTHE)
|
||||
{
|
||||
gd.sel.dthe = 1;
|
||||
|
||||
gd.dimx = (GSVector4i*)_aligned_malloc(sizeof(env.dimx), 32);
|
||||
|
||||
memcpy(gd.dimx, env.dimx, sizeof(env.dimx));
|
||||
}
|
||||
}
|
||||
|
||||
gd.sel.zwrite = zwrite;
|
||||
gd.sel.ztest = ztest;
|
||||
|
|
|
@ -27,12 +27,118 @@
|
|||
|
||||
class GSRendererSW : public GSRendererT<GSVertexSW>
|
||||
{
|
||||
class GSRasterizerData2 : public GSRasterizerData
|
||||
{
|
||||
GSRenderer* renderer;
|
||||
GIFRegFRAME FRAME;
|
||||
GIFRegZBUF ZBUF;
|
||||
GIFRegTEX0 TEX0;
|
||||
uint32 TME;
|
||||
GSVector2i framesize;
|
||||
|
||||
public:
|
||||
GSRasterizerData2(GSRenderer* r)
|
||||
{
|
||||
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)_aligned_malloc(sizeof(GSScanlineGlobalData), 32);
|
||||
|
||||
gd->clut = NULL;
|
||||
gd->dimx = NULL;
|
||||
|
||||
param = gd;
|
||||
|
||||
renderer = r;
|
||||
FRAME = r->m_context->FRAME;
|
||||
ZBUF = r->m_context->ZBUF;
|
||||
TEX0 = r->m_context->TEX0;
|
||||
TME = r->PRIM->TME;
|
||||
framesize = GSVector2i(r->GetFrameRect().width(), 512);
|
||||
}
|
||||
|
||||
virtual ~GSRasterizerData2()
|
||||
{
|
||||
GSScanlineGlobalData* gd = (GSScanlineGlobalData*)param;
|
||||
|
||||
if(gd->clut) _aligned_free(gd->clut);
|
||||
if(gd->dimx) _aligned_free(gd->dimx);
|
||||
|
||||
_aligned_free(gd);
|
||||
|
||||
DumpOutput();
|
||||
}
|
||||
|
||||
// FIXME: not really possible to save whole input/output anymore, strips of the picture may lag in multi-threaded mode
|
||||
|
||||
void DumpInput()
|
||||
{
|
||||
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0))
|
||||
|
||||
GSAutoLock l(&renderer->s_lock);
|
||||
|
||||
uint64 frame = renderer->m_perfmon.GetFrame();
|
||||
|
||||
string s;
|
||||
|
||||
if(renderer->s_save && renderer->s_n >= renderer->s_saven && TME)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", renderer->s_n, frame, (int)TEX0.TBP0, (int)TEX0.PSM);
|
||||
|
||||
renderer->m_mem.SaveBMP(s, TEX0.TBP0, TEX0.TBW, TEX0.PSM, 1 << TEX0.TW, 1 << TEX0.TH);
|
||||
}
|
||||
|
||||
renderer->s_n++;
|
||||
|
||||
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
|
||||
|
||||
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
|
||||
}
|
||||
|
||||
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
|
||||
|
||||
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
|
||||
}
|
||||
|
||||
renderer->s_n++;
|
||||
}
|
||||
|
||||
void DumpOutput()
|
||||
{
|
||||
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
|
||||
|
||||
GSAutoLock l(&renderer->s_lock);
|
||||
|
||||
uint64 frame = renderer->m_perfmon.GetFrame();
|
||||
|
||||
string s;
|
||||
|
||||
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
|
||||
|
||||
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
|
||||
}
|
||||
|
||||
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
|
||||
|
||||
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
|
||||
}
|
||||
|
||||
renderer->s_n++;
|
||||
}
|
||||
};
|
||||
|
||||
protected:
|
||||
GSRasterizerList m_rl;
|
||||
GSRasterizerList* m_rl;
|
||||
GSTextureCacheSW* m_tc;
|
||||
GSTexture* m_texture[2];
|
||||
uint8* m_output;
|
||||
bool m_reset;
|
||||
GSPixelOffset4* m_fzb;
|
||||
|
||||
void Reset();
|
||||
void VSync(int field);
|
||||
|
@ -40,7 +146,9 @@ protected:
|
|||
GSTexture* GetOutput(int i);
|
||||
|
||||
void Draw();
|
||||
void Sync();
|
||||
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
|
||||
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
|
||||
|
||||
bool GetScanlineGlobalData(GSScanlineGlobalData& gd);
|
||||
|
||||
|
|
|
@ -107,15 +107,14 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
|
|||
{
|
||||
GSScanlineSelector sel;
|
||||
|
||||
// - the data of vm, tex, clut, dimx may change, multi-threaded drawing must be finished before that happens (an idea: remember which pages are used, sync when something needs to read or write them)
|
||||
// - the data of vm, tex may change, multi-threaded drawing must be finished before that happens, clut and dimx are copies
|
||||
// - tex is a cached texture, it may be recycled to free up memory, its absolute address cannot be compiled into code
|
||||
// - row and column pointers are allocated once and never change or freed, thier address can be used directly
|
||||
// - if in the future drawing does not have to be synchronized per batch, the rest of GSRasterizerData should be copied here, too (scissor, prim type, vertices)
|
||||
|
||||
void* vm;
|
||||
const void* tex[7];
|
||||
const uint32* clut;
|
||||
const GSVector4i* dimx;
|
||||
uint32* clut;
|
||||
GSVector4i* dimx;
|
||||
|
||||
const int* fbr;
|
||||
const int* zbr;
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
GSTextureCacheSW::GSTextureCacheSW(GSState* state)
|
||||
: m_state(state)
|
||||
{
|
||||
memset(m_invalid, 0, sizeof(m_invalid));
|
||||
}
|
||||
|
||||
GSTextureCacheSW::~GSTextureCacheSW()
|
||||
|
@ -32,7 +33,7 @@ GSTextureCacheSW::~GSTextureCacheSW()
|
|||
RemoveAll();
|
||||
}
|
||||
|
||||
const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, uint32 tw0)
|
||||
GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0)
|
||||
{
|
||||
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
|
||||
|
||||
|
@ -76,36 +77,9 @@ const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0
|
|||
|
||||
m_textures.insert(t);
|
||||
|
||||
__aligned(uint32, 16) pages[16];
|
||||
|
||||
((GSVector4i*)pages)[0] = GSVector4i::zero();
|
||||
((GSVector4i*)pages)[1] = GSVector4i::zero();
|
||||
((GSVector4i*)pages)[2] = GSVector4i::zero();
|
||||
((GSVector4i*)pages)[3] = GSVector4i::zero();
|
||||
|
||||
GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
|
||||
|
||||
int tw = 1 << TEX0.TW;
|
||||
int th = 1 << TEX0.TH;
|
||||
|
||||
for(int y = 0; y < th; y += bs.y)
|
||||
for(int i = 0; i < countof(t->m_pages); i++)
|
||||
{
|
||||
uint32 base = o->block.row[y >> 3];
|
||||
|
||||
for(int x = 0; x < tw; x += bs.x)
|
||||
{
|
||||
uint32 page = (base + o->block.col[x >> 3]) >> 5;
|
||||
|
||||
if(page < MAX_PAGES)
|
||||
{
|
||||
pages[page >> 5] |= 1 << (page & 31);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int i = 0; i < countof(pages); i++)
|
||||
{
|
||||
uint32 p = pages[i];
|
||||
uint32 p = t->m_pages[i];
|
||||
|
||||
if(p != 0)
|
||||
{
|
||||
|
@ -123,22 +97,11 @@ const GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0
|
|||
}
|
||||
}
|
||||
|
||||
if(!t->Update(r))
|
||||
{
|
||||
printf("!@#$\n"); // memory allocation may fail if the game is too hungry (tales of legendia fight transition/scene)
|
||||
|
||||
RemoveAt(t);
|
||||
|
||||
t = NULL;
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rect)
|
||||
void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rect)
|
||||
{
|
||||
bool changed = false;
|
||||
|
||||
uint32 bp = o->bp;
|
||||
uint32 bw = o->bw;
|
||||
uint32 psm = o->psm;
|
||||
|
@ -153,10 +116,12 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
|
|||
|
||||
for(int x = r.left; x < r.right; x += bs.x)
|
||||
{
|
||||
uint32 page = (base + o->block.col[x >> 3]) >> 5;
|
||||
uint32 page = (base + o->block.col[x >> 3]) >> 5;
|
||||
|
||||
if(page < MAX_PAGES)
|
||||
{
|
||||
m_invalid[page >> 5] |= 1 << (page & 31); // remember which pages might be invalid for future texture updates
|
||||
|
||||
const list<Texture*>& map = m_map[page];
|
||||
|
||||
for(list<Texture*>::const_iterator i = map.begin(); i != map.end(); i++)
|
||||
|
@ -165,8 +130,6 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
|
|||
|
||||
if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM))
|
||||
{
|
||||
changed = true;
|
||||
|
||||
if(t->m_repeating)
|
||||
{
|
||||
list<GSVector2i>& l = t->m_p2t[page];
|
||||
|
@ -187,8 +150,6 @@ bool GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
void GSTextureCacheSW::RemoveAll()
|
||||
|
@ -237,6 +198,24 @@ void GSTextureCacheSW::IncAge()
|
|||
}
|
||||
}
|
||||
|
||||
bool GSTextureCacheSW::CanUpdate(Texture* t)
|
||||
{
|
||||
for(size_t i = 0; i < countof(m_invalid); i++)
|
||||
{
|
||||
if(m_invalid[i] & t->m_pages[i])
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void GSTextureCacheSW::ResetInvalidPages()
|
||||
{
|
||||
memset(m_invalid, 0, sizeof(m_invalid));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
GSTextureCacheSW::Texture::Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
|
||||
|
@ -252,7 +231,30 @@ GSTextureCacheSW::Texture::Texture(GSState* state, const GSOffset* offset, uint3
|
|||
m_TEXA = TEXA;
|
||||
|
||||
memset(m_valid, 0, sizeof(m_valid));
|
||||
memset(m_pages, 0, sizeof(m_pages));
|
||||
|
||||
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
|
||||
|
||||
GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
|
||||
|
||||
int tw = 1 << TEX0.TW;
|
||||
int th = 1 << TEX0.TH;
|
||||
|
||||
for(int y = 0; y < th; y += bs.y)
|
||||
{
|
||||
uint32 base = offset->block.row[y >> 3];
|
||||
|
||||
for(int x = 0; x < tw; x += bs.x)
|
||||
{
|
||||
uint32 page = (base + offset->block.col[x >> 3]) >> 5;
|
||||
|
||||
if(page < MAX_PAGES)
|
||||
{
|
||||
m_pages[page >> 5] |= 1 << (page & 31);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_repeating = m_TEX0.IsRepeating(); // repeating mode always works, it is just slightly slower
|
||||
|
||||
if(m_repeating)
|
||||
|
|
|
@ -39,13 +39,14 @@ public:
|
|||
bool m_complete;
|
||||
bool m_repeating;
|
||||
list<GSVector2i>* m_p2t;
|
||||
uint32 m_valid[MAX_PAGES];
|
||||
uint32 m_valid[MAX_PAGES];
|
||||
uint32 m_pages[16];
|
||||
|
||||
// m_valid
|
||||
// fast mode: each uint32 bits map to the 32 blocks of that page
|
||||
// repeating mode: 1 bpp image of the texture tiles (8x8), also having 512 elements is just a coincidence (worst case: (1024*1024)/(8*8)/(sizeof(uint32)*8))
|
||||
|
||||
explicit Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
|
||||
Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
|
||||
virtual ~Texture();
|
||||
|
||||
bool Update(const GSVector4i& r);
|
||||
|
@ -56,16 +57,20 @@ protected:
|
|||
GSState* m_state;
|
||||
hash_set<Texture*> m_textures;
|
||||
list<Texture*> m_map[MAX_PAGES];
|
||||
uint32 m_invalid[16];
|
||||
|
||||
public:
|
||||
GSTextureCacheSW(GSState* state);
|
||||
virtual ~GSTextureCacheSW();
|
||||
|
||||
const Texture* Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, uint32 tw0 = 0);
|
||||
Texture* Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0 = 0);
|
||||
|
||||
bool InvalidateVideoMem(const GSOffset* o, const GSVector4i& r);
|
||||
void InvalidateVideoMem(const GSOffset* o, const GSVector4i& r);
|
||||
|
||||
void RemoveAll();
|
||||
void RemoveAt(Texture* t);
|
||||
void IncAge();
|
||||
|
||||
bool CanUpdate(Texture* t);
|
||||
void ResetInvalidPages();
|
||||
};
|
||||
|
|
|
@ -87,6 +87,7 @@
|
|||
#include <list>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <queue>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
|
Loading…
Reference in New Issue