Trying to isolate the rasterizer step-by-step, for better multi-threading in the future.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4305 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-02-17 03:24:37 +00:00
parent 65fc196688
commit 6f18c0dabe
26 changed files with 942 additions and 942 deletions

View File

@ -93,12 +93,13 @@ EXPORT_C_(int32) GPUopen(HWND hWnd)
}
int renderer = theApp.GetConfig("Renderer", 1);
int threads = theApp.GetConfig("swthreads", 1);
switch(renderer)
{
default:
case 0: s_gpu = new GPURendererSW(new GSDevice9()); break;
case 1: s_gpu = new GPURendererSW(new GSDevice11()); break;
case 0: s_gpu = new GPURendererSW(new GSDevice9(), threads); break;
case 1: s_gpu = new GPURendererSW(new GSDevice11(), threads); break;
// TODO: case 3: s_gpu = new GPURendererNull(new GSDeviceNull()); break;
}

View File

@ -22,12 +22,13 @@
#include "StdAfx.h"
#include "GPUDrawScanline.h"
GPUDrawScanline::GPUDrawScanline(GPUState* state, int id)
: m_state(state)
, m_id(id)
, m_sp_map("GPUSetupPrim", &m_env)
, m_ds_map("GPUDrawScanline", &m_env)
GPUDrawScanline::GPUDrawScanline(const GPUScanlineGlobalData* gd)
: m_sp_map("GPUSetupPrim", &m_local)
, m_ds_map("GPUDrawScanline", &m_local)
{
memset(&m_local, 0, sizeof(m_local));
m_local.gd = gd;
}
GPUDrawScanline::~GPUDrawScanline()
@ -36,40 +37,24 @@ GPUDrawScanline::~GPUDrawScanline()
void GPUDrawScanline::BeginDraw(const GSRasterizerData* data)
{
GPUDrawingEnvironment& env = m_state->m_env;
const GPUScanlineParam* p = (const GPUScanlineParam*)data->param;
m_env.sel = p->sel;
m_env.vm = m_state->m_mem.GetPixelAddress(0, 0);
if(m_env.sel.tme)
if(m_local.gd->sel.twin)
{
m_env.tex = p->tex;
m_env.clut = p->clut;
uint32 u, v;
if(m_env.sel.twin)
{
uint32 u, v;
u = ~(m_local.gd->twin.x << 3) & 0xff; // TWW
v = ~(m_local.gd->twin.y << 3) & 0xff; // TWH
u = ~(env.TWIN.TWW << 3) & 0xff;
v = ~(env.TWIN.TWH << 3) & 0xff;
m_local.twin[0].u = GSVector4i((u << 16) | u);
m_local.twin[0].v = GSVector4i((v << 16) | v);
m_env.twin[0].u = GSVector4i((u << 16) | u);
m_env.twin[0].v = GSVector4i((v << 16) | v);
u = env.TWIN.TWX << 3;
v = env.TWIN.TWY << 3;
u = m_local.gd->twin.z << 3; // TWX
v = m_local.gd->twin.y << 3; // TWY
m_env.twin[1].u = GSVector4i((u << 16) | u) & ~m_env.twin[0].u;
m_env.twin[1].v = GSVector4i((v << 16) | v) & ~m_env.twin[0].v;
}
m_local.twin[1].u = GSVector4i((u << 16) | u) & ~m_local.twin[0].u;
m_local.twin[1].v = GSVector4i((v << 16) | v) & ~m_local.twin[0].v;
}
//
m_ds = m_ds_map[m_env.sel];
m_ds = m_ds_map[m_local.gd->sel];
m_de = NULL;
@ -81,15 +66,15 @@ void GPUDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.key = 0;
sel.iip = m_env.sel.iip;
sel.tfx = m_env.sel.tfx;
sel.twin = m_env.sel.twin;
sel.sprite = m_env.sel.sprite;
sel.iip = m_local.gd->sel.iip;
sel.tfx = m_local.gd->sel.tfx;
sel.twin = m_local.gd->sel.twin;
sel.sprite = m_local.gd->sel.sprite;
m_sp = m_sp_map[sel];
}
void GPUDrawScanline::EndDraw(const GSRasterizerStats& stats)
void GPUDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
{
m_ds_map.UpdateStats(stats, m_state->m_perfmon.GetFrame());
m_ds_map.UpdateStats(stats, frame);
}

View File

@ -29,24 +29,18 @@
class GPUDrawScanline : public IDrawScanline
{
GPUScanlineEnvironment m_env;
//
GPUScanlineLocalData m_local;
GSCodeGeneratorFunctionMap<GPUSetupPrimCodeGenerator, uint32, SetupPrimPtr> m_sp_map;
GSCodeGeneratorFunctionMap<GPUDrawScanlineCodeGenerator, uint32, DrawScanlinePtr> m_ds_map;
protected:
GPUState* m_state;
int m_id;
public:
GPUDrawScanline(GPUState* state, int id);
GPUDrawScanline(const GPUScanlineGlobalData* gd);
virtual ~GPUDrawScanline();
// IDrawScanline
void BeginDraw(const GSRasterizerData* data);
void EndDraw(const GSRasterizerStats& stats);
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
void PrintStats() {m_ds_map.PrintStats();}
};

View File

@ -24,14 +24,20 @@
#include "StdAfx.h"
#include "GPUDrawScanlineCodeGenerator.h"
static const int _args = 8;
static const int _top = _args + 4;
static const int _v = _args + 8;
GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_env(*(GPUScanlineEnvironment*)param)
, m_local(*(GPUScanlineLocalData*)param)
{
#if _M_AMD64
#error TODO
#endif
m_sel.key = key;
Generate();
}
@ -40,9 +46,7 @@ void GPUDrawScanlineCodeGenerator::Generate()
push(esi);
push(edi);
const int params = 8;
Init(params);
Init();
align(16);
@ -112,26 +116,23 @@ L("exit");
ret(8);
}
void GPUDrawScanlineCodeGenerator::Init(int params)
void GPUDrawScanlineCodeGenerator::Init()
{
const int _top = params + 4;
const int _v = params + 8;
mov(eax, dword[esp + _top]);
// uint16* fb = &m_env.vm[(top << (10 + m_env.sel.scalex)) + left];
// uint16* fb = &m_local.vm[(top << (10 + m_sel.scalex)) + left];
mov(edi, eax);
shl(edi, 10 + m_env.sel.scalex);
shl(edi, 10 + m_sel.scalex);
add(edi, edx);
lea(edi, ptr[edi * 2 + (size_t)m_env.vm]);
lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);
// int steps = right - left - 8;
sub(ecx, edx);
sub(ecx, 8);
if(m_env.sel.dtd)
if(m_sel.dtd)
{
// dither = GSVector4i::load<false>(&s_dither[top & 3][left & 3]);
@ -140,48 +141,48 @@ void GPUDrawScanlineCodeGenerator::Init(int params)
and(edx, 3);
shl(edx, 1);
movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]);
movdqa(ptr[&m_env.temp.dither], xmm0);
movdqa(ptr[&m_local.temp.dither], xmm0);
}
mov(edx, dword[esp + _v]);
if(m_env.sel.tme)
if(m_sel.tme)
{
mov(esi, dword[&m_env.tex]);
mov(esi, dword[&m_local.gd->tex]);
// GSVector4i vt = GSVector4i(v.t).xxzzl();
cvttps2dq(xmm4, ptr[edx + 32]);
pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
// s = vt.xxxx().add16(m_env.d.s);
// t = vt.yyyy().add16(m_env.d.t);
// s = vt.xxxx().add16(m_local.d.s);
// t = vt.yyyy().add16(m_local.d.t);
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
paddw(xmm2, ptr[&m_env.d.s]);
paddw(xmm2, ptr[&m_local.d.s]);
if(!m_env.sel.sprite)
if(!m_sel.sprite)
{
paddw(xmm3, ptr[&m_env.d.t]);
paddw(xmm3, ptr[&m_local.d.t]);
}
else
{
if(m_env.sel.ltf)
if(m_sel.ltf)
{
movdqa(xmm0, xmm3);
psllw(xmm0, 8);
psrlw(xmm0, 1);
movdqa(ptr[&m_env.temp.vf], xmm0);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
}
movdqa(ptr[&m_env.temp.s], xmm2);
movdqa(ptr[&m_env.temp.t], xmm3);
movdqa(ptr[&m_local.temp.s], xmm2);
movdqa(ptr[&m_local.temp.t], xmm3);
}
if(m_env.sel.tfx != 3) // != decal
if(m_sel.tfx != 3) // != decal
{
// GSVector4i vc = GSVector4i(v.c).xxzzlh();
@ -197,20 +198,20 @@ void GPUDrawScanlineCodeGenerator::Init(int params)
pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
if(m_env.sel.iip)
if(m_sel.iip)
{
// r = r.add16(m_env.d.r);
// g = g.add16(m_env.d.g);
// b = b.add16(m_env.d.b);
// r = r.add16(m_local.d.r);
// g = g.add16(m_local.d.g);
// b = b.add16(m_local.d.b);
paddw(xmm4, ptr[&m_env.d.r]);
paddw(xmm5, ptr[&m_env.d.g]);
paddw(xmm6, ptr[&m_env.d.b]);
paddw(xmm4, ptr[&m_local.d.r]);
paddw(xmm5, ptr[&m_local.d.g]);
paddw(xmm6, ptr[&m_local.d.b]);
}
movdqa(ptr[&m_env.temp.r], xmm4);
movdqa(ptr[&m_env.temp.g], xmm5);
movdqa(ptr[&m_env.temp.b], xmm6);
movdqa(ptr[&m_local.temp.r], xmm4);
movdqa(ptr[&m_local.temp.g], xmm5);
movdqa(ptr[&m_local.temp.b], xmm6);
}
}
@ -224,62 +225,62 @@ void GPUDrawScanlineCodeGenerator::Step()
add(edi, 8 * sizeof(uint16));
if(m_env.sel.tme)
if(m_sel.tme)
{
// GSVector4i st = m_env.d8.st;
// GSVector4i st = m_local.d8.st;
movdqa(xmm4, ptr[&m_env.d8.st]);
movdqa(xmm4, ptr[&m_local.d8.st]);
// s = s.add16(st.xxxx());
// t = t.add16(st.yyyy());
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
paddw(xmm2, ptr[&m_env.temp.s]);
movdqa(ptr[&m_env.temp.s], xmm2);
paddw(xmm2, ptr[&m_local.temp.s]);
movdqa(ptr[&m_local.temp.s], xmm2);
// TODO: if(!sprite) ... else reload t
pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
paddw(xmm3, ptr[&m_env.temp.t]);
movdqa(ptr[&m_env.temp.t], xmm3);
paddw(xmm3, ptr[&m_local.temp.t]);
movdqa(ptr[&m_local.temp.t], xmm3);
}
if(m_env.sel.tfx != 3) // != decal
if(m_sel.tfx != 3) // != decal
{
if(m_env.sel.iip)
if(m_sel.iip)
{
// GSVector4i c = m_env.d8.c;
// GSVector4i c = m_local.d8.c;
// r = r.add16(c.xxxx());
// g = g.add16(c.yyyy());
// b = b.add16(c.zzzz());
movdqa(xmm6, ptr[&m_env.d8.c]);
movdqa(xmm6, ptr[&m_local.d8.c]);
pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
paddw(xmm4, ptr[&m_env.temp.r]);
paddw(xmm5, ptr[&m_env.temp.g]);
paddw(xmm6, ptr[&m_env.temp.b]);
paddw(xmm4, ptr[&m_local.temp.r]);
paddw(xmm5, ptr[&m_local.temp.g]);
paddw(xmm6, ptr[&m_local.temp.b]);
movdqa(ptr[&m_env.temp.r], xmm4);
movdqa(ptr[&m_env.temp.g], xmm5);
movdqa(ptr[&m_env.temp.b], xmm6);
movdqa(ptr[&m_local.temp.r], xmm4);
movdqa(ptr[&m_local.temp.g], xmm5);
movdqa(ptr[&m_local.temp.b], xmm6);
}
else
{
movdqa(xmm4, ptr[&m_env.temp.r]);
movdqa(xmm5, ptr[&m_env.temp.g]);
movdqa(xmm6, ptr[&m_env.temp.b]);
movdqa(xmm4, ptr[&m_local.temp.r]);
movdqa(xmm5, ptr[&m_local.temp.g]);
movdqa(xmm6, ptr[&m_local.temp.b]);
}
}
}
void GPUDrawScanlineCodeGenerator::TestMask()
{
if(!m_env.sel.me)
if(!m_sel.me)
{
return;
}
@ -295,7 +296,7 @@ void GPUDrawScanlineCodeGenerator::TestMask()
void GPUDrawScanlineCodeGenerator::SampleTexture()
{
if(!m_env.sel.tme)
if(!m_sel.tme)
{
return;
}
@ -306,7 +307,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
// xmm0, xmm4, xmm5, xmm6 = free
// xmm1 = used
if(m_env.sel.ltf)
if(m_sel.ltf)
{
// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f
@ -324,14 +325,14 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
movdqa(xmm0, xmm2);
psllw(xmm0, 8);
psrlw(xmm0, 1);
movdqa(ptr[&m_env.temp.uf], xmm0);
movdqa(ptr[&m_local.temp.uf], xmm0);
if(!m_env.sel.sprite)
if(!m_sel.sprite)
{
movdqa(xmm0, xmm3);
psllw(xmm0, 8);
psrlw(xmm0, 1);
movdqa(ptr[&m_env.temp.vf], xmm0);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
}
@ -347,7 +348,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
// xmm0, xmm4, xmm5, xmm6 = free
// xmm1 = used
if(m_env.sel.ltf)
if(m_sel.ltf)
{
// GSVector4i u1 = u0.add16(GSVector4i::x0001());
// GSVector4i v1 = v0.add16(GSVector4i::x0001());
@ -360,23 +361,23 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
paddw(xmm4, xmm0);
paddw(xmm5, xmm0);
if(m_env.sel.twin)
if(m_sel.twin)
{
// u0 = (u0 & m_env.twin[0].u).add16(m_env.twin[1].u);
// v0 = (v0 & m_env.twin[0].v).add16(m_env.twin[1].v);
// u1 = (u1 & m_env.twin[0].u).add16(m_env.twin[1].u);
// v1 = (v1 & m_env.twin[0].v).add16(m_env.twin[1].v);
// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);
movdqa(xmm0, ptr[&m_env.twin[0].u]);
movdqa(xmm6, ptr[&m_env.twin[1].u]);
movdqa(xmm0, ptr[&m_local.twin[0].u]);
movdqa(xmm6, ptr[&m_local.twin[1].u]);
pand(xmm2, xmm0);
paddw(xmm2, xmm6);
pand(xmm4, xmm0);
paddw(xmm4, xmm6);
movdqa(xmm0, ptr[&m_env.twin[0].v]);
movdqa(xmm6, ptr[&m_env.twin[1].v]);
movdqa(xmm0, ptr[&m_local.twin[0].v]);
movdqa(xmm6, ptr[&m_local.twin[1].v]);
pand(xmm3, xmm0);
paddw(xmm3, xmm6);
@ -385,15 +386,15 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
}
else
{
// u0 = u0.min_i16(m_env.twin[2].u);
// v0 = v0.min_i16(m_env.twin[2].v);
// u1 = u1.min_i16(m_env.twin[2].u);
// v1 = v1.min_i16(m_env.twin[2].v);
// u0 = u0.min_i16(m_local.twin[2].u);
// v0 = v0.min_i16(m_local.twin[2].v);
// u1 = u1.min_i16(m_local.twin[2].u);
// v1 = v1.min_i16(m_local.twin[2].v);
// TODO: if(!sprite) clamp16 else:
movdqa(xmm0, ptr[&m_env.twin[2].u]);
movdqa(xmm6, ptr[&m_env.twin[2].v]);
movdqa(xmm0, ptr[&m_local.twin[2].u]);
movdqa(xmm6, ptr[&m_local.twin[2].v]);
pminsw(xmm2, xmm0);
pminsw(xmm3, xmm6);
@ -447,8 +448,8 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
// spill (TODO)
movdqa(ptr[&m_env.temp.fd], xmm1);
movdqa(ptr[&m_env.temp.test], xmm7);
movdqa(ptr[&m_local.temp.fd], xmm1);
movdqa(ptr[&m_local.temp.test], xmm7);
// xmm2 = c00
// xmm4 = c01
@ -464,7 +465,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psllw(xmm0, 11);
psrlw(xmm0, 8);
lerp16<0>(xmm0, xmm1, ptr[&m_env.temp.uf]);
lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);
movdqa(xmm6, xmm2);
psllw(xmm6, 6);
@ -476,7 +477,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psrlw(xmm1, 11);
psllw(xmm1, 3);
lerp16<0>(xmm1, xmm6, ptr[&m_env.temp.uf]);
lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);
movdqa(xmm7, xmm2);
psllw(xmm7, 1);
@ -488,14 +489,14 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psrlw(xmm6, 11);
psllw(xmm6, 3);
lerp16<0>(xmm6, xmm7, ptr[&m_env.temp.uf]);
lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);
psraw(xmm2, 15);
psrlw(xmm2, 8);
psraw(xmm4, 15);
psrlw(xmm4, 8);
lerp16<0>(xmm4, xmm2, ptr[&m_env.temp.uf]);
lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);
// xmm0 = r00
// xmm1 = g00
@ -513,8 +514,8 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psllw(xmm2, 11);
psrlw(xmm2, 8);
lerp16<0>(xmm2, xmm7, ptr[&m_env.temp.uf]);
lerp16<0>(xmm2, xmm0, ptr[&m_env.temp.vf]);
lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);
// xmm2 = r
// xmm1 = g00
@ -534,8 +535,8 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psrlw(xmm0, 11);
psllw(xmm0, 3);
lerp16<0>(xmm0, xmm7, ptr[&m_env.temp.uf]);
lerp16<0>(xmm0, xmm1, ptr[&m_env.temp.vf]);
lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);
// xmm2 = r
// xmm0 = g
@ -555,8 +556,8 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psrlw(xmm1, 11);
psllw(xmm1, 3);
lerp16<0>(xmm1, xmm7, ptr[&m_env.temp.uf]);
lerp16<0>(xmm1, xmm6, ptr[&m_env.temp.vf]);
lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);
// xmm2 = r
// xmm0 = g
@ -571,8 +572,8 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
psraw(xmm5, 15);
psrlw(xmm5, 8);
lerp16<0>(xmm5, xmm3, ptr[&m_env.temp.uf]);
lerp16<0>(xmm5, xmm4, ptr[&m_env.temp.vf]);
lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);
// xmm2 = r
// xmm0 = g
@ -588,7 +589,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
// reload test
movdqa(xmm7, ptr[&m_env.temp.test]);
movdqa(xmm7, ptr[&m_local.temp.test]);
// xmm4 = r
// xmm5 = g
@ -615,29 +616,29 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
// reload fd
movdqa(xmm1, ptr[&m_env.temp.fd]);
movdqa(xmm1, ptr[&m_local.temp.fd]);
}
else
{
if(m_env.sel.twin)
if(m_sel.twin)
{
// u = (u & m_env.twin[0].u).add16(m_env.twin[1].u);
// v = (v & m_env.twin[0].v).add16(m_env.twin[1].v);
// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);
pand(xmm2, ptr[&m_env.twin[0].u]);
paddw(xmm2, ptr[&m_env.twin[1].u]);
pand(xmm3, ptr[&m_env.twin[0].v]);
paddw(xmm3, ptr[&m_env.twin[1].v]);
pand(xmm2, ptr[&m_local.twin[0].u]);
paddw(xmm2, ptr[&m_local.twin[1].u]);
pand(xmm3, ptr[&m_local.twin[0].v]);
paddw(xmm3, ptr[&m_local.twin[1].v]);
}
else
{
// u = u.min_i16(m_env.twin[2].u);
// v = v.min_i16(m_env.twin[2].v);
// u = u.min_i16(m_local.twin[2].u);
// v = v.min_i16(m_local.twin[2].v);
// TODO: if(!sprite) clamp16 else:
pminsw(xmm2, ptr[&m_env.twin[2].u]);
pminsw(xmm3, ptr[&m_env.twin[2].v]);
pminsw(xmm2, ptr[&m_local.twin[2].u]);
pminsw(xmm3, ptr[&m_local.twin[2].v]);
}
// xmm2 = u
@ -696,7 +697,7 @@ void GPUDrawScanlineCodeGenerator::SampleTexture()
void GPUDrawScanlineCodeGenerator::ColorTFX()
{
switch(m_env.sel.tfx)
switch(m_sel.tfx)
{
case 0: // none (tfx = 0)
case 1: // none (tfx = tge)
@ -713,11 +714,11 @@ void GPUDrawScanlineCodeGenerator::ColorTFX()
// c[2] = c[2].modulate16<1>(b).clamp8();
pcmpeqd(xmm0, xmm0);
psrlw(xmm0, 8);
modulate16<1>(xmm4, ptr[&m_env.temp.r]);
modulate16<1>(xmm4, ptr[&m_local.temp.r]);
pminsw(xmm4, xmm0);
modulate16<1>(xmm5, ptr[&m_env.temp.g]);
modulate16<1>(xmm5, ptr[&m_local.temp.g]);
pminsw(xmm5, xmm0);
modulate16<1>(xmm6, ptr[&m_env.temp.b]);
modulate16<1>(xmm6, ptr[&m_local.temp.b]);
pminsw(xmm6, xmm0);
break;
case 3: // decal (tfx = tme)
@ -727,7 +728,7 @@ void GPUDrawScanlineCodeGenerator::ColorTFX()
void GPUDrawScanlineCodeGenerator::AlphaBlend()
{
if(!m_env.sel.abe)
if(!m_sel.abe)
{
return;
}
@ -748,7 +749,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
pand(xmm2, xmm0);
psllw(xmm2, 3);
switch(m_env.sel.abr)
switch(m_sel.abr)
{
case 0:
// r = r.avg8(c[0]);
@ -770,7 +771,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
break;
}
if(m_env.sel.tme)
if(m_sel.tme)
{
movdqa(xmm0, xmm3);
blend8(xmm4, xmm2);
@ -789,7 +790,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
pand(xmm2, xmm0);
psrlw(xmm2, 2);
switch(m_env.sel.abr)
switch(m_sel.abr)
{
case 0:
// g = g.avg8(c[2]);
@ -811,7 +812,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
break;
}
if(m_env.sel.tme)
if(m_sel.tme)
{
movdqa(xmm0, xmm3);
blend8(xmm5, xmm2);
@ -830,7 +831,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
pand(xmm2, xmm0);
psrlw(xmm2, 7);
switch(m_env.sel.abr)
switch(m_sel.abr)
{
case 0:
// b = b.avg8(c[2]);
@ -852,7 +853,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
break;
}
if(m_env.sel.tme)
if(m_sel.tme)
{
movdqa(xmm0, xmm3);
blend8(xmm6, xmm2);
@ -865,7 +866,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend()
void GPUDrawScanlineCodeGenerator::Dither()
{
if(!m_env.sel.dtd)
if(!m_sel.dtd)
{
return;
}
@ -874,7 +875,7 @@ void GPUDrawScanlineCodeGenerator::Dither()
// c[1] = c[1].addus8(dither);
// c[2] = c[2].addus8(dither);
movdqa(xmm0, ptr[&m_env.temp.dither]);
movdqa(xmm0, ptr[&m_local.temp.dither]);
paddusb(xmm4, xmm0);
paddusb(xmm5, xmm0);
@ -883,11 +884,11 @@ void GPUDrawScanlineCodeGenerator::Dither()
void GPUDrawScanlineCodeGenerator::WriteFrame()
{
// GSVector4i fs = r | g | b | (m_env.sel.md ? GSVector4i(0x80008000) : m_env.sel.tme ? a : 0);
// GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0);
pcmpeqd(xmm0, xmm0);
if(m_env.sel.md || m_env.sel.tme)
if(m_sel.md || m_sel.tme)
{
movdqa(xmm2, xmm0);
psllw(xmm2, 15);
@ -916,13 +917,13 @@ void GPUDrawScanlineCodeGenerator::WriteFrame()
psllw(xmm6, 7);
por(xmm4, xmm6);
if(m_env.sel.md)
if(m_sel.md)
{
// GSVector4i a = GSVector4i(0x80008000);
por(xmm4, xmm2);
}
else if(m_env.sel.tme)
else if(m_sel.tme)
{
// GSVector4i a = (c[3] << 8) & 0x80008000;
@ -950,9 +951,9 @@ void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr)
{
pextrw(eax, addr, (uint8)i);
if(m_env.sel.tlu) movzx(eax, byte[esi + eax]);
if(m_sel.tlu) movzx(eax, byte[esi + eax]);
const Address& src = m_env.sel.tlu ? ptr[eax * 2 + (size_t)m_env.clut] : ptr[esi + eax * 2];
const Address& src = m_sel.tlu ? ptr[eax * 2 + (size_t)m_local.gd->clut] : ptr[esi + eax * 2];
if(i == 0) movd(dst, src);
else pinsrw(dst, src, (uint8)i);

View File

@ -33,11 +33,12 @@ class GPUDrawScanlineCodeGenerator : public GSCodeGenerator
static const GSVector4i m_test[8];
static const uint16 m_dither[4][16];
GPUScanlineEnvironment& m_env;
GPUScanlineSelector m_sel;
GPUScanlineLocalData& m_local;
void Generate();
void Init(int params);
void Init();
void Step();
void TestMask();
void SampleTexture();

View File

@ -25,7 +25,7 @@
#pragma pack(push, 1)
__declspec(align(16)) class GPUDrawingEnvironment
__aligned32 class GPUDrawingEnvironment
{
public:
GPURegSTATUS STATUS;

View File

@ -23,11 +23,11 @@
#include "GPURendererSW.h"
#include "GSdx.h"
GPURendererSW::GPURendererSW(GSDevice* dev)
GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
: GPURendererT(dev)
, m_texture(NULL)
{
m_rl.Create<GPUDrawScanline>(this, theApp.GetConfig("swthreads", 1));
m_rl.Create<GPUDrawScanline, GPUScanlineGlobalData>(threads);
}
GPURendererSW::~GPURendererSW()
@ -70,39 +70,42 @@ void GPURendererSW::Draw()
//
GPUScanlineParam p;
GPUScanlineGlobalData gd;
p.sel.key = 0;
p.sel.iip = env.PRIM.IIP;
p.sel.me = env.STATUS.ME;
gd.sel.key = 0;
gd.sel.iip = env.PRIM.IIP;
gd.sel.me = env.STATUS.ME;
if(env.PRIM.ABE)
{
p.sel.abe = env.PRIM.ABE;
p.sel.abr = env.STATUS.ABR;
gd.sel.abe = env.PRIM.ABE;
gd.sel.abr = env.STATUS.ABR;
}
p.sel.tge = env.PRIM.TGE;
gd.sel.tge = env.PRIM.TGE;
if(env.PRIM.TME)
{
p.sel.tme = env.PRIM.TME;
p.sel.tlu = env.STATUS.TP < 2;
p.sel.twin = (env.TWIN.u32 & 0xfffff) != 0;
p.sel.ltf = m_filter == 1 && env.PRIM.TYPE == GPU_POLYGON || m_filter == 2 ? 1 : 0;
gd.sel.tme = env.PRIM.TME;
gd.sel.tlu = env.STATUS.TP < 2;
gd.sel.twin = (env.TWIN.u32 & 0xfffff) != 0;
gd.sel.ltf = m_filter == 1 && env.PRIM.TYPE == GPU_POLYGON || m_filter == 2 ? 1 : 0;
const void* t = m_mem.GetTexture(env.STATUS.TP, env.STATUS.TX, env.STATUS.TY);
if(!t) {ASSERT(0); return;}
p.tex = t;
p.clut = m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y);
gd.tex = t;
gd.clut = m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y);
gd.twin = GSVector4i(env.TWIN.TWW, env.TWIN.TWH, env.TWIN.TWX, env.TWIN.TWY);
}
p.sel.dtd = m_dither ? env.STATUS.DTD : 0;
p.sel.md = env.STATUS.MD;
p.sel.sprite = env.PRIM.TYPE == GPU_SPRITE;
p.sel.scalex = m_mem.GetScale().x;
gd.sel.dtd = m_dither ? env.STATUS.DTD : 0;
gd.sel.md = env.STATUS.MD;
gd.sel.sprite = env.PRIM.TYPE == GPU_SPRITE;
gd.sel.scalex = m_mem.GetScale().x;
gd.vm = m_mem.GetPixelAddress(0, 0);
//
@ -110,7 +113,8 @@ void GPURendererSW::Draw()
data.vertices = m_vertices;
data.count = m_count;
data.param = &p;
data.frame = m_perfmon.GetFrame();
data.param = &gd;
data.scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
data.scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
@ -127,14 +131,6 @@ void GPURendererSW::Draw()
m_rl.Draw(&data);
GSRasterizerStats stats;
m_rl.GetStats(stats);
m_perfmon.Put(GSPerfMon::Draw, 1);
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
// TODO
{
@ -158,6 +154,16 @@ void GPURendererSW::Draw()
Invalidate(r);
}
m_rl.Sync();
GSRasterizerStats stats;
m_rl.GetStats(stats);
m_perfmon.Put(GSPerfMon::Draw, 1);
m_perfmon.Put(GSPerfMon::Prim, stats.prims);
m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
}
void GPURendererSW::VertexKick()

View File

@ -36,6 +36,6 @@ protected:
void Draw();
public:
GPURendererSW(GSDevice* dev);
GPURendererSW(GSDevice* dev, int threads);
virtual ~GPURendererSW();
};

View File

@ -53,29 +53,26 @@ union GPUScanlineSelector
uint32 key;
operator uint32() {return key;}
operator uint32() const {return key;}
};
__declspec(align(16)) struct GPUScanlineParam
{
GPUScanlineSelector sel;
const void* tex;
const uint16* clut;
};
__declspec(align(16)) struct GPUScanlineEnvironment
__aligned32 struct GPUScanlineGlobalData
{
GPUScanlineSelector sel;
void* vm;
const void* tex;
const uint16* clut;
GSVector4i twin; // TWW, TWH, TWX, TWY
};
// GSVector4i md; // similar to gs fba
__aligned32 struct GPUScanlineLocalData
{
const GPUScanlineGlobalData* gd;
struct {GSVector4i u, v;} twin[3];
struct {GSVector4i s, t, r, g, b, _pad[3];} d;
struct {GSVector4i st, c;} d8;
struct {GSVector4i s, t, r, b, g, uf, vf, dither, fd, test;} temp;
};

View File

@ -29,22 +29,24 @@ using namespace Xbyak;
GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_env(*(GPUScanlineEnvironment*)param)
, m_local(*(GPUScanlineLocalData*)param)
{
#if _M_AMD64
#error TODO
#endif
m_sel.key = key;
Generate();
}
void GPUSetupPrimCodeGenerator::Generate()
{
if(m_env.sel.tme && !m_env.sel.twin)
if(m_sel.tme && !m_sel.twin)
{
pcmpeqd(xmm0, xmm0);
if(m_env.sel.sprite)
if(m_sel.sprite)
{
// t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001();
@ -59,30 +61,30 @@ void GPUSetupPrimCodeGenerator::Generate()
packssdw(xmm1, xmm1);
punpcklwd(xmm1, xmm1);
// m_env.twin[2].u = t.xxxx();
// m_env.twin[2].v = t.yyyy();
// m_local.twin[2].u = t.xxxx();
// m_local.twin[2].v = t.yyyy();
pshufd(xmm2, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm3, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
movdqa(ptr[&m_env.twin[2].u], xmm2);
movdqa(ptr[&m_env.twin[2].v], xmm3);
movdqa(ptr[&m_local.twin[2].u], xmm2);
movdqa(ptr[&m_local.twin[2].v], xmm3);
}
else
{
// TODO: not really needed
// m_env.twin[2].u = GSVector4i::x00ff();
// m_env.twin[2].v = GSVector4i::x00ff();
// m_local.twin[2].u = GSVector4i::x00ff();
// m_local.twin[2].v = GSVector4i::x00ff();
psrlw(xmm0, 8);
movdqa(ptr[&m_env.twin[2].u], xmm0);
movdqa(ptr[&m_env.twin[2].v], xmm0);
movdqa(ptr[&m_local.twin[2].u], xmm0);
movdqa(ptr[&m_local.twin[2].v], xmm0);
}
}
if(m_env.sel.tme || m_env.sel.iip && m_env.sel.tfx != 3)
if(m_sel.tme || m_sel.iip && m_sel.tfx != 3)
{
for(int i = 0; i < 3; i++)
{
@ -105,21 +107,21 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm2, xmm2);
packssdw(xmm1, xmm2);
if(m_env.sel.tme)
if(m_sel.tme)
{
// m_env.d8.st = dtc8.upl16(dtc8);
// m_local.d8.st = dtc8.upl16(dtc8);
movdqa(xmm0, xmm1);
punpcklwd(xmm0, xmm0);
movdqa(ptr[&m_env.d8.st], xmm0);
movdqa(ptr[&m_local.d8.st], xmm0);
}
if(m_env.sel.iip && m_env.sel.tfx != 3)
if(m_sel.iip && m_sel.tfx != 3)
{
// m_env.d8.c = dtc8.uph16(dtc8);
// m_local.d8.c = dtc8.uph16(dtc8);
punpckhwd(xmm1, xmm1);
movdqa(ptr[&m_env.d8.c], xmm1);
movdqa(ptr[&m_local.d8.c], xmm1);
}
// xmm3 = dt
@ -128,7 +130,7 @@ void GPUSetupPrimCodeGenerator::Generate()
// xmm7 = ps4567
// xmm0, xmm1, xmm2, xmm5 = free
if(m_env.sel.tme)
if(m_sel.tme)
{
// GSVector4 dtx = dt.xxxx();
// GSVector4 dty = dt.yyyy();
@ -137,7 +139,7 @@ void GPUSetupPrimCodeGenerator::Generate()
shufps(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
// m_env.d.s = GSVector4i(dtx * ps0123).ps32(GSVector4i(dtx * ps4567));
// m_local.d.s = GSVector4i(dtx * ps0123).ps32(GSVector4i(dtx * ps4567));
movaps(xmm1, xmm3);
mulps(xmm3, xmm6);
@ -145,9 +147,9 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm3, xmm3);
cvttps2dq(xmm1, xmm1);
packssdw(xmm3, xmm1);
movdqa(ptr[&m_env.d.s], xmm3);
movdqa(ptr[&m_local.d.s], xmm3);
// m_env.d.t = GSVector4i(dty * ps0123).ps32(GSVector4i(dty * ps4567));
// m_local.d.t = GSVector4i(dty * ps0123).ps32(GSVector4i(dty * ps4567));
movaps(xmm1, xmm0);
mulps(xmm0, xmm6);
@ -155,7 +157,7 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm0, xmm0);
cvttps2dq(xmm1, xmm1);
packssdw(xmm0, xmm1);
movdqa(ptr[&m_env.d.t], xmm0);
movdqa(ptr[&m_local.d.t], xmm0);
}
// xmm4 = dc
@ -163,7 +165,7 @@ void GPUSetupPrimCodeGenerator::Generate()
// xmm7 = ps4567
// xmm0, xmm1, zmm2, xmm3, xmm5 = free
if(m_env.sel.iip && m_env.sel.tfx != 3)
if(m_sel.iip && m_sel.tfx != 3)
{
// GSVector4 dcx = dc.xxxx();
// GSVector4 dcy = dc.yyyy();
@ -175,7 +177,7 @@ void GPUSetupPrimCodeGenerator::Generate()
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d.r = GSVector4i(dcx * ps0123).ps32(GSVector4i(dcx * ps4567));
// m_local.d.r = GSVector4i(dcx * ps0123).ps32(GSVector4i(dcx * ps4567));
movaps(xmm2, xmm4);
mulps(xmm4, xmm6);
@ -183,9 +185,9 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm4, xmm4);
cvttps2dq(xmm2, xmm2);
packssdw(xmm4, xmm2);
movdqa(ptr[&m_env.d.r], xmm4);
movdqa(ptr[&m_local.d.r], xmm4);
// m_env.d.g = GSVector4i(dcy * ps0123).ps32(GSVector4i(dcy * ps4567));
// m_local.d.g = GSVector4i(dcy * ps0123).ps32(GSVector4i(dcy * ps4567));
movaps(xmm2, xmm0);
mulps(xmm0, xmm6);
@ -193,9 +195,9 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm0, xmm0);
cvttps2dq(xmm2, xmm2);
packssdw(xmm0, xmm2);
movdqa(ptr[&m_env.d.g], xmm0);
movdqa(ptr[&m_local.d.g], xmm0);
// m_env.d.b = GSVector4i(dcz * ps0123).ps32(GSVector4i(dcz * ps4567));
// m_local.d.b = GSVector4i(dcz * ps0123).ps32(GSVector4i(dcz * ps4567));
movaps(xmm2, xmm1);
mulps(xmm1, xmm6);
@ -203,7 +205,7 @@ void GPUSetupPrimCodeGenerator::Generate()
cvttps2dq(xmm1, xmm1);
cvttps2dq(xmm2, xmm2);
packssdw(xmm1, xmm2);
movdqa(ptr[&m_env.d.b], xmm1);
movdqa(ptr[&m_local.d.b], xmm1);
}
}

View File

@ -30,7 +30,8 @@ class GPUSetupPrimCodeGenerator : public GSCodeGenerator
static const GSVector4 m_shift[3];
GPUScanlineEnvironment& m_env;
GPUScanlineSelector m_sel;
GPUScanlineLocalData& m_local;
void Generate();

View File

@ -29,7 +29,7 @@
#include "GSUtil.h"
#include "GSPerfMon.h"
class GPUState : public GSAlignedClass<16>
class GPUState : public GSAlignedClass<32>
{
typedef void (GPUState::*GPUStatusCommandHandler)(GPUReg* r);

View File

@ -26,7 +26,7 @@
#pragma pack(push, 1)
__declspec(align(16)) struct GPUVertex
__aligned32 struct GPUVertex
{
union
{

View File

@ -149,7 +149,7 @@ EXPORT_C GSclose()
s_gs->m_wnd.Detach();
}
static INT32 _GSopen(void* dsp, char* title, int renderer)
static INT32 _GSopen(void* dsp, char* title, int renderer, int threads = -1)
{
GSDevice* dev = NULL;
@ -158,6 +158,11 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
renderer = theApp.GetConfig("renderer", 0);
}
if(threads == -1)
{
threads = theApp.GetConfig("swthreads", 1);
}
try
{
if (s_renderer != renderer)
@ -196,20 +201,21 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
s_gs = new GSRendererNull();
break;
case 1: case 4: case 7: case 10: case 12:
s_gs = new GSRendererSW();
s_gs = new GSRendererSW(threads);
break;
}
s_renderer = renderer;
}
}
catch( std::exception& ex )
catch(std::exception& ex)
{
// Allowing std exceptions to escape the scope of the plugin callstack could
// be problematic, because of differing typeids between DLL and EXE compilations.
// ('new' could throw std::alloc)
printf( "GSdx error: Exception caught in GSopen: %s", ex.what() );
printf("GSdx error: Exception caught in GSopen: %s", ex.what());
return -1;
}
@ -218,7 +224,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
s_gs->SetVsync(s_vsync);
s_gs->SetFrameLimit(s_framelimit);
if( *(HWND*)dsp == NULL )
if(*(HWND*)dsp == NULL)
{
// old-style API expects us to create and manage our own window:
@ -232,21 +238,23 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
}
s_gs->m_wnd.Show();
*(HWND*)dsp = (HWND)s_gs->m_wnd.GetHandle();
}
else
{
s_gs->SetMultithreaded( true );
s_gs->m_wnd.Attach( *(HWND*)dsp, false );
s_gs->SetMultithreaded(true);
s_gs->m_wnd.Attach(*(HWND*)dsp, false);
}
if( !s_gs->CreateDevice(dev) )
if(!s_gs->CreateDevice(dev))
{
// This probably means the user has DX11 configured with a video card that is only DX9
// compliant. Cound mean drivr issues of some sort also, but to be sure, that's the most
// common cause of device creation errors. :) --air
GSclose();
return -1;
}
@ -255,17 +263,18 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
return 0;
}
EXPORT_C_(INT32) GSopen2( void* dsp, INT32 flags )
EXPORT_C_(INT32) GSopen2(void* dsp, INT32 flags)
{
int renderer = theApp.GetConfig("renderer", 0);
if( flags & 4 )
if(flags & 4)
{
if (isdx11avail) renderer = 4; //dx11 sw
else renderer = 1; //dx9 sw
renderer = isdx11avail ? 4 : 1; // dx11 / dx9 sw
}
INT32 retval = _GSopen( dsp, NULL, renderer );
s_gs->SetAspectRatio(0); // PCSX2 manages the aspect ratios
INT32 retval = _GSopen(dsp, NULL, renderer);
s_gs->SetAspectRatio(0); // PCSX2 manages the aspect ratios
return retval;
}
@ -275,18 +284,21 @@ EXPORT_C_(INT32) GSopen(void* dsp, char* title, int mt)
int renderer;
// Legacy GUI expects to acquire vsync from the configuration files.
s_vsync = !!theApp.GetConfig("vsync", 0);
if(mt == 2)
{
// pcsx2 sent a switch renderer request
if (isdx11avail) renderer = 4; //dx11 sw
else renderer = 1; //dx9 sw
renderer = isdx11avail ? 4 : 1; // dx11 / dx9 sw
mt = 1;
}
else
{
// normal init
renderer = theApp.GetConfig("renderer", 0);
}
@ -294,9 +306,9 @@ EXPORT_C_(INT32) GSopen(void* dsp, char* title, int mt)
int retval = _GSopen(dsp, title, renderer);
if( retval == 0 && s_gs )
if(retval == 0 && s_gs)
{
s_gs->SetMultithreaded( !!mt );
s_gs->SetMultithreaded(!!mt);
}
return retval;
@ -370,10 +382,14 @@ EXPORT_C GSvsync(int field)
EXPORT_C_(uint32) GSmakeSnapshot(char* path)
{
string str = string(path);
if (str[str.length() - 1] != '\\')
str = str + "\\";
return s_gs->MakeSnapshot(str + "gsdx");
string s(path);
if(s.back() != '\\')
{
s = s + "\\";
}
return s_gs->MakeSnapshot(s + "gsdx");
}
EXPORT_C GSkeyEvent(GSKeyEventData* e)
@ -401,13 +417,14 @@ EXPORT_C_(int) GSfreeze(int mode, GSFreezeData* data)
EXPORT_C GSconfigure()
{
if( !GSUtil::CheckSSE() ) return;
if(!GSUtil::CheckSSE()) return;
if( GSSettingsDlg( s_IsGsOpen2 ).DoModal() == IDOK )
if(GSSettingsDlg(s_IsGsOpen2).DoModal() == IDOK)
{
if( s_gs != NULL && s_gs->m_wnd.IsManaged() )
if(s_gs != NULL && s_gs->m_wnd.IsManaged())
{
// Legacy apps like gsdxgui expect this...
GSshutdown();
}
}
@ -427,7 +444,9 @@ EXPORT_C_(INT32) GStest()
if(!GSUtil::CheckDirectX())
{
if(SUCCEEDED(s_hr))
{
::CoUninitialize();
}
s_hr = E_FAIL;
@ -435,7 +454,9 @@ EXPORT_C_(INT32) GStest()
}
if(SUCCEEDED(s_hr))
{
::CoUninitialize();
}
s_hr = E_FAIL;
@ -451,7 +472,8 @@ EXPORT_C GSabout()
EXPORT_C GSirqCallback(void (*irq)())
{
s_irq = irq;
if( s_gs )
if(s_gs)
{
s_gs->SetIrqCallback(s_irq);
}
@ -462,9 +484,13 @@ EXPORT_C_(int) GSsetupRecording(int start, void* data)
if(!s_gs) return 0;
if(start & 1)
{
s_gs->BeginCapture();
}
else
{
s_gs->EndCapture();
}
return 1;
}
@ -486,13 +512,15 @@ EXPORT_C GSgetLastTag(uint32* tag)
EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
{
if (!s_gs->m_GStitleInfoBuffer[0])
if(!s_gs->m_GStitleInfoBuffer[0])
{
strcpy(dest, "GSdx");
}
else
{
EnterCriticalSection(&s_gs->m_pGSsetTitle_Crit);
snprintf(dest, length-1, "GSdx | %s", s_gs->m_GStitleInfoBuffer);
dest[length-1] = 0; // just in case!
snprintf(dest, length - 1, "GSdx | %s", s_gs->m_GStitleInfoBuffer);
dest[length - 1] = 0; // just in case!
LeaveCriticalSection(&s_gs->m_pGSsetTitle_Crit);
}
}
@ -505,22 +533,31 @@ EXPORT_C GSsetFrameSkip(int frameskip)
EXPORT_C GSsetVsync(int enabled)
{
s_vsync = !!enabled;
if( s_gs )
if(s_gs)
{
s_gs->SetVsync(s_vsync);
}
}
EXPORT_C GSsetExclusive(int enabled)
{
s_exclusive = !!enabled;
if( s_gs )
if(s_gs)
{
s_gs->SetVsync(s_vsync);
}
}
EXPORT_C GSsetFrameLimit(int limit)
{
s_framelimit = !!limit;
if( s_gs )
if(s_gs)
{
s_gs->SetFrameLimit(s_framelimit);
}
}
#ifdef _WINDOWS
@ -595,6 +632,7 @@ public:
// lpszCmdLine:
// First parameter is the renderer.
// Second parameter is the gs file to load and run.
EXPORT_C GSReplay(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
{
int renderer = -1;

View File

@ -23,13 +23,13 @@
#include "GSDrawScanline.h"
#include "GSTextureCacheSW.h"
GSDrawScanline::GSDrawScanline(GSState* state, int id)
: m_state(state)
, m_id(id)
, m_sp_map("GSSetupPrim", &m_env)
, m_ds_map("GSDrawScanline", &m_env)
GSDrawScanline::GSDrawScanline(GSScanlineGlobalData* gd)
: m_sp_map("GSSetupPrim", &m_local)
, m_ds_map("GSDrawScanline", &m_local)
{
memset(&m_env, 0, sizeof(m_env));
memset(&m_local, 0, sizeof(m_local));
m_local.gd = gd;
}
GSDrawScanline::~GSDrawScanline()
@ -38,137 +38,13 @@ GSDrawScanline::~GSDrawScanline()
void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
{
GSDrawingEnvironment& env = m_state->m_env;
GSDrawingContext* context = m_state->m_context;
m_ds = m_ds_map[m_local.gd->sel];
const GSScanlineParam* p = (const GSScanlineParam*)data->param;
m_sel = p->sel;
m_env.vm = p->vm;
m_env.fbr = p->fbo->pixel.row;
m_env.zbr = p->zbo->pixel.row;
m_env.fbc = p->fbo->pixel.col[0];
m_env.zbc = p->zbo->pixel.col[0];
m_env.fzbr = p->fzbo->row;
m_env.fzbc = p->fzbo->col;
m_env.fm = GSVector4i(p->fm);
m_env.zm = GSVector4i(p->zm);
m_env.aref = GSVector4i((int)context->TEST.AREF);
m_env.afix = GSVector4i((int)context->ALPHA.FIX << 7).xxzzlh();
m_env.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff);
m_env.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff);
m_env.dimx = env.dimx;
if(m_sel.fpsm == 1)
{
m_env.fm |= GSVector4i::xff000000();
}
else if(m_sel.fpsm == 2)
{
GSVector4i rb = m_env.fm & 0x00f800f8;
GSVector4i ga = m_env.fm & 0x8000f800;
m_env.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | GSVector4i::xffff0000();
}
if(m_sel.zpsm == 1)
{
m_env.zm |= GSVector4i::xff000000();
}
else if(m_sel.zpsm == 2)
{
m_env.zm |= GSVector4i::xffff0000();
}
if(m_sel.atst == ATST_LESS)
{
m_sel.atst = ATST_LEQUAL;
m_env.aref -= GSVector4i::x00000001();
}
else if(m_sel.atst == ATST_GREATER)
{
m_sel.atst = ATST_GEQUAL;
m_env.aref += GSVector4i::x00000001();
}
if(m_sel.tfx != TFX_NONE)
{
m_env.tex = p->tex;
m_env.clut = p->clut;
unsigned short tw = (unsigned short)(1 << context->TEX0.TW);
unsigned short th = (unsigned short)(1 << context->TEX0.TH);
switch(context->CLAMP.WMS)
{
case CLAMP_REPEAT:
m_env.t.min.u16[0] = tw - 1;
m_env.t.max.u16[0] = 0;
m_env.t.mask.u32[0] = 0xffffffff;
break;
case CLAMP_CLAMP:
m_env.t.min.u16[0] = 0;
m_env.t.max.u16[0] = tw - 1;
m_env.t.mask.u32[0] = 0;
break;
case CLAMP_REGION_CLAMP:
m_env.t.min.u16[0] = std::min<int>(context->CLAMP.MINU, tw - 1);
m_env.t.max.u16[0] = std::min<int>(context->CLAMP.MAXU, tw - 1);
m_env.t.mask.u32[0] = 0;
break;
case CLAMP_REGION_REPEAT:
m_env.t.min.u16[0] = context->CLAMP.MINU;
m_env.t.max.u16[0] = context->CLAMP.MAXU;
m_env.t.mask.u32[0] = 0xffffffff;
break;
default:
__assume(0);
}
switch(context->CLAMP.WMT)
{
case CLAMP_REPEAT:
m_env.t.min.u16[4] = th - 1;
m_env.t.max.u16[4] = 0;
m_env.t.mask.u32[2] = 0xffffffff;
break;
case CLAMP_CLAMP:
m_env.t.min.u16[4] = 0;
m_env.t.max.u16[4] = th - 1;
m_env.t.mask.u32[2] = 0;
break;
case CLAMP_REGION_CLAMP:
m_env.t.min.u16[4] = std::min<int>(context->CLAMP.MINV, th - 1);
m_env.t.max.u16[4] = std::min<int>(context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
m_env.t.mask.u32[2] = 0;
break;
case CLAMP_REGION_REPEAT:
m_env.t.min.u16[4] = context->CLAMP.MINV;
m_env.t.max.u16[4] = context->CLAMP.MAXV;
m_env.t.mask.u32[2] = 0xffffffff;
break;
default:
__assume(0);
}
m_env.t.min = m_env.t.min.xxxxlh();
m_env.t.max = m_env.t.max.xxxxlh();
m_env.t.mask = m_env.t.mask.xxzz();
m_env.t.invmask = ~m_env.t.mask;
}
//
m_ds = m_ds_map[m_sel];
if(m_sel.aa1)// && (m_state->m_perfmon.GetFrame() & 0x40))
if(m_local.gd->sel.aa1)// && (m_state->m_perfmon.GetFrame() & 0x40))
{
GSScanlineSelector sel;
sel.key = m_sel.key;
sel.key = m_local.gd->sel.key;
sel.zwrite = 0;
sel.edge = 1;
@ -179,7 +55,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
m_de = NULL;
}
if(m_sel.IsSolidRect())
if(m_local.gd->sel.IsSolidRect())
{
m_dr = (DrawRectPtr)&GSDrawScanline::DrawRect;
}
@ -194,22 +70,22 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.key = 0;
sel.iip = m_sel.iip;
sel.tfx = m_sel.tfx;
sel.tcc = m_sel.tcc;
sel.fst = m_sel.fst;
sel.fge = m_sel.fge;
sel.sprite = m_sel.sprite;
sel.fb = m_sel.fb;
sel.zb = m_sel.zb;
sel.zoverflow = m_sel.zoverflow;
sel.iip = m_local.gd->sel.iip;
sel.tfx = m_local.gd->sel.tfx;
sel.tcc = m_local.gd->sel.tcc;
sel.fst = m_local.gd->sel.fst;
sel.fge = m_local.gd->sel.fge;
sel.sprite = m_local.gd->sel.sprite;
sel.fb = m_local.gd->sel.fb;
sel.zb = m_local.gd->sel.zb;
sel.zoverflow = m_local.gd->sel.zoverflow;
m_sp = m_sp_map[sel];
}
void GSDrawScanline::EndDraw(const GSRasterizerStats& stats)
void GSDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame)
{
m_ds_map.UpdateStats(stats, m_state->m_perfmon.GetFrame());
m_ds_map.UpdateStats(stats, frame);
}
void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
@ -221,56 +97,62 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
uint32 m;
m = m_env.zm.u32[0];
m = m_local.gd->zm.u32[0];
if(m != 0xffffffff)
{
const int* zbr = m_local.gd->zbr;
const int* zbc = m_local.gd->zbc;
uint32 z = (uint32)v.p.z;
if(m_sel.zpsm != 2)
if(m_local.gd->sel.zpsm != 2)
{
if(m == 0)
{
DrawRectT<uint32, false>(m_env.zbr, m_env.zbc, r, z, m);
DrawRectT<uint32, false>(zbr, zbc, r, z, m);
}
else
{
DrawRectT<uint32, true>(m_env.zbr, m_env.zbc, r, z, m);
DrawRectT<uint32, true>(zbr, zbc, r, z, m);
}
}
else
{
if(m == 0)
{
DrawRectT<uint16, false>(m_env.zbr, m_env.zbc, r, z, m);
DrawRectT<uint16, false>(zbr, zbc, r, z, m);
}
else
{
DrawRectT<uint16, true>(m_env.zbr, m_env.zbc, r, z, m);
DrawRectT<uint16, true>(zbr, zbc, r, z, m);
}
}
}
m = m_env.fm.u32[0];
m = m_local.gd->fm.u32[0];
if(m != 0xffffffff)
{
const int* fbr = m_local.gd->fbr;
const int* fbc = m_local.gd->fbc;
uint32 c = (GSVector4i(v.c) >> 7).rgba32();
if(m_state->m_context->FBA.FBA)
if(m_local.gd->sel.fba)
{
c |= 0x80000000;
}
if(m_sel.fpsm != 2)
if(m_local.gd->sel.fpsm != 2)
{
if(m == 0)
{
DrawRectT<uint32, false>(m_env.fbr, m_env.fbc, r, c, m);
DrawRectT<uint32, false>(fbr, fbc, r, c, m);
}
else
{
DrawRectT<uint32, true>(m_env.fbr, m_env.fbc, r, c, m);
DrawRectT<uint32, true>(fbr, fbc, r, c, m);
}
}
else
@ -279,11 +161,11 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
if(m == 0)
{
DrawRectT<uint16, false>(m_env.fbr, m_env.fbc, r, c, m);
DrawRectT<uint16, false>(fbr, fbc, r, c, m);
}
else
{
DrawRectT<uint16, true>(m_env.fbr, m_env.fbc, r, c, m);
DrawRectT<uint16, true>(fbr, fbc, r, c, m);
}
}
}
@ -331,9 +213,11 @@ void GSDrawScanline::FillRect(const int* RESTRICT row, const int* RESTRICT col,
{
if(r.x >= r.z) return;
T* vm = (T*)m_local.gd->vm;
for(int y = r.y; y < r.w; y++)
{
T* RESTRICT d = &((T*)m_env.vm)[row[y]];
T* RESTRICT d = &vm[row[y]];
for(int x = r.x; x < r.z; x++)
{
@ -347,9 +231,11 @@ void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col,
{
if(r.x >= r.z) return;
T* vm = (T*)m_local.gd->vm;
for(int y = r.y; y < r.w; y += 8)
{
T* RESTRICT d = &((T*)m_env.vm)[row[y]];
T* RESTRICT d = &vm[row[y]];
for(int x = r.x; x < r.z; x += 8 * 4 / sizeof(T))
{

View File

@ -29,8 +29,7 @@
class GSDrawScanline : public IDrawScanline
{
GSScanlineEnvironment m_env;
GSScanlineSelector m_sel;
GSScanlineLocalData m_local;
GSCodeGeneratorFunctionMap<GSSetupPrimCodeGenerator, uint64, SetupPrimPtr> m_sp_map;
GSCodeGeneratorFunctionMap<GSDrawScanlineCodeGenerator, uint64, DrawScanlinePtr> m_ds_map;
@ -46,17 +45,13 @@ class GSDrawScanline : public IDrawScanline
template<class T, bool masked>
__forceinline void FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m);
protected:
GSState* m_state;
int m_id;
public:
GSDrawScanline(GSState* state, int id);
GSDrawScanline(GSScanlineGlobalData* gd);
virtual ~GSDrawScanline();
// IDrawScanline
void BeginDraw(const GSRasterizerData* data);
void EndDraw(const GSRasterizerStats& stats);
void EndDraw(const GSRasterizerStats& stats, uint64 frame);
void PrintStats() {m_ds_map.PrintStats();}
};

File diff suppressed because it is too large Load Diff

View File

@ -32,12 +32,12 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
static const GSVector4i m_test[8];
GSScanlineEnvironment& m_env;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
void Generate();
void Init(int params);
void Init();
void Step();
void TestZ(const Xmm& temp1, const Xmm& temp2);
void SampleTexture();
@ -53,7 +53,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame(int params);
void WriteFrame();
void ReadPixel(const Xmm& dst, const Reg32& addr);
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz);

View File

@ -110,7 +110,7 @@ void GSRasterizer::Draw(const GSRasterizerData* data)
m_stats.ticks = __rdtsc() - start;
m_ds->EndDraw(m_stats);
m_ds->EndDraw(m_stats, data->frame);
}
void GSRasterizer::GetStats(GSRasterizerStats& stats)
@ -908,23 +908,19 @@ void GSRasterizerMT::ThreadProc()
//
GSRasterizerList::GSRasterizerList()
: m_sync(0)
, m_syncstart(0)
, m_param(NULL)
{
}
GSRasterizerList::~GSRasterizerList()
{
FreeRasterizers();
}
void GSRasterizerList::FreeRasterizers()
{
for(size_t i = 0; i < size(); i++) delete (*this)[i];
clear();
for(size_t i = 0; i < m_ready.size(); i++) CloseHandle(m_ready[i]);
m_ready.clear();
if(m_param) _aligned_free(m_param);
}
void GSRasterizerList::Sync()
@ -956,6 +952,8 @@ void GSRasterizerList::Draw(const GSRasterizerData* data)
{
m_stats.Reset();
memcpy(m_param, data->param, m_param_size);
m_start = __rdtsc();
m_sync = m_syncstart;

View File

@ -34,6 +34,7 @@ public:
GS_PRIM_CLASS primclass;
const GSVertexSW* vertices;
int count;
uint64 frame;
const void* param;
};
@ -55,7 +56,7 @@ public:
virtual ~IDrawScanline() {}
virtual void BeginDraw(const GSRasterizerData* data) = 0;
virtual void EndDraw(const GSRasterizerStats& stats) = 0;
virtual void EndDraw(const GSRasterizerStats& stats, uint64 frame) = 0;
virtual void PrintStats() = 0;
__forceinline void SetupPrim(const GSVertexSW* v, const GSVertexSW& dscan) {m_sp(v, dscan);}
@ -141,28 +142,29 @@ protected:
long m_syncstart;
GSRasterizerStats m_stats;
int64 m_start;
void FreeRasterizers();
void* m_param;
size_t m_param_size;
public:
GSRasterizerList();
virtual ~GSRasterizerList();
template<class DS, class T> void Create(T* parent, int threads)
template<class DS, class PARAM> void Create(int threads)
{
FreeRasterizers();
threads = std::max<int>(threads, 1); // TODO: min(threads, number of cpu cores)
m_param = _aligned_malloc(sizeof(PARAM), 32);
m_param_size = sizeof(PARAM);
m_syncstart = 0;
push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
push_back(new GSRasterizer(new DS((PARAM*)m_param), 0, threads));
for(int i = 1; i < threads; i++)
{
HANDLE ready = CreateEvent(NULL, FALSE, FALSE, NULL);
HANDLE ready = CreateEvent(NULL, FALSE, TRUE, NULL);
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, ready, m_sync));
push_back(new GSRasterizerMT(new DS((PARAM*)m_param), i, threads, ready, m_sync));
m_ready.push_back(ready);

View File

@ -24,14 +24,14 @@
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
GSRendererSW::GSRendererSW()
GSRendererSW::GSRendererSW(int threads)
: GSRendererT()
{
m_tc = new GSTextureCacheSW(this);
memset(m_texture, 0, sizeof(m_texture));
m_rl.Create<GSDrawScanline>(this, theApp.GetConfig("swthreads", 1));
m_rl.Create<GSDrawScanline, GSScanlineGlobalData>(threads);
InitVertexKick<GSRendererSW>();
}
@ -128,11 +128,11 @@ void GSRendererSW::Draw()
m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
}
GSScanlineParam p;
GSScanlineGlobalData gd;
GetScanlineParam(p, m_vt.m_primclass);
GetScanlineGlobalData(gd);
if((p.fm & p.zm) == 0xffffffff)
if(!gd.sel.fwrite && !gd.sel.zwrite)
{
return;
}
@ -176,18 +176,19 @@ void GSRendererSW::Draw()
data.primclass = m_vt.m_primclass;
data.vertices = m_vertices;
data.count = m_count;
data.param = &p;
data.frame = m_perfmon.GetFrame();
data.param = &gd;
m_rl.Draw(&data);
GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(data.scissor);
if(p.fm != 0xffffffff)
if(gd.sel.fwrite)
{
m_tc->InvalidateVideoMem(m_context->offset.fb, r);
}
if(p.zm != 0xffffffff)
if(gd.sel.zwrite)
{
m_tc->InvalidateVideoMem(m_context->offset.zb, r);
}
@ -230,7 +231,7 @@ void GSRendererSW::Draw()
if(0)//stats.ticks > 5000000)
{
printf("* [%I64d | %012I64x] ticks %I64d prims %d (%d) pixels %d (%d)\n",
m_perfmon.GetFrame(), p.sel.key,
m_perfmon.GetFrame(), gd.sel.key,
stats.ticks,
stats.prims, stats.prims > 0 ? (int)(stats.ticks / stats.prims) : -1,
stats.pixels, stats.pixels > 0 ? (int)(stats.ticks / stats.pixels) : -1);
@ -242,33 +243,38 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
}
void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
{
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
const GS_PRIM_CLASS primclass = m_vt.m_primclass;
p.vm = m_mem.m_vm8;
gd.vm = m_mem.m_vm8;
gd.dimx = env.dimx;
p.fbo = context->offset.fb;
p.zbo = context->offset.zb;
p.fzbo = context->offset.fzb;
gd.fbr = context->offset.fb->pixel.row;
gd.zbr = context->offset.zb->pixel.row;
gd.fbc = context->offset.fb->pixel.col[0];
gd.zbc = context->offset.zb->pixel.col[0];
gd.fzbr = context->offset.fzb->row;
gd.fzbc = context->offset.fzb->col;
p.sel.key = 0;
gd.sel.key = 0;
p.sel.fpsm = 3;
p.sel.zpsm = 3;
p.sel.atst = ATST_ALWAYS;
p.sel.tfx = TFX_NONE;
p.sel.ababcd = 255;
p.sel.sprite = primclass == GS_SPRITE_CLASS ? 1 : 0;
gd.sel.fpsm = 3;
gd.sel.zpsm = 3;
gd.sel.atst = ATST_ALWAYS;
gd.sel.tfx = TFX_NONE;
gd.sel.ababcd = 255;
gd.sel.sprite = primclass == GS_SPRITE_CLASS ? 1 : 0;
p.fm = context->FRAME.FBMSK;
p.zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
uint32 fm = context->FRAME.FBMSK;
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
{
p.fm = 0xffffffff;
p.zm = 0xffffffff;
fm = 0xffffffff;
zm = 0xffffffff;
}
if(PRIM->TME)
@ -278,46 +284,60 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
if(context->TEST.ATE)
{
if(!TryAlphaTest(p.fm, p.zm))
if(!TryAlphaTest(fm, zm))
{
p.sel.atst = context->TEST.ATST;
p.sel.afail = context->TEST.AFAIL;
gd.sel.atst = context->TEST.ATST;
gd.sel.afail = context->TEST.AFAIL;
gd.aref = GSVector4i((int)context->TEST.AREF);
switch(gd.sel.atst)
{
case ATST_LESS:
gd.sel.atst = ATST_LEQUAL;
gd.aref -= GSVector4i::x00000001();
break;
case ATST_GREATER:
gd.sel.atst = ATST_GEQUAL;
gd.aref += GSVector4i::x00000001();
break;
}
}
}
bool fwrite = p.fm != 0xffffffff;
bool ftest = p.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
bool fwrite = fm != 0xffffffff;
bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
p.sel.fwrite = fwrite;
p.sel.ftest = ftest;
gd.sel.fwrite = fwrite;
gd.sel.ftest = ftest;
if(fwrite || ftest)
{
p.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
gd.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
{
p.sel.iip = PRIM->IIP;
gd.sel.iip = PRIM->IIP;
}
if(PRIM->TME)
{
p.sel.tfx = context->TEX0.TFX;
p.sel.tcc = context->TEX0.TCC;
p.sel.fst = PRIM->FST;
p.sel.ltf = IsLinear();
p.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0;
p.sel.wms = context->CLAMP.WMS;
p.sel.wmt = context->CLAMP.WMT;
gd.sel.tfx = context->TEX0.TFX;
gd.sel.tcc = context->TEX0.TCC;
gd.sel.fst = PRIM->FST;
gd.sel.ltf = IsLinear();
gd.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0;
gd.sel.wms = context->CLAMP.WMS;
gd.sel.wmt = context->CLAMP.WMT;
if(p.sel.tfx == TFX_MODULATE && p.sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
if(gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
{
// modulate does not do anything when vertex color is 0x80
p.sel.tfx = TFX_DECAL;
gd.sel.tfx = TFX_DECAL;
}
if(p.sel.fst == 0)
if(gd.sel.fst == 0)
{
// skip per pixel division if q is constant
@ -325,7 +345,7 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
if(m_vt.m_eq.q)
{
p.sel.fst = 1;
gd.sel.fst = 1;
if(v[0].t.z != 1.0f)
{
@ -339,7 +359,7 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
}
else if(primclass == GS_SPRITE_CLASS)
{
p.sel.fst = 1;
gd.sel.fst = 1;
for(int i = 0, j = m_count; i < j; i += 2)
{
@ -351,11 +371,11 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
}
}
if(p.sel.ltf)
if(gd.sel.ltf)
{
GSVector4 half(0x8000, 0x8000);
if(p.sel.fst)
if(gd.sel.fst)
{
// if q is constant we can do the half pel shift for bilinear sampling on the vertices
@ -370,68 +390,160 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
GSVector4i r;
GetTextureMinMax(r, p.sel.ltf);
GetTextureMinMax(r, gd.sel.ltf);
const GSTextureCacheSW::GSTexture* t = m_tc->Lookup(context->TEX0, env.TEXA, r);
if(!t) {ASSERT(0); return;}
p.tex = t->m_buff;
p.clut = m_mem.m_clut;
gd.tex = t->m_buff;
gd.clut = m_mem.m_clut;
p.sel.tw = t->m_tw - 3;
gd.sel.tw = t->m_tw - 3;
uint16 tw = (uint16)(1 << context->TEX0.TW);
uint16 th = (uint16)(1 << context->TEX0.TH);
switch(context->CLAMP.WMS)
{
case CLAMP_REPEAT:
gd.t.min.u16[0] = tw - 1;
gd.t.max.u16[0] = 0;
gd.t.mask.u32[0] = 0xffffffff;
break;
case CLAMP_CLAMP:
gd.t.min.u16[0] = 0;
gd.t.max.u16[0] = tw - 1;
gd.t.mask.u32[0] = 0;
break;
case CLAMP_REGION_CLAMP:
gd.t.min.u16[0] = std::min<int>(context->CLAMP.MINU, tw - 1);
gd.t.max.u16[0] = std::min<int>(context->CLAMP.MAXU, tw - 1);
gd.t.mask.u32[0] = 0;
break;
case CLAMP_REGION_REPEAT:
gd.t.min.u16[0] = context->CLAMP.MINU;
gd.t.max.u16[0] = context->CLAMP.MAXU;
gd.t.mask.u32[0] = 0xffffffff;
break;
default:
__assume(0);
}
switch(context->CLAMP.WMT)
{
case CLAMP_REPEAT:
gd.t.min.u16[4] = th - 1;
gd.t.max.u16[4] = 0;
gd.t.mask.u32[2] = 0xffffffff;
break;
case CLAMP_CLAMP:
gd.t.min.u16[4] = 0;
gd.t.max.u16[4] = th - 1;
gd.t.mask.u32[2] = 0;
break;
case CLAMP_REGION_CLAMP:
gd.t.min.u16[4] = std::min<int>(context->CLAMP.MINV, th - 1);
gd.t.max.u16[4] = std::min<int>(context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
gd.t.mask.u32[2] = 0;
break;
case CLAMP_REGION_REPEAT:
gd.t.min.u16[4] = context->CLAMP.MINV;
gd.t.max.u16[4] = context->CLAMP.MAXV;
gd.t.mask.u32[2] = 0xffffffff;
break;
default:
__assume(0);
}
gd.t.min = gd.t.min.xxxxlh();
gd.t.max = gd.t.max.xxxxlh();
gd.t.mask = gd.t.mask.xxzz();
gd.t.invmask = ~gd.t.mask;
}
p.sel.fge = PRIM->FGE;
if(PRIM->FGE)
{
gd.sel.fge = 1;
gd.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff);
gd.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff);
}
if(context->FRAME.PSM != PSM_PSMCT24)
{
p.sel.date = context->TEST.DATE;
p.sel.datm = context->TEST.DATM;
gd.sel.date = context->TEST.DATE;
gd.sel.datm = context->TEST.DATM;
}
if(!IsOpaque())
{
p.sel.abe = PRIM->ABE;
p.sel.ababcd = context->ALPHA.u32[0];
gd.sel.abe = PRIM->ABE;
gd.sel.ababcd = context->ALPHA.u32[0];
if(env.PABE.PABE)
{
p.sel.pabe = 1;
gd.sel.pabe = 1;
}
if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
{
p.sel.aa1 = 1;
gd.sel.aa1 = 1;
}
gd.afix = GSVector4i((int)context->ALPHA.FIX << 7).xxzzlh();
}
if(p.sel.date
|| p.sel.aba == 1 || p.sel.abb == 1 || p.sel.abc == 1 || p.sel.abd == 1
|| p.sel.atst != ATST_ALWAYS && p.sel.afail == AFAIL_RGB_ONLY
|| p.sel.fpsm == 0 && p.fm != 0 && p.fm != 0xffffffff
|| p.sel.fpsm == 1 && (p.fm & 0x00ffffff) != 0 && (p.fm & 0x00ffffff) != 0x00ffffff
|| p.sel.fpsm == 2 && (p.fm & 0x80f8f8f8) != 0 && (p.fm & 0x80f8f8f8) != 0x80f8f8f8)
if(gd.sel.date
|| gd.sel.aba == 1 || gd.sel.abb == 1 || gd.sel.abc == 1 || gd.sel.abd == 1
|| gd.sel.atst != ATST_ALWAYS && gd.sel.afail == AFAIL_RGB_ONLY
|| gd.sel.fpsm == 0 && fm != 0 && fm != 0xffffffff
|| gd.sel.fpsm == 1 && (fm & 0x00ffffff) != 0 && (fm & 0x00ffffff) != 0x00ffffff
|| gd.sel.fpsm == 2 && (fm & 0x80f8f8f8) != 0 && (fm & 0x80f8f8f8) != 0x80f8f8f8)
{
p.sel.rfb = 1;
gd.sel.rfb = 1;
}
p.sel.colclamp = env.COLCLAMP.CLAMP;
p.sel.fba = context->FBA.FBA;
p.sel.dthe = env.DTHE.DTHE;
gd.sel.colclamp = env.COLCLAMP.CLAMP;
gd.sel.fba = context->FBA.FBA;
gd.sel.dthe = env.DTHE.DTHE;
}
bool zwrite = p.zm != 0xffffffff;
bool zwrite = zm != 0xffffffff;
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
p.sel.zwrite = zwrite;
p.sel.ztest = ztest;
gd.sel.zwrite = zwrite;
gd.sel.ztest = ztest;
if(zwrite || ztest)
{
p.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
p.sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
p.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
gd.sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
}
gd.fm = GSVector4i(fm);
gd.zm = GSVector4i(zm);
if(gd.sel.fpsm == 1)
{
gd.fm |= GSVector4i::xff000000();
}
else if(gd.sel.fpsm == 2)
{
GSVector4i rb = gd.fm & 0x00f800f8;
GSVector4i ga = gd.fm & 0x8000f800;
gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | GSVector4i::xffff0000();
}
if(gd.sel.zpsm == 1)
{
gd.zm |= GSVector4i::xff000000();
}
else if(gd.sel.zpsm == 2)
{
gd.zm |= GSVector4i::xffff0000();
}
}

View File

@ -41,10 +41,10 @@ protected:
void Draw();
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass);
void GetScanlineGlobalData(GSScanlineGlobalData& gd);
public:
GSRendererSW();
GSRendererSW(int threads);
virtual ~GSRendererSW();
template<uint32 prim, uint32 tme, uint32 fst>

View File

@ -85,10 +85,10 @@ union GSScanlineSelector
uint64 key;
operator uint32() {return lo;}
operator uint64() {return key;}
operator uint32() const {return lo;}
operator uint64() const {return key;}
bool IsSolidRect()
bool IsSolidRect() const
{
return sprite
&& iip == 0
@ -101,45 +101,44 @@ union GSScanlineSelector
}
};
__aligned32 struct GSScanlineParam
__aligned32 struct GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer
{
GSScanlineSelector sel;
// - the data of vm, tex, clut, dimx may change, multi-threaded drawing must be finished before that happens (an idea: remember which pages are used, sync when something needs to read or write them)
// - tex is a cached texture, it may be recycled to free up memory, its absolute address cannot be compiled into code
// - row and column pointers are allocated once and never change or freed, thier address can be used directly
// - if in the future drawing does not have to be synchronized per batch, the rest of GSRasterizerData should be copied here, too (scissor, prim type, vertices)
void* vm;
const void* tex;
const uint32* clut;
const GSVector4i* dimx;
GSOffset* fbo;
GSOffset* zbo;
GSPixelOffset4* fzbo;
uint32 fm, zm;
};
__aligned32 struct GSScanlineEnvironment
{
void* vm;
const void* tex;
const uint32* clut;
int* fbr;
int* zbr;
int* fbc;
int* zbc;
GSVector2i* fzbr;
GSVector2i* fzbc;
GSVector4i* dimx;
const int* fbr;
const int* zbr;
const int* fbc;
const int* zbc;
const GSVector2i* fzbr;
const GSVector2i* fzbc;
GSVector4i fm, zm;
struct {GSVector4i min, max, mask, invmask;} t; // [u] x 4 [v] x 4
GSVector4i aref;
GSVector4i afix;
GSVector4i frb, fga;
};
__aligned32 struct GSScanlineLocalData // per prim variables, each thread has its own
{
const GSScanlineGlobalData* gd;
struct {GSVector4 z, s, t, q; GSVector4i rb, ga, f, si, ti, _pad[7];} d[4];
struct {GSVector4 z, stq; GSVector4i c, f, st;} d4;
struct {GSVector4i rb, ga;} c;
struct {GSVector4i z, f;} p;
struct {GSVector4i z, f, s, t, q, rb, ga, zs, zd, uf, vf, cov;} temp;
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct {GSVector4i z, f, s, t, q, rb, ga, zs, zd, uf, vf, cov;} temp;
};

View File

@ -28,8 +28,12 @@ using namespace Xbyak;
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_env(*(GSScanlineEnvironment*)param)
, m_local(*(GSScanlineLocalData*)param)
{
#if _M_AMD64
#error TODO
#endif
m_sel.key = key;
m_en.z = m_sel.zb ? 1 : 0;
@ -37,10 +41,6 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
#if _M_AMD64
#error TODO
#endif
Generate();
}
@ -91,23 +91,23 @@ void GSSetupPrimCodeGenerator::Depth()
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d4.f], xmm2);
vmovdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d[i].f], xmm2);
vmovdqa(ptr[&m_local.d[i].f], xmm2);
}
}
@ -117,17 +117,17 @@ void GSSetupPrimCodeGenerator::Depth()
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d4.z = dz * 4.0f;
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_env.d4.z], xmm1);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].z = dz * m_shift[i];
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_env.d[i].z], xmm1);
vmovdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
@ -139,12 +139,12 @@ void GSSetupPrimCodeGenerator::Depth()
if(m_en.f)
{
// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.p.f], xmm1);
vmovdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
@ -155,7 +155,7 @@ void GSSetupPrimCodeGenerator::Depth()
if(m_sel.zoverflow)
{
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
@ -173,12 +173,12 @@ void GSSetupPrimCodeGenerator::Depth()
}
else
{
// m_env.p.z = GSVector4i(z);
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[&m_env.p.z], xmm0);
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
}
@ -197,25 +197,25 @@ void GSSetupPrimCodeGenerator::Depth()
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_env.d4.f], xmm2);
movdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_env.d[i].f], xmm2);
movdqa(ptr[&m_local.d[i].f], xmm2);
}
}
@ -225,19 +225,19 @@ void GSSetupPrimCodeGenerator::Depth()
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d4.z = dz * 4.0f;
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[&m_env.d4.z], xmm1);
movdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].z = dz * m_shift[i];
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[&m_env.d[i].z], xmm1);
movdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
@ -249,12 +249,12 @@ void GSSetupPrimCodeGenerator::Depth()
if(m_en.f)
{
// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_env.p.f], xmm1);
movdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
@ -265,7 +265,7 @@ void GSSetupPrimCodeGenerator::Depth()
if(m_sel.zoverflow)
{
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
@ -284,12 +284,12 @@ void GSSetupPrimCodeGenerator::Depth()
}
else
{
// m_env.p.z = GSVector4i(z);
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[&m_env.p.z], xmm0);
movdqa(ptr[&m_local.p.z], xmm0);
}
}
}
@ -312,16 +312,16 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst)
{
// m_env.d4.st = GSVector4i(t * 4.0f);
// m_local.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.st], xmm1);
vmovdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
// m_local.d4.stq = t * 4.0f;
vmovaps(ptr[&m_env.d4.stq], xmm1);
vmovaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -340,25 +340,25 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst)
{
// m_env.d[i].si/ti = GSVector4i(v);
// m_local.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_env.d[i].ti], xmm2); break;
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_env.d[i].s/t/q = v;
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_env.d[i].q], xmm2); break;
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
@ -375,16 +375,16 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst)
{
// m_env.d4.st = GSVector4i(t * 4.0f);
// m_local.d4.st = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[&m_env.d4.st], xmm1);
movdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
// m_local.d4.stq = t * 4.0f;
movaps(ptr[&m_env.d4.stq], xmm1);
movaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -405,25 +405,25 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst)
{
// m_env.d[i].si/ti = GSVector4i(v);
// m_local.d[i].si/ti = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_env.d[i].ti], xmm2); break;
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_env.d[i].s/t/q = v;
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: movaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_env.d[i].q], xmm2); break;
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
@ -446,13 +446,13 @@ void GSSetupPrimCodeGenerator::Color()
vmovaps(xmm0, ptr[edx]);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.c], xmm1);
vmovdqa(ptr[&m_local.d4.c], xmm1);
// xmm3 is not needed anymore
@ -476,10 +476,10 @@ void GSSetupPrimCodeGenerator::Color()
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].rb = r.upl16(b);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].rb], xmm0);
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
@ -506,10 +506,10 @@ void GSSetupPrimCodeGenerator::Color()
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].ga = g.upl16(a);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].ga], xmm0);
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
}
}
else
@ -530,14 +530,14 @@ void GSSetupPrimCodeGenerator::Color()
vpsrlw(xmm0, 7);
}
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.c.rb], xmm1);
vmovdqa(ptr[&m_env.c.ga], xmm2);
vmovdqa(ptr[&m_local.c.rb], xmm1);
vmovdqa(ptr[&m_local.c.ga], xmm2);
}
}
else
@ -549,14 +549,14 @@ void GSSetupPrimCodeGenerator::Color()
movaps(xmm0, ptr[edx]);
movaps(xmm1, xmm0);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[&m_env.d4.c], xmm2);
movdqa(ptr[&m_local.d4.c], xmm2);
// xmm3 is not needed anymore
@ -582,10 +582,10 @@ void GSSetupPrimCodeGenerator::Color()
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_env.d[i].rb = r.upl16(b);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_env.d[i].rb], xmm2);
movdqa(ptr[&m_local.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
@ -615,10 +615,10 @@ void GSSetupPrimCodeGenerator::Color()
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_env.d[i].ga = g.upl16(a);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_env.d[i].ga], xmm2);
movdqa(ptr[&m_local.d[i].ga], xmm2);
}
}
else
@ -640,14 +640,14 @@ void GSSetupPrimCodeGenerator::Color()
psrlw(xmm0, 7);
}
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_env.c.rb], xmm1);
movdqa(ptr[&m_env.c.ga], xmm2);
movdqa(ptr[&m_local.c.rb], xmm1);
movdqa(ptr[&m_local.c.ga], xmm2);
}
}
}

View File

@ -30,8 +30,8 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
static const GSVector4 m_shift[5];
GSScanlineEnvironment& m_env;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
struct {uint32 z:1, f:1, t:1, c:1;} m_en;

View File

@ -126,6 +126,12 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
using namespace Xbyak;
static const int _args = 0;
static const int _v = _args + 4;
static const int _count = _args + 8;
static const int _min = _args + 12;
static const int _max = _args + 16;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
@ -133,8 +139,6 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
#error TODO
#endif
const int params = 0;
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
@ -157,11 +161,6 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
break;
}
const int _v = params + 4;
const int _count = params + 8;
const int _min = params + 12;
const int _max = params + 16;
//
if(m_cpu.has(util::Cpu::tAVX))
@ -410,8 +409,6 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
#error TODO
#endif
const int params = 0;
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
@ -436,11 +433,6 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
break;
}
const int _v = params + 4;
const int _count = params + 8;
const int _min = params + 12;
const int _max = params + 16;
//
if(m_cpu.has(util::Cpu::tAVX))
@ -748,8 +740,6 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
#error TODO
#endif
const int params = 0;
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
@ -772,11 +762,6 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
break;
}
const int _v = params + 4;
const int _count = params + 8;
const int _min = params + 12;
const int _max = params + 16;
//
if(m_cpu.has(util::Cpu::tAVX))