GS:SW: Merge SSE and AVX implementations of CSetupPrim

This commit is contained in:
TellowKrinkle 2022-03-19 02:02:48 -05:00 committed by tellowkrinkle
parent af75ac00e2
commit 586486b81c
1 changed files with 63 additions and 179 deletions

View File

@ -109,6 +109,16 @@ void GSDrawScanline::EndDraw(u64 frame, u64 ticks, int actual, int total, int pr
m_ds_map.UpdateStats(frame, ticks, actual, total, prims);
}
#if _M_SSE >= 0x501
typedef GSVector8i VectorI;
typedef GSVector8 VectorF;
#define LOCAL_STEP local.d8
#else
typedef GSVector4i VectorI;
typedef GSVector4 VectorF;
#define LOCAL_STEP local.d4
#endif
void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local, const GSScanlineGlobalData& global)
{
GSScanlineSelector sel = global.sel;
@ -118,25 +128,38 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
bool has_t = sel.fb && sel.tfx != TFX_NONE;
bool has_c = sel.fb && !(sel.tfx == TFX_DECAL && sel.tcc);
#if _M_SSE >= 0x501
constexpr int vlen = sizeof(VectorF) / sizeof(float);
#if _M_SSE >= 0x501
const GSVector8* shift = (GSVector8*)g_const->m_shift_256b;
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
#else
const GSVector4* shift = (GSVector4*)g_const->m_shift_128b;
const GSVector4 step_shift = shift[0];
#endif
if (has_z || has_f)
{
if (sel.prim != GS_SPRITE_CLASS)
{
GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
#if _M_SSE >= 0x501
GSVector4 dp8 = dscan.p * step_shift;
#endif
if (has_f)
{
#if _M_SSE >= 0x501
local.d8.p.f = GSVector4i(dp8).extract32<3>();
GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
#else
GSVector4 df = dscan.p.wwww();
for (int i = 0; i < 8; i++)
local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
#endif
for (int i = 0; i < vlen; i++)
{
local.d[i].f = GSVector8i(df * shift[1 + i]).xxzzlh();
local.d[i].f = VectorI(df * shift[1 + i]).xxzzlh();
}
}
@ -148,11 +171,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
}
{
#if _M_SSE >= 0x501
local.d8.p.z = dp8.extract32<2>();
const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
#else
const GSVector4 dz = dscan.p.zzzz();
for (int i = 0; i < 8; i++)
local.d4.z = dz * shift[0];
#endif
for (int i = 0; i < vlen; i++)
{
local.d[i].z = dz * shift[1 + i];
}
@ -163,7 +191,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
{
if (has_f)
{
#if _M_SSE >= 0x501
local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
#else
local.p.f = GSVector4i(vertex[index[1]].p).zzzzh().zzzz();
#endif
}
if (has_z)
@ -175,22 +207,22 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
if (has_t)
{
GSVector4 dt8 = dscan.t * GSVector4::broadcast32(&shift[0]);
GSVector4 tstep = dscan.t * step_shift;
if (sel.fst)
{
local.d8.stq = GSVector4::cast(GSVector4i(dt8));
LOCAL_STEP.stq = GSVector4::cast(GSVector4i(tstep));
}
else
{
local.d8.stq = dt8;
LOCAL_STEP.stq = tstep;
}
GSVector8 dt(dscan.t);
VectorF dt(dscan.t);
for (int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
{
GSVector8 dstq;
VectorF dstq;
switch (j)
{
@ -199,16 +231,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
case 2: dstq = dt.zzzz(); break;
}
for (int i = 0; i < 8; i++)
for (int i = 0; i < vlen; i++)
{
GSVector8 v = dstq * shift[1 + i];
VectorF v = dstq * shift[1 + i];
if (sel.fst)
{
switch (j)
{
case 0: local.d[i].s = GSVector8::cast(GSVector8i(v)); break;
case 1: local.d[i].t = GSVector8::cast(GSVector8i(v)); break;
case 0: local.d[i].s = VectorF::cast(VectorI(v)); break;
case 1: local.d[i].t = VectorF::cast(VectorI(v)); break;
}
}
else
@ -228,177 +260,31 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
{
if (sel.iip)
{
GSVector4 dc8 = dscan.c * GSVector4::broadcast32(&shift[0]);
GSVector4i::storel(&local.d8.c, GSVector4i(dc8).xzyw().ps32());
GSVector8 dc(dscan.c);
GSVector8 dr = dc.xxxx();
GSVector8 db = dc.zzzz();
for (int i = 0; i < 8; i++)
{
GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
GSVector8i b = GSVector8i(db * shift[1 + i]).ps32();
local.d[i].rb = r.upl16(b);
}
GSVector8 dg = dc.yyyy();
GSVector8 da = dc.wwww();
for (int i = 0; i < 8; i++)
{
GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
local.d[i].ga = g.upl16(a);
}
}
else
{
int last = 0;
switch (sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
c = c.upl16(c.zwxy());
if (sel.tfx == TFX_NONE)
c = c.srl16(7);
local.c.rb = c.xxxx();
local.c.ga = c.zzzz();
}
}
#if _M_SSE >= 0x501
GSVector4i::storel(&local.d8.c, GSVector4i(dscan.c * step_shift).xzyw().ps32());
#else
local.d4.c = GSVector4i(dscan.c * step_shift).xzyw().ps32();
#endif
VectorF dc(dscan.c);
const GSVector4* shift = (GSVector4*)g_const->m_shift_128b;
VectorF dr = dc.xxxx();
VectorF db = dc.zzzz();
if (has_z || has_f)
{
if (sel.prim != GS_SPRITE_CLASS)
{
if (has_f)
for (int i = 0; i < vlen; i++)
{
GSVector4 df = dscan.p.wwww();
local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
for (int i = 0; i < 4; i++)
{
local.d[i].f = GSVector4i(df * shift[1 + i]).xxzzlh();
}
}
if (has_z)
{
GSVector4 dz = dscan.p.zzzz();
local.d4.z = dz * shift[0];
for (int i = 0; i < 4; i++)
{
local.d[i].z = dz * shift[1 + i];
}
}
}
else
{
if (has_f)
{
local.p.f = GSVector4i(vertex[index[1]].p).zzzzh().zzzz();
}
if (has_z)
{
local.p.z = vertex[index[1]].t.U32[3]; // u32 z is bypassed in t.w
}
}
}
if (has_t)
{
GSVector4 t = dscan.t;
if (sel.fst)
{
local.d4.stq = GSVector4::cast(GSVector4i(t * shift[0]));
}
else
{
local.d4.stq = t * shift[0];
}
for (int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
{
GSVector4 dstq;
switch (j)
{
case 0: dstq = t.xxxx(); break;
case 1: dstq = t.yyyy(); break;
case 2: dstq = t.zzzz(); break;
}
for (int i = 0; i < 4; i++)
{
GSVector4 v = dstq * shift[1 + i];
if (sel.fst)
{
switch (j)
{
case 0: local.d[i].s = GSVector4::cast(GSVector4i(v)); break;
case 1: local.d[i].t = GSVector4::cast(GSVector4i(v)); break;
}
}
else
{
switch (j)
{
case 0: local.d[i].s = v; break;
case 1: local.d[i].t = v; break;
case 2: local.d[i].q = v; break;
}
}
}
}
}
if (has_c)
{
if (sel.iip)
{
local.d4.c = GSVector4i(dscan.c * shift[0]).xzyw().ps32();
GSVector4 dr = dscan.c.xxxx();
GSVector4 db = dscan.c.zzzz();
for (int i = 0; i < 4; i++)
{
GSVector4i r = GSVector4i(dr * shift[1 + i]).ps32();
GSVector4i b = GSVector4i(db * shift[1 + i]).ps32();
VectorI r = VectorI(dr * shift[1 + i]).ps32();
VectorI b = VectorI(db * shift[1 + i]).ps32();
local.d[i].rb = r.upl16(b);
}
GSVector4 dg = dscan.c.yyyy();
GSVector4 da = dscan.c.wwww();
VectorF dg = dc.yyyy();
VectorF da = dc.wwww();
for (int i = 0; i < 4; i++)
for (int i = 0; i < vlen; i++)
{
GSVector4i g = GSVector4i(dg * shift[1 + i]).ps32();
GSVector4i a = GSVector4i(da * shift[1 + i]).ps32();
VectorI g = VectorI(dg * shift[1 + i]).ps32();
VectorI a = VectorI(da * shift[1 + i]).ps32();
local.d[i].ga = g.upl16(a);
}
@ -415,7 +301,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
case GS_SPRITE_CLASS: last = 1; break;
}
GSVector4i c = GSVector4i(vertex[index[last]].c);
VectorI c = VectorI(VectorF(vertex[index[last]].c));
c = c.upl16(c.zwxy());
@ -426,8 +312,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
local.c.ga = c.zzzz();
}
}
#endif
}
void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global)