GS:SW: Rearrange GSVertexSW members to better match planned DoubleZ arrangement

Note: Removes zequal.  DoubleZ will fix, but until then things will break
This commit is contained in:
TellowKrinkle 2022-05-11 13:12:35 -05:00 committed by tellowkrinkle
parent 56bba522ac
commit 9be7eb67d8
7 changed files with 124 additions and 241 deletions

View File

@ -98,7 +98,6 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.fb = m_global.sel.fb;
sel.zb = m_global.sel.zb;
sel.zoverflow = m_global.sel.zoverflow;
sel.zequal = m_global.sel.zequal;
sel.notest = m_global.sel.notest;
m_sp = m_sp_map[sel];
@ -138,23 +137,22 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
const GSVector4 step_shift = shift[0];
#endif
GSVector4 tstep = dscan.t * step_shift;
if (has_z || has_f)
{
if (sel.prim != GS_SPRITE_CLASS)
{
#if _M_SSE >= 0x501
GSVector4 dp8 = dscan.p * step_shift;
#endif
if (has_f)
{
#if _M_SSE >= 0x501
local.d8.p.f = GSVector4i(dp8).extract32<3>();
local.d8.p.f = GSVector4i(tstep).extract32<3>();
GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
#else
GSVector4 df = dscan.p.wwww();
GSVector4 df = dscan.t.wwww();
local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
local.d4.f = GSVector4i(tstep).zzzzh().wwww();
#endif
for (int i = 0; i < vlen; i++)
@ -165,25 +163,18 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
if (has_z)
{
if (sel.zequal)
{
local.p.z = vertex[index[1]].t.U32[3];
}
{
#if _M_SSE >= 0x501
local.d8.p.z = dp8.extract32<2>();
const VectorF dz = VectorF::broadcast32(&dscan.p.z);
const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
#else
const GSVector4 dz = dscan.p.zzzz();
const GSVector4 dz = dscan.p.zzzz();
local.d4.z = dz * shift[0];
local.d4.z = dz * step_shift;
#endif
for (int i = 0; i < vlen; i++)
{
local.d[i].z = dz * shift[1 + i];
}
for (int i = 0; i < vlen; i++)
{
local.d[i].z = dz * shift[1 + i];
}
}
}
@ -207,8 +198,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
if (has_t)
{
GSVector4 tstep = dscan.t * step_shift;
if (sel.fst)
{
LOCAL_STEP.stq = GSVector4::cast(GSVector4i(tstep));
@ -361,9 +350,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.fwrite && sel.fge)
{
#if _M_SSE >= 0x501
f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(local.d[skip].f);
f = GSVector8i::broadcast16(GSVector4i(scan.t).srl<12>()).add16(local.d[skip].f);
#else
f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f);
f = GSVector4i(scan.t).zzzzh().zzzz().add16(local.d[skip].f);
#endif
}
@ -378,9 +367,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.edge)
{
#if _M_SSE >= 0x501
cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9);
cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
#else
cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9);
cov = GSVector4i::cast(scan.p).xxxxl().xxxx().srl16(9);
#endif
}
@ -467,11 +456,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
GSVector4 z = scan.p.zzzz() + zo;
#endif
if (sel.zequal)
{
zs = local.p.z;
}
else if (sel.zoverflow)
if (sel.zoverflow)
{
zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
}

View File

@ -697,28 +697,33 @@ void GSDrawScanlineCodeGenerator2::Init()
{
if (m_sel.fwrite && m_sel.fge || m_sel.zb)
{
broadcastf128(z, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p
if (m_sel.fwrite && m_sel.fge)
{
// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
// f = GSVector4i(v.t).zzzzh().zzzz().add16(m_local.d[skip].f);
if (isYmm)
vbroadcastss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]);
else
movss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]); // v.t.w
cvttps2dq(f, z);
pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
cvttps2dq(f, f);
punpcklwd(f, f);
pshufd(f, f, _MM_SHUFFLE(0, 0, 0, 0));
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
}
if (m_sel.zb)
{
if (!m_sel.zequal)
// z = vp.zzzz() + m_local.d[skip].z;
if (hasAVX)
{
// z = vp.zzzz() + m_local.d[skip].z;
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
}
else
pbroadcastdLocal(z, _rip_local(p.z));
{
movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
}
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
}
}
}
@ -733,21 +738,22 @@ void GSDrawScanlineCodeGenerator2::Init()
pbroadcastwLocal(_f, _rip_local(p.f));
}
const XYm& vt = xym4;
if (m_sel.fb)
{
if (m_sel.edge || m_sel.tfx != TFX_NONE)
{
broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
}
if (m_sel.edge)
{
// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
// m_local.temp.cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
pshufhw(xym3, vt, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xym3, xym3, _MM_SHUFFLE(3, 3, 3, 3));
if (hasAVX2)
{
vpbroadcastw(xym3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
}
else
{
movd(xmm3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
punpcklwd(xmm3, xmm3);
pshufd(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
}
psrlw(xym3, 9);
movdqa(_rip_local(temp.cov), xym3);
@ -755,6 +761,10 @@ void GSDrawScanlineCodeGenerator2::Init()
if (m_sel.tfx != TFX_NONE)
{
const XYm& vt = xym4;
broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
// a1 = &m_local.d[skip]
const XYm& s = _s;
@ -901,14 +911,7 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.zb)
{
if (m_sel.zequal)
{
pbroadcastdLocal(z, _rip_local(p.z));
}
else
{
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
}
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
}
// f = f.add16(m_local.d4.f);
@ -1051,11 +1054,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
if (m_sel.prim != GS_SPRITE_CLASS)
{
if (m_sel.zequal)
{
movdqa(xym0, _z);
}
else if (m_sel.zoverflow)
if (m_sel.zoverflow)
{
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;

View File

@ -963,7 +963,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
AddScanline(e, 1, xi, top, edge);
e->t.U32[3] = (0x10000 - xf) & 0xffff;
e->p.U32[0] = (0x10000 - xf) & 0xffff;
e++;
}
@ -986,7 +986,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
AddScanline(e, 1, xi, top, edge);
e->t.U32[3] = xf;
e->p.U32[0] = xf;
e++;
}
@ -1053,7 +1053,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
AddScanline(e, 1, left, yi, edge);
e->t.U32[3] = (0x10000 - yf) & 0xffff;
e->p.U32[0] = (0x10000 - yf) & 0xffff;
e++;
}
@ -1076,7 +1076,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
AddScanline(e, 1, left, yi, edge);
e->t.U32[3] = yf;
e->p.U32[0] = yf;
e++;
}

View File

@ -198,74 +198,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
{
// FIXME q_div wasn't added to AVX2 code path.
#if 0 //_M_SSE >= 0x501
// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
GSVector8i o2((GSVector4i)m_context->XYOFFSET);
GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
{
GSVector8i v0 = GSVector8i::load<true>(src[0].m);
GSVector8i v1 = GSVector8i::load<true>(src[1].m);
GSVector8 stcq = GSVector8::cast(v0.ac(v1));
GSVector8i xyzuvf = v0.bd(v1);
//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
GSVector8i xy = xyzuvf.upl16() - o2;
GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * m_pos_scale2;
GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
GSVector8 t = GSVector8::zero();
if(tme)
{
if(fst)
{
t = GSVector8(xyzuvf.uph16() << (16 - 4));
}
else
{
t = stcq.xyww() * tsize2;
}
}
if(primclass == GS_SPRITE_CLASS)
{
t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
}
GSVector8::storel(&dst[0].p, p);
if(tme || primclass == GS_SPRITE_CLASS)
{
GSVector8::store<true>(&dst[0].t, t.ac(c));
}
else
{
GSVector8::storel(&dst[0].c, c);
}
GSVector8::storeh(&dst[1].p, p);
if(tme || primclass == GS_SPRITE_CLASS)
{
GSVector8::store<true>(&dst[1].t, t.bd(c));
}
else
{
GSVector8::storeh(&dst[1].c, c);
}
}
#else
GSVector4i off = (GSVector4i)m_context->XYOFFSET;
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
@ -277,9 +209,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
GSVector4i xyzuvf(src->m[1]);
GSVector4i xy = xyzuvf.upl16() - off;
GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale;
dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
GSVector4 t = GSVector4::zero();
@ -311,11 +241,19 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
}
}
if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z)
if (primclass == GS_SPRITE_CLASS)
{
dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * m_pos_scale;
xyzuvf = xyzuvf.min_u32(z_max);
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
}
else
{
float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
t = t.blend32<8>(GSVector4(xyzuvf << 7));
}
dst->t = t;
@ -325,8 +263,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
#endif
}
#endif
}
void GSRendererSW::Draw()
@ -1352,7 +1288,6 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
gd.sel.zequal = !!m_vt.m_eq.z;
gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
}

View File

@ -64,7 +64,6 @@ union GSScanlineSelector
u32 mmin : 2; // 54
u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
u32 zequal : 1; // 56
u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
};

View File

@ -82,6 +82,19 @@ void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem
#endif
}
void GSSetupPrimCodeGenerator2::broadcastss(const XYm& reg, const Address& mem)
{
if (hasAVX)
{
vbroadcastss(reg, mem);
}
else
{
movss(reg, mem);
shufps(reg, reg, _MM_SHUFFLE(0, 0, 0, 0));
}
}
void GSSetupPrimCodeGenerator2::Generate()
{
// Technically we just need the delta < 2GB
@ -152,16 +165,10 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// GSVector4 df = t.wwww();
broadcastss(xym1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
@ -185,38 +192,21 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_en.z)
{
if (m_sel.zequal)
// GSVector4 dz = p.zzzz();
broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
u32 offset = 0;
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
// m_local.d[i].z = dz * m_shift[i];
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, _64_vertex);
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
movdqa(_rip_local(p.z), xmm0);
}
else
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
}
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
}
}
}
@ -259,68 +249,19 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
vmulps(ymm1, ymm0, ymm3);
if (m_en.z)
{
if (m_sel.zequal)
{
u32 offset = 0;
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, _64_vertex);
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), t1.cvt32());
}
else
{
// m_local.d8.p.z = dp8.extract32<2>();
extractps(_rip_local_d_p(z), xmm1, 2);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
}
}
if (m_en.f)
{
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
// GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
vbroadcastss(ymm1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
cvtps2dq(ymm1, ymm1);
pextrd(_rip_local_d_p(f), xmm1, 3);
// local.d8.p.f = GSVector4i(tstep).extract32<3>();
vmulps(xmm0, xmm1, xmm3);
cvtps2dq(xmm0, xmm0);
movd(_rip_local_d_p(f), xmm0);
// GSVector8 df = GSVector8(dscan.p).wwww();
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
}
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
if (m_en.z)
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// m_local.d[i].z = dz * shift[1 + i];
// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
if (i < 4 || many_regs)
vmulps(ymm0, Ymm(4 + i), ymm2);
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
movaps(_rip_local(d[i].z), ymm0);
}
if (m_en.f)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
// m_local.d[i].f = GSVectorI(df * m_shift[i]).xxzzlh();
if (i < 4 || many_regs)
vmulps(ymm0, Ymm(4 + i), ymm1);
@ -332,6 +273,28 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
movdqa(_rip_local(d[i].f), ymm0);
}
}
if (m_en.z)
{
// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
vmulss(xmm1, xmm0, xmm3);
movss(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// m_local.d[i].z = dz * shift[1 + i];
// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
if (i < 4 || many_regs)
vmulps(ymm1, Ymm(4 + i), ymm0);
else
vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
movaps(_rip_local(d[i].z), ymm1);
}
}
}
else
{

View File

@ -69,6 +69,8 @@ public:
private:
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a 32-bit float to the whole register, whatever size that register might be
void broadcastss(const XYm& reg, const Xbyak::Address& mem);
void Depth_XMM();
void Depth_YMM();