GS:SW: Hold double z in registers more

This commit is contained in:
TellowKrinkle 2022-05-16 19:01:51 -05:00 committed by tellowkrinkle
parent 4ddf897719
commit 8ba745030a
3 changed files with 27 additions and 53 deletions

View File

@ -312,7 +312,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b;
#endif
VectorI test;
VectorF zo0, zo1;
VectorF z0, z1;
VectorI f;
VectorF s, t, q;
VectorI uf, vf;
@ -358,8 +358,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.zb)
{
zo0 = local.d[skip].z0;
zo1 = local.d[skip].z1;
VectorF zbase = VectorF::broadcast64(&scan.p.z);
z0 = zbase.add64(local.d[skip].z0);
z1 = zbase.add64(local.d[skip].z1);
}
}
@ -450,11 +451,6 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.prim != GS_SPRITE_CLASS)
{
// Need to handle when the float converts incorrectly
VectorF zbase = VectorF::broadcast64(&scan.p.z);
VectorF z0 = zbase.add64(zo0);
VectorF z1 = zbase.add64(zo1);
if (sel.zoverflow)
{
// SSE only has double to int32 conversion, no double to uint32
@ -1505,8 +1501,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
#else
GSVector4 add = local.d4.z;
#endif
zo0 = zo0.add64(add);
zo1 = zo1.add64(add);
z0 = z0.add64(add);
z1 = z1.add64(add);
}
if (sel.fwrite && sel.fge)

View File

@ -100,7 +100,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
, _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
{
m_sel.key = key;
use_lod = m_sel.mmin;
@ -394,7 +394,7 @@ L("loop");
// xym4 = q (tme) | free
// xym5 = rb (!tme)
// xym6 = ga (!tme)
// xym7 = test | free
// xym7 = test | z0
// xym15 = | test
bool tme = m_sel.tfx != TFX_NONE;
@ -723,20 +723,19 @@ void GSDrawScanlineCodeGenerator2::Init()
if (m_sel.zb)
{
// z = vp.zzzz() + m_local.d[skip].z;
broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
broadcastsd(_z, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
if (hasAVX)
{
vaddpd(xym0, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
vaddpd(xym1, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
vaddpd(xym7, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
vaddpd(_z, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
}
else
{
movaps(xym0, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
addpd(xym0, xym1);
addpd(xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
movaps(xym7, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
addpd(xym7, _z);
addpd(_z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
}
movaps(_rip_local(temp.z0), xym0);
movaps(_rip_local(temp.z1), xym1);
movaps(_rip_local(temp.z0), xym7);
}
}
}
@ -798,7 +797,7 @@ void GSDrawScanlineCodeGenerator2::Init()
}
else if (m_sel.ltf)
{
XYm vf = xym7;
XYm vf = xym5;
pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12);
@ -918,20 +917,10 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.zb)
{
broadcastsd(xym1, _rip_local_d_p(z));
if (hasAVX)
{
vaddpd(xym0, xym1, _rip_local(temp.z0));
vaddpd(xym1, xym1, _rip_local(temp.z1));
}
else
{
movaps(xym0, _rip_local(temp.z0));
addpd(xym0, xym1);
addpd(xym1, _rip_local(temp.z1));
}
movaps(_rip_local(temp.z0), xym0);
movaps(_rip_local(temp.z1), xym1);
broadcastsd(xym7, _rip_local_d_p(z));
addpd(_z, xym7);
addpd(xym7, _rip_local(temp.z0));
movaps(_rip_local(temp.z0), xym7);
}
// f = f.add16(m_local.d4.f);
@ -1052,7 +1041,7 @@ void GSDrawScanlineCodeGenerator2::Step()
}
}
/// Inputs: xym0[x86]=z, t1=fza_base, t0=fza_offset, _test
/// Inputs: xym0[x86]=z, xym7[x64]=z0, t1=fza_base, t0=fza_offset, _test
/// Outputs: t2=za
/// Destroys: rax, xym0, temp1, temp2
void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
@ -1082,18 +1071,9 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
auto m_imin = loadAddress(rax, &GSVector4::m_xc1e00000000fffff);
broadcastsd(temp1, ptr[m_imin]);
if (hasAVX)
{
vaddpd(xym0, temp1, _rip_local(temp.z0));
vaddpd(temp1, temp1, _rip_local(temp.z1));
}
else
{
movaps(xym0, _rip_local(temp.z0));
addpd(xym0, temp1);
addpd(temp1, _rip_local(temp.z1));
}
cvtpd2dq(xmm0, xym0);
addpd(xym7, temp1);
addpd(temp1, _z);
cvtpd2dq(xmm0, xym7);
cvtpd2dq(Xmm(temp1.getIdx()), temp1);
#if USING_YMM
@ -1110,13 +1090,11 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
{
// zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
cvttpd2dq(xmm0, xym7);
cvttpd2dq(Xmm(temp1.getIdx()), _z);
#if USING_YMM
cvttpd2dq(xmm0, _rip_local_(yword, temp.z0));
cvttpd2dq(Xmm(temp1.getIdx()), _rip_local_(yword, temp.z1));
vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1);
#else
cvttpd2dq(xmm0, _rip_local_(xword, temp.z0));
cvttpd2dq(temp1, _rip_local_(xword, temp.z1));
punpcklqdq(xym0, temp1);
#endif
}

View File

@ -82,7 +82,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
/// Available on both x86 and x64, not always valid
const XYm _rb, _ga, _fm, _zm, _fd, _test;
/// Always valid if needed, x64 only
const XYm _f, _s, _t, _q, _f_rb, _f_ga;
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
/// Returns the first arg on 32-bit, second on 64-bit
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)