From 8ba745030a8513bc8a915f88e29da29e36afc53a Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Mon, 16 May 2022 19:01:51 -0500 Subject: [PATCH] GS:SW: Hold double z in registers more --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 16 ++--- .../SW/GSDrawScanlineCodeGenerator.all.cpp | 62 ++++++------------- .../SW/GSDrawScanlineCodeGenerator.all.h | 2 +- 3 files changed, 27 insertions(+), 53 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index cb8bc911fd..232ef6e0b3 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -312,7 +312,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b; #endif VectorI test; - VectorF zo0, zo1; + VectorF z0, z1; VectorI f; VectorF s, t, q; VectorI uf, vf; @@ -358,8 +358,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.zb) { - zo0 = local.d[skip].z0; - zo1 = local.d[skip].z1; + VectorF zbase = VectorF::broadcast64(&scan.p.z); + z0 = zbase.add64(local.d[skip].z0); + z1 = zbase.add64(local.d[skip].z1); } } @@ -450,11 +451,6 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.prim != GS_SPRITE_CLASS) { - // Need to handle when the float converts incorrectly - VectorF zbase = VectorF::broadcast64(&scan.p.z); - VectorF z0 = zbase.add64(zo0); - VectorF z1 = zbase.add64(zo1); - if (sel.zoverflow) { // SSE only has double to int32 conversion, no double to uint32 @@ -1505,8 +1501,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex #else GSVector4 add = local.d4.z; #endif - zo0 = zo0.add64(add); - zo1 = zo1.add64(add); + z0 = z0.add64(add); + z1 = z1.add64(add); } if (sel.fwrite && sel.fge) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index 8809573883..1a836c1dc8 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -100,7 +100,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15) - , _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) + , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) { m_sel.key = key; use_lod = m_sel.mmin; @@ -394,7 +394,7 @@ L("loop"); // xym4 = q (tme) | free // xym5 = rb (!tme) // xym6 = ga (!tme) - // xym7 = test | free + // xym7 = test | z0 // xym15 = | test bool tme = m_sel.tfx != TFX_NONE; @@ -723,20 +723,19 @@ void GSDrawScanlineCodeGenerator2::Init() if (m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; - broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z + broadcastsd(_z, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z if (hasAVX) { - vaddpd(xym0, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); - vaddpd(xym1, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); + vaddpd(xym7, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); + vaddpd(_z, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); } else { - movaps(xym0, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); - addpd(xym0, xym1); - addpd(xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); + movaps(xym7, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); + addpd(xym7, _z); + addpd(_z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); } - movaps(_rip_local(temp.z0), xym0); - movaps(_rip_local(temp.z1), xym1); + movaps(_rip_local(temp.z0), xym7); } } } @@ -798,7 +797,7 @@ void GSDrawScanlineCodeGenerator2::Init() } else if (m_sel.ltf) { - XYm vf = xym7; + XYm vf = xym5; pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(vf, 12); @@ -918,20 +917,10 @@ void GSDrawScanlineCodeGenerator2::Step() if (m_sel.zb) { - broadcastsd(xym1, _rip_local_d_p(z)); - if (hasAVX) - { - vaddpd(xym0, xym1, _rip_local(temp.z0)); - vaddpd(xym1, xym1, _rip_local(temp.z1)); - } - else - { - movaps(xym0, _rip_local(temp.z0)); - addpd(xym0, xym1); - addpd(xym1, _rip_local(temp.z1)); - } - movaps(_rip_local(temp.z0), xym0); - movaps(_rip_local(temp.z1), xym1); + broadcastsd(xym7, _rip_local_d_p(z)); + addpd(_z, xym7); + addpd(xym7, _rip_local(temp.z0)); + movaps(_rip_local(temp.z0), xym7); } // f = f.add16(m_local.d4.f); @@ -1052,7 +1041,7 @@ void GSDrawScanlineCodeGenerator2::Step() } } -/// Inputs: xym0[x86]=z, t1=fza_base, t0=fza_offset, _test +/// Inputs: xym0[x86]=z, xym7[x64]=z0, t1=fza_base, t0=fza_offset, _test /// Outputs: t2=za /// Destroys: rax, xym0, temp1, temp2 void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) @@ -1082,18 +1071,9 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) auto m_imin = loadAddress(rax, &GSVector4::m_xc1e00000000fffff); broadcastsd(temp1, ptr[m_imin]); - if (hasAVX) - { - vaddpd(xym0, temp1, _rip_local(temp.z0)); - vaddpd(temp1, temp1, _rip_local(temp.z1)); - } - else - { - movaps(xym0, _rip_local(temp.z0)); - addpd(xym0, temp1); - addpd(temp1, _rip_local(temp.z1)); - } - cvtpd2dq(xmm0, xym0); + addpd(xym7, temp1); + addpd(temp1, _z); + cvtpd2dq(xmm0, xym7); cvtpd2dq(Xmm(temp1.getIdx()), temp1); #if USING_YMM @@ -1110,13 +1090,11 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) { // zs = GSVector8i(z0.f64toi32(), z1.f64toi32()); + cvttpd2dq(xmm0, xym7); + cvttpd2dq(Xmm(temp1.getIdx()), _z); #if USING_YMM - cvttpd2dq(xmm0, _rip_local_(yword, temp.z0)); - cvttpd2dq(Xmm(temp1.getIdx()), _rip_local_(yword, temp.z1)); vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1); #else - cvttpd2dq(xmm0, _rip_local_(xword, temp.z0)); - cvttpd2dq(temp1, _rip_local_(xword, temp.z1)); punpcklqdq(xym0, temp1); #endif } diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h index 3614d726ae..8054a31b70 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h @@ -82,7 +82,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator /// Available on both x86 and x64, not always valid const XYm _rb, _ga, _fm, _zm, _fd, _test; /// Always valid if needed, x64 only - const XYm _f, _s, _t, _q, _f_rb, _f_ga; + const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga; /// Returns the first arg on 32-bit, second on 64-bit static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)