From 4ddf897719f940cd1e1a8dcbb8cfa19e1c75513a Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Wed, 11 May 2022 13:31:30 -0500 Subject: [PATCH] GS:SW: Use doubles for Z calculation --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 52 ++++++--- .../SW/GSDrawScanlineCodeGenerator.all.cpp | 109 ++++++++++++------ .../SW/GSDrawScanlineCodeGenerator.all.h | 3 +- pcsx2/GS/Renderers/SW/GSRasterizer.cpp | 65 ++++------- pcsx2/GS/Renderers/SW/GSRendererSW.cpp | 4 +- pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h | 12 +- .../SW/GSSetupPrimCodeGenerator.all.cpp | 56 +++++---- pcsx2/GS/Renderers/SW/GSVertexSW.h | 39 ++++++- 8 files changed, 206 insertions(+), 134 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 09465c6983..cb8bc911fd 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -131,9 +131,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons #if _M_SSE >= 0x501 const GSVector8* shift = (GSVector8*)g_const->m_shift_256b; + const GSVector4* half_shift = reinterpret_cast(shift); const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]); #else const GSVector4* shift = (GSVector4*)g_const->m_shift_128b; + const u64* half_shift = reinterpret_cast(shift); const GSVector4 step_shift = shift[0]; #endif @@ -163,18 +165,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons if (has_z) { + const VectorF dz = VectorF::broadcast64(&dscan.p.z); #if _M_SSE >= 0x501 - const VectorF dz = VectorF::broadcast32(&dscan.p.z); - - local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>(); + GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift))); #else - const GSVector4 dz = dscan.p.zzzz(); - - local.d4.z = dz * step_shift; + local.d4.z = dz.mul64(GSVector4::f32to64(shift)); #endif for (int i = 0; i < vlen; i++) { - local.d[i].z = dz * shift[1 + i]; + local.d[i].z0 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 2])); + local.d[i].z1 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 3])); } } } @@ -312,7 +312,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b; #endif VectorI test; - VectorF zo; + VectorF zo0, zo1; VectorI f; VectorF s, t, q; VectorI uf, vf; @@ -358,7 +358,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.zb) { - zo = local.d[skip].z; + zo0 = local.d[skip].z0; + zo1 = local.d[skip].z1; } } @@ -450,19 +451,32 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.prim != GS_SPRITE_CLASS) { // Need to handle when the float converts incorrectly -#if _M_SSE >= 0x501 - GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; -#else - GSVector4 z = scan.p.zzzz() + zo; -#endif + VectorF zbase = VectorF::broadcast64(&scan.p.z); + VectorF z0 = zbase.add64(zo0); + VectorF z1 = zbase.add64(zo1); if (sel.zoverflow) { - zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001()); + // SSE only has double to int32 conversion, no double to uint32 + // Work around this by subtracting 0x80000000 before converting, then adding it back after + // Since we've subtracted 0x80000000, truncating now rounds up for numbers less than 0x80000000 + // So approximate the truncation by subtracting an extra (0.5 - ulp) and rounding instead + GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(false); + GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(false); +#if _M_SSE >= 0x501 + zs = GSVector8i(zl, zh); +#else + zs = zl.upl64(zh); +#endif + zs += VectorI::x80000000(); } else { - zs = VectorI(z); +#if _M_SSE >= 0x501 + zs = GSVector8i(z0.f64toi32(), z1.f64toi32()); +#else + zs = z0.f64toi32().upl64(z1.f64toi32()); +#endif } if (sel.zclamp) @@ -1487,10 +1501,12 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.zb) { #if _M_SSE >= 0x501 - zo += GSVector8::broadcast32(&local.d8.p.z); + GSVector8 add = GSVector8::broadcast64(&local.d8.p.z); #else - zo += local.d4.z; + GSVector4 add = local.d4.z; #endif + zo0 = zo0.add64(add); + zo1 = zo1.add64(add); } if (sel.fwrite && sel.fge) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index c50bdfd3c9..8809573883 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -30,7 +30,8 @@ using namespace Xbyak; // If use_lod, m_local.gd->tex, else m_local.gd->tex[0] #define _64_m_local__gd__tex r14 -#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) +#define _rip_local_(ptrtype, field) ((m_rip) ? ptrtype[rip + (char*)&m_local.field] : ptrtype[_m_local + OFFSETOF(GSScanlineLocalData, field)]) +#define _rip_local(field) _rip_local_(ptr, field) #define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) /// On AVX, does a v-prefixed separate destination operation @@ -99,7 +100,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15) - , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) + , _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) { m_sel.key = key; use_lod = m_sel.mmin; @@ -169,6 +170,15 @@ void GSDrawScanlineCodeGenerator2::pbroadcastwLocal(const XYm& reg, const Addres #endif } +void GSDrawScanlineCodeGenerator2::broadcastsd(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vbroadcastsd(reg, mem); +#else + movddup(reg, mem); +#endif +} + void GSDrawScanlineCodeGenerator2::broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr) { movd(Xmm(vec.getIdx()), gpr); @@ -691,7 +701,6 @@ void GSDrawScanlineCodeGenerator2::Init() // Free: rax const XYm& f = _f; - const XYm& z = _z; if (m_sel.prim != GS_SPRITE_CLASS) { @@ -714,26 +723,25 @@ void GSDrawScanlineCodeGenerator2::Init() if (m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; + broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z if (hasAVX) { - vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]); + vaddpd(xym0, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); + vaddpd(xym1, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); } else { - movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]); - shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0)); + movaps(xym0, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); + addpd(xym0, xym1); + addpd(xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); } - addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + movaps(_rip_local(temp.z0), xym0); + movaps(_rip_local(temp.z1), xym1); } } } else { - if (m_sel.ztest) - { - pbroadcastdLocal(z, _rip_local(p.z)); - } - if (m_sel.fwrite && m_sel.fge) pbroadcastwLocal(_f, _rip_local(p.f)); } @@ -902,8 +910,7 @@ void GSDrawScanlineCodeGenerator2::Step() add(t0, vecsize / 2); - const XYm& z =_z; - const XYm& f =_f; + const XYm& f = _f; if (m_sel.prim != GS_SPRITE_CLASS) { @@ -911,7 +918,20 @@ void GSDrawScanlineCodeGenerator2::Step() if (m_sel.zb) { - BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z)); + broadcastsd(xym1, _rip_local_d_p(z)); + if (hasAVX) + { + vaddpd(xym0, xym1, _rip_local(temp.z0)); + vaddpd(xym1, xym1, _rip_local(temp.z1)); + } + else + { + movaps(xym0, _rip_local(temp.z0)); + addpd(xym0, xym1); + addpd(xym1, _rip_local(temp.z1)); + } + movaps(_rip_local(temp.z0), xym0); + movaps(_rip_local(temp.z1), xym1); } // f = f.add16(m_local.d4.f); @@ -1042,8 +1062,6 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) return; } - const XYm& z = _z; - // int za = fza_base.y + fza_offset->y; mov(t2.cvt32(), dword[t1 + 4]); @@ -1056,36 +1074,51 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) { if (m_sel.zoverflow) { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - /*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; - z /= 2; - zs = GSVector8i(z, true); - zs = zs.min_u32(GSVector8i::x7fffffff()); - zs = zs.sll32(1) | 1;*/ + // GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(); + // GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(); + // zs = GSVector8i(zl, zh); + // zs += VectorI::x80000000(); - auto m_half = loadAddress(rax, &GSVector4::m_half); + auto m_imin = loadAddress(rax, &GSVector4::m_xc1e00000000fffff); + broadcastsd(temp1, ptr[m_imin]); if (hasAVX) - vbroadcastss(temp1, ptr[m_half]); + { + vaddpd(xym0, temp1, _rip_local(temp.z0)); + vaddpd(temp1, temp1, _rip_local(temp.z1)); + } else - movaps(temp1, ptr[m_half]); + { + movaps(xym0, _rip_local(temp.z0)); + addpd(xym0, temp1); + addpd(temp1, _rip_local(temp.z1)); + } + cvtpd2dq(xmm0, xym0); + cvtpd2dq(Xmm(temp1.getIdx()), temp1); - mulps(temp1, z); - cvttps2dq(temp1, temp1); - pslld(temp1, 1); +#if USING_YMM + vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1); +#else + punpcklqdq(xym0, temp1); +#endif - cvttps2dq(xym0, z); - pcmpeqd(temp2, temp2); - psrld(temp2, 31); - pand(xym0, temp2); - - por(xym0, temp1); + pcmpeqd(temp1, temp1); + pslld(temp1, 31); + paddd(xym0, temp1); } else { - // zs = GSVector4i(z); + // zs = GSVector8i(z0.f64toi32(), z1.f64toi32()); - cvttps2dq(xym0, z); +#if USING_YMM + cvttpd2dq(xmm0, _rip_local_(yword, temp.z0)); + cvttpd2dq(Xmm(temp1.getIdx()), _rip_local_(yword, temp.z1)); + vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1); +#else + cvttpd2dq(xmm0, _rip_local_(xword, temp.z0)); + cvttpd2dq(temp1, _rip_local_(xword, temp.z1)); + punpcklqdq(xym0, temp1); +#endif } // Clamp Z to ZPSM_FMT_MAX @@ -1104,7 +1137,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) } else { - movdqa(xym0, _z); + pbroadcastdLocal(xym0, _rip_local(p.z)); } if (m_sel.ztest) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h index f6bf1fce57..3614d726ae 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h @@ -82,7 +82,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator /// Available on both x86 and x64, not always valid const XYm _rb, _ga, _fm, _zm, _fd, _test; /// Always valid if needed, x64 only - const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga; + const XYm _f, _s, _t, _q, _f_rb, _f_ga; /// Returns the first arg on 32-bit, second on 64-bit static LocalAddr chooseLocal(const void* addr32, AddressReg reg64) @@ -117,6 +117,7 @@ private: /// On YMM registers this will be a broadcast from a 16-bit value /// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem); + void broadcastsd(const XYm& reg, const Xbyak::Address& mem); /// Broadcast a 32-bit GPR to a vector register void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr); void modulate16(const XYm& a, const Xbyak::Operand& f, u8 shift); diff --git a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp index 0197bbfd2a..d9a4471763 100644 --- a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp +++ b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp @@ -463,7 +463,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index) GSVertexSW2 dv1 = v2 - v0; GSVertexSW2 dv2 = v2 - v1; - GSVector4 cross = dv0.p * dv1.p.yxwz(); + GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz(); cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product // the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value @@ -487,18 +487,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index) ddx[2] = ddx[0].xzyw(); // Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1 - GSVector8 _dxy01c(dxy01 / cross); + GSVector8 dxy01c(dxy01 / cross); - /* dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww(); dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx(); - */ - - dscan.p = dv1.p * _dxy01c.yyyy().extract<0>() - dv0.p * _dxy01c.wwww().extract<0>(); - dscan.tc = dv1.tc * _dxy01c.yyyy() - dv0.tc * _dxy01c.wwww(); - - dedge.p = dv0.p * _dxy01c.zzzz().extract<0>() - dv1.p * _dxy01c.xxxx().extract<0>(); - dedge.tc = dv0.tc * _dxy01c.zzzz() - dv1.tc * _dxy01c.xxxx(); if (m1 & 1) { @@ -567,13 +559,12 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c while (top < bottom) { - GSVector8 dy(GSVector4(top) - p0.yyyy()); + const float dy = static_cast(top) - p0.y; + GSVector8 dyv(dy); - GSVertexSW2 scan; + GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dyv.extract<0>(); - scan.p = edge.p + dedge.p * dy.extract<0>(); - - GSVector4 lrf = scan.p.ceil(); + GSVector4 lrf = xy.ceil(); GSVector4 l = lrf.max(scissor); GSVector4 r = lrf.min(scissor); GSVector4i lr = GSVector4i(l.xxyy(r)); @@ -585,12 +576,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c if (pixels > 0) { - scan.tc = edge.tc + dedge.tc * dy; + float prestep = l.x - p0.x; + GSVector8 prestepv(prestep); - GSVector8 prestep((l - p0).xxxx()); - - scan.p = scan.p + dscan.p * prestep.extract<0>(); - scan.tc = scan.tc + dscan.tc * prestep; + GSVertexSW2 scan; + GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestepv.extract<0>()); + scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep; + scan.tc = edge.tc + dedge.tc * dyv + dscan.tc * prestepv; AddScanline(e++, pixels, left, top, (GSVertexSW&)scan); } @@ -652,7 +644,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index) GSVertexSW dv1 = v2 - v0; GSVertexSW dv2 = v2 - v1; - GSVector4 cross = dv0.p * dv1.p.yxwz(); + GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz(); cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product // the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value @@ -678,18 +670,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index) // Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1 GSVector4 dxy01c = dxy01 / cross; - /* dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww(); dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx(); - */ - - dscan.p = dv1.p * dxy01c.yyyy() - dv0.p * dxy01c.wwww(); - dscan.t = dv1.t * dxy01c.yyyy() - dv0.t * dxy01c.wwww(); - dscan.c = dv1.c * dxy01c.yyyy() - dv0.c * dxy01c.wwww(); - - dedge.p = dv0.p * dxy01c.zzzz() - dv1.p * dxy01c.xxxx(); - dedge.t = dv0.t * dxy01c.zzzz() - dv1.t * dxy01c.xxxx(); - dedge.c = dv0.c * dxy01c.zzzz() - dv1.c * dxy01c.xxxx(); if (m1 & 1) { @@ -758,13 +740,11 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co while (top < bottom) { - GSVector4 dy = GSVector4(top) - p0.yyyy(); + const float dy = static_cast(top) - p0.y; - GSVertexSW scan; + GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dy; - scan.p = edge.p + dedge.p * dy; - - GSVector4 lrf = scan.p.ceil(); + GSVector4 lrf = xy.ceil(); GSVector4 l = lrf.max(scissor); GSVector4 r = lrf.min(scissor); GSVector4i lr = GSVector4i(l.xxyy(r)); @@ -776,14 +756,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co if (pixels > 0) { - scan.t = edge.t + dedge.t * dy; - scan.c = edge.c + dedge.c * dy; + const float prestep = l.x - p0.x; - GSVector4 prestep = (l - p0).xxxx(); - - scan.p = scan.p + dscan.p * prestep; - scan.t = scan.t + dscan.t * prestep; - scan.c = scan.c + dscan.c * prestep; + GSVertexSW scan; + GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestep); + scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep; + scan.t = edge.t + dedge.t * dy + dscan.t * prestep; + scan.c = edge.c + dedge.c * dy + dscan.c * prestep; AddScanline(e++, pixels, left, top, scan); } diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp index 84c7325510..957baaec15 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp @@ -250,8 +250,8 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* } else { - float z = static_cast(static_cast(xyzuvf.extract32<1>())); - dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0)); + double z = static_cast(static_cast(xyzuvf.extract32<1>())); + dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4::f64(z, 0.0)); t = t.blend32<8>(GSVector4(xyzuvf << 7)); } diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h index 7ca184cb30..763157f39f 100644 --- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h +++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h @@ -170,16 +170,16 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it { #if _M_SSE >= 0x501 - struct skip { GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad; } d[8]; - struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u32 z, f; } p; } d8; - struct { GSVector8i rb, ga; } c; + struct skip { GSVector8 z0, z1, s, t, q; GSVector8i rb, ga, f; } d[8]; + struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u64 z; u32 f; } p; } d8; struct { u32 z, f; } p; + struct { GSVector8i rb, ga; } c; // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) struct { - GSVector8 z, zo; + GSVector8 z0, z1; GSVector8i f; GSVector8 s, t, q; GSVector8i rb, ga; @@ -198,7 +198,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it #else - struct skip { GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad; } d[4]; + struct skip { GSVector4 z0, z1, s, t, q; GSVector4i rb, ga, f; } d[4]; struct step { GSVector4 z, stq; GSVector4i c, f; } d4; struct { GSVector4i rb, ga; } c; struct { GSVector4i z, f; } p; @@ -207,7 +207,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it struct { - GSVector4 z, zo; + GSVector4 z0, z1; GSVector4i f; GSVector4 s, t, q; GSVector4i rb, ga; diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index 11985b99d6..0006a1eef1 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -192,21 +192,26 @@ void GSSetupPrimCodeGenerator2::Depth_XMM() if (m_en.z) { - // GSVector4 dz = p.zzzz(); + // VectorF dz = VectorF::broadcast64(&dscan.p.z) + movddup(xmm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); - broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); - - // m_local.d4.z = dz * 4.0f; - - THREEARG(mulps, xmm1, xmm0, xmm3); - movdqa(_rip_local_d_p(z), xmm1); + // m_local.d4.z = dz.mul64(GSVector4::f32to64(shift)); + cvtps2pd(xmm1, xmm3); + mulpd(xmm1, xmm0); + movaps(_rip_local_d_p(z), xmm1); for (int i = 0; i < (m_sel.notest ? 1 : 4); i++) { - // m_local.d[i].z = dz * m_shift[i]; + // m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2])); + // m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3])); - THREEARG(mulps, xmm1, xmm0, XYm(4 + i)); - movdqa(_rip_local(d[i].z), xmm1); + cvtps2pd(xmm1, XYm(4 + i)); + pshufd(xmm2, XYm(4 + i), _MM_SHUFFLE(1, 0, 3, 2)); + cvtps2pd(xmm2, xmm2); + mulpd(xmm1, xmm0); + mulpd(xmm2, xmm0); + movaps(_rip_local(d[i].z0), xmm1); + movaps(_rip_local(d[i].z1), xmm2); } } } @@ -276,23 +281,34 @@ void GSSetupPrimCodeGenerator2::Depth_YMM() if (m_en.z) { - // const VectorF dz = VectorF::broadcast32(&dscan.p.z); - vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); + // const VectorF dz = VectorF::broadcast64(&dscan.p.z); + vbroadcastsd(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); - // local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>(); - vmulss(xmm1, xmm0, xmm3); - movss(_rip_local_d_p(z), xmm1); + // GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift))); + cvtss2sd(xmm1, xmm3); + vmulsd(xmm1, xmm0, xmm1); + movsd(_rip_local_d_p(z), xmm1); for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++) { - // m_local.d[i].z = dz * shift[1 + i]; + // m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2])); + // m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3])); - // Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative) if (i < 4 || many_regs) - vmulps(ymm1, Ymm(4 + i), ymm0); + { + cvtps2pd(ymm1, Xmm(4 + i)); + vextracti128(xmm2, Ymm(4 + i), 1); + cvtps2pd(ymm2, xmm2); + } else - vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]); - movaps(_rip_local(d[i].z), ymm1); + { + cvtps2pd(ymm1, ptr[&g_const->m_shift_256b[i + 1][0]]); + cvtps2pd(ymm2, ptr[&g_const->m_shift_256b[i + 1][4]]); + } + mulpd(ymm1, ymm0); + mulpd(ymm2, ymm0); + movaps(_rip_local(d[i].z0), ymm1); + movaps(_rip_local(d[i].z1), ymm2); } } } diff --git a/pcsx2/GS/Renderers/SW/GSVertexSW.h b/pcsx2/GS/Renderers/SW/GSVertexSW.h index 6b3d5a29dd..7358f2e41c 100644 --- a/pcsx2/GS/Renderers/SW/GSVertexSW.h +++ b/pcsx2/GS/Renderers/SW/GSVertexSW.h @@ -19,6 +19,16 @@ struct alignas(32) GSVertexSW { + // When drawing sprites: + // p: x y _ f + // t: s t q z + // c: r g b a + // Otherwise: + // p: x y zl zh + // t: s t q f + // c: r g b a + // cov is placed in x since by the time it's known, xy are no longer needed + GSVector4 p, _pad, t, c; __forceinline GSVertexSW() {} @@ -43,7 +53,8 @@ struct alignas(32) GSVertexSW __forceinline void operator+=(const GSVertexSW& v) { - p += v.p; + GSVector4::storel(&p, GSVector4::loadl(&p) + GSVector4::loadl(&v.p)); + p.F64[1] += v.p.F64[1]; t += v.t; c += v.c; } @@ -52,7 +63,8 @@ struct alignas(32) GSVertexSW { GSVertexSW v; - v.p = a.p + b.p; + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) + GSVector4::loadl(&b.p)); + v.p.F64[1] = a.p.F64[1] + b.p.F64[1]; v.t = a.t + b.t; v.c = a.c + b.c; @@ -63,7 +75,8 @@ struct alignas(32) GSVertexSW { GSVertexSW v; - v.p = a.p - b.p; + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p)); + v.p.F64[1] = a.p.F64[1] - b.p.F64[1]; v.t = a.t - b.t; v.c = a.c - b.c; @@ -74,7 +87,8 @@ struct alignas(32) GSVertexSW { GSVertexSW v; - v.p = a.p * b; + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b); + v.p.F64[1] = a.p.F64[1] * b.F32[0]; v.t = a.t * b; v.c = a.c * b; @@ -85,7 +99,8 @@ struct alignas(32) GSVertexSW { GSVertexSW v; - v.p = a.p / b; + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) / b); + v.p.F64[1] = a.p.F64[1] / b.F32[0]; v.t = a.t / b; v.c = a.c / b; @@ -249,11 +264,23 @@ struct alignas(32) GSVertexSW2 { GSVertexSW2 v; - v.p = a.p - b.p; + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p)); + v.p.F64[1] = a.p.F64[1] - b.p.F64[1]; v.tc = a.tc - b.tc; return v; } + + __forceinline friend GSVertexSW2 operator*(const GSVertexSW2& a, const GSVector8& b) + { + GSVertexSW2 v; + + GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b.extract<0>()); + v.p.F64[1] = a.p.F64[1] * b.F32[0]; + v.tc = a.tc * b; + + return v; + } }; #endif