diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 232ef6e0b3..a0df154aa7 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -131,11 +131,9 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons #if _M_SSE >= 0x501 const GSVector8* shift = (GSVector8*)g_const->m_shift_256b; - const GSVector4* half_shift = reinterpret_cast(shift); const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]); #else const GSVector4* shift = (GSVector4*)g_const->m_shift_128b; - const u64* half_shift = reinterpret_cast(shift); const GSVector4 step_shift = shift[0]; #endif @@ -165,16 +163,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons if (has_z) { - const VectorF dz = VectorF::broadcast64(&dscan.p.z); + const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z); + const VectorF dzf(static_cast(dscan.p.F64[1])); #if _M_SSE >= 0x501 - GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift))); + GSVector4::storel(&local.d8.p.z, dz.mul64(GSVector4::f32to64(shift))); #else local.d4.z = dz.mul64(GSVector4::f32to64(shift)); #endif for (int i = 0; i < vlen; i++) { - local.d[i].z0 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 2])); - local.d[i].z1 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 3])); + local.d[i].z = dzf * shift[i + 1]; } } } @@ -359,8 +357,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.zb) { VectorF zbase = VectorF::broadcast64(&scan.p.z); - z0 = zbase.add64(local.d[skip].z0); - z1 = zbase.add64(local.d[skip].z1); + z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0])); + z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2])); } } diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index 1a836c1dc8..e47d93ee80 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -723,19 +723,12 @@ void GSDrawScanlineCodeGenerator2::Init() if (m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; - broadcastsd(_z, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z - if (hasAVX) - { - vaddpd(xym7, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); - vaddpd(_z, _z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); - } - else - { - movaps(xym7, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]); - addpd(xym7, _z); - addpd(_z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]); - } + broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z + cvtps2pd(xym7, ptr[a1 + offsetof(GSScanlineLocalData::skip, z.I8[0])]); + addpd(xym7, xym1); movaps(_rip_local(temp.z0), xym7); + cvtps2pd(_z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z.I8[vecsize/2])]); + addpd(_z, xym1); } } } diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h index 763157f39f..f9250e48ff 100644 --- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h +++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h @@ -170,7 +170,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it { #if _M_SSE >= 0x501 - struct skip { GSVector8 z0, z1, s, t, q; GSVector8i rb, ga, f; } d[8]; + struct skip { GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad; } d[8]; struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u64 z; u32 f; } p; } d8; struct { u32 z, f; } p; struct { GSVector8i rb, ga; } c; @@ -198,7 +198,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it #else - struct skip { GSVector4 z0, z1, s, t, q; GSVector4i rb, ga, f; } d[4]; + struct skip { GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad; } d[4]; struct step { GSVector4 z, stq; GSVector4i c, f; } d4; struct { GSVector4i rb, ga; } c; struct { GSVector4i z, f; } p; diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index 0006a1eef1..a1992dc1b1 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -200,18 +200,16 @@ void GSSetupPrimCodeGenerator2::Depth_XMM() mulpd(xmm1, xmm0); movaps(_rip_local_d_p(z), xmm1); + cvtpd2ps(xmm0, xmm0); + unpcklpd(xmm0, xmm0); + for (int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2])); // m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3])); - cvtps2pd(xmm1, XYm(4 + i)); - pshufd(xmm2, XYm(4 + i), _MM_SHUFFLE(1, 0, 3, 2)); - cvtps2pd(xmm2, xmm2); - mulpd(xmm1, xmm0); - mulpd(xmm2, xmm0); - movaps(_rip_local(d[i].z0), xmm1); - movaps(_rip_local(d[i].z1), xmm2); + THREEARG(mulps, xmm1, xmm0, XYm(4 + i)); + movdqa(_rip_local(d[i].z), xmm1); } } } @@ -282,33 +280,25 @@ void GSSetupPrimCodeGenerator2::Depth_YMM() if (m_en.z) { // const VectorF dz = VectorF::broadcast64(&dscan.p.z); - vbroadcastsd(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); + movsd(xmm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]); // GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift))); - cvtss2sd(xmm1, xmm3); + vcvtss2sd(xmm1, xmm3, xmm3); vmulsd(xmm1, xmm0, xmm1); movsd(_rip_local_d_p(z), xmm1); + cvtsd2ss(xmm0, xmm0); + vbroadcastss(ymm0, xmm0); + for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++) { - // m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2])); - // m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3])); + // m_local.d[i].z = dzf * shift[i + 1]; if (i < 4 || many_regs) - { - cvtps2pd(ymm1, Xmm(4 + i)); - vextracti128(xmm2, Ymm(4 + i), 1); - cvtps2pd(ymm2, xmm2); - } + vmulps(ymm1, Ymm(4 + i), ymm0); else - { - cvtps2pd(ymm1, ptr[&g_const->m_shift_256b[i + 1][0]]); - cvtps2pd(ymm2, ptr[&g_const->m_shift_256b[i + 1][4]]); - } - mulpd(ymm1, ymm0); - mulpd(ymm2, ymm0); - movaps(_rip_local(d[i].z0), ymm1); - movaps(_rip_local(d[i].z1), ymm2); + vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]); + movaps(_rip_local(d[i].z), ymm1); } } }