GS:SW: Rearrange GSVertexSW members to better match planned DoubleZ arrangement

Note: Removes zequal. DoubleZ will fix, but until then things will break
2022-05-11 13:12:35 -05:00 · 2022-05-11 13:12:35 -05:00 · 9be7eb67d8
parent 56bba522ac
commit 9be7eb67d8
7 changed files with 124 additions and 241 deletions
--- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
@ -98,7 +98,6 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
 	sel.fb = m_global.sel.fb;
 	sel.zb = m_global.sel.zb;
 	sel.zoverflow = m_global.sel.zoverflow;
 	sel.zequal = m_global.sel.zequal;
 	sel.notest = m_global.sel.notest;
 	m_sp = m_sp_map[sel];
@ -138,23 +137,22 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
 	const GSVector4 step_shift = shift[0];
 #endif
 	GSVector4 tstep = dscan.t * step_shift;
 	if (has_z || has_f)
 	{
 		if (sel.prim != GS_SPRITE_CLASS)
 		{
 #if _M_SSE >= 0x501
 			GSVector4 dp8 = dscan.p * step_shift;
 #endif
 			if (has_f)
 			{
 #if _M_SSE >= 0x501
-				local.d8.p.f = GSVector4i(dp8).extract32<3>();
+				local.d8.p.f = GSVector4i(tstep).extract32<3>();
-				GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
+				GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
 #else
-				GSVector4 df = dscan.p.wwww();
+				GSVector4 df = dscan.t.wwww();
-				local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
+				local.d4.f = GSVector4i(tstep).zzzzh().wwww();
 #endif
 				for (int i = 0; i < vlen; i++)
@ -165,25 +163,18 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
 			if (has_z)
 			{
 				if (sel.zequal)
 				{
 					local.p.z = vertex[index[1]].t.U32[3];
 				}
 				{
 #if _M_SSE >= 0x501
-					local.d8.p.z = dp8.extract32<2>();
+				const VectorF dz = VectorF::broadcast32(&dscan.p.z);
-					const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
+				local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
 #else
-					const GSVector4 dz = dscan.p.zzzz();
+				const GSVector4 dz = dscan.p.zzzz();
-					local.d4.z = dz * shift[0];
+				local.d4.z = dz * step_shift;
 #endif
-					for (int i = 0; i < vlen; i++)
+				for (int i = 0; i < vlen; i++)
-					{
+				{
-						local.d[i].z = dz * shift[1 + i];
+					local.d[i].z = dz * shift[1 + i];
 					}
 				}
 			}
 		}
@ -207,8 +198,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
 	if (has_t)
 	{
 		GSVector4 tstep = dscan.t * step_shift;
 		if (sel.fst)
 		{
 			LOCAL_STEP.stq = GSVector4::cast(GSVector4i(tstep));
@ -361,9 +350,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 		if (sel.fwrite && sel.fge)
 		{
 #if _M_SSE >= 0x501
-			f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(local.d[skip].f);
+			f = GSVector8i::broadcast16(GSVector4i(scan.t).srl<12>()).add16(local.d[skip].f);
 #else
-			f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f);
+			f = GSVector4i(scan.t).zzzzh().zzzz().add16(local.d[skip].f);
 #endif
 		}
@ -378,9 +367,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 		if (sel.edge)
 		{
 #if _M_SSE >= 0x501
-			cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9);
+			cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
 #else
-			cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9);
+			cov = GSVector4i::cast(scan.p).xxxxl().xxxx().srl16(9);
 #endif
 		}
@ -467,11 +456,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 					GSVector4 z = scan.p.zzzz() + zo;
 #endif
-					if (sel.zequal)
+					if (sel.zoverflow)
 					{
 						zs = local.p.z;
 					}
 					else if (sel.zoverflow)
 					{
 						zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
 					}
--- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
@ -697,28 +697,33 @@ void GSDrawScanlineCodeGenerator2::Init()
 	{
 		if (m_sel.fwrite && m_sel.fge || m_sel.zb)
 		{
 			broadcastf128(z, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p
 			if (m_sel.fwrite && m_sel.fge)
 			{
-				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+				// f = GSVector4i(v.t).zzzzh().zzzz().add16(m_local.d[skip].f);
 				if (isYmm)
 					vbroadcastss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]);
 				else
 					movss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]); // v.t.w
-				cvttps2dq(f, z);
+				cvttps2dq(f, f);
-				pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+				punpcklwd(f, f);
-				pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+				pshufd(f, f, _MM_SHUFFLE(0, 0, 0, 0));
 				paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
 			}
 			if (m_sel.zb)
 			{
-				if (!m_sel.zequal)
+				// z = vp.zzzz() + m_local.d[skip].z;
 				if (hasAVX)
 				{
-					// z = vp.zzzz() + m_local.d[skip].z;
+					vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
 					shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
 					addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
 				}
 				else
-					pbroadcastdLocal(z, _rip_local(p.z));
+				{
 					movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
 					shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
 				}
 				addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
 			}
 		}
 	}
@ -733,21 +738,22 @@ void GSDrawScanlineCodeGenerator2::Init()
 			pbroadcastwLocal(_f, _rip_local(p.f));
 	}
 	const XYm& vt = xym4;
 	if (m_sel.fb)
 	{
 		if (m_sel.edge || m_sel.tfx != TFX_NONE)
 		{
 			broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
 		}
 		if (m_sel.edge)
 		{
-			// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
+			// m_local.temp.cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
-			pshufhw(xym3, vt, _MM_SHUFFLE(2, 2, 2, 2));
+			if (hasAVX2)
-			pshufd(xym3, xym3, _MM_SHUFFLE(3, 3, 3, 3));
+			{
 				vpbroadcastw(xym3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
 			}
 			else
 			{
 				movd(xmm3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
 				punpcklwd(xmm3, xmm3);
 				pshufd(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
 			}
 			psrlw(xym3, 9);
 			movdqa(_rip_local(temp.cov), xym3);
@ -755,6 +761,10 @@ void GSDrawScanlineCodeGenerator2::Init()
 		if (m_sel.tfx != TFX_NONE)
 		{
 			const XYm& vt = xym4;
 			broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
 			// a1 = &m_local.d[skip]
 			const XYm& s = _s;
@ -901,14 +911,7 @@ void GSDrawScanlineCodeGenerator2::Step()
 		if (m_sel.zb)
 		{
-			if (m_sel.zequal)
+			BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
 			{
 				pbroadcastdLocal(z, _rip_local(p.z));
 			}
 			else
 			{
 				BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
 			}
 		}
 		// f = f.add16(m_local.d4.f);
@ -1051,11 +1054,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
-		if (m_sel.zequal)
+		if (m_sel.zoverflow)
 		{
 			movdqa(xym0, _z);
 		}
 		else if (m_sel.zoverflow)
 		{
 			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
 			/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
--- a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp
+++ b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp
@ -963,7 +963,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, xi, top, edge);
-					e->t.U32[3] = (0x10000 - xf) & 0xffff;
+					e->p.U32[0] = (0x10000 - xf) & 0xffff;
 					e++;
 				}
@ -986,7 +986,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, xi, top, edge);
-					e->t.U32[3] = xf;
+					e->p.U32[0] = xf;
 					e++;
 				}
@ -1053,7 +1053,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, left, yi, edge);
-					e->t.U32[3] = (0x10000 - yf) & 0xffff;
+					e->p.U32[0] = (0x10000 - yf) & 0xffff;
 					e++;
 				}
@ -1076,7 +1076,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, left, yi, edge);
-					e->t.U32[3] = yf;
+					e->p.U32[0] = yf;
 					e++;
 				}
--- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp
+++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp
@ -198,74 +198,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 {
 	// FIXME q_div wasn't added to AVX2 code path.
 #if 0 //_M_SSE >= 0x501
 	// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
 	GSVector8i o2((GSVector4i)m_context->XYOFFSET);
 	GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
 	for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
 	{
 		GSVector8i v0 = GSVector8i::load<true>(src[0].m);
 		GSVector8i v1 = GSVector8i::load<true>(src[1].m);
 		GSVector8 stcq = GSVector8::cast(v0.ac(v1));
 		GSVector8i xyzuvf = v0.bd(v1);
 		//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
 		//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
 		GSVector8i xy = xyzuvf.upl16() - o2;
 		GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
 		GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * m_pos_scale2;
 		GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
 		GSVector8 t = GSVector8::zero();
 		if(tme)
 		{
 			if(fst)
 			{
 				t = GSVector8(xyzuvf.uph16() << (16 - 4));
 			}
 			else
 			{
 				t = stcq.xyww() * tsize2;
 			}
 		}
 		if(primclass == GS_SPRITE_CLASS)
 		{
 			t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
 		}
 		GSVector8::storel(&dst[0].p, p);
 		if(tme || primclass == GS_SPRITE_CLASS)
 		{
 			GSVector8::store<true>(&dst[0].t, t.ac(c));
 		}
 		else
 		{
 			GSVector8::storel(&dst[0].c, c);
 		}
 		GSVector8::storeh(&dst[1].p, p);
 		if(tme || primclass == GS_SPRITE_CLASS)
 		{
 			GSVector8::store<true>(&dst[1].t, t.bd(c));
 		}
 		else
 		{
 			GSVector8::storeh(&dst[1].c, c);
 		}
 	}
 #else
 	GSVector4i off = (GSVector4i)m_context->XYOFFSET;
 	GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
 	GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
@ -277,9 +209,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 		GSVector4i xyzuvf(src->m[1]);
 		GSVector4i xy = xyzuvf.upl16() - off;
 		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
 		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale;
 		dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
 		GSVector4 t = GSVector4::zero();
@ -311,11 +241,19 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 			}
 		}
-		if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z)
+		if (primclass == GS_SPRITE_CLASS)
 		{
 			dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * m_pos_scale;
 			xyzuvf = xyzuvf.min_u32(z_max);
 			t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
 		}
 		else
 		{
 			float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
 			dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
 			t = t.blend32<8>(GSVector4(xyzuvf << 7));
 		}
 		dst->t = t;
@ -325,8 +263,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 #endif
 	}
 #endif
 }
 void GSRendererSW::Draw()
@ -1352,7 +1288,6 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 		gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
 		gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
 		gd.sel.zequal = !!m_vt.m_eq.z;
 		gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
 		gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
 	}
--- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
+++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
@ -64,7 +64,6 @@ union GSScanlineSelector
 		u32 mmin   : 2; // 54
 		u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
 		// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
 		u32 zequal : 1; // 56
 		u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
 	};
--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
@ -82,6 +82,19 @@ void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem
 #endif
 }
 void GSSetupPrimCodeGenerator2::broadcastss(const XYm& reg, const Address& mem)
 {
 	if (hasAVX)
 	{
 		vbroadcastss(reg, mem);
 	}
 	else
 	{
 		movss(reg, mem);
 		shufps(reg, reg, _MM_SHUFFLE(0, 0, 0, 0));
 	}
 }
 void GSSetupPrimCodeGenerator2::Generate()
 {
 	// Technically we just need the delta < 2GB
@ -152,16 +165,10 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
 		// GSVector4 p = dscan.p;
 		movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
 		if (m_en.f)
 		{
-			// GSVector4 df = p.wwww();
+			// GSVector4 df = t.wwww();
-
+			broadcastss(xym1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
 			THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
 			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
@ -185,38 +192,21 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
 		if (m_en.z)
 		{
-			if (m_sel.zequal)
+			// GSVector4 dz = p.zzzz();
 			broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
 			// m_local.d4.z = dz * 4.0f;
 			THREEARG(mulps, xmm1, xmm0, xmm3);
 			movdqa(_rip_local_d_p(z), xmm1);
 			for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
 			{
-				u32 offset = 0;
+				// m_local.d[i].z = dz * m_shift[i];
 				if (m_sel.prim != GS_POINT_CLASS)
 					offset = sizeof(u32) * 1;
-				mov(eax, ptr[_index + offset]);
+				THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
-				shl(eax, 6); // * sizeof(GSVertexSW)
+				movdqa(_rip_local(d[i].z), xmm1);
 				add(rax, _64_vertex);
 				movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
 				pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
 				movdqa(_rip_local(p.z), xmm0);
 			}
 			else
 			{
 				// GSVector4 dz = p.zzzz();
 				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
 				// m_local.d4.z = dz * 4.0f;
 				THREEARG(mulps, xmm1, xmm0, xmm3);
 				movdqa(_rip_local_d_p(z), xmm1);
 				for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
 				{
 					// m_local.d[i].z = dz * m_shift[i];
 					THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
 					movdqa(_rip_local(d[i].z), xmm1);
 				}
 			}
 		}
 	}
@ -259,68 +249,19 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
 		// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
 		broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
 		vmulps(ymm1, ymm0, ymm3);
 		if (m_en.z)
 		{
 			if (m_sel.zequal)
 			{
 				u32 offset = 0;
 				if (m_sel.prim != GS_POINT_CLASS)
 					offset = sizeof(u32) * 1;
 				mov(eax, ptr[_index + offset]);
 				shl(eax, 6); // * sizeof(GSVertexSW)
 				add(rax, _64_vertex);
 				mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
 				mov(_rip_local(p.z), t1.cvt32());
 			}
 			else
 			{
 				// m_local.d8.p.z = dp8.extract32<2>();
 				extractps(_rip_local_d_p(z), xmm1, 2);
 				// GSVector8 dz = GSVector8(dscan.p).zzzz();
 				vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
 			}
 		}
 		if (m_en.f)
 		{
-			// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
+			// GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
 			vbroadcastss(ymm1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
-			cvtps2dq(ymm1, ymm1);
+			// local.d8.p.f = GSVector4i(tstep).extract32<3>();
-			pextrd(_rip_local_d_p(f), xmm1, 3);
+			vmulps(xmm0, xmm1, xmm3);
 			cvtps2dq(xmm0, xmm0);
 			movd(_rip_local_d_p(f), xmm0);
-			// GSVector8 df = GSVector8(dscan.p).wwww();
+			for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 			vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
 		}
 		for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 		{
 			if (m_en.z)
 			{
-				// m_local.d[i].z = dz * shift[1 + i];
+				// m_local.d[i].f = GSVectorI(df * m_shift[i]).xxzzlh();
 				// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
 				if (i < 4 || many_regs)
 					vmulps(ymm0, Ymm(4 + i), ymm2);
 				else
 					vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
 				movaps(_rip_local(d[i].z), ymm0);
 			}
 			if (m_en.f)
 			{
 				// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
 				if (i < 4 || many_regs)
 					vmulps(ymm0, Ymm(4 + i), ymm1);
@ -332,6 +273,28 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
 				movdqa(_rip_local(d[i].f), ymm0);
 			}
 		}
 		if (m_en.z)
 		{
 			// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
 			vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
 			// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
 			vmulss(xmm1, xmm0, xmm3);
 			movss(_rip_local_d_p(z), xmm1);
 			for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 			{
 				// m_local.d[i].z = dz * shift[1 + i];
 				// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
 				if (i < 4 || many_regs)
 					vmulps(ymm1, Ymm(4 + i), ymm0);
 				else
 					vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
 				movaps(_rip_local(d[i].z), ymm1);
 			}
 		}
 	}
 	else
 	{
--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
@ -69,6 +69,8 @@ public:
 private:
 	/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
 	void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
 	/// Broadcast a 32-bit float to the whole register, whatever size that register might be
 	void broadcastss(const XYm& reg, const Xbyak::Address& mem);
 	void Depth_XMM();
 	void Depth_YMM();