GS:SW: Rearrange GSVertexSW members to better match planned DoubleZ arrangement

Note: Removes zequal. DoubleZ will fix, but until then things will break
2022-05-11 13:12:35 -05:00 · 2022-05-11 13:12:35 -05:00 · 9be7eb67d8
parent 56bba522ac
commit 9be7eb67d8
7 changed files with 124 additions and 241 deletions
--- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
@ -98,7 +98,6 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
 	sel.fb = m_global.sel.fb;
 	sel.zb = m_global.sel.zb;
 	sel.zoverflow = m_global.sel.zoverflow;
-	sel.zequal = m_global.sel.zequal;
 	sel.notest = m_global.sel.notest;

 	m_sp = m_sp_map[sel];
@ -138,23 +137,22 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
 	const GSVector4 step_shift = shift[0];
 #endif

+	GSVector4 tstep = dscan.t * step_shift;
+
 	if (has_z || has_f)
 	{
 		if (sel.prim != GS_SPRITE_CLASS)
 		{
-#if _M_SSE >= 0x501
-			GSVector4 dp8 = dscan.p * step_shift;
-#endif
 			if (has_f)
 			{
 #if _M_SSE >= 0x501
-				local.d8.p.f = GSVector4i(dp8).extract32<3>();
+				local.d8.p.f = GSVector4i(tstep).extract32<3>();

-				GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
+				GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
 #else
-				GSVector4 df = dscan.p.wwww();
+				GSVector4 df = dscan.t.wwww();

-				local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
+				local.d4.f = GSVector4i(tstep).zzzzh().wwww();
 #endif

 				for (int i = 0; i < vlen; i++)
@ -165,20 +163,14 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons

 			if (has_z)
 			{
-				if (sel.zequal)
-				{
-					local.p.z = vertex[index[1]].t.U32[3];
-				}
-
-				{
 #if _M_SSE >= 0x501
-					local.d8.p.z = dp8.extract32<2>();
+				const VectorF dz = VectorF::broadcast32(&dscan.p.z);

-					const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
+				local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
 #else
 				const GSVector4 dz = dscan.p.zzzz();

-					local.d4.z = dz * shift[0];
+				local.d4.z = dz * step_shift;
 #endif
 				for (int i = 0; i < vlen; i++)
 				{
@ -186,7 +178,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
 				}
 			}
 		}
-		}
 		else
 		{
 			if (has_f)
@ -207,8 +198,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons

 	if (has_t)
 	{
-		GSVector4 tstep = dscan.t * step_shift;
-
 		if (sel.fst)
 		{
 			LOCAL_STEP.stq = GSVector4::cast(GSVector4i(tstep));
@ -361,9 +350,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 		if (sel.fwrite && sel.fge)
 		{
 #if _M_SSE >= 0x501
-			f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(local.d[skip].f);
+			f = GSVector8i::broadcast16(GSVector4i(scan.t).srl<12>()).add16(local.d[skip].f);
 #else
-			f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f);
+			f = GSVector4i(scan.t).zzzzh().zzzz().add16(local.d[skip].f);
 #endif
 		}

@ -378,9 +367,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 		if (sel.edge)
 		{
 #if _M_SSE >= 0x501
-			cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9);
+			cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
 #else
-			cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9);
+			cov = GSVector4i::cast(scan.p).xxxxl().xxxx().srl16(9);
 #endif
 		}

@ -467,11 +456,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
 					GSVector4 z = scan.p.zzzz() + zo;
 #endif

-					if (sel.zequal)
-					{
-						zs = local.p.z;
-					}
-					else if (sel.zoverflow)
+					if (sel.zoverflow)
 					{
 						zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
 					}
--- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
@ -697,28 +697,33 @@ void GSDrawScanlineCodeGenerator2::Init()
 	{
 		if (m_sel.fwrite && m_sel.fge || m_sel.zb)
 		{
-			broadcastf128(z, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p
-
 			if (m_sel.fwrite && m_sel.fge)
 			{
-				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+				// f = GSVector4i(v.t).zzzzh().zzzz().add16(m_local.d[skip].f);
+				if (isYmm)
+					vbroadcastss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]);
+				else
+					movss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]); // v.t.w

-				cvttps2dq(f, z);
-				pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
-				pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+				cvttps2dq(f, f);
+				punpcklwd(f, f);
+				pshufd(f, f, _MM_SHUFFLE(0, 0, 0, 0));
 				paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
 			}

 			if (m_sel.zb)
-			{
-				if (!m_sel.zequal)
 			{
 				// z = vp.zzzz() + m_local.d[skip].z;
-					shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
-					addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
+				if (hasAVX)
+				{
+					vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
 				}
 				else
-					pbroadcastdLocal(z, _rip_local(p.z));
+				{
+					movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
+					shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
+				}
+				addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
 			}
 		}
 	}
@ -733,21 +738,22 @@ void GSDrawScanlineCodeGenerator2::Init()
 			pbroadcastwLocal(_f, _rip_local(p.f));
 	}

-	const XYm& vt = xym4;
-
 	if (m_sel.fb)
 	{
-		if (m_sel.edge || m_sel.tfx != TFX_NONE)
-		{
-			broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
-		}
-
 		if (m_sel.edge)
 		{
-			// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
+			// m_local.temp.cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);

-			pshufhw(xym3, vt, _MM_SHUFFLE(2, 2, 2, 2));
-			pshufd(xym3, xym3, _MM_SHUFFLE(3, 3, 3, 3));
+			if (hasAVX2)
+			{
+				vpbroadcastw(xym3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
+			}
+			else
+			{
+				movd(xmm3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
+				punpcklwd(xmm3, xmm3);
+				pshufd(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
+			}
 			psrlw(xym3, 9);

 			movdqa(_rip_local(temp.cov), xym3);
@ -755,6 +761,10 @@ void GSDrawScanlineCodeGenerator2::Init()

 		if (m_sel.tfx != TFX_NONE)
 		{
+			const XYm& vt = xym4;
+
+			broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
+
 			// a1 = &m_local.d[skip]

 			const XYm& s = _s;
@ -900,16 +910,9 @@ void GSDrawScanlineCodeGenerator2::Step()
 		// z += m_local.d4.z;

 		if (m_sel.zb)
-		{
-			if (m_sel.zequal)
-			{
-				pbroadcastdLocal(z, _rip_local(p.z));
-			}
-			else
 		{
 			BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
 		}
-		}

 		// f = f.add16(m_local.d4.f);

@ -1051,11 +1054,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)

 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
-		if (m_sel.zequal)
-		{
-			movdqa(xym0, _z);
-		}
-		else if (m_sel.zoverflow)
+		if (m_sel.zoverflow)
 		{
 			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
 			/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
--- a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp
+++ b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp
@ -963,7 +963,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, xi, top, edge);

-					e->t.U32[3] = (0x10000 - xf) & 0xffff;
+					e->p.U32[0] = (0x10000 - xf) & 0xffff;

 					e++;
 				}
@ -986,7 +986,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, xi, top, edge);

-					e->t.U32[3] = xf;
+					e->p.U32[0] = xf;

 					e++;
 				}
@ -1053,7 +1053,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, left, yi, edge);

-					e->t.U32[3] = (0x10000 - yf) & 0xffff;
+					e->p.U32[0] = (0x10000 - yf) & 0xffff;

 					e++;
 				}
@ -1076,7 +1076,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 				{
 					AddScanline(e, 1, left, yi, edge);

-					e->t.U32[3] = yf;
+					e->p.U32[0] = yf;

 					e++;
 				}
--- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp
+++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp
@ -198,74 +198,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 {
 	// FIXME q_div wasn't added to AVX2 code path.

-#if 0 //_M_SSE >= 0x501
-
-	// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
-
-	GSVector8i o2((GSVector4i)m_context->XYOFFSET);
-	GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
-
-	for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
-	{
-		GSVector8i v0 = GSVector8i::load<true>(src[0].m);
-		GSVector8i v1 = GSVector8i::load<true>(src[1].m);
-
-		GSVector8 stcq = GSVector8::cast(v0.ac(v1));
-		GSVector8i xyzuvf = v0.bd(v1);
-
-		//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
-		//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
-
-		GSVector8i xy = xyzuvf.upl16() - o2;
-		GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
-
-		GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * m_pos_scale2;
-		GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
-
-		GSVector8 t = GSVector8::zero();
-
-		if(tme)
-		{
-			if(fst)
-			{
-				t = GSVector8(xyzuvf.uph16() << (16 - 4));
-			}
-			else
-			{
-				t = stcq.xyww() * tsize2;
-			}
-		}
-
-		if(primclass == GS_SPRITE_CLASS)
-		{
-			t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
-		}
-
-		GSVector8::storel(&dst[0].p, p);
-
-		if(tme || primclass == GS_SPRITE_CLASS)
-		{
-			GSVector8::store<true>(&dst[0].t, t.ac(c));
-		}
-		else
-		{
-			GSVector8::storel(&dst[0].c, c);
-		}
-
-		GSVector8::storeh(&dst[1].p, p);
-
-		if(tme || primclass == GS_SPRITE_CLASS)
-		{
-			GSVector8::store<true>(&dst[1].t, t.bd(c));
-		}
-		else
-		{
-			GSVector8::storeh(&dst[1].c, c);
-		}
-	}
-
-#else
-
 	GSVector4i off = (GSVector4i)m_context->XYOFFSET;
 	GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
 	GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
@ -277,9 +209,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 		GSVector4i xyzuvf(src->m[1]);

 		GSVector4i xy = xyzuvf.upl16() - off;
-		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());

-		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale;
 		dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);

 		GSVector4 t = GSVector4::zero();
@ -311,11 +241,19 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 			}
 		}

-		if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z)
+		if (primclass == GS_SPRITE_CLASS)
 		{
+			dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * m_pos_scale;
+
 			xyzuvf = xyzuvf.min_u32(z_max);
 			t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
 		}
+		else
+		{
+			float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
+			dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
+			t = t.blend32<8>(GSVector4(xyzuvf << 7));
+		}

 		dst->t = t;

@ -325,8 +263,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*

 #endif
 	}
-
-#endif
 }

 void GSRendererSW::Draw()
@ -1352,7 +1288,6 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)

 		gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
 		gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
-		gd.sel.zequal = !!m_vt.m_eq.z;
 		gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
 		gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
 	}
--- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
+++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
@ -64,7 +64,6 @@ union GSScanlineSelector
 		u32 mmin   : 2; // 54
 		u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
 		// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
-		u32 zequal : 1; // 56
 		u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
 	};

--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
@ -82,6 +82,19 @@ void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem
 #endif
 }

+void GSSetupPrimCodeGenerator2::broadcastss(const XYm& reg, const Address& mem)
+{
+	if (hasAVX)
+	{
+		vbroadcastss(reg, mem);
+	}
+	else
+	{
+		movss(reg, mem);
+		shufps(reg, reg, _MM_SHUFFLE(0, 0, 0, 0));
+	}
+}
+
 void GSSetupPrimCodeGenerator2::Generate()
 {
 	// Technically we just need the delta < 2GB
@ -152,16 +165,10 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()

 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
-		// GSVector4 p = dscan.p;
-
-
-		movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
-
 		if (m_en.f)
 		{
-			// GSVector4 df = p.wwww();
-
-			THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+			// GSVector4 df = t.wwww();
+			broadcastss(xym1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);

 			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();

@ -184,26 +191,10 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
 		}

 		if (m_en.z)
-		{
-			if (m_sel.zequal)
-			{
-				u32 offset = 0;
-				if (m_sel.prim != GS_POINT_CLASS)
-					offset = sizeof(u32) * 1;
-
-				mov(eax, ptr[_index + offset]);
-				shl(eax, 6); // * sizeof(GSVertexSW)
-				add(rax, _64_vertex);
-
-				movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
-				pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
-				movdqa(_rip_local(p.z), xmm0);
-			}
-			else
 		{
 			// GSVector4 dz = p.zzzz();

-				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+			broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);

 			// m_local.d4.z = dz * 4.0f;

@ -219,7 +210,6 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
 			}
 		}
 	}
-	}
 	else
 	{
 		// GSVector4 p = vertex[index[1]].p;
@ -259,68 +249,19 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()

 	if (m_sel.prim != GS_SPRITE_CLASS)
 	{
-		// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
-
-		broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
-
-		vmulps(ymm1, ymm0, ymm3);
-
-		if (m_en.z)
-		{
-			if (m_sel.zequal)
-			{
-				u32 offset = 0;
-				if (m_sel.prim != GS_POINT_CLASS)
-					offset = sizeof(u32) * 1;
-
-				mov(eax, ptr[_index + offset]);
-				shl(eax, 6); // * sizeof(GSVertexSW)
-				add(rax, _64_vertex);
-
-				mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
-				mov(_rip_local(p.z), t1.cvt32());
-			}
-			else
-			{
-				// m_local.d8.p.z = dp8.extract32<2>();
-
-				extractps(_rip_local_d_p(z), xmm1, 2);
-
-				// GSVector8 dz = GSVector8(dscan.p).zzzz();
-
-				vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
-			}
-		}
-
 		if (m_en.f)
 		{
-			// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
+			// GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
+			vbroadcastss(ymm1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);

-			cvtps2dq(ymm1, ymm1);
-			pextrd(_rip_local_d_p(f), xmm1, 3);
-
-			// GSVector8 df = GSVector8(dscan.p).wwww();
-
-			vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
-		}
+			// local.d8.p.f = GSVector4i(tstep).extract32<3>();
+			vmulps(xmm0, xmm1, xmm3);
+			cvtps2dq(xmm0, xmm0);
+			movd(_rip_local_d_p(f), xmm0);

 			for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 			{
-			if (m_en.z)
-			{
-				// m_local.d[i].z = dz * shift[1 + i];
-
-				// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
-				if (i < 4 || many_regs)
-					vmulps(ymm0, Ymm(4 + i), ymm2);
-				else
-					vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
-				movaps(_rip_local(d[i].z), ymm0);
-			}
-
-			if (m_en.f)
-			{
-				// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
+				// m_local.d[i].f = GSVectorI(df * m_shift[i]).xxzzlh();

 				if (i < 4 || many_regs)
 					vmulps(ymm0, Ymm(4 + i), ymm1);
@ -332,6 +273,28 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
 				movdqa(_rip_local(d[i].f), ymm0);
 			}
 		}
+
+		if (m_en.z)
+		{
+			// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
+			vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
+
+			// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
+			vmulss(xmm1, xmm0, xmm3);
+			movss(_rip_local_d_p(z), xmm1);
+
+			for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
+			{
+				// m_local.d[i].z = dz * shift[1 + i];
+
+				// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
+				if (i < 4 || many_regs)
+					vmulps(ymm1, Ymm(4 + i), ymm0);
+				else
+					vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
+				movaps(_rip_local(d[i].z), ymm1);
+			}
+		}
 	}
 	else
 	{
--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
@ -69,6 +69,8 @@ public:
 private:
 	/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
 	void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
+	/// Broadcast a 32-bit float to the whole register, whatever size that register might be
+	void broadcastss(const XYm& reg, const Xbyak::Address& mem);

 	void Depth_XMM();
 	void Depth_YMM();