From eb0b341e61cd3cd06db74e9ebdf0f32346d4c197 Mon Sep 17 00:00:00 2001
From: TellowKrinkle <tellowkrinkle@gmail.com>
Date: Mon, 7 Oct 2024 00:18:33 -0500
Subject: [PATCH] GS:SW: Use unaligned loads to reduce constant size on AVX2

Allows more instructions to use 1-byte offsets
---
 pcsx2/GS/Renderers/SW/GSDrawScanline.cpp      | 30 +++----
 .../SW/GSDrawScanlineCodeGenerator.all.cpp    | 80 +++++++++++++------
 pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h    |  4 +
 pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h | 43 +++-------
 .../SW/GSSetupPrimCodeGenerator.all.cpp       | 21 +++--
 5 files changed, 100 insertions(+), 78 deletions(-)
diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
index 57f2fc8574..7c6d716d8c 100644
--- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
@@ -207,10 +207,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 	constexpr int vlen = sizeof(VectorF) / sizeof(float);
 
 #if _M_SSE >= 0x501
-	const GSVector8* shift = (GSVector8*)g_const_256b.m_shift;
-	const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
+	auto load_shift = [](int i) { return GSVector8::load<false>(&g_const_256b.m_shift[8 - i]); };
+	const GSVector4 step_shift = GSVector4::broadcast32(&g_const_256b.m_shift[0]);
 #else
-	const GSVector4* shift = (GSVector4*)g_const_128b.m_shift;
+	static const GSVector4* shift = reinterpret_cast<const GSVector4*>(g_const_128b.m_shift);
+	auto load_shift = [](int i) { return shift[1 + i]; };
 	const GSVector4 step_shift = shift[0];
 #endif
 
@@ -234,22 +235,23 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 
 				for (int i = 0; i < vlen; i++)
 				{
-					local.d[i].f = VectorI(df * shift[1 + i]).xxzzlh();
+					local.d[i].f = VectorI(df * load_shift(i)).xxzzlh();
 				}
 			}
 
 			if (has_z && !sel.zequal)
 			{
-				const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
 				const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
 #if _M_SSE >= 0x501
-				GSVector4::storel(&local.d8.p.z, dz.mul64(GSVector4::f32to64(shift)));
+				double dz = dscan.p.F64[1] * g_const_256b.m_shift[0];
+				memcpy(&local.d8.p.z, &dz, sizeof(dz));
 #else
+				const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
 				local.d4.z = dz.mul64(GSVector4::f32to64(shift));
 #endif
 				for (int i = 0; i < vlen; i++)
 				{
-					local.d[i].z = dzf * shift[i + 1];
+					local.d[i].z = dzf * load_shift(i);
 				}
 			}
 		}
@@ -297,7 +299,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 
 			for (int i = 0; i < vlen; i++)
 			{
-				VectorF v = dstq * shift[1 + i];
+				VectorF v = dstq * load_shift(i);
 
 				if (sel.fst)
 				{
@@ -336,8 +338,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 
 			for (int i = 0; i < vlen; i++)
 			{
-				VectorI r = VectorI(dr * shift[1 + i]).ps32();
-				VectorI b = VectorI(db * shift[1 + i]).ps32();
+				VectorI r = VectorI(dr * load_shift(i)).ps32();
+				VectorI b = VectorI(db * load_shift(i)).ps32();
 
 				local.d[i].rb = r.upl16(b);
 			}
@@ -347,8 +349,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 
 			for (int i = 0; i < vlen; i++)
 			{
-				VectorI g = VectorI(dg * shift[1 + i]).ps32();
-				VectorI a = VectorI(da * shift[1 + i]).ps32();
+				VectorI g = VectorI(dg * load_shift(i)).ps32();
+				VectorI a = VectorI(da * load_shift(i)).ps32();
 
 				local.d[i].ga = g.upl16(a);
 			}
@@ -515,7 +517,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
 		steps = pixels + skip - vlen;
 		left -= skip;
 #if _M_SSE >= 0x501
-		test = GSVector8i::i8to32(g_const_256b.m_test[skip]) | GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
+		test = GSVector8i::i8to32(&g_const_256b.m_test[16 - skip]) | GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
 #else
 		test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
 #endif
@@ -1756,7 +1758,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
 		if (!sel.notest)
 		{
 #if _M_SSE >= 0x501
-			test = GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
+			test = GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
 #else
 			test = const_test[7 + (steps & (steps >> 31))];
 #endif
diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
index b299558639..dc26ac61bc 100644
--- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
@@ -661,25 +661,29 @@ void GSDrawScanlineCodeGenerator::Init()
 
 		lea(a0.cvt32(), ptr[a0 + a1 - vecints]);
 
-		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
-
-		mov(eax, a0.cvt32());
-		sar(eax, 31); // GH: 31 to extract the sign of the register
-		and_(eax, a0.cvt32());
-		if (isXmm)
-			shl(eax, 4); // * sizeof(m_test[0])
-		cdqe();
-
 		if (isXmm)
 		{
+			// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
+			mov(eax, a0.cvt32());
+			sar(eax, 31); // GH: 31 to extract the sign of the register
+			and_(eax, a0.cvt32());
+			shl(eax, 4); // * sizeof(m_test[0])
+			cdqe();
 			shl(a1.cvt32(), 4); // * sizeof(m_test[0])
 			movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]);
 			por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
 		}
 		else
 		{
-			pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
-			pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
+			// GSVector8i test = loadu(&m_test[16 - skip]) | loadu(&m_test[steps >= 0 ? 0 : -steps]);
+			mov(eax, a1.cvt32());
+			neg(rax); // rax = -skip
+			pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[16])]);
+			xor_(t0.cvt32(), t0.cvt32());
+			mov(eax, a0.cvt32());
+			neg(eax);               // eax = -steps
+			cmovs(eax, t0.cvt32()); // if (eax < 0) eax = 0
+			pmovsxbd(xym0, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
 			por(_test, xym0);
 			shl(a1.cvt32(), 5); // * sizeof(m_test[0])
 		}
@@ -922,7 +926,7 @@ void GSDrawScanlineCodeGenerator::Init()
 /// Inputs: a0=steps, t0=fza_offset
 /// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test
 /// Destroys[x86]: all
-/// Destroys[x64]: xym0, xym1, xym2, xym3
+/// Destroys[x64]: xym0, xym1, xym2, xym3, t2
 void GSDrawScanlineCodeGenerator::Step()
 {
 	// steps -= 4;
@@ -1048,19 +1052,22 @@ void GSDrawScanlineCodeGenerator::Step()
 
 	if (!m_sel.notest)
 	{
+#if USING_XMM
 		// test = m_test[7 + (steps & (steps >> 31))];
 
 		mov(eax, a0.cvt32());
 		sar(eax, 31); // GH: 31 to extract the sign of the register
 		and_(eax, a0.cvt32());
-		if (isXmm)
-			shl(eax, 4);
+		shl(eax, 4);
 		cdqe();
-
-#if USING_XMM
 		movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
 #else
-		pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
+		// test = loadu(&m_test[steps >= 0 ? 0 : -steps]);
+		xor_(t2.cvt32(), t2.cvt32());
+		mov(eax, a0.cvt32());
+		neg(eax);               // eax = -steps
+		cmovs(eax, t2.cvt32()); // if (eax < 0) eax = 0;
+		pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
 #endif
 	}
 }
@@ -1655,29 +1662,54 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
 		pslld(xym4, 9);
 		psrld(xym4, 9);
 
-		auto log2_coeff = [this](int i) -> Address
+#if USING_YMM
+		auto load_log2_coeff = [this](const XYm& reg, int i)
 		{
-			ptr[_m_const + log2_coeff_offset(i)];
+			vbroadcastss(reg, ptr[_m_const + log2_coeff_offset(i)]);
 		};
+		auto log2_coeff = [this, &load_log2_coeff](int i)
+		{
+			load_log2_coeff(xym6, i);
+			return xym6;
+		};
+#else
+		auto log2_coeff = [this](int i) -> Operand
+		{
+			return ptr[_m_const + log2_coeff_offset(i)];
+		};
+		auto load_log2_coeff = [this, &log2_coeff](const XYm& reg, int i)
+		{
+			movaps(reg, log2_coeff(i));
+		};
+#endif
 
-		orps(xym4, log2_coeff(3));
+		load_log2_coeff(xym1, 3);
+		orps(xym4, xym1);
 
 		// xym4 = mant(q) | 1.0f
 
 		if (hasFMA)
 		{
-			movaps(xym5, log2_coeff(0)); // c0
+			load_log2_coeff(xym5, 0); // c0
 			vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1
 			vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2
-			subps(xym4, log2_coeff(3)); // xym4 - 1.0f
+			subps(xym4, xym1); // xym4 - 1.0f
 			vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0
 		}
 		else
 		{
-			THREEARG(mulps, xym5, xym4, log2_coeff(0));
+			if (hasAVX)
+			{
+				vmulps(xym5, xym4, log2_coeff(0));
+			}
+			else
+			{
+				load_log2_coeff(xym5, 0);
+				mulps(xym5, xym4);
+			}
 			addps(xym5, log2_coeff(1));
 			mulps(xym5, xym4);
-			subps(xym4, log2_coeff(3));
+			subps(xym4, xym1);
 			addps(xym5, log2_coeff(2));
 			mulps(xym4, xym5);
 			addps(xym4, xym0);
diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
index d23c713a28..953e84cc67 100644
--- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
+++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
@@ -204,10 +204,12 @@ public:
 	FORWARD_OO_OI(or_)
 	FORWARD_OO_OI(sub)
 	FORWARD_OO_OI(xor_)
+	FORWARD(2, BASE, cmovs, const Reg&, const Operand&)
 	FORWARD(2, BASE, lea,   const Reg&, const Address&)
 	FORWARD(2, BASE, mov,   const Operand&, size_t)
 	FORWARD(2, BASE, mov,   ARGS_OO)
 	FORWARD(2, BASE, movzx, const Reg&, const Operand&)
+	FORWARD(1, BASE, neg,   const Operand&)
 	FORWARD(1, BASE, not_,  const Operand&)
 	FORWARD(1, BASE, pop,   const Operand&)
 	FORWARD(1, BASE, push,  const Operand&)
@@ -243,6 +245,8 @@ public:
 	AFORWARD(2, minps,     ARGS_XO)
 	SFORWARD(2, movaps,    ARGS_XO)
 	SFORWARD(2, movaps,    const Address&, const Xmm&)
+	SFORWARD(2, movups,    ARGS_XO)
+	SFORWARD(2, movups,    const Address&, const Xmm&)
 	SFORWARD(2, movd,      const Address&, const Xmm&)
 	SFORWARD(2, movd,      const Reg32&, const Xmm&)
 	SFORWARD(2, movd,      const Xmm&, const Address&)
diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
index 49e638faec..dbeef06a04 100644
--- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
+++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
@@ -256,46 +256,25 @@ namespace GSScanlineConstantData
 // Constant shared by all threads (to reduce cache miss)
 struct alignas(64) GSScanlineConstantData256B
 {
-	alignas(32) u8 m_test[16][8] = {
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-		{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-		{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-		{0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-		{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-		{0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-		{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-		{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-		{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-		{0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-		{0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
-		{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+	// All AVX processors support unaligned access with little to no penalty as long as you don't cross a cache line.
+	// Take advantage of that to store single vectors that we index with single-element alignment
+	alignas(32) u8 m_test[24] = {
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 	};
-	alignas(32) float m_shift[9][8] = {
-		{ 8.0f  , 8.0f  , 8.0f  , 8.0f  , 8.0f  , 8.0f  , 8.0f  , 8.0f},
-		{ 0.0f  , 1.0f  , 2.0f  , 3.0f  , 4.0f  , 5.0f  , 6.0f  , 7.0f},
-		{ -1.0f , 0.0f  , 1.0f  , 2.0f  , 3.0f  , 4.0f  , 5.0f  , 6.0f},
-		{ -2.0f , -1.0f , 0.0f  , 1.0f  , 2.0f  , 3.0f  , 4.0f  , 5.0f},
-		{ -3.0f , -2.0f , -1.0f , 0.0f  , 1.0f  , 2.0f  , 3.0f  , 4.0f},
-		{ -4.0f , -3.0f , -2.0f , -1.0f , 0.0f  , 1.0f  , 2.0f  , 3.0f},
-		{ -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f  , 1.0f  , 2.0f},
-		{ -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f  , 1.0f},
-		{ -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f},
+	float m_log2_coef[4] = {};
+	alignas(64) float m_shift[16] = {
+		8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
+		0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
 	};
-	alignas(32) float m_log2_coef[4][8] = {};
 
 	constexpr GSScanlineConstantData256B()
 	{
 		using namespace GSScanlineConstantData;
 		for (size_t n = 0; n < std::size(log2_coef); ++n)
 		{
-			for (size_t i = 0; i < 8; ++i)
-			{
-				m_log2_coef[n][i] = log2_coef[n];
-			}
+			m_log2_coef[n] = log2_coef[n];
 		}
 	}
 };
diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
index af62bed11c..e9ae51b3bf 100644
--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
@@ -110,7 +110,12 @@ void GSSetupPrimCodeGenerator::Generate()
 
 		for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
 		{
-			movaps(XYm(3 + i), ptr[rax + i * vecsize]);
+			if (isXmm)
+				movaps(XYm(3 + i), ptr[rax + i * vecsize]);
+			else if (i == 0)
+				vbroadcastss(xym3, ptr[rax]);
+			else
+				movups(XYm(3 + i), ptr[rax + (9 - i) * sizeof(float)]);
 		}
 	}
 
@@ -253,7 +258,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
 				if (i < 4 || many_regs)
 					vmulps(ymm0, Ymm(4 + i), ymm1);
 				else
-					vmulps(ymm0, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
+					vmulps(ymm0, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
 				cvttps2dq(ymm0, ymm0);
 				pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
 				pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
@@ -281,7 +286,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
 				if (i < 4 || many_regs)
 					vmulps(ymm1, Ymm(4 + i), ymm0);
 				else
-					vmulps(ymm1, ymm0, ptr[g_const_256b.m_shift[i + 1]]);
+					vmulps(ymm1, ymm0, ptr[&g_const_256b.m_shift[8 - i]]);
 				movaps(_rip_local_di(i, z), ymm1);
 			}
 		}
@@ -356,7 +361,7 @@ void GSSetupPrimCodeGenerator::Texture()
 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym2, XYm(4 + i), xym1);
 			else
-				vmulps(ymm2, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
+				vmulps(ymm2, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
 
 			if (m_sel.fst)
 			{
@@ -424,7 +429,7 @@ void GSSetupPrimCodeGenerator::Color()
 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym0, XYm(4 + i), xym2);
 			else
-				vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
+				vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
 			cvttps2dq(xym0, xym0);
 			packssdw(xym0, xym0);
 
@@ -433,7 +438,7 @@ void GSSetupPrimCodeGenerator::Color()
 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym1, XYm(4 + i), xym3);
 			else
-				vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
+				vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
 			cvttps2dq(xym1, xym1);
 			packssdw(xym1, xym1);
 
@@ -460,7 +465,7 @@ void GSSetupPrimCodeGenerator::Color()
 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym0, XYm(4 + i), xym2);
 			else
-				vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
+				vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
 			cvttps2dq(xym0, xym0);
 			packssdw(xym0, xym0);
 
@@ -469,7 +474,7 @@ void GSSetupPrimCodeGenerator::Color()
 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym1, XYm(4 + i), xym3);
 			else
-				vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
+				vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
 			cvttps2dq(xym1, xym1);
 			packssdw(xym1, xym1);