From b6c1b3fb96e1f7ef0509678f1fd14e8edda46ee7 Mon Sep 17 00:00:00 2001
From: TellowKrinkle <tellowkrinkle@gmail.com>
Date: Wed, 11 May 2022 08:58:46 -0500
Subject: [PATCH] GS: Add double operations to GSVector/GSNewCodeGenerator

---
 pcsx2/GS/GSVector.cpp                      |  2 +
 pcsx2/GS/GSVector4.h                       | 62 ++++++++++++++++++++++
 pcsx2/GS/GSVector8.h                       | 57 ++++++++++++++++++++
 pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h | 20 +++++++
 4 files changed, 141 insertions(+)
diff --git a/pcsx2/GS/GSVector.cpp b/pcsx2/GS/GSVector.cpp
index fb9476d44c..deb00ac09d 100644
--- a/pcsx2/GS/GSVector.cpp
+++ b/pcsx2/GS/GSVector.cpp
@@ -67,6 +67,7 @@ CONSTINIT const GSVector4 GSVector4::m_two = cxpr(2.0f);
 CONSTINIT const GSVector4 GSVector4::m_four = cxpr(4.0f);
 CONSTINIT const GSVector4 GSVector4::m_x4b000000 = cxpr(0x4b000000);
 CONSTINIT const GSVector4 GSVector4::m_x4f800000 = cxpr(0x4f800000);
+CONSTINIT const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull);
 CONSTINIT const GSVector4 GSVector4::m_max = cxpr(FLT_MAX);
 CONSTINIT const GSVector4 GSVector4::m_min = cxpr(FLT_MIN);
 
@@ -78,6 +79,7 @@ CONSTINIT const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff);
 CONSTINIT const GSVector8 GSVector8::m_x80000000 = cxpr(0x80000000);
 CONSTINIT const GSVector8 GSVector8::m_x4b000000 = cxpr(0x4b000000);
 CONSTINIT const GSVector8 GSVector8::m_x4f800000 = cxpr(0x4f800000);
+CONSTINIT const GSVector8 GSVector8::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull);
 CONSTINIT const GSVector8 GSVector8::m_max = cxpr(FLT_MAX);
 CONSTINIT const GSVector8 GSVector8::m_min = cxpr(FLT_MAX);
 
diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h
index 0c1e70533d..bd5ed47e67 100644
--- a/pcsx2/GS/GSVector4.h
+++ b/pcsx2/GS/GSVector4.h
@@ -28,6 +28,11 @@ class alignas(16) GSVector4
 	{
 	}
 
+	constexpr GSVector4(cxpr_init_tag, u64 x, u64 y)
+		: U64{x, y}
+	{
+	}
+
 public:
 	union
 	{
@@ -36,6 +41,7 @@ public:
 		struct { float left, top, right, bottom; };
 		float v[4];
 		float F32[4];
+		double F64[2];
 		s8  I8[16];
 		s16 I16[8];
 		s32 I32[4];
@@ -55,6 +61,7 @@ public:
 	static const GSVector4 m_four;
 	static const GSVector4 m_x4b000000;
 	static const GSVector4 m_x4f800000;
+	static const GSVector4 m_xc1e00000000fffff;
 	static const GSVector4 m_max;
 	static const GSVector4 m_min;
 
@@ -82,6 +89,16 @@ public:
 		return GSVector4(cxpr_init, x, x, x, x);
 	}
 
+	constexpr static GSVector4 cxpr64(u64 x, u64 y)
+	{
+		return GSVector4(cxpr_init, x, y);
+	}
+
+	constexpr static GSVector4 cxpr64(u64 x)
+	{
+		return GSVector4(cxpr_init, x, x);
+	}
+
 	__forceinline GSVector4(float x, float y, float z, float w)
 	{
 		m = _mm_set_ps(w, z, y, x);
@@ -119,6 +136,11 @@ public:
 	{
 	}
 
+	__forceinline explicit GSVector4(__m128d m)
+		: m(_mm_castpd_ps(m))
+	{
+	}
+
 	__forceinline explicit GSVector4(float f)
 	{
 		*this = f;
@@ -162,6 +184,11 @@ public:
 
 #endif
 
+	__forceinline static GSVector4 f64(double x, double y)
+	{
+		return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x)));
+	}
+
 	__forceinline void operator=(const GSVector4& v)
 	{
 		m = v.m;
@@ -858,6 +885,36 @@ GSVector.h:2973:15: error:  shadows template parm 'int i'
 		return GSVector4(_mm_cmple_ps(v1, v2));
 	}
 
+	__forceinline GSVector4 mul64(const GSVector4& v) const
+	{
+		return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
+	}
+
+	__forceinline GSVector4 add64(const GSVector4& v) const
+	{
+		return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
+	}
+
+	__forceinline GSVector4 sub64(const GSVector4& v) const
+	{
+		return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
+	}
+
+	__forceinline static GSVector4 f32to64(const GSVector4& v)
+	{
+		return GSVector4(_mm_cvtps_pd(v.m));
+	}
+
+	__forceinline static GSVector4 f32to64(const void* p)
+	{
+		return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
+	}
+
+	__forceinline GSVector4i f64toi32(bool truncate = true) const
+	{
+		return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
+	}
+
 	// clang-format off
 
 	#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
@@ -907,4 +964,9 @@ GSVector.h:2973:15: error:  shadows template parm 'int i'
 	}
 
 #endif
+
+	__forceinline static GSVector4 broadcast64(const void* d)
+	{
+		return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
+	}
 };
diff --git a/pcsx2/GS/GSVector8.h b/pcsx2/GS/GSVector8.h
index 6a43ab9527..fa3f5d2c77 100644
--- a/pcsx2/GS/GSVector8.h
+++ b/pcsx2/GS/GSVector8.h
@@ -32,6 +32,11 @@ class alignas(32) GSVector8
 	{
 	}
 
+	constexpr GSVector8(cxpr_init_tag, u64 x, u64 y, u64 z, u64 w)
+		: U64{x, y, z, w}
+	{
+	}
+
 public:
 	union
 	{
@@ -39,6 +44,7 @@ public:
 		struct { float r0, g0, b0, a0, r1, g1, b1, a1; };
 		float v[8];
 		float F32[8];
+		double F64[4];
 		s8  I8[32];
 		s16 I16[16];
 		s32 I32[8];
@@ -57,6 +63,7 @@ public:
 	static const GSVector8 m_x80000000;
 	static const GSVector8 m_x4b000000;
 	static const GSVector8 m_x4f800000;
+	static const GSVector8 m_xc1e00000000fffff;
 	static const GSVector8 m_max;
 	static const GSVector8 m_min;
 
@@ -87,6 +94,16 @@ public:
 		return cxpr(static_cast<int>(x));
 	}
 
+	constexpr static GSVector8 cxpr64(u64 x, u64 y, u64 z, u64 w)
+	{
+		return GSVector8(cxpr_init, x, y, z, w);
+	}
+
+	constexpr static GSVector8 cxpr64(u64 x)
+	{
+		return GSVector8(cxpr_init, x, x, x, x);
+	}
+
 	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
 	{
 		m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
@@ -142,6 +159,11 @@ public:
 	{
 	}
 
+	__forceinline explicit GSVector8(__m256d m)
+		: m(_mm256_castpd_ps(m))
+	{
+	}
+
 #if _M_SSE >= 0x501
 
 	__forceinline explicit GSVector8(const GSVector8i& v);
@@ -773,6 +795,36 @@ public:
 		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
 	}
 
+	__forceinline GSVector8 mul64(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_mul_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
+	}
+
+	__forceinline GSVector8 add64(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_add_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
+	}
+
+	__forceinline GSVector8 sub64(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_sub_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
+	}
+
+	__forceinline static GSVector8 f32to64(const GSVector4& v)
+	{
+		return GSVector8(_mm256_cvtps_pd(v.m));
+	}
+
+	__forceinline static GSVector8 f32to64(const void* p)
+	{
+		return GSVector8(_mm256_cvtps_pd(_mm_load_ps(static_cast<const float*>(p))));
+	}
+
+	__forceinline GSVector4i f64toi32(bool truncate = true) const
+	{
+		return GSVector4i(truncate ? _mm256_cvttpd_epi32(_mm256_castps_pd(m)) : _mm256_cvtpd_epi32(_mm256_castps_pd(m)));
+	}
+
 	// clang-format off
 
 	// x = v[31:0] / v[159:128]
@@ -888,6 +940,11 @@ public:
 		return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
 	}
 
+	__forceinline static GSVector8 broadcast64(const void* d)
+	{
+		return GSVector8(_mm256_broadcast_sd(static_cast<const double*>(d)));
+	}
+
 	// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
 
 #endif
diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
index 297b341511..86246a3f2d 100644
--- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
+++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
@@ -312,9 +312,18 @@ public:
 	FORWARD_JUMP(jmp)
 
 	AFORWARD(2, addps,     ARGS_XO)
+	AFORWARD(2, addpd,     ARGS_XO)
 	SFORWARD(2, cvtdq2ps,  ARGS_XO)
+	SFORWARD(2, cvtpd2dq,  ARGS_XO)
+	SFORWARD(2, cvtpd2ps,  ARGS_XO)
+	SFORWARD(2, cvttpd2dq, ARGS_XO)
 	SFORWARD(2, cvtps2dq,  ARGS_XO)
+	SFORWARD(2, cvtps2pd,  ARGS_XO)
+	SFORWARD(2, cvtsd2si,  const AddressReg&, const Operand&);
+	AFORWARD(2, cvtsd2ss,  ARGS_XO)
+	AFORWARD(2, cvtss2sd,  ARGS_XO)
 	SFORWARD(2, cvttps2dq, ARGS_XO)
+	SFORWARD(2, cvttsd2si, const AddressReg&, const Operand&);
 	SFORWARD(3, extractps, const Operand&, const Xmm&, u8)
 	AFORWARD(2, maxps,     ARGS_XO)
 	AFORWARD(2, minps,     ARGS_XO)
@@ -324,13 +333,21 @@ public:
 	SFORWARD(2, movd,      const Reg32&, const Xmm&)
 	SFORWARD(2, movd,      const Xmm&, const Address&)
 	SFORWARD(2, movd,      const Xmm&, const Reg32&)
+	SFORWARD(2, movddup,   ARGS_XO);
 	SFORWARD(2, movdqa,    ARGS_XO)
 	SFORWARD(2, movdqa,    const Address&, const Xmm&)
 	SFORWARD(2, movhps,    ARGS_XO)
 	SFORWARD(2, movhps,    const Address&, const Xmm&)
 	SFORWARD(2, movq,      const Address&, const Xmm&)
 	SFORWARD(2, movq,      const Xmm&, const Address&)
+	SFORWARD(2, movsd,     const Address&, const Xmm&)
+	SFORWARD(2, movsd,     const Xmm&, const Address&)
+	SFORWARD(2, movss,     const Address&, const Xmm&)
+	SFORWARD(2, movss,     const Xmm&, const Address&)
+	AFORWARD(2, mulpd,     ARGS_XO)
 	AFORWARD(2, mulps,     ARGS_XO)
+	AFORWARD(2, mulsd,     ARGS_XO)
+	AFORWARD(2, mulss,     ARGS_XO)
 	AFORWARD(2, orps,      ARGS_XO)
 	AFORWARD(2, packssdw,  ARGS_XO)
 	AFORWARD(2, packusdw,  ARGS_XO)
@@ -382,11 +399,14 @@ public:
 	SFORWARD(2, rcpps,     ARGS_XO)
 	AFORWARD(3, shufps,    ARGS_XOI)
 	AFORWARD(2, subps,     ARGS_XO)
+	AFORWARD(2, unpcklps,  ARGS_XO)
+	AFORWARD(2, unpcklpd,  ARGS_XO)
 	AFORWARD(2, xorps,     ARGS_XO)
 
 	FORWARD_SSE_XMM0(pblendvb)
 
 	FORWARD(2, AVX,  vbroadcastss,   ARGS_XO)
+	FORWARD(2, AVX,  vbroadcastsd,   const Ymm&, const Address&)
 	FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&)
 	FORWARD(2, AVX,  vbroadcastf128, const Ymm&, const Address&)
 	FORWARD(3, FMA,  vfmadd213ps,    ARGS_XXO)