From b6c1b3fb96e1f7ef0509678f1fd14e8edda46ee7 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Wed, 11 May 2022 08:58:46 -0500 Subject: [PATCH] GS: Add double operations to GSVector/GSNewCodeGenerator --- pcsx2/GS/GSVector.cpp | 2 + pcsx2/GS/GSVector4.h | 62 ++++++++++++++++++++++ pcsx2/GS/GSVector8.h | 57 ++++++++++++++++++++ pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h | 20 +++++++ 4 files changed, 141 insertions(+) diff --git a/pcsx2/GS/GSVector.cpp b/pcsx2/GS/GSVector.cpp index fb9476d44c..deb00ac09d 100644 --- a/pcsx2/GS/GSVector.cpp +++ b/pcsx2/GS/GSVector.cpp @@ -67,6 +67,7 @@ CONSTINIT const GSVector4 GSVector4::m_two = cxpr(2.0f); CONSTINIT const GSVector4 GSVector4::m_four = cxpr(4.0f); CONSTINIT const GSVector4 GSVector4::m_x4b000000 = cxpr(0x4b000000); CONSTINIT const GSVector4 GSVector4::m_x4f800000 = cxpr(0x4f800000); +CONSTINIT const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull); CONSTINIT const GSVector4 GSVector4::m_max = cxpr(FLT_MAX); CONSTINIT const GSVector4 GSVector4::m_min = cxpr(FLT_MIN); @@ -78,6 +79,7 @@ CONSTINIT const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff); CONSTINIT const GSVector8 GSVector8::m_x80000000 = cxpr(0x80000000); CONSTINIT const GSVector8 GSVector8::m_x4b000000 = cxpr(0x4b000000); CONSTINIT const GSVector8 GSVector8::m_x4f800000 = cxpr(0x4f800000); +CONSTINIT const GSVector8 GSVector8::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull); CONSTINIT const GSVector8 GSVector8::m_max = cxpr(FLT_MAX); CONSTINIT const GSVector8 GSVector8::m_min = cxpr(FLT_MAX); diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h index 0c1e70533d..bd5ed47e67 100644 --- a/pcsx2/GS/GSVector4.h +++ b/pcsx2/GS/GSVector4.h @@ -28,6 +28,11 @@ class alignas(16) GSVector4 { } + constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) + : U64{x, y} + { + } + public: union { @@ -36,6 +41,7 @@ public: struct { float left, top, right, bottom; }; float v[4]; float F32[4]; + double F64[2]; s8 I8[16]; s16 I16[8]; s32 I32[4]; @@ -55,6 +61,7 @@ public: static const GSVector4 m_four; static const GSVector4 m_x4b000000; static const GSVector4 m_x4f800000; + static const GSVector4 m_xc1e00000000fffff; static const GSVector4 m_max; static const GSVector4 m_min; @@ -82,6 +89,16 @@ public: return GSVector4(cxpr_init, x, x, x, x); } + constexpr static GSVector4 cxpr64(u64 x, u64 y) + { + return GSVector4(cxpr_init, x, y); + } + + constexpr static GSVector4 cxpr64(u64 x) + { + return GSVector4(cxpr_init, x, x); + } + __forceinline GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); @@ -119,6 +136,11 @@ public: { } + __forceinline explicit GSVector4(__m128d m) + : m(_mm_castpd_ps(m)) + { + } + __forceinline explicit GSVector4(float f) { *this = f; @@ -162,6 +184,11 @@ public: #endif + __forceinline static GSVector4 f64(double x, double y) + { + return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); + } + __forceinline void operator=(const GSVector4& v) { m = v.m; @@ -858,6 +885,36 @@ GSVector.h:2973:15: error: shadows template parm 'int i' return GSVector4(_mm_cmple_ps(v1, v2)); } + __forceinline GSVector4 mul64(const GSVector4& v) const + { + return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))); + } + + __forceinline GSVector4 add64(const GSVector4& v) const + { + return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))); + } + + __forceinline GSVector4 sub64(const GSVector4& v) const + { + return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))); + } + + __forceinline static GSVector4 f32to64(const GSVector4& v) + { + return GSVector4(_mm_cvtps_pd(v.m)); + } + + __forceinline static GSVector4 f32to64(const void* p) + { + return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast(p))))); + } + + __forceinline GSVector4i f64toi32(bool truncate = true) const + { + return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m))); + } + // clang-format off #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ @@ -907,4 +964,9 @@ GSVector.h:2973:15: error: shadows template parm 'int i' } #endif + + __forceinline static GSVector4 broadcast64(const void* d) + { + return GSVector4(_mm_loaddup_pd(static_cast(d))); + } }; diff --git a/pcsx2/GS/GSVector8.h b/pcsx2/GS/GSVector8.h index 6a43ab9527..fa3f5d2c77 100644 --- a/pcsx2/GS/GSVector8.h +++ b/pcsx2/GS/GSVector8.h @@ -32,6 +32,11 @@ class alignas(32) GSVector8 { } + constexpr GSVector8(cxpr_init_tag, u64 x, u64 y, u64 z, u64 w) + : U64{x, y, z, w} + { + } + public: union { @@ -39,6 +44,7 @@ public: struct { float r0, g0, b0, a0, r1, g1, b1, a1; }; float v[8]; float F32[8]; + double F64[4]; s8 I8[32]; s16 I16[16]; s32 I32[8]; @@ -57,6 +63,7 @@ public: static const GSVector8 m_x80000000; static const GSVector8 m_x4b000000; static const GSVector8 m_x4f800000; + static const GSVector8 m_xc1e00000000fffff; static const GSVector8 m_max; static const GSVector8 m_min; @@ -87,6 +94,16 @@ public: return cxpr(static_cast(x)); } + constexpr static GSVector8 cxpr64(u64 x, u64 y, u64 z, u64 w) + { + return GSVector8(cxpr_init, x, y, z, w); + } + + constexpr static GSVector8 cxpr64(u64 x) + { + return GSVector8(cxpr_init, x, x, x, x); + } + __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) { m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0); @@ -142,6 +159,11 @@ public: { } + __forceinline explicit GSVector8(__m256d m) + : m(_mm256_castpd_ps(m)) + { + } + #if _M_SSE >= 0x501 __forceinline explicit GSVector8(const GSVector8i& v); @@ -773,6 +795,36 @@ public: return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ)); } + __forceinline GSVector8 mul64(const GSVector8& v) const + { + return GSVector8(_mm256_mul_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m))); + } + + __forceinline GSVector8 add64(const GSVector8& v) const + { + return GSVector8(_mm256_add_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m))); + } + + __forceinline GSVector8 sub64(const GSVector8& v) const + { + return GSVector8(_mm256_sub_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m))); + } + + __forceinline static GSVector8 f32to64(const GSVector4& v) + { + return GSVector8(_mm256_cvtps_pd(v.m)); + } + + __forceinline static GSVector8 f32to64(const void* p) + { + return GSVector8(_mm256_cvtps_pd(_mm_load_ps(static_cast(p)))); + } + + __forceinline GSVector4i f64toi32(bool truncate = true) const + { + return GSVector4i(truncate ? _mm256_cvttpd_epi32(_mm256_castps_pd(m)) : _mm256_cvtpd_epi32(_mm256_castps_pd(m))); + } + // clang-format off // x = v[31:0] / v[159:128] @@ -888,6 +940,11 @@ public: return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f))); } + __forceinline static GSVector8 broadcast64(const void* d) + { + return GSVector8(_mm256_broadcast_sd(static_cast(d))); + } + // TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element #endif diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h index 297b341511..86246a3f2d 100644 --- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h @@ -312,9 +312,18 @@ public: FORWARD_JUMP(jmp) AFORWARD(2, addps, ARGS_XO) + AFORWARD(2, addpd, ARGS_XO) SFORWARD(2, cvtdq2ps, ARGS_XO) + SFORWARD(2, cvtpd2dq, ARGS_XO) + SFORWARD(2, cvtpd2ps, ARGS_XO) + SFORWARD(2, cvttpd2dq, ARGS_XO) SFORWARD(2, cvtps2dq, ARGS_XO) + SFORWARD(2, cvtps2pd, ARGS_XO) + SFORWARD(2, cvtsd2si, const AddressReg&, const Operand&); + AFORWARD(2, cvtsd2ss, ARGS_XO) + AFORWARD(2, cvtss2sd, ARGS_XO) SFORWARD(2, cvttps2dq, ARGS_XO) + SFORWARD(2, cvttsd2si, const AddressReg&, const Operand&); SFORWARD(3, extractps, const Operand&, const Xmm&, u8) AFORWARD(2, maxps, ARGS_XO) AFORWARD(2, minps, ARGS_XO) @@ -324,13 +333,21 @@ public: SFORWARD(2, movd, const Reg32&, const Xmm&) SFORWARD(2, movd, const Xmm&, const Address&) SFORWARD(2, movd, const Xmm&, const Reg32&) + SFORWARD(2, movddup, ARGS_XO); SFORWARD(2, movdqa, ARGS_XO) SFORWARD(2, movdqa, const Address&, const Xmm&) SFORWARD(2, movhps, ARGS_XO) SFORWARD(2, movhps, const Address&, const Xmm&) SFORWARD(2, movq, const Address&, const Xmm&) SFORWARD(2, movq, const Xmm&, const Address&) + SFORWARD(2, movsd, const Address&, const Xmm&) + SFORWARD(2, movsd, const Xmm&, const Address&) + SFORWARD(2, movss, const Address&, const Xmm&) + SFORWARD(2, movss, const Xmm&, const Address&) + AFORWARD(2, mulpd, ARGS_XO) AFORWARD(2, mulps, ARGS_XO) + AFORWARD(2, mulsd, ARGS_XO) + AFORWARD(2, mulss, ARGS_XO) AFORWARD(2, orps, ARGS_XO) AFORWARD(2, packssdw, ARGS_XO) AFORWARD(2, packusdw, ARGS_XO) @@ -382,11 +399,14 @@ public: SFORWARD(2, rcpps, ARGS_XO) AFORWARD(3, shufps, ARGS_XOI) AFORWARD(2, subps, ARGS_XO) + AFORWARD(2, unpcklps, ARGS_XO) + AFORWARD(2, unpcklpd, ARGS_XO) AFORWARD(2, xorps, ARGS_XO) FORWARD_SSE_XMM0(pblendvb) FORWARD(2, AVX, vbroadcastss, ARGS_XO) + FORWARD(2, AVX, vbroadcastsd, const Ymm&, const Address&) FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&) FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&) FORWARD(3, FMA, vfmadd213ps, ARGS_XXO)