GS: Add double operations to GSVector/GSNewCodeGenerator

This commit is contained in:
TellowKrinkle 2022-05-11 08:58:46 -05:00 committed by tellowkrinkle
parent 82de13d95a
commit b6c1b3fb96
4 changed files with 141 additions and 0 deletions

View File

@ -67,6 +67,7 @@ CONSTINIT const GSVector4 GSVector4::m_two = cxpr(2.0f);
CONSTINIT const GSVector4 GSVector4::m_four = cxpr(4.0f); CONSTINIT const GSVector4 GSVector4::m_four = cxpr(4.0f);
CONSTINIT const GSVector4 GSVector4::m_x4b000000 = cxpr(0x4b000000); CONSTINIT const GSVector4 GSVector4::m_x4b000000 = cxpr(0x4b000000);
CONSTINIT const GSVector4 GSVector4::m_x4f800000 = cxpr(0x4f800000); CONSTINIT const GSVector4 GSVector4::m_x4f800000 = cxpr(0x4f800000);
CONSTINIT const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull);
CONSTINIT const GSVector4 GSVector4::m_max = cxpr(FLT_MAX); CONSTINIT const GSVector4 GSVector4::m_max = cxpr(FLT_MAX);
CONSTINIT const GSVector4 GSVector4::m_min = cxpr(FLT_MIN); CONSTINIT const GSVector4 GSVector4::m_min = cxpr(FLT_MIN);
@ -78,6 +79,7 @@ CONSTINIT const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff);
CONSTINIT const GSVector8 GSVector8::m_x80000000 = cxpr(0x80000000); CONSTINIT const GSVector8 GSVector8::m_x80000000 = cxpr(0x80000000);
CONSTINIT const GSVector8 GSVector8::m_x4b000000 = cxpr(0x4b000000); CONSTINIT const GSVector8 GSVector8::m_x4b000000 = cxpr(0x4b000000);
CONSTINIT const GSVector8 GSVector8::m_x4f800000 = cxpr(0x4f800000); CONSTINIT const GSVector8 GSVector8::m_x4f800000 = cxpr(0x4f800000);
CONSTINIT const GSVector8 GSVector8::m_xc1e00000000fffff = cxpr64(0xc1e00000000fffffull);
CONSTINIT const GSVector8 GSVector8::m_max = cxpr(FLT_MAX); CONSTINIT const GSVector8 GSVector8::m_max = cxpr(FLT_MAX);
CONSTINIT const GSVector8 GSVector8::m_min = cxpr(FLT_MAX); CONSTINIT const GSVector8 GSVector8::m_min = cxpr(FLT_MAX);

View File

@ -28,6 +28,11 @@ class alignas(16) GSVector4
{ {
} }
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y)
: U64{x, y}
{
}
public: public:
union union
{ {
@ -36,6 +41,7 @@ public:
struct { float left, top, right, bottom; }; struct { float left, top, right, bottom; };
float v[4]; float v[4];
float F32[4]; float F32[4];
double F64[2];
s8 I8[16]; s8 I8[16];
s16 I16[8]; s16 I16[8];
s32 I32[4]; s32 I32[4];
@ -55,6 +61,7 @@ public:
static const GSVector4 m_four; static const GSVector4 m_four;
static const GSVector4 m_x4b000000; static const GSVector4 m_x4b000000;
static const GSVector4 m_x4f800000; static const GSVector4 m_x4f800000;
static const GSVector4 m_xc1e00000000fffff;
static const GSVector4 m_max; static const GSVector4 m_max;
static const GSVector4 m_min; static const GSVector4 m_min;
@ -82,6 +89,16 @@ public:
return GSVector4(cxpr_init, x, x, x, x); return GSVector4(cxpr_init, x, x, x, x);
} }
constexpr static GSVector4 cxpr64(u64 x, u64 y)
{
return GSVector4(cxpr_init, x, y);
}
constexpr static GSVector4 cxpr64(u64 x)
{
return GSVector4(cxpr_init, x, x);
}
__forceinline GSVector4(float x, float y, float z, float w) __forceinline GSVector4(float x, float y, float z, float w)
{ {
m = _mm_set_ps(w, z, y, x); m = _mm_set_ps(w, z, y, x);
@ -119,6 +136,11 @@ public:
{ {
} }
__forceinline explicit GSVector4(__m128d m)
: m(_mm_castpd_ps(m))
{
}
__forceinline explicit GSVector4(float f) __forceinline explicit GSVector4(float f)
{ {
*this = f; *this = f;
@ -162,6 +184,11 @@ public:
#endif #endif
__forceinline static GSVector4 f64(double x, double y)
{
return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x)));
}
__forceinline void operator=(const GSVector4& v) __forceinline void operator=(const GSVector4& v)
{ {
m = v.m; m = v.m;
@ -858,6 +885,36 @@ GSVector.h:2973:15: error: shadows template parm 'int i'
return GSVector4(_mm_cmple_ps(v1, v2)); return GSVector4(_mm_cmple_ps(v1, v2));
} }
__forceinline GSVector4 mul64(const GSVector4& v) const
{
return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
}
__forceinline GSVector4 add64(const GSVector4& v) const
{
return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
}
__forceinline GSVector4 sub64(const GSVector4& v) const
{
return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)));
}
__forceinline static GSVector4 f32to64(const GSVector4& v)
{
return GSVector4(_mm_cvtps_pd(v.m));
}
__forceinline static GSVector4 f32to64(const void* p)
{
return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
}
__forceinline GSVector4i f64toi32(bool truncate = true) const
{
return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
}
// clang-format off // clang-format off
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
@ -907,4 +964,9 @@ GSVector.h:2973:15: error: shadows template parm 'int i'
} }
#endif #endif
__forceinline static GSVector4 broadcast64(const void* d)
{
return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
}
}; };

View File

@ -32,6 +32,11 @@ class alignas(32) GSVector8
{ {
} }
constexpr GSVector8(cxpr_init_tag, u64 x, u64 y, u64 z, u64 w)
: U64{x, y, z, w}
{
}
public: public:
union union
{ {
@ -39,6 +44,7 @@ public:
struct { float r0, g0, b0, a0, r1, g1, b1, a1; }; struct { float r0, g0, b0, a0, r1, g1, b1, a1; };
float v[8]; float v[8];
float F32[8]; float F32[8];
double F64[4];
s8 I8[32]; s8 I8[32];
s16 I16[16]; s16 I16[16];
s32 I32[8]; s32 I32[8];
@ -57,6 +63,7 @@ public:
static const GSVector8 m_x80000000; static const GSVector8 m_x80000000;
static const GSVector8 m_x4b000000; static const GSVector8 m_x4b000000;
static const GSVector8 m_x4f800000; static const GSVector8 m_x4f800000;
static const GSVector8 m_xc1e00000000fffff;
static const GSVector8 m_max; static const GSVector8 m_max;
static const GSVector8 m_min; static const GSVector8 m_min;
@ -87,6 +94,16 @@ public:
return cxpr(static_cast<int>(x)); return cxpr(static_cast<int>(x));
} }
constexpr static GSVector8 cxpr64(u64 x, u64 y, u64 z, u64 w)
{
return GSVector8(cxpr_init, x, y, z, w);
}
constexpr static GSVector8 cxpr64(u64 x)
{
return GSVector8(cxpr_init, x, x, x, x);
}
__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
{ {
m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0); m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
@ -142,6 +159,11 @@ public:
{ {
} }
__forceinline explicit GSVector8(__m256d m)
: m(_mm256_castpd_ps(m))
{
}
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
__forceinline explicit GSVector8(const GSVector8i& v); __forceinline explicit GSVector8(const GSVector8i& v);
@ -773,6 +795,36 @@ public:
return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ)); return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
} }
__forceinline GSVector8 mul64(const GSVector8& v) const
{
return GSVector8(_mm256_mul_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
}
__forceinline GSVector8 add64(const GSVector8& v) const
{
return GSVector8(_mm256_add_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
}
__forceinline GSVector8 sub64(const GSVector8& v) const
{
return GSVector8(_mm256_sub_pd(_mm256_castps_pd(m), _mm256_castps_pd(v.m)));
}
__forceinline static GSVector8 f32to64(const GSVector4& v)
{
return GSVector8(_mm256_cvtps_pd(v.m));
}
__forceinline static GSVector8 f32to64(const void* p)
{
return GSVector8(_mm256_cvtps_pd(_mm_load_ps(static_cast<const float*>(p))));
}
__forceinline GSVector4i f64toi32(bool truncate = true) const
{
return GSVector4i(truncate ? _mm256_cvttpd_epi32(_mm256_castps_pd(m)) : _mm256_cvtpd_epi32(_mm256_castps_pd(m)));
}
// clang-format off // clang-format off
// x = v[31:0] / v[159:128] // x = v[31:0] / v[159:128]
@ -888,6 +940,11 @@ public:
return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f))); return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
} }
__forceinline static GSVector8 broadcast64(const void* d)
{
return GSVector8(_mm256_broadcast_sd(static_cast<const double*>(d)));
}
// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element // TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
#endif #endif

View File

@ -312,9 +312,18 @@ public:
FORWARD_JUMP(jmp) FORWARD_JUMP(jmp)
AFORWARD(2, addps, ARGS_XO) AFORWARD(2, addps, ARGS_XO)
AFORWARD(2, addpd, ARGS_XO)
SFORWARD(2, cvtdq2ps, ARGS_XO) SFORWARD(2, cvtdq2ps, ARGS_XO)
SFORWARD(2, cvtpd2dq, ARGS_XO)
SFORWARD(2, cvtpd2ps, ARGS_XO)
SFORWARD(2, cvttpd2dq, ARGS_XO)
SFORWARD(2, cvtps2dq, ARGS_XO) SFORWARD(2, cvtps2dq, ARGS_XO)
SFORWARD(2, cvtps2pd, ARGS_XO)
SFORWARD(2, cvtsd2si, const AddressReg&, const Operand&);
AFORWARD(2, cvtsd2ss, ARGS_XO)
AFORWARD(2, cvtss2sd, ARGS_XO)
SFORWARD(2, cvttps2dq, ARGS_XO) SFORWARD(2, cvttps2dq, ARGS_XO)
SFORWARD(2, cvttsd2si, const AddressReg&, const Operand&);
SFORWARD(3, extractps, const Operand&, const Xmm&, u8) SFORWARD(3, extractps, const Operand&, const Xmm&, u8)
AFORWARD(2, maxps, ARGS_XO) AFORWARD(2, maxps, ARGS_XO)
AFORWARD(2, minps, ARGS_XO) AFORWARD(2, minps, ARGS_XO)
@ -324,13 +333,21 @@ public:
SFORWARD(2, movd, const Reg32&, const Xmm&) SFORWARD(2, movd, const Reg32&, const Xmm&)
SFORWARD(2, movd, const Xmm&, const Address&) SFORWARD(2, movd, const Xmm&, const Address&)
SFORWARD(2, movd, const Xmm&, const Reg32&) SFORWARD(2, movd, const Xmm&, const Reg32&)
SFORWARD(2, movddup, ARGS_XO);
SFORWARD(2, movdqa, ARGS_XO) SFORWARD(2, movdqa, ARGS_XO)
SFORWARD(2, movdqa, const Address&, const Xmm&) SFORWARD(2, movdqa, const Address&, const Xmm&)
SFORWARD(2, movhps, ARGS_XO) SFORWARD(2, movhps, ARGS_XO)
SFORWARD(2, movhps, const Address&, const Xmm&) SFORWARD(2, movhps, const Address&, const Xmm&)
SFORWARD(2, movq, const Address&, const Xmm&) SFORWARD(2, movq, const Address&, const Xmm&)
SFORWARD(2, movq, const Xmm&, const Address&) SFORWARD(2, movq, const Xmm&, const Address&)
SFORWARD(2, movsd, const Address&, const Xmm&)
SFORWARD(2, movsd, const Xmm&, const Address&)
SFORWARD(2, movss, const Address&, const Xmm&)
SFORWARD(2, movss, const Xmm&, const Address&)
AFORWARD(2, mulpd, ARGS_XO)
AFORWARD(2, mulps, ARGS_XO) AFORWARD(2, mulps, ARGS_XO)
AFORWARD(2, mulsd, ARGS_XO)
AFORWARD(2, mulss, ARGS_XO)
AFORWARD(2, orps, ARGS_XO) AFORWARD(2, orps, ARGS_XO)
AFORWARD(2, packssdw, ARGS_XO) AFORWARD(2, packssdw, ARGS_XO)
AFORWARD(2, packusdw, ARGS_XO) AFORWARD(2, packusdw, ARGS_XO)
@ -382,11 +399,14 @@ public:
SFORWARD(2, rcpps, ARGS_XO) SFORWARD(2, rcpps, ARGS_XO)
AFORWARD(3, shufps, ARGS_XOI) AFORWARD(3, shufps, ARGS_XOI)
AFORWARD(2, subps, ARGS_XO) AFORWARD(2, subps, ARGS_XO)
AFORWARD(2, unpcklps, ARGS_XO)
AFORWARD(2, unpcklpd, ARGS_XO)
AFORWARD(2, xorps, ARGS_XO) AFORWARD(2, xorps, ARGS_XO)
FORWARD_SSE_XMM0(pblendvb) FORWARD_SSE_XMM0(pblendvb)
FORWARD(2, AVX, vbroadcastss, ARGS_XO) FORWARD(2, AVX, vbroadcastss, ARGS_XO)
FORWARD(2, AVX, vbroadcastsd, const Ymm&, const Address&)
FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&) FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&)
FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&) FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&)
FORWARD(3, FMA, vfmadd213ps, ARGS_XXO) FORWARD(3, FMA, vfmadd213ps, ARGS_XXO)