diff --git a/src/common/gsvector.h b/src/common/gsvector.h index fdef42676..0054699a0 100644 --- a/src/common/gsvector.h +++ b/src/common/gsvector.h @@ -1,6 +1,10 @@ // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) +// +// Lightweight wrapper over native SIMD types for cross-platform vector code. +// + #pragma once #include "common/intrin.h" diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 874371651..e4991af5e 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -284,8 +284,6 @@ public: vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s))))); } - ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } - ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s)))); @@ -537,16 +535,6 @@ public: return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); } - ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const - { - return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); - } - - ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const - { - return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); - } - ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); @@ -828,22 +816,11 @@ public: ALWAYS_INLINE operator float32x2_t() const { return v2s; } ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); } - ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); } - ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); } - - ALWAYS_INLINE GSVector2 rcpnr() const - { - float32x2_t recip = vrecpe_f32(v2s); - recip = vmul_f32(recip, vrecps_f32(recip, v2s)); - return GSVector2(recip); - } - #ifdef CPU_ARCH_ARM64 ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); } - ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); } #else @@ -1160,16 +1137,8 @@ public: // rect - ALWAYS_INLINE int width() const { return right - left; } - - ALWAYS_INLINE int height() const { return bottom - top; } - - ALWAYS_INLINE GSVector4i rsize() const - { - return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); - } - - ALWAYS_INLINE s32 rarea() const { return width() * height(); } + ALWAYS_INLINE s32 width() const { return right - left; } + ALWAYS_INLINE s32 height() const { return bottom - top; } ALWAYS_INLINE bool rempty() const { @@ -1456,8 +1425,6 @@ public: vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s))))); } - ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } - ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { #ifdef CPU_ARCH_ARM64 @@ -2004,50 +1971,6 @@ public: ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { const int32x4_t res = veorq_s32(v4s, v.v4s); @@ -2355,36 +2278,39 @@ public: ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); } - // clang-format off +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \ + { \ + return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \ + } +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3); -#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + VECTOR4i_SHUFFLE_1(x, 0); + VECTOR4i_SHUFFLE_1(y, 1); + VECTOR4i_SHUFFLE_1(z, 2); + VECTOR4i_SHUFFLE_1(w, 3); -#define VECTOR4i_SHUFFLE_1(xs, xn) \ - VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ - - VECTOR4i_SHUFFLE_1(x, 0) - VECTOR4i_SHUFFLE_1(y, 1) - VECTOR4i_SHUFFLE_1(z, 2) - VECTOR4i_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4i_SHUFFLE_1 +#undef VECTOR4i_SHUFFLE_2 +#undef VECTOR4i_SHUFFLE_3 +#undef VECTOR4i_SHUFFLE_4 }; class alignas(16) GSVector4 @@ -2400,6 +2326,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -2442,6 +2370,10 @@ public: constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { const float arr[4] = {x, y, z, w}; @@ -2475,12 +2407,28 @@ public: ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); -#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE static GSVector4 f64(double x, double y) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1))); - } +#else + GSVector4 ret; + ret.F64[0] = x; + ret.F64[1] = y; + return ret; #endif + } + + ALWAYS_INLINE static GSVector4 f64(double x) + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x))); +#else + GSVector4 ret; + ret.F64[0] = ret.F64[1] = x; + return ret; +#endif + } ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); } @@ -2498,15 +2446,6 @@ public: ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); } - ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); } - - ALWAYS_INLINE GSVector4 rcpnr() const - { - float32x4_t recip = vrecpeq_f32(v4s); - recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s)); - return GSVector4(recip); - } - #ifdef _M_ARM64 ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); } @@ -2529,27 +2468,6 @@ public: #endif - ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const - { - return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s)); - } - ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const - { - return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s)); - } - ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; } - ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; } - - ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const - { - return a.madd(b, *this); // *this + a * b - } - - ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const - { - return a.nmadd(b, *this); // *this - a * b - } - #ifdef CPU_ARCH_ARM64 ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); } @@ -2729,6 +2647,28 @@ public: return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i); } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst))); +#else + GSVector4 ret; + ret.F64[dst] = v; + return ret; +#endif + } + + template + ALWAYS_INLINE double extract64() const + { +#ifdef CPU_ARCH_ARM64 + return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src); +#else + return F64[src]; +#endif + } + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); } ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); } @@ -2903,73 +2843,219 @@ public: return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s))); } -#ifdef CPU_ARCH_ARM64 - // Not in ARM32 - ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const { - return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]); +#endif } ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const { - return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]); +#endif } ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const { - return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL)); } + + ALWAYS_INLINE GSVector4 sqrt64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 sqr64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 floor64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); +#endif } ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s)))); +#else + return GSVector4::f64(static_cast(vgetq_lane_f32(v.v4s, 0)), static_cast(vgetq_lane_f32(v.v4s, 1))); +#endif } ALWAYS_INLINE static GSVector4 f32to64(const void* p) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast(p))))); +#else + const float* fp = static_cast(p); + return GSVector4::f64(static_cast(fp[0]), static_cast(fp[1])); +#endif } ALWAYS_INLINE GSVector4i f64toi32() const { +#ifdef CPU_ARCH_ARM64 const s32 low = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0)); const s32 high = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1)); +#else + const s32 low = static_cast(F64[0]); + const s32 high = static_cast(F64[1]); +#endif return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1)); } -#endif +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \ + { \ + return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \ + } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const \ + { \ + return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); \ + } - // clang-format off +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); } +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3); -#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + VECTOR4_SHUFFLE_1(x, 0); + VECTOR4_SHUFFLE_1(y, 1); + VECTOR4_SHUFFLE_1(z, 2); + VECTOR4_SHUFFLE_1(w, 3); -#define VECTOR4_SHUFFLE_1(xs, xn) \ - VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ - - VECTOR4_SHUFFLE_1(x, 0) - VECTOR4_SHUFFLE_1(y, 1) - VECTOR4_SHUFFLE_1(z, 2) - VECTOR4_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4_SHUFFLE_1 +#undef VECTOR4_SHUFFLE_2 +#undef VECTOR4_SHUFFLE_3 +#undef VECTOR4_SHUFFLE_4 ALWAYS_INLINE GSVector4 broadcast32() const { diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index 8f2f26472..b460c4ca8 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -235,8 +235,6 @@ public: return ret; } - ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } - GSVector2i shuffle8(const GSVector2i& mask) const { ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf])); @@ -601,15 +599,6 @@ public: GSVector2 neg() const { return GSVector2(-x, -y); } - GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); } - - GSVector2 rcpnr() const - { - GSVector2 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); } GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); } @@ -973,16 +962,8 @@ public: // rect ALWAYS_INLINE s32 width() const { return right - left; } - ALWAYS_INLINE s32 height() const { return bottom - top; } - ALWAYS_INLINE GSVector4i rsize() const - { - return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); - } - - ALWAYS_INLINE s32 rarea() const { return width() * height(); } - ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; } // TODO: Optimize for no-simd, this generates crap code. @@ -1185,8 +1166,6 @@ public: return ret; } - ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } - GSVector4i shuffle8(const GSVector4i& mask) const { ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf])); @@ -1447,64 +1426,14 @@ public: GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); } - GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); } - - GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); } - GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); } - GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); } - GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); } GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); } GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a_.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if constexpr (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; } GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); } @@ -1746,36 +1675,36 @@ public: ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); } ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); } - // clang-format off - // l/h/lh not implemented until needed +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]); } -#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]);} +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3); -#define VECTOR4i_SHUFFLE_1(xs, xn) \ - VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ + VECTOR4i_SHUFFLE_1(x, 0); + VECTOR4i_SHUFFLE_1(y, 1); + VECTOR4i_SHUFFLE_1(z, 2); + VECTOR4i_SHUFFLE_1(w, 3); - VECTOR4i_SHUFFLE_1(x, 0) - VECTOR4i_SHUFFLE_1(y, 1) - VECTOR4i_SHUFFLE_1(z, 2) - VECTOR4i_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4i_SHUFFLE_1 +#undef VECTOR4i_SHUFFLE_2 +#undef VECTOR4i_SHUFFLE_3 +#undef VECTOR4i_SHUFFLE_4 }; class alignas(16) GSVector4 @@ -1791,6 +1720,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -1832,6 +1763,10 @@ public: constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { this->x = x; @@ -1881,6 +1816,13 @@ public: return ret; } + ALWAYS_INLINE static GSVector4 f64(double x) + { + GSVector4 ret; + ret.F64[0] = ret.F64[1] = x; + return ret; + } + ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; } u32 rgba32() const { return GSVector4i(*this).rgba32(); } @@ -1893,37 +1835,10 @@ public: GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); } - GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); } - - GSVector4 rcpnr() const - { - GSVector4 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); } GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); } - GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; } - - GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; } - - GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; } - - GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; } - - GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const - { - return a_.madd(b_, *this); // *this + a * b - } - - GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const - { - return a_.nmadd(b_, *this); // *this - a * b - } - GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); } GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); } @@ -2045,6 +1960,20 @@ public: return I32[i]; } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { + GSVector4 ret; + ret.F64[dst] = v; + return ret; + } + + template + ALWAYS_INLINE double extract64() const + { + return F64[src]; + } + ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); } ALWAYS_INLINE static constexpr GSVector4 xffffffff() @@ -2300,6 +2229,71 @@ public: return ret; } + ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const + { + return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]); + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const + { + return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1])); + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const + { + return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1])); + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const {return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL(); } + + ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); } + + ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); } + + ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); } + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { GSVector4 ret; @@ -2323,36 +2317,40 @@ public: return GSVector4i(static_cast(F64[0]), static_cast(F64[1]), 0, 0); } - // clang-format off +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const \ + { \ + return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); \ + } -#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); } +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3); -#define VECTOR4_SHUFFLE_1(xs, xn) \ - VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ + VECTOR4_SHUFFLE_1(x, 0); + VECTOR4_SHUFFLE_1(y, 1); + VECTOR4_SHUFFLE_1(z, 2); + VECTOR4_SHUFFLE_1(w, 3); - VECTOR4_SHUFFLE_1(x, 0) - VECTOR4_SHUFFLE_1(y, 1) - VECTOR4_SHUFFLE_1(z, 2) - VECTOR4_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4_SHUFFLE_1 +#undef VECTOR4_SHUFFLE_2 +#undef VECTOR4_SHUFFLE_3 +#undef VECTOR4_SHUFFLE_4 ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); } diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index a2b31b4cc..b2ab9d9f9 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin // SPDX-License-Identifier: LGPL-3.0+ // +// Lightweight wrapper over native SIMD types for cross-platform vector code. // Rewritten and NEON+No-SIMD variants added for DuckStation. // @@ -63,11 +64,9 @@ public: GSVector2i() = default; ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } - ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } - ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) { return GSVector2i(cxpr_init, s0, s1, s2, s3); @@ -79,26 +78,26 @@ public: } ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); } - ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); } - ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} { } - - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); - - ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); - ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {} - ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } - ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + ALWAYS_INLINE GSVector2i& operator=(s32 i) + { + m = _mm_set1_epi32(i); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator=(__m128i m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128i() const { return m; } @@ -142,10 +141,7 @@ public: ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); } ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); } - ALWAYS_INLINE s32 addv_s32() const - { - return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); - } + ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); } ALWAYS_INLINE u8 minv_u8() const { @@ -180,11 +176,8 @@ public: } ALWAYS_INLINE s32 minv_s32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE u32 minv_u32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE s32 maxv_s32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE u32 maxv_u32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } @@ -203,7 +196,7 @@ public: template ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const { -#if defined(__AVX2__) +#if defined(CPU_ARCH_AVX2) return GSVector2i(_mm_blend_epi32(m, v.m, mask)); #else constexpr s32 bit1 = ((mask & 2) * 3) << 1; @@ -217,8 +210,6 @@ public: return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); } - ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } - ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); } ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); } @@ -333,39 +324,25 @@ public: #endif ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); } - ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); } - ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); } ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); } @@ -399,7 +376,6 @@ public: ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); } ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); } - ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); } template @@ -442,24 +418,35 @@ public: } ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } - - ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64((__m128i*)p)); } - + ALWAYS_INLINE static GSVector2i load(const void* p) + { + return GSVector2i(_mm_loadl_epi64(static_cast(p))); + } ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64((__m128i*)p, v.m); } - + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); } - ALWAYS_INLINE void operator&=(const GSVector2i& v) { m = _mm_and_si128(m, v); } - ALWAYS_INLINE void operator|=(const GSVector2i& v) { m = _mm_or_si128(m, v); } - ALWAYS_INLINE void operator^=(const GSVector2i& v) { m = _mm_xor_si128(m, v); } + ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v) + { + m = _mm_and_si128(m, v); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v) + { + m = _mm_or_si128(m, v); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v) + { + m = _mm_xor_si128(m, v); + return *this; + } ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) { @@ -485,6 +472,7 @@ public: ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); } ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); } + ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); } @@ -500,7 +488,6 @@ class alignas(16) GSVector2 static constexpr cxpr_init_tag cxpr_init{}; constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} - constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} public: @@ -530,28 +517,20 @@ public: GSVector2() = default; constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } - constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } - constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } - constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); } - ALWAYS_INLINE GSVector2(int x, int y) { GSVector2i v_(x, y); - m = _mm_cvtepi32_ps(v_.m); } ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {} - ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {} - ALWAYS_INLINE explicit GSVector2(float f) { *this = f; } - ALWAYS_INLINE explicit GSVector2(int i) { #ifdef CPU_ARCH_AVX2 @@ -563,38 +542,22 @@ public: ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); - ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); - - ALWAYS_INLINE void operator=(float f) + ALWAYS_INLINE GSVector2& operator=(float f) { -#if CPU_ARCH_AVX2 - - m = _mm_broadcastss_ps(_mm_load_ss(&f)); - -#else - m = _mm_set1_ps(f); - -#endif + return *this; } - ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + ALWAYS_INLINE GSVector2& operator=(__m128 m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128() const { return m; } ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); } - ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); } - - ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); } - - ALWAYS_INLINE GSVector2 rcpnr() const - { - GSVector2 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - ALWAYS_INLINE GSVector2 floor() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); @@ -657,27 +620,77 @@ public: ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); } - ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd((double*)p))); } + ALWAYS_INLINE static GSVector2 load(const void* p) + { + return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); + } ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); } - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } + ALWAYS_INLINE static void store(void* p, const GSVector2& v) + { + _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); + } ALWAYS_INLINE GSVector2 operator-() const { return neg(); } - ALWAYS_INLINE void operator+=(const GSVector2& v_) { m = _mm_add_ps(m, v_); } - ALWAYS_INLINE void operator-=(const GSVector2& v_) { m = _mm_sub_ps(m, v_); } - ALWAYS_INLINE void operator*=(const GSVector2& v_) { m = _mm_mul_ps(m, v_); } - ALWAYS_INLINE void operator/=(const GSVector2& v_) { m = _mm_div_ps(m, v_); } + ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_) + { + m = _mm_add_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_) + { + m = _mm_sub_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_) + { + m = _mm_mul_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_) + { + m = _mm_div_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } - ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } - ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); } - ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); } + ALWAYS_INLINE GSVector2& operator+=(float f) + { + *this += GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator-=(float f) + { + *this -= GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator*=(float f) + { + *this *= GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator/=(float f) + { + *this /= GSVector2(f); + return *this; + } - ALWAYS_INLINE void operator&=(const GSVector2& v_) { m = _mm_and_ps(m, v_); } - ALWAYS_INLINE void operator|=(const GSVector2& v_) { m = _mm_or_ps(m, v_); } - ALWAYS_INLINE void operator^=(const GSVector2& v_) { m = _mm_xor_ps(m, v_); } + ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_) + { + m = _mm_and_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_) + { + m = _mm_or_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_) + { + m = _mm_xor_ps(m, v_); + return *this; + } ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) { @@ -752,6 +765,8 @@ public: return GSVector2(_mm_cmple_ps(v1, v2)); } + ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); + ALWAYS_INLINE GSVector2 xy() const { return *this; } ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); } ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); } @@ -811,11 +826,9 @@ public: { return GSVector4i(cxpr_init, x, y, z, w); } - ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } - ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); @@ -828,9 +841,7 @@ public: } ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); } - ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } - ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); @@ -844,36 +855,30 @@ public: ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); - ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); - ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {} - ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } - ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + ALWAYS_INLINE GSVector4i& operator=(s32 i) + { + m = _mm_set1_epi32(i); + return *this; + } + ALWAYS_INLINE GSVector4i& operator=(__m128i m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128i() const { return m; } - // rect - ALWAYS_INLINE s32 width() const { return right - left; } - ALWAYS_INLINE s32 height() const { return bottom - top; } - ALWAYS_INLINE GSVector4i rsize() const - { - return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); - } - - ALWAYS_INLINE s32 rarea() const { return width() * height(); } - ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; } ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); } @@ -882,8 +887,6 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - // - ALWAYS_INLINE u32 rgba32() const { GSVector4i v = *this; @@ -1044,7 +1047,7 @@ public: template ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const { -#if defined(__AVX2__) +#if defined(CPU_ARCH_AVX2) return GSVector4i(_mm_blend_epi32(m, v.m, mask)); #else constexpr s32 bit3 = ((mask & 8) * 3) << 3; @@ -1060,8 +1063,6 @@ public: return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); } - ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } - ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); } ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); } @@ -1237,99 +1238,30 @@ public: #endif ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); } - ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); } - ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); } ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); } - ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); } - ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); } - ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); } - - ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); } - - ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a_.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { - // pxor, ptest, je - - GSVector4i t = *this ^ v; - + const GSVector4i t = *this ^ v; return _mm_testz_si128(t, t) != 0; } @@ -1420,15 +1352,21 @@ public: return _mm_extract_epi64(m, i); } - ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); } + ALWAYS_INLINE static GSVector4i loadnt(const void* p) + { + return GSVector4i(_mm_stream_load_si128(static_cast(p))); + } ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } - ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); } + ALWAYS_INLINE static GSVector4i loadl(const void* p) + { + return GSVector4i(_mm_loadl_epi64(static_cast(p))); + } ALWAYS_INLINE static GSVector4i loadh(const void* p) { - return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p))); + return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast(p)))); } ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) @@ -1439,18 +1377,19 @@ public: template ALWAYS_INLINE static GSVector4i load(const void* p) { - return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p)); + return GSVector4i(aligned ? _mm_load_si128(static_cast(p)) : + _mm_loadu_si128(static_cast(p))); } ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); } - - ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); } - - ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); } + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); } + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) + { + _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m)); + } ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) { @@ -1462,20 +1401,30 @@ public: ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { if constexpr (aligned) - _mm_store_si128((__m128i*)p, v.m); + _mm_store_si128(static_cast<__m128i*>(p), v.m); else - _mm_storeu_si128((__m128i*)p, v.m); + _mm_storeu_si128(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); } - ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); } - ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); } - ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); } + ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v) + { + m = _mm_and_si128(m, v); + return *this; + } + ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v) + { + m = _mm_or_si128(m, v); + return *this; + } + ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v) + { + m = _mm_xor_si128(m, v); + return *this; + } ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) { @@ -1493,14 +1442,12 @@ public: } ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); } ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } + ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } @@ -1508,38 +1455,52 @@ public: ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); } - // clang-format off +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \ + { \ + return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn))); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \ + { \ + return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \ + { \ + return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const \ + { \ + return GSVector4i( \ + _mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn))); \ + } -#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \ +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3) -#define VECTOR4i_SHUFFLE_1(xs, xn) \ - VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ + VECTOR4i_SHUFFLE_1(x, 0); + VECTOR4i_SHUFFLE_1(y, 1); + VECTOR4i_SHUFFLE_1(z, 2); + VECTOR4i_SHUFFLE_1(w, 3) - VECTOR4i_SHUFFLE_1(x, 0) - VECTOR4i_SHUFFLE_1(y, 1) - VECTOR4i_SHUFFLE_1(z, 2) - VECTOR4i_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4i_SHUFFLE_1 +#undef VECTOR4i_SHUFFLE_2 +#undef VECTOR4i_SHUFFLE_3 +#undef VECTOR4i_SHUFFLE_4 }; class alignas(16) GSVector4 @@ -1555,6 +1516,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -1586,35 +1549,29 @@ public: GSVector4() = default; constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } - constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } - constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } - constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } - constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); } - ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); } - ALWAYS_INLINE GSVector4(int x, int y, int z, int w) { GSVector4i v_(x, y, z, w); - m = _mm_cvtepi32_ps(v_.m); } - ALWAYS_INLINE GSVector4(int x, int y) { m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y))); } ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {} - ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd()))) { @@ -1637,24 +1594,20 @@ public: ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); - ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); - ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); } + ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); } - ALWAYS_INLINE void operator=(float f) + ALWAYS_INLINE GSVector4& operator=(float f) { -#if CPU_ARCH_AVX2 - - m = _mm_broadcastss_ps(_mm_load_ss(&f)); - -#else - m = _mm_set1_ps(f); - -#endif + return *this; } - ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + ALWAYS_INLINE GSVector4& operator=(__m128 m_) + { + this->m = m_; + return *this; + } ALWAYS_INLINE operator __m128() const { return m; } @@ -1668,15 +1621,6 @@ public: ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); } - ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); } - - ALWAYS_INLINE GSVector4 rcpnr() const - { - GSVector4 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - ALWAYS_INLINE GSVector4 floor() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); @@ -1684,52 +1628,6 @@ public: ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } - ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const - { -#ifdef CPU_ARCH_AVX2 - return GSVector4(_mm_fmadd_ps(m, a_, b_)); -#else - return *this * a_ + b_; -#endif - } - - ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const - { -#ifdef CPU_ARCH_AVX2 - return GSVector4(_mm_fmsub_ps(m, a_, b_)); -#else - return *this * a_ - b_; -#endif - } - - ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const - { -#ifdef CPU_ARCH_AVX2 - return GSVector4(_mm_fnmadd_ps(m, a_, b_)); -#else - return b_ - *this * a_; -#endif - } - - ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const - { -#ifdef CPU_ARCH_AVX2 - return GSVector4(_mm_fnmsub_ps(m, a_, b_)); -#else - return -b_ - *this * a_; -#endif - } - - ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const - { - return a_.madd(b_, *this); // *this + a * b - } - - ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const - { - return a_.nmadd(b_, *this); // *this - a * b - } - ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); } ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); } @@ -1824,52 +1722,132 @@ public: return _mm_extract_ps(m, i); } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { + if constexpr (dst == 0) + return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v))); + else + return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0)); + } + + template + ALWAYS_INLINE double extract64() const + { + double ret; + if constexpr (src == 0) + _mm_storel_pd(&ret, _mm_castps_pd(m)); + else + _mm_storeh_pd(&ret, _mm_castps_pd(m)); + return ret; + } + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); } + ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); } - ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); } + ALWAYS_INLINE static GSVector4 loadl(const void* p) + { + return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); + } ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); } template ALWAYS_INLINE static GSVector4 load(const void* p) { - return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); + return GSVector4(aligned ? _mm_load_ps(static_cast(p)) : _mm_loadu_ps(static_cast(p))); } - ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); } - - ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } - - ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); } + ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast(p), v.m); } + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) + { + _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); + } + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) + { + _mm_storeh_pd(static_cast(p), _mm_castps_pd(v.m)); + } template ALWAYS_INLINE static void store(void* p, const GSVector4& v) { if constexpr (aligned) - _mm_store_ps((float*)p, v.m); + _mm_store_ps(static_cast(p), v.m); else - _mm_storeu_ps((float*)p, v.m); + _mm_storeu_ps(static_cast(p), v.m); } ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } ALWAYS_INLINE GSVector4 operator-() const { return neg(); } - ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); } - ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); } - ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); } - ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); } + ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_) + { + m = _mm_add_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } - ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } - ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } - ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); } + ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_) + { + m = _mm_sub_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); } - ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); } - ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); } + ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_) + { + m = _mm_mul_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_) + { + m = _mm_div_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator+=(float f) + { + *this += GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator-=(float f) + { + *this -= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator*=(float f) + { + *this *= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator/=(float f) + { + *this /= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_) + { + m = _mm_and_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_) + { + m = _mm_or_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_) + { + m = _mm_xor_ps(m, v_); + return *this; + } ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) { @@ -1959,6 +1937,59 @@ public: return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); } + ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const + { + return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const + { + return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const + { + return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const + { + return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const + { + return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL)); } + + ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); } + + ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); } + + ALWAYS_INLINE GSVector4 floor64() const + { + return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + } + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); } ALWAYS_INLINE static GSVector4 f32to64(const void* p) @@ -1968,36 +1999,43 @@ public: ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); } - // clang-format off +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \ + { \ + return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); \ + } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const \ + { \ + return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); \ + } -#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3); -#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ - VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3); -#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ - VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0); \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1); \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2); \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3); -#define VECTOR4_SHUFFLE_1(xs, xn) \ - VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ - VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ - VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ - VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ + VECTOR4_SHUFFLE_1(x, 0); + VECTOR4_SHUFFLE_1(y, 1); + VECTOR4_SHUFFLE_1(z, 2); + VECTOR4_SHUFFLE_1(w, 3); - VECTOR4_SHUFFLE_1(x, 0) - VECTOR4_SHUFFLE_1(y, 1) - VECTOR4_SHUFFLE_1(z, 2) - VECTOR4_SHUFFLE_1(w, 3) - - // clang-format on +#undef VECTOR4_SHUFFLE_1 +#undef VECTOR4_SHUFFLE_2 +#undef VECTOR4_SHUFFLE_3 +#undef VECTOR4_SHUFFLE_4 #if CPU_ARCH_AVX2 @@ -2007,7 +2045,7 @@ public: ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { - return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f))); + return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast(f)))); } #endif