diff --git a/src/common/gsvector.h b/src/common/gsvector.h index fdef42676..0054699a0 100644 --- a/src/common/gsvector.h +++ b/src/common/gsvector.h @@ -1,6 +1,10 @@ // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) +// +// Lightweight wrapper over native SIMD types for cross-platform vector code. +// + #pragma once #include "common/intrin.h" diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 874371651..2915e64c8 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -828,12 +828,8 @@ public: ALWAYS_INLINE operator float32x2_t() const { return v2s; } ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); } - ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); } - - ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); } - - ALWAYS_INLINE GSVector2 rcpnr() const + ALWAYS_INLINE GSVector2 rcp() const { float32x2_t recip = vrecpe_f32(v2s); recip = vmul_f32(recip, vrecps_f32(recip, v2s)); @@ -843,7 +839,6 @@ public: #ifdef CPU_ARCH_ARM64 ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); } - ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); } #else @@ -2004,50 +1999,6 @@ public: ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { const int32x4_t res = veorq_s32(v4s, v.v4s); @@ -2400,6 +2351,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -2442,6 +2395,10 @@ public: constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { const float arr[4] = {x, y, z, w}; @@ -2475,12 +2432,28 @@ public: ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); -#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE static GSVector4 f64(double x, double y) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1))); - } +#else + GSVector4 ret; + ret.F64[0] = x; + ret.F64[1] = y; + return ret; #endif + } + + ALWAYS_INLINE static GSVector4 f64(double x) + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x))); +#else + GSVector4 ret; + ret.F64[0] = ret.F64[1] = x; + return ret; +#endif + } ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); } @@ -2729,6 +2702,28 @@ public: return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i); } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst))); +#else + GSVector4 ret; + ret.F64[dst] = v; + return ret; +#endif + } + + template + ALWAYS_INLINE double extract64() const + { +#ifdef CPU_ARCH_ARM64 + return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src); +#else + return F64[src]; +#endif + } + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); } ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); } @@ -2903,43 +2898,182 @@ public: return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s))); } -#ifdef CPU_ARCH_ARM64 - // Not in ARM32 - ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const { - return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]); +#endif } ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const { - return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]); +#endif } ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const { - return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + GSVector4 ret; + ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; +#endif + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); +#else + return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL)); } + + ALWAYS_INLINE GSVector4 sqrt64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); +#endif + } + + ALWAYS_INLINE GSVector4 sqr64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); +#endif + } + + ALWAYS_INLINE GSVector4 floor64() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s)))); +#else + return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); +#endif } ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s)))); +#else + return GSVector4::f64(static_cast(vgetq_lane_f32(v.v4s, 0)), static_cast(vgetq_lane_f32(v.v4s, 1))); +#endif } ALWAYS_INLINE static GSVector4 f32to64(const void* p) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast(p))))); +#else + const float* fp = static_cast(p); + return GSVector4::f64(static_cast(fp[0]), static_cast(fp[1])); +#endif } ALWAYS_INLINE GSVector4i f64toi32() const { +#ifdef CPU_ARCH_ARM64 const s32 low = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0)); const s32 high = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1)); +#else + const s32 low = static_cast(F64[0]); + const s32 high = static_cast(F64[1]); +#endif return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1)); } -#endif - // clang-format off #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index 8f2f26472..c636f486f 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -603,13 +603,6 @@ public: GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); } - GSVector2 rcpnr() const - { - GSVector2 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); } GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); } @@ -1461,50 +1454,6 @@ public: GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a_.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if constexpr (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; } GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); } @@ -1791,6 +1740,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -1832,6 +1783,10 @@ public: constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { this->x = x; @@ -1881,6 +1836,13 @@ public: return ret; } + ALWAYS_INLINE static GSVector4 f64(double x) + { + GSVector4 ret; + ret.F64[0] = ret.F64[1] = x; + return ret; + } + ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; } u32 rgba32() const { return GSVector4i(*this).rgba32(); } @@ -2045,6 +2007,20 @@ public: return I32[i]; } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { + GSVector4 ret; + ret.F64[dst] = v; + return ret; + } + + template + ALWAYS_INLINE double extract64() const + { + return F64[src]; + } + ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); } ALWAYS_INLINE static constexpr GSVector4 xffffffff() @@ -2300,6 +2276,71 @@ public: return ret; } + ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const + { + return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]); + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const + { + return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1])); + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const + { + return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1])); + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL(); } + + ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); } + + ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); } + + ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); } + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { GSVector4 ret; diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index a2b31b4cc..d5dfe5ada 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin // SPDX-License-Identifier: LGPL-3.0+ // +// Lightweight wrapper over native SIMD types for cross-platform vector code. // Rewritten and NEON+No-SIMD variants added for DuckStation. // @@ -63,11 +64,9 @@ public: GSVector2i() = default; ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } - ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } - ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) { return GSVector2i(cxpr_init, s0, s1, s2, s3); @@ -79,26 +78,26 @@ public: } ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); } - ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); } - ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} { } - - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); - - ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); - ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {} - ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } - ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + ALWAYS_INLINE GSVector2i& operator=(s32 i) + { + m = _mm_set1_epi32(i); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator=(__m128i m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128i() const { return m; } @@ -142,10 +141,7 @@ public: ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); } ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); } - ALWAYS_INLINE s32 addv_s32() const - { - return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); - } + ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); } ALWAYS_INLINE u8 minv_u8() const { @@ -180,11 +176,8 @@ public: } ALWAYS_INLINE s32 minv_s32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE u32 minv_u32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE s32 maxv_s32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } - ALWAYS_INLINE u32 maxv_u32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } @@ -333,39 +326,25 @@ public: #endif ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); } - ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); } - ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); } - ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); } - ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); } ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); } - ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); } ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); } @@ -399,7 +378,6 @@ public: ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); } ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); } - ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); } template @@ -442,24 +420,35 @@ public: } ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } - - ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64((__m128i*)p)); } - + ALWAYS_INLINE static GSVector2i load(const void* p) + { + return GSVector2i(_mm_loadl_epi64(static_cast(p))); + } ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64((__m128i*)p, v.m); } - + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); } - ALWAYS_INLINE void operator&=(const GSVector2i& v) { m = _mm_and_si128(m, v); } - ALWAYS_INLINE void operator|=(const GSVector2i& v) { m = _mm_or_si128(m, v); } - ALWAYS_INLINE void operator^=(const GSVector2i& v) { m = _mm_xor_si128(m, v); } + ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v) + { + m = _mm_and_si128(m, v); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v) + { + m = _mm_or_si128(m, v); + return *this; + } + + ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v) + { + m = _mm_xor_si128(m, v); + return *this; + } ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) { @@ -485,6 +474,7 @@ public: ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); } ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); } + ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); } @@ -500,7 +490,6 @@ class alignas(16) GSVector2 static constexpr cxpr_init_tag cxpr_init{}; constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} - constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} public: @@ -530,28 +519,20 @@ public: GSVector2() = default; constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } - constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } - constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } - constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); } - ALWAYS_INLINE GSVector2(int x, int y) { GSVector2i v_(x, y); - m = _mm_cvtepi32_ps(v_.m); } ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {} - ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {} - ALWAYS_INLINE explicit GSVector2(float f) { *this = f; } - ALWAYS_INLINE explicit GSVector2(int i) { #ifdef CPU_ARCH_AVX2 @@ -563,38 +544,23 @@ public: ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); - ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); - - ALWAYS_INLINE void operator=(float f) + ALWAYS_INLINE GSVector2& operator=(float f) { -#if CPU_ARCH_AVX2 - - m = _mm_broadcastss_ps(_mm_load_ss(&f)); - -#else - m = _mm_set1_ps(f); - -#endif + return *this; } - ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + ALWAYS_INLINE GSVector2& operator=(__m128 m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128() const { return m; } ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); } - ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); } - ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); } - - ALWAYS_INLINE GSVector2 rcpnr() const - { - GSVector2 v_ = rcp(); - - return (v_ + v_) - (v_ * v_) * *this; - } - ALWAYS_INLINE GSVector2 floor() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); @@ -657,27 +623,77 @@ public: ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); } - ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd((double*)p))); } + ALWAYS_INLINE static GSVector2 load(const void* p) + { + return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); + } ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); } - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } + ALWAYS_INLINE static void store(void* p, const GSVector2& v) + { + _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); + } ALWAYS_INLINE GSVector2 operator-() const { return neg(); } - ALWAYS_INLINE void operator+=(const GSVector2& v_) { m = _mm_add_ps(m, v_); } - ALWAYS_INLINE void operator-=(const GSVector2& v_) { m = _mm_sub_ps(m, v_); } - ALWAYS_INLINE void operator*=(const GSVector2& v_) { m = _mm_mul_ps(m, v_); } - ALWAYS_INLINE void operator/=(const GSVector2& v_) { m = _mm_div_ps(m, v_); } + ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_) + { + m = _mm_add_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_) + { + m = _mm_sub_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_) + { + m = _mm_mul_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_) + { + m = _mm_div_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } - ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } - ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); } - ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); } + ALWAYS_INLINE GSVector2& operator+=(float f) + { + *this += GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator-=(float f) + { + *this -= GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator*=(float f) + { + *this *= GSVector2(f); + return *this; + } + ALWAYS_INLINE GSVector2& operator/=(float f) + { + *this /= GSVector2(f); + return *this; + } - ALWAYS_INLINE void operator&=(const GSVector2& v_) { m = _mm_and_ps(m, v_); } - ALWAYS_INLINE void operator|=(const GSVector2& v_) { m = _mm_or_ps(m, v_); } - ALWAYS_INLINE void operator^=(const GSVector2& v_) { m = _mm_xor_ps(m, v_); } + ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_) + { + m = _mm_and_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_) + { + m = _mm_or_ps(m, v_); + return *this; + } + ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_) + { + m = _mm_xor_ps(m, v_); + return *this; + } ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) { @@ -752,6 +768,8 @@ public: return GSVector2(_mm_cmple_ps(v1, v2)); } + ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); + ALWAYS_INLINE GSVector2 xy() const { return *this; } ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); } ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); } @@ -811,11 +829,9 @@ public: { return GSVector4i(cxpr_init, x, y, z, w); } - ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } - ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); @@ -828,9 +844,7 @@ public: } ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); } - ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } - ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); @@ -844,25 +858,27 @@ public: ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); - ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); - ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {} - ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } - ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + ALWAYS_INLINE GSVector4i& operator=(s32 i) + { + m = _mm_set1_epi32(i); + return *this; + } + ALWAYS_INLINE GSVector4i& operator=(__m128i m_) + { + m = m_; + return *this; + } ALWAYS_INLINE operator __m128i() const { return m; } - // rect - ALWAYS_INLINE s32 width() const { return right - left; } ALWAYS_INLINE s32 height() const { return bottom - top; } @@ -882,8 +898,6 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - // - ALWAYS_INLINE u32 rgba32() const { GSVector4i v = *this; @@ -1237,99 +1251,34 @@ public: #endif ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); } - ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); } - ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); } ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); } - ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); } - ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); } - ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); } ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); } - ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); } ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); } - ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); } - template - ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const - { - // (a - this) * f << shift + this - - return add16(a.sub16(*this).modulate16(f)); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) - { - // (a - b) * c << shift - - return a.sub16(b).modulate16(c); - } - - template - ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, - const GSVector4i& d) - { - // (a - b) * c << shift + d - - return d.add16(a.sub16(b).modulate16(c)); - } - - ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const - { - // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) - - return add16(a_.sub16(*this).mul16l(f).sra16<4>()); - } - - template - ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const - { - // a * f << shift - if (shift == 0) - { - return mul16hrs(f); - } - - return sll16().mul16hs(f); - } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { - // pxor, ptest, je - - GSVector4i t = *this ^ v; - + const GSVector4i t = *this ^ v; return _mm_testz_si128(t, t) != 0; } @@ -1420,15 +1369,21 @@ public: return _mm_extract_epi64(m, i); } - ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); } + ALWAYS_INLINE static GSVector4i loadnt(const void* p) + { + return GSVector4i(_mm_stream_load_si128(static_cast(p))); + } ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } - ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); } + ALWAYS_INLINE static GSVector4i loadl(const void* p) + { + return GSVector4i(_mm_loadl_epi64(static_cast(p))); + } ALWAYS_INLINE static GSVector4i loadh(const void* p) { - return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p))); + return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast(p)))); } ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) @@ -1439,18 +1394,19 @@ public: template ALWAYS_INLINE static GSVector4i load(const void* p) { - return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p)); + return GSVector4i(aligned ? _mm_load_si128(static_cast(p)) : + _mm_loadu_si128(static_cast(p))); } ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); } - - ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); } - - ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); } + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); } + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) + { + _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m)); + } ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) { @@ -1462,20 +1418,30 @@ public: ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { if constexpr (aligned) - _mm_store_si128((__m128i*)p, v.m); + _mm_store_si128(static_cast<__m128i*>(p), v.m); else - _mm_storeu_si128((__m128i*)p, v.m); + _mm_storeu_si128(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); } - ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); } - ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); } - ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); } + ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v) + { + m = _mm_and_si128(m, v); + return *this; + } + ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v) + { + m = _mm_or_si128(m, v); + return *this; + } + ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v) + { + m = _mm_xor_si128(m, v); + return *this; + } ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) { @@ -1493,14 +1459,12 @@ public: } ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); } - ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); } ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } + ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } @@ -1555,6 +1519,8 @@ class alignas(16) GSVector4 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} + public: union { @@ -1586,35 +1552,29 @@ public: GSVector4() = default; constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } - constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } - constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } - constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } - constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } + constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); } - ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); } - ALWAYS_INLINE GSVector4(int x, int y, int z, int w) { GSVector4i v_(x, y, z, w); - m = _mm_cvtepi32_ps(v_.m); } - ALWAYS_INLINE GSVector4(int x, int y) { m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y))); } ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {} - ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd()))) { @@ -1637,24 +1597,20 @@ public: ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); - ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); - ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); } + ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); } - ALWAYS_INLINE void operator=(float f) + ALWAYS_INLINE GSVector4& operator=(float f) { -#if CPU_ARCH_AVX2 - - m = _mm_broadcastss_ps(_mm_load_ss(&f)); - -#else - m = _mm_set1_ps(f); - -#endif + return *this; } - ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + ALWAYS_INLINE GSVector4& operator=(__m128 m_) + { + this->m = m_; + return *this; + } ALWAYS_INLINE operator __m128() const { return m; } @@ -1824,52 +1780,132 @@ public: return _mm_extract_ps(m, i); } + template + ALWAYS_INLINE GSVector4 insert64(double v) const + { + if constexpr (dst == 0) + return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v))); + else + return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0)); + } + + template + ALWAYS_INLINE double extract64() const + { + double ret; + if constexpr (src == 0) + _mm_storel_pd(&ret, _mm_castps_pd(m)); + else + _mm_storeh_pd(&ret, _mm_castps_pd(m)); + return ret; + } + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); } + ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); } - ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); } + ALWAYS_INLINE static GSVector4 loadl(const void* p) + { + return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); + } ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); } template ALWAYS_INLINE static GSVector4 load(const void* p) { - return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); + return GSVector4(aligned ? _mm_load_ps(static_cast(p)) : _mm_loadu_ps(static_cast(p))); } - ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); } - - ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } - - ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); } + ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast(p), v.m); } + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) + { + _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); + } + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) + { + _mm_storeh_pd(static_cast(p), _mm_castps_pd(v.m)); + } template ALWAYS_INLINE static void store(void* p, const GSVector4& v) { if constexpr (aligned) - _mm_store_ps((float*)p, v.m); + _mm_store_ps(static_cast(p), v.m); else - _mm_storeu_ps((float*)p, v.m); + _mm_storeu_ps(static_cast(p), v.m); } ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } ALWAYS_INLINE GSVector4 operator-() const { return neg(); } - ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); } - ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); } - ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); } - ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); } + ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_) + { + m = _mm_add_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } - ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } - ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } - ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); } + ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_) + { + m = _mm_sub_ps(m, v_); + return *this; + } - ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); } - ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); } - ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); } + ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_) + { + m = _mm_mul_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_) + { + m = _mm_div_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator+=(float f) + { + *this += GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator-=(float f) + { + *this -= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator*=(float f) + { + *this *= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator/=(float f) + { + *this /= GSVector4(f); + return *this; + } + + ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_) + { + m = _mm_and_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_) + { + m = _mm_or_ps(m, v_); + return *this; + } + + ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_) + { + m = _mm_xor_ps(m, v_); + return *this; + } ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) { @@ -1959,6 +1995,59 @@ public: return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); } + ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const + { + return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); + } + + ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const + { + return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const + { + return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const + { + return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const + { + return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const + { + return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); + } + + ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } + + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL)); } + + ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); } + + ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); } + + ALWAYS_INLINE GSVector4 floor64() const + { + return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + } + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); } ALWAYS_INLINE static GSVector4 f32to64(const void* p) @@ -2007,7 +2096,7 @@ public: ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { - return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f))); + return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast(f)))); } #endif