From a7747c5be3d62dc0f5fc00eaad5835ff6799707a Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 21 Sep 2024 14:07:14 +1000 Subject: [PATCH] Common: Add 256-bit integer vector wrapper --- src/common/gsvector_neon.h | 93 +++--- src/common/gsvector_nosimd.h | 95 ++---- src/common/gsvector_sse.h | 551 ++++++++++++++++++++++++++++++--- src/core/gpu_hw.cpp | 22 +- src/core/gpu_sw_rasterizer.inl | 4 +- 5 files changed, 582 insertions(+), 183 deletions(-) diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 11db729b8..24b03cf84 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -81,8 +81,6 @@ public: { } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; } ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {} @@ -690,9 +688,9 @@ public: return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0)); } - ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); } + ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); } - ALWAYS_INLINE static GSVector2i load(int i) { return GSVector2i(vset_lane_s32(i, vdup_n_s32(0), 0)); } + ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { @@ -702,8 +700,6 @@ public: ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); } - ALWAYS_INLINE static int store(const GSVector2i& v) { return vget_lane_s32(v.v2s, 0); } - ALWAYS_INLINE void operator&=(const GSVector2i& v) { v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); @@ -907,11 +903,9 @@ public: ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); } - ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(vset_lane_f32(f, vmov_n_f32(0.0f), 0)); } + ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast(p))); } - ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32((const float*)p)); } - - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32((float*)p, v.v2s); } + ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast(p), v.v2s); } ALWAYS_INLINE GSVector2 operator-() const { return neg(); } @@ -1099,15 +1093,10 @@ public: } ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) + : v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3)) { - GSVector4i xz = load(x).upl32(load(z)); - GSVector4i yw = load(y).upl32(load(w)); - - *this = xz.upl32(yw); } - ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } - ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) : S16{s0, s1, s2, s3, s4, s5, s6, s7} { @@ -1119,9 +1108,7 @@ public: { } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default - ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; } + ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {} ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {} @@ -1131,7 +1118,7 @@ public: ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); - ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); } + ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); } ALWAYS_INLINE operator int32x4_t() const { return v4s; } @@ -1155,15 +1142,7 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - ALWAYS_INLINE u32 rgba32() const - { - GSVector4i v = *this; - - v = v.ps32(v); - v = v.pu16(v); - - return (u32)store(v); - } + ALWAYS_INLINE u32 rgba32() const { return static_cast(ps32().pu16().extract32<0>()); } ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const { @@ -2153,6 +2132,8 @@ public: return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0)); } + ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); } + ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0))); @@ -2163,36 +2144,18 @@ public: return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p)))); } - ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v) - { - return GSVector4i( - vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p)))); - } - ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); } - ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph) - { - return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph)))); - } - template ALWAYS_INLINE static GSVector4i load(const void* p) { return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); } - ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); } - - ALWAYS_INLINE static GSVector4i loadq(s64 i) - { - return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0))); - } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { #if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p)); + __builtin_nontemporal_store(v.v4s, static_cast(p)); #else vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); #endif @@ -2214,21 +2177,19 @@ public: vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s))); } - ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) - { - GSVector4i::storel(pl, v); - GSVector4i::storeh(ph, v); - } - template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); } - ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); } + ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); } + template + ALWAYS_INLINE static GSVector4i broadcast128(const void* v) + { + return load(v); + } ALWAYS_INLINE void operator&=(const GSVector4i& v) { @@ -2274,6 +2235,11 @@ public: ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw) + { + return GSVector4i(vcombine_s32(xy.v2s, zw.v2s)); + } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); } ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); } @@ -2282,6 +2248,16 @@ public: ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \ { \ return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \ + { \ + return GSVector4i(vreinterpretq_s32_s16( \ + __builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7))); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \ + { \ + return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector( \ + vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn))); \ } #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ @@ -2438,7 +2414,10 @@ public: ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); } - ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) + { + return GSVector4(GSVector4i::zext32(static_cast(rgba)).u8to32()); + } ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index caa8df6d8..0ddc9d9f7 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -117,8 +117,6 @@ public: ALWAYS_INLINE GSVector2i(const GSVector2i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); @@ -468,6 +466,8 @@ public: return ret; } + ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); } + ALWAYS_INLINE static GSVector2i load(const void* p) { GSVector2i ret; @@ -475,19 +475,10 @@ public: return ret; } - ALWAYS_INLINE static GSVector2i load(s32 i) - { - GSVector2i ret; - ret.x = i; - return ret; - } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); } - ALWAYS_INLINE static s32 store(const GSVector2i& v) { return v.x; } - ALWAYS_INLINE void operator&=(const GSVector2i& v) { U64[0] &= v.U64[0]; } ALWAYS_INLINE void operator|=(const GSVector2i& v) { U64[0] |= v.U64[0]; } ALWAYS_INLINE void operator^=(const GSVector2i& v) { U64[0] ^= v.U64[0]; } @@ -668,8 +659,6 @@ public: return ret; } - ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(f, f); } - ALWAYS_INLINE static GSVector2 load(const void* p) { GSVector2 ret; @@ -919,8 +908,6 @@ public: this->w = w; } - ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } - ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { S16[0] = s0; @@ -942,8 +929,6 @@ public: ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : S32{v.S32[0], v.S32[1], 0, 0} {} - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), - // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); @@ -973,15 +958,7 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - ALWAYS_INLINE u32 rgba32() const - { - GSVector4i v = *this; - - v = v.ps32(v); - v = v.pu16(v); - - return (u32)store(v); - } + ALWAYS_INLINE u32 rgba32() const { return static_cast(ps32().pu16().extract32<0>()); } ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const { @@ -1128,8 +1105,6 @@ public: u32 maxv_u32() const { return std::max(U32[0], std::max(U32[1], std::max(U32[2], U32[3]))); } - static s32 min_i16(s32 a, s32 b) { return store(load(a).min_i16(load(b))); } - ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const @@ -1552,6 +1527,8 @@ public: return ret; } + ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); } + ALWAYS_INLINE static GSVector4i loadl(const void* p) { GSVector4i ret; @@ -1578,36 +1555,12 @@ public: return ret; } - ALWAYS_INLINE static GSVector4i load(s32 i) - { - GSVector4i ret; - ret.x = i; - ret.y = 0; - ret.z = 0; - ret.w = 0; - return ret; - } - - ALWAYS_INLINE static GSVector4i loadq(s64 i) - { - GSVector4i ret; - ret.S64[0] = i; - ret.S64[1] = 0; - return ret; - } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); } ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); } ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); } - ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) - { - GSVector4i::storel(pl, v); - GSVector4i::storeh(ph, v); - } - template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { @@ -1616,9 +1569,13 @@ public: ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); } - ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; } + ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.S64[0]; } + template + ALWAYS_INLINE static GSVector4i broadcast128(const void* v) + { + return load(v); + } ALWAYS_INLINE void operator&=(const GSVector4i& v) { @@ -1672,11 +1629,24 @@ public: ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw) + { + return GSVector4i(xy.x, xy.y, zw.x, zw.y); + } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); } ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); } #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]); } + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]); } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \ + { \ + return GSVector4i(S16[xn], S16[yn], S16[zn], S16[wn], S16[4], S16[5], S16[6], S16[7]); \ + } \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \ + { \ + return GSVector4i(S16[0], S16[1], S16[2], S16[3], S16[4 + xn], S16[4 + yn], S16[4 + zn], S16[4 + wn]); \ + } #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ @@ -1827,7 +1797,10 @@ public: u32 rgba32() const { return GSVector4i(*this).rgba32(); } - ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) + { + return GSVector4(GSVector4i::zext32(static_cast(rgba)).u8to32()); + } ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } @@ -1993,8 +1966,6 @@ public: return ret; } - ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(f, f, f, f); } - template ALWAYS_INLINE static GSVector4 load(const void* p) { @@ -2286,7 +2257,7 @@ public: ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast(0x7FFFFFFFFFFFFFFFULL)); } - ALWAYS_INLINE GSVector4 neg64() const {return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL(); } + ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast(0x8000000000000000ULL)); } ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); } @@ -2318,11 +2289,7 @@ public: } #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const \ - { \ - return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); \ - } + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \ diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index dc3234539..33eeb4992 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -15,6 +15,7 @@ #ifdef CPU_ARCH_AVX2 #define GSVECTOR_HAS_UNSIGNED 1 #define GSVECTOR_HAS_SRLV 1 +#define GSVECTOR_HAS_256 1 #endif class GSVector2; @@ -411,24 +412,18 @@ public: template ALWAYS_INLINE s32 extract32() const { - if constexpr (i == 0) - return GSVector2i::store(*this); - return _mm_extract_epi32(m, i); } ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } + ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); } ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64(static_cast(p))); } - ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); } ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); } ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v) { @@ -625,8 +620,6 @@ public: return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); } - ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); } - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); @@ -841,7 +834,6 @@ public: } ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); } - ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); @@ -887,15 +879,7 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - ALWAYS_INLINE u32 rgba32() const - { - GSVector4i v = *this; - - v = v.ps32(v); - v = v.pu16(v); - - return (u32)store(v); - } + ALWAYS_INLINE u32 rgba32() const { return static_cast(ps32().pu16().extract32<0>()); } ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const { @@ -1331,9 +1315,6 @@ public: template ALWAYS_INLINE s32 extract32() const { - if constexpr (i == 0) - return GSVector4i::store(*this); - return _mm_extract_epi32(m, i); } @@ -1346,9 +1327,6 @@ public: template ALWAYS_INLINE s64 extract64() const { - if (i == 0) - return GSVector4i::storeq(*this); - return _mm_extract_epi64(m, i); } @@ -1359,6 +1337,7 @@ public: } ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } + ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); } ALWAYS_INLINE static GSVector4i loadl(const void* p) { @@ -1382,9 +1361,6 @@ public: _mm_loadu_si128(static_cast(p))); } - ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); } - ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) @@ -1392,12 +1368,6 @@ public: _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m)); } - ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) - { - GSVector4i::storel(pl, v); - GSVector4i::storeh(ph, v); - } - template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { @@ -1408,8 +1378,6 @@ public: } ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); } - ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); } ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v) { @@ -1450,8 +1418,21 @@ public: ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); + ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; } + + template + ALWAYS_INLINE static GSVector4i broadcast128(const void* v) + { + return load(v); + } + ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw) + { + return GSVector4i(_mm_unpacklo_epi64(xy.m, zw.m)); + } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); } @@ -1468,11 +1449,6 @@ public: ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \ { \ return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn))); \ - } \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const \ - { \ - return GSVector4i( \ - _mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn))); \ } #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ @@ -1614,7 +1590,10 @@ public: u32 rgba32() const { return GSVector4i(*this).rgba32(); } - ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) + { + return GSVector4(GSVector4i::zext32(static_cast(rgba)).u8to32()); + } ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } @@ -1753,8 +1732,6 @@ public: return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); } - ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); } - template ALWAYS_INLINE static GSVector4 load(const void* p) { @@ -1780,7 +1757,7 @@ public: _mm_storeu_ps(static_cast(p), v.m); } - ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } + ALWAYS_INLINE static void store32(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } ALWAYS_INLINE GSVector4 operator-() const { return neg(); } @@ -2004,10 +1981,6 @@ public: ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \ { \ return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); \ - } \ - ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const \ - { \ - return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); \ } #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ @@ -2096,3 +2069,483 @@ ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) { return GSVector4(_mm_castsi128_ps(v.m)); } + +#ifdef GSVECTOR_HAS_256 + +class alignas(32) GSVector8i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector8i(cxpr_init_tag, s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1) + : S32{x0, y0, z0, w0, x1, y1, z1, w1} + { + } + + constexpr GSVector8i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7, s16 s8, s16 s9, + s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15) + : S16{s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15} + { + } + +public: + union + { + struct + { + s32 x0, y0, z0, w0, x1, y1, z1, w1; + }; + struct + { + s32 r0, g0, b0, a0, r1, g1, b1, a1; + }; + + float F32[8]; + s8 S8[32]; + s16 S16[16]; + s32 S32[8]; + s64 S64[4]; + u8 U8[32]; + u16 U16[16]; + u32 U32[8]; + u64 U64[4]; + __m256i m; + }; + + GSVector8i() = default; + + ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x0, s32 y0, s32 z0, s32 w0, s32 x1, s32 y1, s32 z1, s32 w1) + { + return GSVector8i(cxpr_init, x0, y0, z0, w0, x1, y1, z1, w1); + } + ALWAYS_INLINE constexpr static GSVector8i cxpr(s32 x) { return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 x) + { + return GSVector8i(cxpr_init, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); + } + ALWAYS_INLINE constexpr static GSVector8i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7, + s16 s8, s16 s9, s16 s10, s16 s11, s16 s12, s16 s13, s16 s14, s16 s15) + { + return GSVector8i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15); + } + + ALWAYS_INLINE explicit GSVector8i(s32 i) { *this = i; } + + ALWAYS_INLINE constexpr explicit GSVector8i(__m256i m) : m(m) {} + + ALWAYS_INLINE GSVector8i& operator=(s32 i) + { + m = _mm256_set1_epi32(i); + return *this; + } + ALWAYS_INLINE GSVector8i& operator=(__m256i m_) + { + m = m_; + return *this; + } + + ALWAYS_INLINE operator __m256i() const { return m; } + + ALWAYS_INLINE GSVector8i min_i8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi8(m, v)); } + ALWAYS_INLINE GSVector8i max_i8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi8(m, v)); } + ALWAYS_INLINE GSVector8i min_i16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi16(m, v)); } + ALWAYS_INLINE GSVector8i max_i16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi16(m, v)); } + ALWAYS_INLINE GSVector8i min_i32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epi32(m, v)); } + ALWAYS_INLINE GSVector8i max_i32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epi32(m, v)); } + + ALWAYS_INLINE GSVector8i min_u8(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu8(m, v)); } + ALWAYS_INLINE GSVector8i max_u8(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu8(m, v)); } + ALWAYS_INLINE GSVector8i min_u16(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu16(m, v)); } + ALWAYS_INLINE GSVector8i max_u16(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu16(m, v)); } + ALWAYS_INLINE GSVector8i min_u32(const GSVector8i& v) const { return GSVector8i(_mm256_min_epu32(m, v)); } + ALWAYS_INLINE GSVector8i max_u32(const GSVector8i& v) const { return GSVector8i(_mm256_max_epu32(m, v)); } + + ALWAYS_INLINE GSVector8i madd_s16(const GSVector8i& v) const { return GSVector8i(_mm256_madd_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector8i clamp8() const { return pu16().upl8(); } + + ALWAYS_INLINE GSVector8i blend8(const GSVector8i& v, const GSVector8i& mask) const + { + return GSVector8i(_mm256_blendv_epi8(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector8i blend16(const GSVector8i& v) const + { + return GSVector8i(_mm256_blend_epi16(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector8i blend32(const GSVector8i& v) const + { + return GSVector8i(_mm256_blend_epi32(m, v.m, mask)); + } + + ALWAYS_INLINE GSVector8i blend(const GSVector8i& v, const GSVector8i& mask) const + { + return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, v))); + } + + ALWAYS_INLINE GSVector8i shuffle8(const GSVector8i& mask) const { return GSVector8i(_mm256_shuffle_epi8(m, mask)); } + + ALWAYS_INLINE GSVector8i ps16(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi16(m, v)); } + ALWAYS_INLINE GSVector8i ps16() const { return GSVector8i(_mm256_packs_epi16(m, m)); } + ALWAYS_INLINE GSVector8i pu16(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi16(m, v)); } + ALWAYS_INLINE GSVector8i pu16() const { return GSVector8i(_mm256_packus_epi16(m, m)); } + ALWAYS_INLINE GSVector8i ps32(const GSVector8i& v) const { return GSVector8i(_mm256_packs_epi32(m, v)); } + ALWAYS_INLINE GSVector8i ps32() const { return GSVector8i(_mm256_packs_epi32(m, m)); } + ALWAYS_INLINE GSVector8i pu32(const GSVector8i& v) const { return GSVector8i(_mm256_packus_epi32(m, v)); } + ALWAYS_INLINE GSVector8i pu32() const { return GSVector8i(_mm256_packus_epi32(m, m)); } + + ALWAYS_INLINE GSVector8i upl8(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi8(m, v)); } + ALWAYS_INLINE GSVector8i uph8(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi8(m, v)); } + ALWAYS_INLINE GSVector8i upl16(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi16(m, v)); } + ALWAYS_INLINE GSVector8i uph16(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi16(m, v)); } + ALWAYS_INLINE GSVector8i upl32(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi32(m, v)); } + ALWAYS_INLINE GSVector8i uph32(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi32(m, v)); } + ALWAYS_INLINE GSVector8i upl64(const GSVector8i& v) const { return GSVector8i(_mm256_unpacklo_epi64(m, v)); } + ALWAYS_INLINE GSVector8i uph64(const GSVector8i& v) const { return GSVector8i(_mm256_unpackhi_epi64(m, v)); } + + ALWAYS_INLINE GSVector8i upl8() const { return GSVector8i(_mm256_unpacklo_epi8(m, _mm256_setzero_si256())); } + ALWAYS_INLINE GSVector8i uph8() const { return GSVector8i(_mm256_unpackhi_epi8(m, _mm256_setzero_si256())); } + + ALWAYS_INLINE GSVector8i upl16() const { return GSVector8i(_mm256_unpacklo_epi16(m, _mm256_setzero_si256())); } + ALWAYS_INLINE GSVector8i uph16() const { return GSVector8i(_mm256_unpackhi_epi16(m, _mm256_setzero_si256())); } + + ALWAYS_INLINE GSVector8i upl32() const { return GSVector8i(_mm256_unpacklo_epi32(m, _mm256_setzero_si256())); } + + ALWAYS_INLINE GSVector8i uph32() const { return GSVector8i(_mm256_unpackhi_epi32(m, _mm256_setzero_si256())); } + ALWAYS_INLINE GSVector8i upl64() const { return GSVector8i(_mm256_unpacklo_epi64(m, _mm256_setzero_si256())); } + ALWAYS_INLINE GSVector8i uph64() const { return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256())); } + + ALWAYS_INLINE GSVector8i s8to16() const { return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i s8to32() const { return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i s8to64() const { return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m))); } + + ALWAYS_INLINE GSVector8i s16to32() const { return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i s16to64() const { return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i s32to64() const { return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u8to16() const { return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u8to32() const { return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u8to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u16to32() const { return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u16to64() const { return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); } + ALWAYS_INLINE GSVector8i u32to64() const { return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m))); } + + ALWAYS_INLINE static GSVector8i s8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi16(v.m)); } + ALWAYS_INLINE static GSVector8i s8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi32(v.m)); } + ALWAYS_INLINE static GSVector8i s8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi8_epi64(v.m)); } + + ALWAYS_INLINE static GSVector8i s16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi32(v.m)); } + ALWAYS_INLINE static GSVector8i s16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi16_epi64(v.m)); } + ALWAYS_INLINE static GSVector8i s32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepi32_epi64(v.m)); } + ALWAYS_INLINE static GSVector8i u8to16(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi16(v.m)); } + ALWAYS_INLINE static GSVector8i u8to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu8_epi32(v.m)); } + ALWAYS_INLINE static GSVector8i u8to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); } + ALWAYS_INLINE static GSVector8i u16to32(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi32(v.m)); } + ALWAYS_INLINE static GSVector8i u16to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu16_epi64(v.m)); } + ALWAYS_INLINE static GSVector8i u32to64(const GSVector4i& v) { return GSVector8i(_mm256_cvtepu32_epi64(v.m)); } + + template + ALWAYS_INLINE GSVector8i srl() const + { + return GSVector8i(_mm256_srli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector8i srl(const GSVector8i& v) + { + return GSVector8i(_mm256_alignr_epi8(v.m, m, i)); + } + + template + ALWAYS_INLINE GSVector8i sll() const + { + return GSVector8i(_mm256_slli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector8i sll16() const + { + return GSVector8i(_mm256_slli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector8i sll16(s32 i) const { return GSVector8i(_mm256_sll_epi16(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i sllv16(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi16(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i srl16() const + { + return GSVector8i(_mm256_srli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector8i srl16(s32 i) const { return GSVector8i(_mm256_srl_epi16(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srlv16(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi16(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i sra16() const + { + return GSVector8i(_mm256_srai_epi16(m, i)); + } + + ALWAYS_INLINE GSVector8i sra16(s32 i) const { return GSVector8i(_mm256_sra_epi16(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srav16(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi16(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i sll32() const + { + return GSVector8i(_mm256_slli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector8i sll32(s32 i) const { return GSVector8i(_mm256_sll_epi32(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i sllv32(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi32(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i srl32() const + { + return GSVector8i(_mm256_srli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector8i srl32(s32 i) const { return GSVector8i(_mm256_srl_epi32(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srlv32(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi32(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i sra32() const + { + return GSVector8i(_mm256_srai_epi32(m, i)); + } + + ALWAYS_INLINE GSVector8i sra32(s32 i) const { return GSVector8i(_mm256_sra_epi32(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srav32(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi32(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i sll64() const + { + return GSVector8i(_mm256_slli_epi64(m, i)); + } + + ALWAYS_INLINE GSVector8i sll64(s32 i) const { return GSVector8i(_mm256_sll_epi64(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i sllv64(const GSVector8i& v) const { return GSVector8i(_mm256_sllv_epi64(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i srl64() const + { + return GSVector8i(_mm256_srli_epi64(m, i)); + } + + ALWAYS_INLINE GSVector8i srl64(s32 i) const { return GSVector8i(_mm256_srl_epi64(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srlv64(const GSVector8i& v) const { return GSVector8i(_mm256_srlv_epi64(m, v.m)); } + + template + ALWAYS_INLINE GSVector8i sra64() const + { + return GSVector8i(_mm256_srai_epi64(m, i)); + } + + ALWAYS_INLINE GSVector8i sra64(s32 i) const { return GSVector8i(_mm256_sra_epi64(m, _mm_cvtsi32_si128(i))); } + ALWAYS_INLINE GSVector8i srav64(const GSVector8i& v) const { return GSVector8i(_mm256_srav_epi64(m, v.m)); } + + ALWAYS_INLINE GSVector8i add8(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i add16(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i add32(const GSVector8i& v) const { return GSVector8i(_mm256_add_epi32(m, v.m)); } + ALWAYS_INLINE GSVector8i adds8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i adds16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i hadds16(const GSVector8i& v) const { return GSVector8i(_mm256_hadds_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i addus8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu8(m, v.m)); } + ALWAYS_INLINE GSVector8i addus16(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector8i sub8(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i sub16(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i sub32(const GSVector8i& v) const { return GSVector8i(_mm256_sub_epi32(m, v.m)); } + ALWAYS_INLINE GSVector8i subs8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i subs16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i subus8(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu8(m, v.m)); } + ALWAYS_INLINE GSVector8i subus16(const GSVector8i& v) const { return GSVector8i(_mm256_subs_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector8i mul16hs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhi_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i mul16l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i mul16hrs(const GSVector8i& v) const { return GSVector8i(_mm256_mulhrs_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i mul32l(const GSVector8i& v) const { return GSVector8i(_mm256_mullo_epi32(m, v.m)); } + + ALWAYS_INLINE bool eq(const GSVector8i& v) const + { + const GSVector8i t = *this ^ v; + return _mm256_testz_si256(t, t) != 0; + } + + ALWAYS_INLINE GSVector8i eq8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i eq16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i eq32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi32(m, v.m)); } + ALWAYS_INLINE GSVector8i eq64(const GSVector8i& v) const { return GSVector8i(_mm256_cmpeq_epi64(m, v.m)); } + + ALWAYS_INLINE GSVector8i neq8(const GSVector8i& v) const { return ~eq8(v); } + ALWAYS_INLINE GSVector8i neq16(const GSVector8i& v) const { return ~eq16(v); } + ALWAYS_INLINE GSVector8i neq32(const GSVector8i& v) const { return ~eq32(v); } + + ALWAYS_INLINE GSVector8i gt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i gt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i gt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector8i ge8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); } + ALWAYS_INLINE GSVector8i ge16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); } + ALWAYS_INLINE GSVector8i ge32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(v.m, m)); } + + ALWAYS_INLINE GSVector8i lt8(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi8(v.m, m)); } + ALWAYS_INLINE GSVector8i lt16(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi16(v.m, m)); } + ALWAYS_INLINE GSVector8i lt32(const GSVector8i& v) const { return GSVector8i(_mm256_cmpgt_epi32(v.m, m)); } + + ALWAYS_INLINE GSVector8i le8(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector8i le16(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector8i le32(const GSVector8i& v) const { return ~GSVector8i(_mm256_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector8i andnot(const GSVector8i& v) const { return GSVector8i(_mm256_andnot_si256(v.m, m)); } + + ALWAYS_INLINE u32 mask() const { return static_cast(_mm256_movemask_epi8(m)); } + + ALWAYS_INLINE bool alltrue() const { return mask() == 0xFFFFFFFFu; } + + ALWAYS_INLINE bool allfalse() const { return _mm256_testz_si256(m, m) != 0; } + + template + ALWAYS_INLINE GSVector8i insert8(s32 a) const + { + return GSVector8i(_mm256_insert_epi8(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract8() const + { + return _mm256_extract_epi8(m, i); + } + + template + ALWAYS_INLINE GSVector8i insert16(s32 a) const + { + return GSVector8i(_mm256_insert_epi16(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract16() const + { + return _mm256_extract_epi16(m, i); + } + + template + ALWAYS_INLINE GSVector8i insert32(s32 a) const + { + return GSVector8i(_mm256_insert_epi32(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract32() const + { + return _mm256_extract_epi32(m, i); + } + + template + ALWAYS_INLINE GSVector8i insert64(s64 a) const + { + return GSVector8i(_mm256_insert_epi64(m, a, i)); + } + + template + ALWAYS_INLINE s64 extract64() const + { + return _mm256_extract_epi64(m, i); + } + + ALWAYS_INLINE static GSVector8i zext32(s32 v) { return GSVector8i(_mm256_castsi128_si256(GSVector4i::zext32(v))); } + + ALWAYS_INLINE static GSVector8i loadnt(const void* p) + { + // Should be const, but isn't... + return GSVector8i(_mm256_stream_load_si256(const_cast<__m256i*>(static_cast(p)))); + } + + template + ALWAYS_INLINE static GSVector8i load(const void* p) + { + return GSVector8i(aligned ? _mm256_load_si256(static_cast(p)) : + _mm256_loadu_si256(static_cast(p))); + } + + ALWAYS_INLINE static void storent(void* p, const GSVector8i& v) + { + _mm256_stream_si256(static_cast<__m256i*>(p), v.m); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector8i& v) + { + if constexpr (aligned) + _mm256_store_si256(static_cast<__m256i*>(p), v.m); + else + _mm256_storeu_si256(static_cast<__m256i*>(p), v.m); + } + + template + ALWAYS_INLINE static void storel(void* p, const GSVector8i& v) + { + if constexpr (aligned) + _mm_store_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m)); + else + _mm_storeu_si128(static_cast<__m128i*>(p), _mm256_castsi256_si128(v.m)); + } + + ALWAYS_INLINE GSVector8i& operator&=(const GSVector8i& v) + { + m = _mm256_and_si256(m, v); + return *this; + } + ALWAYS_INLINE GSVector8i& operator|=(const GSVector8i& v) + { + m = _mm256_or_si256(m, v); + return *this; + } + ALWAYS_INLINE GSVector8i& operator^=(const GSVector8i& v) + { + m = _mm256_xor_si256(m, v); + return *this; + } + + ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_and_si256(v1, v2)); + } + + ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_or_si256(v1, v2)); + } + + ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_xor_si256(v1, v2)); + } + + ALWAYS_INLINE friend GSVector8i operator&(const GSVector8i& v, s32 i) { return v & GSVector8i(i); } + ALWAYS_INLINE friend GSVector8i operator|(const GSVector8i& v, s32 i) { return v | GSVector8i(i); } + ALWAYS_INLINE friend GSVector8i operator^(const GSVector8i& v, s32 i) { return v ^ GSVector8i(i); } + ALWAYS_INLINE friend GSVector8i operator~(const GSVector8i& v) { return v ^ v.eq32(v); } + + ALWAYS_INLINE static GSVector8i zero() { return GSVector8i(_mm256_setzero_si256()); } + + ALWAYS_INLINE static GSVector8i broadcast128(const GSVector4i& v) + { + return GSVector8i(_mm256_broadcastsi128_si256(v.m)); + } + + template + ALWAYS_INLINE static GSVector8i broadcast128(const void* v) + { + return broadcast128(GSVector4i::load(v)); + } + + ALWAYS_INLINE GSVector4i low128() const { return GSVector4i(_mm256_castsi256_si128(m)); } + ALWAYS_INLINE GSVector4i high128() const { return GSVector4i(_mm256_extracti128_si256(m, 1)); } +}; + +#endif diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 750624573..c1d806508 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -2707,11 +2707,11 @@ void GPU_HW::LoadVertices() end_pos.bits = FifoPop(); } - const GSVector4i vstart_pos = GSVector4i(start_pos.x + m_drawing_offset.x, start_pos.y + m_drawing_offset.y); - const GSVector4i vend_pos = GSVector4i(end_pos.x + m_drawing_offset.x, end_pos.y + m_drawing_offset.y); - const GSVector4i bounds = vstart_pos.xyxy(vend_pos); - const GSVector4i rect = - vstart_pos.min_i32(vend_pos).xyxy(vstart_pos.max_i32(vend_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1)); + const GSVector2i vstart_pos = GSVector2i(start_pos.x + m_drawing_offset.x, start_pos.y + m_drawing_offset.y); + const GSVector2i vend_pos = GSVector2i(end_pos.x + m_drawing_offset.x, end_pos.y + m_drawing_offset.y); + const GSVector4i bounds = GSVector4i::xyxy(vstart_pos, vend_pos); + const GSVector4i rect = GSVector4i::xyxy(vstart_pos.min_i32(vend_pos), vstart_pos.max_i32(vend_pos)) + .add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area); if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty()) @@ -2747,7 +2747,7 @@ void GPU_HW::LoadVertices() u32 buffer_pos = 0; const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]}; - GSVector4i start_pos = GSVector4i(start_vp.x + m_drawing_offset.x, start_vp.y + m_drawing_offset.y); + GSVector2i start_pos = GSVector2i(start_vp.x + m_drawing_offset.x, start_vp.y + m_drawing_offset.y); u32 start_color = rc.color_for_first_vertex; GPUBackendDrawLineCommand* cmd; @@ -2755,7 +2755,7 @@ void GPU_HW::LoadVertices() { cmd = m_sw_renderer->NewDrawLineCommand(num_vertices); FillDrawCommand(cmd, rc); - GSVector4i::storel(&cmd->vertices[0].x, start_pos); + GSVector2i::store(&cmd->vertices[0].x, start_pos); cmd->vertices[0].color = start_color; } else @@ -2767,10 +2767,10 @@ void GPU_HW::LoadVertices() { const u32 end_color = shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : start_color; const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]}; - const GSVector4i end_pos = GSVector4i(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y); - const GSVector4i bounds = start_pos.xyxy(end_pos); + const GSVector2i end_pos = GSVector2i(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y); + const GSVector4i bounds = GSVector4i::xyxy(start_pos, end_pos); const GSVector4i rect = - start_pos.min_i32(end_pos).xyxy(start_pos.max_i32(end_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1)); + GSVector4i::xyxy(start_pos.min_i32(end_pos), start_pos.max_i32(end_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area); if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty()) { @@ -2790,7 +2790,7 @@ void GPU_HW::LoadVertices() if (cmd) { - GSVector4i::storel(&cmd->vertices[i], end_pos); + GSVector2i::store(&cmd->vertices[i], end_pos); cmd->vertices[i].color = end_color; } } diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index d434b49b9..ebfeb4efd 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -273,7 +273,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVecto std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1)); std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2)); std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3)); - GSVector4i pixels = GSVector4i::load(p0); + GSVector4i pixels = GSVector4i::zext32(p0); pixels = pixels.insert16<2>(p1); pixels = pixels.insert16<4>(p2); pixels = pixels.insert16<6>(p3); @@ -295,7 +295,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices) std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1)); std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2)); std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3)); - GSVector4i pixels = GSVector4i::load(p0); + GSVector4i pixels = GSVector4i::zext32(p0); pixels = pixels.insert16<2>(p1); pixels = pixels.insert16<4>(p2); pixels = pixels.insert16<6>(p3);