From 59a2309a83e6a64af37464c0b858e431be5f6d1d Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 11 Jul 2024 18:37:51 +1000 Subject: [PATCH] GPU: Use half width vector types where appropriate --- src/common/gsvector.h | 37 +- src/common/gsvector_neon.h | 1001 ++++++++++++++++++++++++++++++++-- src/common/gsvector_nosimd.h | 912 +++++++++++++++++++++++++++++-- src/common/gsvector_sse.h | 806 +++++++++++++++++++++++++-- src/core/gpu.h | 6 +- src/core/gpu_hw.cpp | 52 +- src/core/gpu_hw.h | 2 +- src/core/gpu_sw.cpp | 18 +- 8 files changed, 2614 insertions(+), 220 deletions(-) diff --git a/src/common/gsvector.h b/src/common/gsvector.h index 04ff6f153..f19e92e04 100644 --- a/src/common/gsvector.h +++ b/src/common/gsvector.h @@ -1,45 +1,10 @@ -// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once #include "common/intrin.h" -#include - -template -class GSVector2T -{ -public: - union - { - struct - { - T x, y; - }; - struct - { - T r, g; - }; - struct - { - T v[2]; - }; - }; - - GSVector2T() = default; - - ALWAYS_INLINE constexpr GSVector2T(T x) : x(x), y(x) {} - ALWAYS_INLINE constexpr GSVector2T(T x, T y) : x(x), y(y) {} - ALWAYS_INLINE constexpr bool operator==(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) == 0; } - ALWAYS_INLINE constexpr bool operator!=(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) != 0; } - ALWAYS_INLINE constexpr GSVector2T operator*(const GSVector2T& v) const { return {x * v.x, y * v.y}; } - ALWAYS_INLINE constexpr GSVector2T operator/(const GSVector2T& v) const { return {x / v.x, y / v.y}; } -}; - -using GSVector2 = GSVector2T; -using GSVector2i = GSVector2T; - #if defined(CPU_ARCH_SSE) #include "common/gsvector_sse.h" #elif defined(CPU_ARCH_NEON) diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 6bfd89e2a..b37fbe751 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -1,10 +1,918 @@ // SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) +#include "common/intrin.h" +#include "common/types.h" + +#include + #define GSVECTOR_HAS_UNSIGNED 1 #define GSVECTOR_HAS_SRLV 1 +class GSVector2; +class GSVector2i; class GSVector4; +class GSVector4i; + +class alignas(16) GSVector2i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y} {} + + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + + constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7} + { + } + +public: + union + { + struct + { + s32 x, y; + }; + struct + { + s32 r, g; + }; + float F32[2]; + s8 I8[8]; + s16 I16[4]; + s32 I32[2]; + s64 I64[1]; + u8 U8[8]; + u16 U16[4]; + u32 U32[2]; + u64 U64[1]; + int32x2_t v2s; + }; + + GSVector2i() = default; + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) + { + return GSVector2i(cxpr_init, s0, s1, s2, s3); + } + + ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + { + return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7); + } + + ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); } + + ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + + ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7} + { + } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; } + + ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {} + + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + + ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); + + ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); } + + ALWAYS_INLINE operator int32x2_t() const { return v2s; } + + ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const + { + return max_i32(min).min_i32(max); + } + + ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const + { + return max_u32(min).min_u32(max); + } + + ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); + } + + ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); } + + ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); } + + ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); } + + ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); } + + ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); } + + ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); } + + ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); } + + ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); } + + ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } + + ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const + { + uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7)); + return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s)))); + } + + template + ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const + { + static constexpr const uint16_t _mask[4] = { + ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0, + ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0}; + return GSVector2i( + vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s)))); + } + + template + ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const + { + constexpr int bit1 = ((mask & 2) * 3) << 1; + constexpr int bit0 = (mask & 1) * 3; + return blend16(v); + } + + ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const + { + return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)), + vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s))))); + } + + ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } + + ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const + { + return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s)))); + } + + ALWAYS_INLINE GSVector2i ps16() const + { + return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0))))); + } + + ALWAYS_INLINE GSVector2i pu16() const + { + return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0))))); + } + + ALWAYS_INLINE GSVector2i ps32() const + { + return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0))))); + } + + ALWAYS_INLINE GSVector2i pu32() const + { + return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0))))); + } + + ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i upl8() const + { + return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)))); + } + + ALWAYS_INLINE GSVector2i upl16() const + { + return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)))); + } + + ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); } + + ALWAYS_INLINE GSVector2i i8to16() const + { + return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s))))); + } + + ALWAYS_INLINE GSVector2i u8to16() const + { + return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s))))); + } + + template + ALWAYS_INLINE GSVector2i srl() const + { + return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i))); + } + + template + ALWAYS_INLINE GSVector2i sll() const + { + return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i))); + } + + template + ALWAYS_INLINE GSVector2i sll16() const + { + return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i))); + } + + ALWAYS_INLINE GSVector2i sll16(s32 i) const + { + return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i)))); + } + + ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + template + ALWAYS_INLINE GSVector2i srl16() const + { + return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i))); + } + + ALWAYS_INLINE GSVector2i srl16(s32 i) const + { + return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i)))); + } + + ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s))))); + } + + template + ALWAYS_INLINE GSVector2i sra16() const + { + constexpr int count = (i & ~15) ? 15 : i; + return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count))); + } + + ALWAYS_INLINE GSVector2i sra16(s32 i) const + { + return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i)))); + } + + ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s))))); + } + + template + ALWAYS_INLINE GSVector2i sll32() const + { + return GSVector2i(vshl_n_s32(v2s, i)); + } + + ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); } + + ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); } + + template + ALWAYS_INLINE GSVector2i srl32() const + { + return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i))); + } + + ALWAYS_INLINE GSVector2i srl32(s32 i) const + { + return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i)))); + } + + ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)))); + } + + template + ALWAYS_INLINE GSVector2i sra32() const + { + return GSVector2i(vshr_n_s32(v2s, i)); + } + + ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); } + + ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const + { + return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))); + } + + ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); } + + ALWAYS_INLINE bool eq(const GSVector2i& v) const + { + return (vmaxv_u32(vreinterpret_u32_s32(veor_s32(v2s, v.v2s))) == 0); + } + + ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s))); + } + + ALWAYS_INLINE GSVector2i eq64(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_u64(vceq_s64(vreinterpret_s64_s32(v2s), vreinterpret_s64_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); } + + ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); } + + ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); } + + ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + + ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); + } + ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); + } + ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); } + + ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); } + + ALWAYS_INLINE int mask() const + { + // borrowed from sse2neon + const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7)); + const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14)); + const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28)); + return static_cast(vget_lane_u8(paired64, 0)); + } + + ALWAYS_INLINE bool alltrue() const + { + // MSB should be set in all 8-bit lanes. + return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80; + } + + ALWAYS_INLINE bool allfalse() const + { + // MSB should be clear in all 8-bit lanes. + return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80; + } + + template + ALWAYS_INLINE GSVector2i insert8(int a) const + { + return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast(i)))); + } + + template + ALWAYS_INLINE int extract8() const + { + return vget_lane_u8(vreinterpret_u8_s32(v2s), i); + } + + template + ALWAYS_INLINE GSVector2i insert16(int a) const + { + return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast(i)))); + } + + template + ALWAYS_INLINE int extract16() const + { + return vget_lane_u16(vreinterpret_u16_s32(v2s), i); + } + + template + ALWAYS_INLINE GSVector2i insert32(int a) const + { + return GSVector2i(vset_lane_s32(a, v2s, i)); + } + + template + ALWAYS_INLINE int extract32() const + { + return vget_lane_s32(v2s, i); + } + + ALWAYS_INLINE static GSVector2i load32(const void* p) + { + // should be ldr s0, [x0] + u32 val; + std::memcpy(&val, p, sizeof(u32)); + return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0)); + } + + ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); } + + ALWAYS_INLINE static GSVector2i load(int i) { return GSVector2i(vset_lane_s32(i, vdup_n_s32(0), 0)); } + + ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) + { + s32 val = vget_lane_s32(v, 0); + std::memcpy(p, &val, sizeof(s32)); + } + + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); } + + ALWAYS_INLINE static int store(const GSVector2i& v) { return vget_lane_s32(v.v2s, 0); } + + ALWAYS_INLINE void operator&=(const GSVector2i& v) + { + v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); + } + + ALWAYS_INLINE void operator|=(const GSVector2i& v) + { + v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); + } + + ALWAYS_INLINE void operator^=(const GSVector2i& v) + { + v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); + } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); } + + ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); } + + ALWAYS_INLINE GSVector2i xy() const { return *this; } + ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); } + ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); } + ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); } +}; + +class alignas(16) GSVector2 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} + + constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} + +public: + union + { + struct + { + float x, y; + }; + struct + { + float r, g; + }; + float F32[2]; + double F64[1]; + s8 I8[8]; + s16 I16[4]; + s32 I32[2]; + s64 I64[1]; + u8 U8[8]; + u16 U16[4]; + u32 U32[2]; + u64 U64[1]; + float32x2_t v2s; + }; + + GSVector2() = default; + + constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } + + constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {} + + ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {} + + ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {} + + ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); } + + ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); } + + ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); + + ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); + + ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); } + + ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; } + + ALWAYS_INLINE operator float32x2_t() const { return v2s; } + + ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); } + + ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); } + + ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); } + + ALWAYS_INLINE GSVector2 rcpnr() const + { + float32x2_t recip = vrecpe_f32(v2s); + recip = vmul_f32(recip, vrecps_f32(recip, v2s)); + return GSVector2(recip); + } + + ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); } + + ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); } + + ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); } + + ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } + + ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); } + + ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); } + + ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); } + + template + ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const + { + return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1)); + } + + ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const + { + // duplicate sign bit across and bit select + const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31)); + return GSVector2(vbsl_f32(bitmask, a.v2s, v2s)); + } + + ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const + { + return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s)))); + } + + ALWAYS_INLINE int mask() const + { + const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31); + return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1)); + } + + ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); } + + ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); } + + ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const + { + return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src)); + } + + template + ALWAYS_INLINE int extract32() const + { + return vget_lane_s32(vreinterpret_s32_f32(v2s), i); + } + + ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); } + + ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); } + + ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(vset_lane_f32(f, vmov_n_f32(0.0f), 0)); } + + ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32((const float*)p)); } + + ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32((float*)p, v.v2s); } + + ALWAYS_INLINE GSVector2 operator-() const { return neg(); } + + ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); } + ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); } + ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); } + ALWAYS_INLINE void operator/=(const GSVector2& v) { v2s = vdiv_f32(v2s, v.v2s); } + + ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } + ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } + ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); } + ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); } + + ALWAYS_INLINE void operator&=(const GSVector2& v) + { + v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); + } + + ALWAYS_INLINE void operator|=(const GSVector2& v) + { + v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); + } + + ALWAYS_INLINE void operator^=(const GSVector2& v) + { + v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); + } + + ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vadd_f32(v1.v2s, v2.v2s)); + } + + ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vsub_f32(v1.v2s, v2.v2s)); + } + + ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vmul_f32(v1.v2s, v2.v2s)); + } + + ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vdiv_f32(v1.v2s, v2.v2s)); + } + + ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); } + ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); } + ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); } + ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); } + + ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s))); + } + + ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2) + { + // NEON has no != + return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s)))); + } + + ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s))); + } + + ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s))); + } + + ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s))); + } + + ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s))); + } + + ALWAYS_INLINE GSVector2 xy() const { return *this; } + ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); } + ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); } + ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); } +}; class alignas(16) GSVector4i { @@ -13,20 +921,16 @@ class alignas(16) GSVector4i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector4i(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} - constexpr GSVector4i(cxpr_init_tag, short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7) + constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) : I16{s0, s1, s2, s3, s4, s5, s6, s7} { } - constexpr GSVector4i(cxpr_init_tag, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, - char b9, char b10, char b11, char b12, char b13, char b14, char b15) -#if !defined(__APPLE__) && !defined(_MSC_VER) - : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -#else + constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, + s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -#endif { } @@ -79,7 +983,7 @@ public: return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); } - ALWAYS_INLINE GSVector4i(int x, int y, int z, int w) + ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { GSVector4i xz = load(x).upl32(load(z)); GSVector4i yw = load(y).upl32(load(w)); @@ -87,31 +991,27 @@ public: *this = xz.upl32(yw); } - ALWAYS_INLINE GSVector4i(int x, int y) { *this = load(x).upl32(load(y)); } + ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } - ALWAYS_INLINE GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7) + ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) : I16{s0, s1, s2, s3, s4, s5, s6, s7} { } - constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, - char b10, char b11, char b12, char b13, char b14, char b15) -#if !defined(__APPLE__) && !defined(_MSC_VER) - : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -#else + constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, + s8 b13, s8 b14, s8 b15) : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -#endif { } - ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { v4s = vcombine_s32(vld1_s32(v.v), vcreate_s32(0)); } - // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; } + ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {} ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {} + ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true); ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); @@ -281,8 +1181,6 @@ public: ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); } - ALWAYS_INLINE static int min_i16(int a, int b) { return store(load(a).min_i16(load(b))); } - ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const @@ -997,7 +1895,7 @@ public: vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p)))); } - ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); } ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph) { @@ -1102,19 +2000,9 @@ public: ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } - ALWAYS_INLINE GSVector2i xy() const - { - GSVector2i ret; - storel(&ret, *this); - return ret; - } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); } - ALWAYS_INLINE GSVector2i zw() const - { - GSVector2i ret; - storeh(&ret, *this); - return ret; - } + ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); } // clang-format off @@ -1122,11 +2010,6 @@ public: #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } - // ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} - // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} - // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} - // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} - #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ @@ -1150,19 +2033,6 @@ public: VECTOR4i_SHUFFLE_1(z, 2) VECTOR4i_SHUFFLE_1(w, 3) - // TODO: Make generic like above. - ALWAYS_INLINE GSVector4i xxzzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 2, 2, 4, 4, 6, 6))); } - ALWAYS_INLINE GSVector4i yywwlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 1, 3, 3, 5, 5, 7, 7))); } - ALWAYS_INLINE GSVector4i yxwzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 5, 4, 7, 6))); } - ALWAYS_INLINE GSVector4i xxxxlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 4, 4, 4))); } - - ALWAYS_INLINE GSVector4i xxxxl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 5, 6, 7))); } - ALWAYS_INLINE GSVector4i zwxyl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 0, 1, 4, 5, 6, 7))); } - ALWAYS_INLINE GSVector4i yxwzl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 4, 5, 6, 7))); } - ALWAYS_INLINE GSVector4i zwzwl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 2, 3, 4, 5, 6, 7))); } - - ALWAYS_INLINE GSVector4i zzzzh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 6, 6, 6, 6))); } - // clang-format on }; @@ -1243,12 +2113,9 @@ public: v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0))); } - ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0)); } + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); } - ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) - { - v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0))); - } + ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); } ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {} @@ -1641,6 +2508,26 @@ public: } }; +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +{ + v2s = truncate ? vcvt_s32_f32(v.v2s) : vcvtn_u32_f32(v.v2s); +} + +ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) +{ + v2s = vcvt_f32_s32(v.v2s); +} + +ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v) +{ + return GSVector2i(vreinterpret_s32_f32(v.v2s)); +} + +ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) +{ + return GSVector2(vreinterpret_f32_s32(v.v2s)); +} + ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) { v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s); diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index bf2a9a0c9..3c6d73f81 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -5,7 +5,6 @@ #pragma once -#include "common/assert.h" #include "common/types.h" #include @@ -15,7 +14,820 @@ #define GSVECTOR_HAS_UNSIGNED 1 #define GSVECTOR_HAS_SRLV 1 +class GSVector2; +class GSVector2i; class GSVector4; +class GSVector4i; + +#define SSATURATE8(expr) static_cast(std::clamp(expr, -128, 127)) +#define USATURATE8(expr) static_cast(std::clamp(expr, 0, 255)) +#define SSATURATE16(expr) static_cast(std::clamp(expr, -32768, 32767)) +#define USATURATE16(expr) static_cast(std::clamp(expr, 0, 65535)) + +#define ALL_LANES_8(expr) \ + GSVector2i ret; \ + for (size_t i = 0; i < 8; i++) \ + expr; \ + return ret; +#define ALL_LANES_16(expr) \ + GSVector2i ret; \ + for (size_t i = 0; i < 4; i++) \ + expr; \ + return ret; +#define ALL_LANES_32(expr) \ + GSVector2i ret; \ + for (size_t i = 0; i < 2; i++) \ + expr; \ + return ret; + +class alignas(16) GSVector2i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y} {} + + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + + constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7} + { + } + +public: + union + { + struct + { + s32 x, y; + }; + struct + { + s32 r, g; + }; + float F32[2]; + s8 I8[8]; + s16 I16[4]; + s32 I32[2]; + s64 I64[1]; + u8 U8[8]; + u16 U16[4]; + u32 U32[2]; + u64 U64[1]; + }; + + GSVector2i() = default; + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) + { + return GSVector2i(cxpr_init, s0, s1, s2, s3); + } + + ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + { + return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7); + } + + ALWAYS_INLINE GSVector2i(s32 x, s32 y) + { + this->x = x; + this->y = y; + } + + ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) + { + I16[0] = s0; + I16[1] = s1; + I16[2] = s2; + I16[3] = s3; + } + + ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7} + { + } + + ALWAYS_INLINE explicit GSVector2i(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } + + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + + ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); + + ALWAYS_INLINE void operator=(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + ALWAYS_INLINE void operator=(s32 i) + { + x = i; + y = i; + } + + ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const + { + return max_i32(min).min_i32(max); + } + + ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const + { + return max_u32(min).min_u32(max); + } + + GSVector2i min_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); } + GSVector2i max_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); } + GSVector2i min_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); } + GSVector2i max_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); } + GSVector2i min_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); } + GSVector2i max_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); } + + GSVector2i min_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); } + GSVector2i max_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); } + GSVector2i min_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); } + GSVector2i max_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); } + GSVector2i min_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); } + GSVector2i max_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); } + + u8 minv_u8() const + { + return std::min( + U8[0], + std::min(U8[1], std::min(U8[2], std::min(U8[3], std::min(U8[4], std::min(U8[5], std::min(U8[6], U8[7]))))))); + } + + u16 maxv_u8() const + { + return std::max( + U8[0], + std::max(U8[1], std::max(U8[2], std::max(U8[3], std::max(U8[4], std::max(U8[5], std::max(U8[6], U8[7]))))))); + } + + u16 minv_u16() const { return std::min(U16[0], std::min(U16[1], std::min(U16[2], U16[3]))); } + + u16 maxv_u16() const { return std::max(U16[0], std::max(U16[1], std::max(U16[2], U16[3]))); } + + s32 minv_s32() const { return std::min(x, y); } + + u32 minv_u32() const { return std::min(U32[0], U32[1]); } + + s32 maxv_s32() const { return std::max(x, y); } + + u32 maxv_u32() const { return std::max(U32[0], U32[1]); } + + ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } + + GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const + { + GSVector2i ret; + for (size_t i = 0; i < 8; i++) + ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i]; + return ret; + } + + template + GSVector2i blend16(const GSVector2i& v) const + { + GSVector2i ret; + for (size_t i = 0; i < 4; i++) + ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i]; + return ret; + } + + template + GSVector2i blend32(const GSVector2i& v) const + { + GSVector2i ret; + for (size_t i = 0; i < 2; i++) + ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i]; + return ret; + } + + GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const + { + GSVector2i ret; + ret.U64[0] = (v.U64[0] & mask.U64[0]); + return ret; + } + + ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } + + GSVector2i shuffle8(const GSVector2i& mask) const + { + ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf])); + } + + GSVector2i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 4) ? i : (i - 4)])); } + GSVector2i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 4) ? i : (i - 4)])); } + GSVector2i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 2) ? i : (i - 2)])); } + GSVector2i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 2) ? i : (i - 2)])); } + + GSVector2i upl8() const { return GSVector2i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0); } + + GSVector2i upl16() const { return GSVector2i(I16[0], 0, I16[1], 0); } + + GSVector2i upl32() const { return GSVector2i(I32[0], 0); } + + GSVector2i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); } + + template + GSVector2i srl() const + { + GSVector2i ret = {}; + if constexpr (v < 8) + { + for (s32 i = 0; i < (8 - v); i++) + ret.U8[i] = U8[v + i]; + } + return ret; + } + + template + GSVector2i sll() const + { + GSVector2i ret = {}; + if constexpr (v < 8) + { + for (s32 i = 0; i < (8 - v); i++) + ret.U8[v + i] = U8[i]; + } + return ret; + } + + template + GSVector2i sll16() const + { + ALL_LANES_16(ret.U16[i] = U16[i] << v); + } + + GSVector2i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); } + + GSVector2i sllv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); } + + template + GSVector2i srl16() const + { + ALL_LANES_16(ret.U16[i] = U16[i] >> v); + } + + GSVector2i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); } + + GSVector2i srlv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); } + + template + GSVector2i sra16() const + { + ALL_LANES_16(ret.I16[i] = I16[i] >> v); + } + + GSVector2i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); } + + GSVector2i srav16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); } + + template + GSVector2i sll32() const + { + ALL_LANES_32(ret.U32[i] = U32[i] << v); + } + + GSVector2i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); } + + GSVector2i sllv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); } + + template + GSVector2i srl32() const + { + ALL_LANES_32(ret.U32[i] = U32[i] >> v); + } + + GSVector2i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); } + + GSVector2i srlv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); } + + template + GSVector2i sra32() const + { + ALL_LANES_32(ret.I32[i] = I32[i] >> v); + } + + GSVector2i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); } + + GSVector2i srav32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); } + + GSVector2i add8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); } + + GSVector2i add16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); } + + GSVector2i add32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); } + + GSVector2i adds8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); } + + GSVector2i adds16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); } + + GSVector2i addus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); } + + GSVector2i addus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); } + + GSVector2i sub8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); } + + GSVector2i sub16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); } + + GSVector2i sub32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); } + + GSVector2i subs8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); } + + GSVector2i subs16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); } + + GSVector2i subus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); } + + GSVector2i subus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); } + + GSVector2i avg8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); } + + GSVector2i avg16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); } + + GSVector2i mul16l(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); } + + GSVector2i mul32l(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); } + + ALWAYS_INLINE bool eq(const GSVector2i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; } + + GSVector2i eq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); } + GSVector2i eq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); } + GSVector2i eq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); } + + GSVector2i neq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); } + GSVector2i neq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); } + GSVector2i neq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); } + + GSVector2i gt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); } + GSVector2i gt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); } + GSVector2i gt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); } + + GSVector2i ge8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); } + GSVector2i ge16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); } + GSVector2i ge32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); } + + GSVector2i lt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); } + GSVector2i lt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); } + GSVector2i lt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); } + + GSVector2i le8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); } + GSVector2i le16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); } + GSVector2i le32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); } + + ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const + { + GSVector2i ret; + ret.U64[0] = (~v.U64[0]) & U64[0]; + return ret; + } + + s32 mask() const + { + return static_cast((static_cast(U8[0] >> 7) << 0) | (static_cast(U8[1] >> 7) << 1) | + (static_cast(U8[2] >> 7) << 2) | (static_cast(U8[3] >> 7) << 3) | + (static_cast(U8[4] >> 7) << 4) | (static_cast(U8[5] >> 7) << 5) | + (static_cast(U8[6] >> 7) << 6) | (static_cast(U8[7] >> 7) << 7)); + } + + ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); } + + ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); } + + template + ALWAYS_INLINE GSVector2i insert8(s32 a) const + { + GSVector2i ret = *this; + ret.I8[i] = static_cast(a); + return ret; + } + + template + ALWAYS_INLINE s32 extract8() const + { + return I8[i]; + } + + template + ALWAYS_INLINE GSVector2i insert16(s32 a) const + { + GSVector2i ret = *this; + ret.I16[i] = static_cast(a); + return ret; + } + + template + ALWAYS_INLINE s32 extract16() const + { + return I16[i]; + } + + template + ALWAYS_INLINE GSVector2i insert32(s32 a) const + { + GSVector2i ret = *this; + ret.I32[i] = a; + return ret; + } + + template + ALWAYS_INLINE s32 extract32() const + { + return I32[i]; + } + + ALWAYS_INLINE static GSVector2i load32(const void* p) + { + GSVector2i ret; + std::memcpy(&ret.x, p, sizeof(s32)); + ret.y = 0; + return ret; + } + + ALWAYS_INLINE static GSVector2i load(const void* p) + { + GSVector2i ret; + std::memcpy(ret.I32, p, sizeof(ret.I32)); + return ret; + } + + ALWAYS_INLINE static GSVector2i load(s32 i) + { + GSVector2i ret; + ret.x = i; + return ret; + } + + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.I32, sizeof(I32)); } + + ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); } + + ALWAYS_INLINE static s32 store(const GSVector2i& v) { return v.x; } + + ALWAYS_INLINE void operator&=(const GSVector2i& v) { U64[0] &= v.U64[0]; } + ALWAYS_INLINE void operator|=(const GSVector2i& v) { U64[0] |= v.U64[0]; } + ALWAYS_INLINE void operator^=(const GSVector2i& v) { U64[0] ^= v.U64[0]; } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) + { + GSVector2i ret; + ret.U64[0] = v1.U64[0] & v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2) + { + GSVector2i ret; + ret.U64[0] = v1.U64[0] | v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2) + { + GSVector2i ret; + ret.U64[0] = v1.U64[0] ^ v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); } + + ALWAYS_INLINE static constexpr GSVector2i zero() { return GSVector2i::cxpr(0, 0); } + + ALWAYS_INLINE GSVector2i xy() const { return *this; } + ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(x, x); } + ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(y, x); } + ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(y, y); } +}; + +class alignas(16) GSVector2 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} + + constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} + +public: + union + { + struct + { + float x, y; + }; + struct + { + float r, g; + }; + float F32[4]; + double F64[2]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + }; + + GSVector2() = default; + + constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } + + constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector2(float x, float y) + { + this->x = x; + this->y = y; + } + + ALWAYS_INLINE GSVector2(int x, int y) + { + this->x = static_cast(x); + this->y = static_cast(y); + } + + ALWAYS_INLINE explicit GSVector2(float f) { x = y = f; } + + ALWAYS_INLINE explicit GSVector2(int i) { x = y = static_cast(i); } + + ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); + + ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); + + ALWAYS_INLINE void operator=(float f) { x = y = f; } + + GSVector2 abs() const { return GSVector2(std::fabs(x), std::fabs(y)); } + + GSVector2 neg() const { return GSVector2(-x, -y); } + + GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); } + + GSVector2 rcpnr() const + { + GSVector2 v_ = rcp(); + + return (v_ + v_) - (v_ * v_) * *this; + } + + GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); } + + GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); } + + GSVector2 sat(const GSVector2& min, const GSVector2& max) const + { + return GSVector2(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y)); + } + + GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } + + GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); } + + GSVector2 min(const GSVector2& v) const { return GSVector2(std::min(x, v.x), std::min(y, v.y)); } + + GSVector2 max(const GSVector2& v) const { return GSVector2(std::max(x, v.x), std::max(y, v.y)); } + + template + GSVector2 blend32(const GSVector2& v) const + { + return GSVector2(v.F32[mask & 1], v.F32[(mask >> 1) & 1]); + } + + ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const + { + return GSVector2((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y); + } + + ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const + { + GSVector2 ret; + ret.U32[0] = ((~v.U32[0]) & U32[0]); + ret.U32[1] = ((~v.U32[1]) & U32[1]); + return ret; + } + + ALWAYS_INLINE int mask() const { return (U32[0] >> 31) | ((U32[1] >> 30) & 2); } + + ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); } + + ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); } + + ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const + { + GSVector2 ret = *this; + ret.F32[dst] = v.F32[src]; + return ret; + } + + template + ALWAYS_INLINE int extract32() const + { + return I32[i]; + } + + ALWAYS_INLINE static constexpr GSVector2 zero() { return GSVector2::cxpr(0.0f, 0.0f); } + + ALWAYS_INLINE static constexpr GSVector2 xffffffff() + { + GSVector2 ret = zero(); + ret.U64[0] = ~ret.U64[0]; + return ret; + } + + ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(f, f); } + + ALWAYS_INLINE static GSVector2 load(const void* p) + { + GSVector2 ret; + std::memcpy(ret.F32, p, sizeof(F32)); + return ret; + } + + ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); } + + ALWAYS_INLINE GSVector2 operator-() const { return neg(); } + + void operator+=(const GSVector2& v_) + { + x = x + v_.x; + y = y + v_.y; + } + void operator-=(const GSVector2& v_) + { + x = x - v_.x; + y = y - v_.y; + } + void operator*=(const GSVector2& v_) + { + x = x * v_.x; + y = y * v_.y; + } + void operator/=(const GSVector2& v_) + { + x = x / v_.x; + y = y / v_.y; + } + + void operator+=(const float v_) + { + x = x + v_; + y = y + v_; + } + void operator-=(const float v_) + { + x = x - v_; + y = y - v_; + } + void operator*=(const float v_) + { + x = x * v_; + y = y * v_; + } + void operator/=(const float v_) + { + x = x / v_; + y = y / v_; + } + + void operator&=(const GSVector2& v_) { U64[0] &= v_.U64[0]; } + void operator|=(const GSVector2& v_) { U64[0] |= v_.U64[0]; } + void operator^=(const GSVector2& v_) { U64[0] ^= v_.U64[0]; } + + friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x + v2.x, v1.y + v2.y); } + + friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x - v2.x, v1.y - v2.y); } + + friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x * v2.x, v1.y * v2.y); } + + friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x / v2.x, v1.y / v2.y); } + + friend GSVector2 operator+(const GSVector2& v, float f) { return GSVector2(v.x + f, v.y + f); } + + friend GSVector2 operator-(const GSVector2& v, float f) { return GSVector2(v.x - f, v.y - f); } + + friend GSVector2 operator*(const GSVector2& v, float f) { return GSVector2(v.x * f, v.y * f); } + + friend GSVector2 operator/(const GSVector2& v, float f) { return GSVector2(v.x / f, v.y / f); } + + friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.U64[0] = v1.U64[0] & v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.U64[0] = v1.U64[0] | v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.U64[0] = v1.U64[0] ^ v2.U64[0]; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x == v2.x) ? -1 : 0; + ret.I32[1] = (v1.y == v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x != v2.x) ? -1 : 0; + ret.I32[1] = (v1.y != v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x > v2.x) ? -1 : 0; + ret.I32[1] = (v1.y > v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x < v2.x) ? -1 : 0; + ret.I32[1] = (v1.y < v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x >= v2.x) ? -1 : 0; + ret.I32[1] = (v1.y >= v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2) + { + GSVector2 ret; + ret.I32[0] = (v1.x <= v2.x) ? -1 : 0; + ret.I32[1] = (v1.y <= v2.y) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE GSVector2 xy() const { return *this; } + ALWAYS_INLINE GSVector2 xx() const { return GSVector2(x, x); } + ALWAYS_INLINE GSVector2 yx() const { return GSVector2(y, x); } + ALWAYS_INLINE GSVector2 yy() const { return GSVector2(y, y); } +}; + +#undef ALL_LANES_8 +#undef ALL_LANES_16 +#undef ALL_LANES_32 #define ALL_LANES_8(expr) \ GSVector4i ret; \ @@ -37,10 +849,6 @@ class GSVector4; for (size_t i = 0; i < 2; i++) \ expr; \ return ret; -#define SSATURATE8(expr) static_cast(std::clamp(expr, -128, 127)) -#define USATURATE8(expr) static_cast(std::clamp(expr, 0, 255)) -#define SSATURATE16(expr) static_cast(std::clamp(expr, -32768, 32767)) -#define USATURATE16(expr) static_cast(std::clamp(expr, 0, 65535)) class alignas(16) GSVector4i { @@ -139,14 +947,7 @@ public: } ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } - - ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) - { - x = v.x; - y = v.y; - z = 0; - w = 0; - } + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : I32{v.I32[0], v.I32[1], 0, 0} {} // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), // so leave the non-constexpr version default @@ -374,7 +1175,7 @@ public: { GSVector4i ret; for (size_t i = 0; i < 2; i++) - ret.U64[0] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]); + ret.U64[i] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]); return ret; } @@ -385,14 +1186,20 @@ public: ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf])); } - GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i])); } - GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[i])); } - GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE16((i < 8) ? U16[i] : v.U16[i])); } - GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[i])); } - GSVector4i ps32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 8) ? I32[i] : v.I32[i])); } - GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE8(I32[i])); } - GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 8) ? U32[i] : v.U32[i])); } - GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE8(U32[i])); } + GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i - 8])); } + GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 8) ? i : (i - 8)])); } + GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8((i < 8) ? U16[i] : v.U16[i - 8])); } + GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 8) ? i : (i - 8)])); } + GSVector4i ps32(const GSVector4i& v) const + { + ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 4) ? I32[i] : v.I32[i - 4])); + } + GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 4) ? i : (i - 4)])); } + GSVector4i pu32(const GSVector4i& v) const + { + ALL_LANES_16(ret.U16[i] = USATURATE16((i < 4) ? U32[i] : v.U32[i - 4])); + } + GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 4) ? i : (i - 4)])); } GSVector4i upl8(const GSVector4i& v) const { @@ -930,19 +1737,8 @@ public: ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } - ALWAYS_INLINE GSVector2i xy() const - { - GSVector2i ret; - storel(&ret, *this); - return ret; - } - - ALWAYS_INLINE GSVector2i zw() const - { - GSVector2i ret; - storeh(&ret, *this); - return ret; - } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); } + ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); } // clang-format off // l/h/lh not implemented until needed @@ -1062,26 +1858,11 @@ public: this->w = 0.0f; } - ALWAYS_INLINE explicit GSVector4(const GSVector2& v) - { - x = v.x; - y = v.y; - z = 0.0f; - w = 0.0f; - } - - ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) - { - x = static_cast(v.x); - y = static_cast(v.y); - z = 0.0f; - w = 0.0f; - } - ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; } ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast(i); } + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : x(v.x), y(v.y), z(0.0f), w(0.0f) {} ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); @@ -1298,7 +2079,7 @@ public: template ALWAYS_INLINE static void store(void* p, const GSVector4& v) { - std::memcpy(p, &v.x, sizeof(float)); + std::memcpy(p, v.F32, sizeof(F32)); } ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; } @@ -1589,6 +2370,33 @@ public: } }; +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +{ + // TODO: Truncation vs rounding... + x = static_cast(v.x); + y = static_cast(v.y); +} + +ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) +{ + x = static_cast(v.x); + y = static_cast(v.y); +} + +ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v) +{ + GSVector2i ret; + std::memcpy(&ret, &v, sizeof(ret)); + return ret; +} + +ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) +{ + GSVector2 ret; + std::memcpy(&ret, &v, sizeof(ret)); + return ret; +} + ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) { // TODO: Truncation vs rounding... diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index 213c4b92f..99a48e705 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -3,7 +3,6 @@ #pragma once -#include "common/assert.h" #include "common/intrin.h" #include "common/types.h" @@ -14,7 +13,740 @@ #define GSVECTOR_HAS_SRLV 1 #endif +class GSVector2; +class GSVector2i; class GSVector4; +class GSVector4i; + +class alignas(16) GSVector2i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y, 0, 0} {} + + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3, 0, 0, 0, 0} {} + + constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} + { + } + +public: + union + { + struct + { + s32 x, y; + }; + struct + { + s32 r, g; + }; + float F32[4]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + __m128i m; + }; + + GSVector2i() = default; + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) + { + return GSVector2i(cxpr_init, s0, s1, s2, s3); + } + + ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + { + return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7); + } + + ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); } + + ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); } + + ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} + { + } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } + + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + + ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); + + ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {} + + ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } + ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + + ALWAYS_INLINE operator __m128i() const { return m; } + + ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const + { + return max_i32(min).min_i32(max); + } + + ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const + { + return max_u32(min).min_u32(max); + } + + ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const { return GSVector2i(_mm_min_epi8(m, v)); } + ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const { return GSVector2i(_mm_max_epi8(m, v)); } + ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); } + ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); } + ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(_mm_min_epi32(m, v)); } + ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(_mm_max_epi32(m, v)); } + + ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); } + ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); } + ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(_mm_min_epu16(m, v)); } + ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(_mm_max_epu16(m, v)); } + ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); } + ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); } + + ALWAYS_INLINE u8 minv_u8() const + { + __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); + return static_cast(std::min( + static_cast(_mm_extract_epi8(vmin, 0)), + std::min(static_cast(_mm_extract_epi8(vmin, 1)), + std::min(static_cast(_mm_extract_epi8(vmin, 2)), static_cast(_mm_extract_epi8(vmin, 3)))))); + } + + ALWAYS_INLINE u16 maxv_u8() const + { + __m128i vmax = _mm_max_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); + return static_cast(std::max( + static_cast(_mm_extract_epi8(vmax, 0)), + std::max(static_cast(_mm_extract_epi8(vmax, 1)), + std::max(static_cast(_mm_extract_epi8(vmax, 2)), static_cast(_mm_extract_epi8(vmax, 3)))))); + } + + ALWAYS_INLINE u16 minv_u16() const + { + __m128i vmin = _mm_min_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); + return static_cast( + std::min(static_cast(_mm_extract_epi16(vmin, 0)), static_cast(_mm_extract_epi16(vmin, 1)))); + } + + ALWAYS_INLINE u16 maxv_u16() const + { + __m128i vmax = _mm_max_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); + return static_cast( + std::max(static_cast(_mm_extract_epi16(vmax, 0)), static_cast(_mm_extract_epi16(vmax, 1)))); + } + + ALWAYS_INLINE s32 minv_s32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } + + ALWAYS_INLINE u32 minv_u32() const { return std::min(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } + + ALWAYS_INLINE s32 maxv_s32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } + + ALWAYS_INLINE u32 maxv_u32() const { return std::max(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } + + ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } + + ALWAYS_INLINE GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const + { + return GSVector2i(_mm_blendv_epi8(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector2i blend16(const GSVector2i& v) const + { + return GSVector2i(_mm_blend_epi16(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const + { +#if defined(__AVX2__) + return GSVector2i(_mm_blend_epi32(m, v.m, mask)); +#else + constexpr s32 bit1 = ((mask & 2) * 3) << 1; + constexpr s32 bit0 = (mask & 1) * 3; + return blend16(v); +#endif + } + + ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const + { + return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); + } + + ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } + + ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); } + + ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); } + ALWAYS_INLINE GSVector2i pu16() const { return GSVector2i(_mm_packus_epi16(m, m)); } + ALWAYS_INLINE GSVector2i ps32() const { return GSVector2i(_mm_packs_epi32(m, m)); } + ALWAYS_INLINE GSVector2i pu32() const { return GSVector2i(_mm_packus_epi32(m, m)); } + + ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi8(m, v)); } + ALWAYS_INLINE GSVector2i uph8(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi8(m, v)); } + ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi16(m, v)); } + ALWAYS_INLINE GSVector2i uph16(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi16(m, v)); } + ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi32(m, v)); } + ALWAYS_INLINE GSVector2i uph32(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi32(m, v)); } + + ALWAYS_INLINE GSVector2i upl8() const { return GSVector2i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector2i uph8() const { return GSVector2i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector2i upl16() const { return GSVector2i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector2i uph16() const { return GSVector2i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector2i uph32() const { return GSVector2i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector2i i8to16() const { return GSVector2i(_mm_cvtepi8_epi16(m)); } + +#ifdef CPU_ARCH_SSE41 + ALWAYS_INLINE GSVector2i u8to16() const { return GSVector2i(_mm_cvtepu8_epi16(m)); } +#endif + + template + ALWAYS_INLINE GSVector2i srl() const + { + return GSVector2i(_mm_srli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector2i sll() const + { + return GSVector2i(_mm_slli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector2i sll16() const + { + return GSVector2i(_mm_slli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector2i sll16(s32 i) const { return GSVector2i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector2i srl16() const + { + return GSVector2i(_mm_srli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector2i srl16(s32 i) const { return GSVector2i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector2i sra16() const + { + return GSVector2i(_mm_srai_epi16(m, i)); + } + + ALWAYS_INLINE GSVector2i sra16(s32 i) const { return GSVector2i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector2i sll32() const + { + return GSVector2i(_mm_slli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi32(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector2i srl32() const + { + return GSVector2i(_mm_srli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector2i srl32(s32 i) const { return GSVector2i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi32(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector2i sra32() const + { + return GSVector2i(_mm_srai_epi32(m, i)); + } + + ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi32(m, v.m)); } +#endif + + ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); } + + ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); } + + ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi8(m, v.m)); } + ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi16(m, v.m)); } + ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); } + ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); } + ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); } + + ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(_mm_andnot_si128(v.m, m)); } + + ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); } + + ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); } + + ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); } + + template + ALWAYS_INLINE GSVector2i insert8(s32 a) const + { + return GSVector2i(_mm_insert_epi8(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract8() const + { + return _mm_extract_epi8(m, i); + } + + template + ALWAYS_INLINE GSVector2i insert16(s32 a) const + { + return GSVector2i(_mm_insert_epi16(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract16() const + { + return _mm_extract_epi16(m, i); + } + + template + ALWAYS_INLINE GSVector2i insert32(s32 a) const + { + return GSVector2i(_mm_insert_epi32(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract32() const + { + if constexpr (i == 0) + return GSVector2i::store(*this); + + return _mm_extract_epi32(m, i); + } + + ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } + + ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64((__m128i*)p)); } + + ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); } + + ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); } + + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64((__m128i*)p, v.m); } + + ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } + + ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); } + + ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); } + + ALWAYS_INLINE void operator&=(const GSVector2i& v) { m = _mm_and_si128(m, v); } + ALWAYS_INLINE void operator|=(const GSVector2i& v) { m = _mm_or_si128(m, v); } + ALWAYS_INLINE void operator^=(const GSVector2i& v) { m = _mm_xor_si128(m, v); } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(_mm_and_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(_mm_or_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2) + { + return GSVector2i(_mm_xor_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); } + + ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); } + + ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); } + + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } + ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); } + ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 1))); } + ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 1, 1))); } +}; + +class alignas(16) GSVector2 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} + + constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} + +public: + union + { + struct + { + float x, y; + }; + struct + { + float r, g; + }; + float F32[4]; + double F64[2]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + __m128 m; + }; + + GSVector2() = default; + + constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } + + constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } + + constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); } + + ALWAYS_INLINE GSVector2(int x, int y) + { + GSVector2i v_(x, y); + + m = _mm_cvtepi32_ps(v_.m); + } + + ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {} + + ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {} + + ALWAYS_INLINE explicit GSVector2(float f) { *this = f; } + + ALWAYS_INLINE explicit GSVector2(int i) + { +#ifdef CPU_ARCH_AVX2 + m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i))); +#else + *this = GSVector2(GSVector2i(i)); +#endif + } + + ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); + + ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); + + ALWAYS_INLINE void operator=(float f) + { +#if CPU_ARCH_AVX2 + + m = _mm_broadcastss_ps(_mm_load_ss(&f)); + +#else + + m = _mm_set1_ps(f); + +#endif + } + + ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + + ALWAYS_INLINE operator __m128() const { return m; } + + ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); } + + ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); } + + ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); } + + ALWAYS_INLINE GSVector2 rcpnr() const + { + GSVector2 v_ = rcp(); + + return (v_ + v_) - (v_ * v_) * *this; + } + + ALWAYS_INLINE GSVector2 floor() const + { + return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + } + + ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } + + ALWAYS_INLINE GSVector2 sat(const GSVector2& min, const GSVector2& max) const + { + return GSVector2(_mm_min_ps(_mm_max_ps(m, min), max)); + } + + ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } + + ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); } + + ALWAYS_INLINE GSVector2 min(const GSVector2& v) const { return GSVector2(_mm_min_ps(m, v)); } + + ALWAYS_INLINE GSVector2 max(const GSVector2& v) const { return GSVector2(_mm_max_ps(m, v)); } + + template + ALWAYS_INLINE GSVector2 blend32(const GSVector2& v) const + { + return GSVector2(_mm_blend_ps(m, v, mask)); + } + + ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const + { + return GSVector2(_mm_blendv_ps(m, v, mask)); + } + + ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const { return GSVector2(_mm_andnot_ps(v.m, m)); } + + ALWAYS_INLINE int mask() const { return (_mm_movemask_ps(m) & 0x3); } + + ALWAYS_INLINE bool alltrue() const { return (mask() == 0x3); } + + ALWAYS_INLINE bool allfalse() const { return (mask() == 0x0); } + + ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const + { + if constexpr (src == dst) + return GSVector2(_mm_blend_ps(m, v.m, 1 << src)); + else + return GSVector2(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); + } + + template + ALWAYS_INLINE int extract32() const + { + return _mm_extract_ps(m, i); + } + + ALWAYS_INLINE static GSVector2 zero() { return GSVector2(_mm_setzero_ps()); } + + ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); } + + ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd((double*)p))); } + + ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); } + + ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } + + ALWAYS_INLINE GSVector2 operator-() const { return neg(); } + + ALWAYS_INLINE void operator+=(const GSVector2& v_) { m = _mm_add_ps(m, v_); } + ALWAYS_INLINE void operator-=(const GSVector2& v_) { m = _mm_sub_ps(m, v_); } + ALWAYS_INLINE void operator*=(const GSVector2& v_) { m = _mm_mul_ps(m, v_); } + ALWAYS_INLINE void operator/=(const GSVector2& v_) { m = _mm_div_ps(m, v_); } + + ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } + ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } + ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); } + ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); } + + ALWAYS_INLINE void operator&=(const GSVector2& v_) { m = _mm_and_ps(m, v_); } + ALWAYS_INLINE void operator|=(const GSVector2& v_) { m = _mm_or_ps(m, v_); } + ALWAYS_INLINE void operator^=(const GSVector2& v_) { m = _mm_xor_ps(m, v_); } + + ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_add_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_sub_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_mul_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_div_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); } + + ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); } + + ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); } + + ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); } + + ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_and_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_or_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_xor_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmpeq_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmpneq_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmpgt_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmplt_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmpge_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2) + { + return GSVector2(_mm_cmple_ps(v1, v2)); + } + + ALWAYS_INLINE GSVector2 xy() const { return *this; } + ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); } + ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); } + ALWAYS_INLINE GSVector2 yy() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 1, 1))); } +}; class alignas(16) GSVector4i { @@ -100,21 +832,20 @@ public: { } - ALWAYS_INLINE GSVector4i(const GSVector4i& v) { m = v.m; } - - ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_loadl_epi64((__m128i*)&v); } + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; } // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } + ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {} - ALWAYS_INLINE void operator=(const GSVector4i& v) { m = v.m; } ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } @@ -141,7 +872,6 @@ public: ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } - // ALWAYS_INLINE u32 rgba32() const @@ -685,7 +1415,10 @@ public: return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p))); } - ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) + { + return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m)); + } template ALWAYS_INLINE static GSVector4i load(const void* p) @@ -755,19 +1488,9 @@ public: ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } - ALWAYS_INLINE GSVector2i xy() const - { - GSVector2i ret; - storel(&ret, *this); - return ret; - } + ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } - ALWAYS_INLINE GSVector2i zw() const - { - GSVector2i ret; - storeh(&ret, *this); - return ret; - } + ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); } // clang-format off @@ -874,9 +1597,12 @@ public: m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y))); } - ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v)); } + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {} - ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v)); } + ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) + : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd()))) + { + } ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {} @@ -916,19 +1642,6 @@ public: ALWAYS_INLINE operator __m128() const { return m; } - /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks - /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove - /// possibly-denormal garbage data from vectors before computing with them - ALWAYS_INLINE GSVector4 noopt() - { - // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the - // future the implementation should be updated -#ifdef __clang__ - __asm__("" : "+x"(m)::); -#endif - return *this; - } - u32 rgba32() const { return GSVector4i(*this).rgba32(); } ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } @@ -948,7 +1661,10 @@ public: return (v_ + v_) - (v_ * v_) * *this; } - ALWAYS_INLINE GSVector4 floor() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); } + ALWAYS_INLINE GSVector4 floor() const + { + return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + } ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } @@ -1289,6 +2005,26 @@ public: } }; +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +{ + m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); +} + +ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) +{ + m = _mm_cvtepi32_ps(v); +} + +ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v) +{ + return GSVector2i(_mm_castps_si128(v.m)); +} + +ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) +{ + return GSVector2(_mm_castsi128_ps(v.m)); +} + ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) { m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); diff --git a/src/core/gpu.h b/src/core/gpu.h index e280af903..019138b28 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -319,15 +319,15 @@ protected: virtual void DrawRendererStats(); virtual void OnBufferSwapped(); - ALWAYS_INLINE_RELEASE void AddDrawTriangleTicks(GSVector4i v1, GSVector4i v2, GSVector4i v3, bool shaded, + ALWAYS_INLINE_RELEASE void AddDrawTriangleTicks(GSVector2i v1, GSVector2i v2, GSVector2i v3, bool shaded, bool textured, bool semitransparent) { // This will not produce the correct results for triangles which are partially outside the clip area. // However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect // the edges with the clip rectangle. // TODO: Coordinates are exclusive, so off by one here... - const GSVector4i clamp_min = m_clamped_drawing_area; // would be xyxy(), but zw isn't used. - const GSVector4i clamp_max = m_clamped_drawing_area.zwzw(); + const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x); + const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z); v1 = v1.sat_i32(clamp_min, clamp_max); v2 = v2.sat_i32(clamp_min, clamp_max); v3 = v3.sat_i32(clamp_min, clamp_max); diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index f827e7705..42743a88b 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -1962,15 +1962,15 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices) { DebugAssert(num_vertices == 3 || num_vertices == 4); - GSVector4i v0 = GSVector4i::load32(&vertices[0].u); - GSVector4i v1 = GSVector4i::load32(&vertices[1].u); - GSVector4i v2 = GSVector4i::load32(&vertices[2].u); - GSVector4i v3; - GSVector4i min = v0.min_u16(v1).min_u16(v2); - GSVector4i max = v0.max_u16(v1).max_u16(v2); + GSVector2i v0 = GSVector2i::load32(&vertices[0].u); + GSVector2i v1 = GSVector2i::load32(&vertices[1].u); + GSVector2i v2 = GSVector2i::load32(&vertices[2].u); + GSVector2i v3; + GSVector2i min = v0.min_u16(v1).min_u16(v2); + GSVector2i max = v0.max_u16(v1).max_u16(v2); if (num_vertices == 4) { - v3 = GSVector4i::load32(&vertices[3].u); + v3 = GSVector2i::load32(&vertices[3].u); min = min.min_u16(v3); max = max.max_u16(v3); } @@ -1986,7 +1986,7 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices) vertices[i].SetUVLimits(min_u, max_u, min_v, max_v); if (m_texpage_dirty != 0) - CheckForTexPageOverlap(min.upl32(max).u16to32()); + CheckForTexPageOverlap(GSVector4i(min).upl32(GSVector4i(max)).u16to32()); } void GPU_HW::SetBatchDepthBuffer(bool enabled) @@ -2157,8 +2157,6 @@ void GPU_HW::LoadVertices() const bool shaded = rc.shading_enable; const bool pgxp = g_settings.gpu_pgxp_enable; - // TODO: Using 64-bit vectors instead of 32-bit could be advantageous here, particularly for small ARM cores and - // RISC-V. const u32 first_color = rc.color_for_first_vertex; u32 num_vertices = rc.quad_polygon ? 4 : 3; std::array vertices; @@ -2240,13 +2238,13 @@ void GPU_HW::LoadVertices() } // Cull polygons which are too large. - const GSVector4 v0f = GSVector4::loadl(&vertices[0].x); - const GSVector4 v1f = GSVector4::loadl(&vertices[1].x); - const GSVector4 v2f = GSVector4::loadl(&vertices[2].x); - const GSVector4 min_pos_12 = v1f.min(v2f); - const GSVector4 max_pos_12 = v1f.max(v2f); - const GSVector4i draw_rect_012 = - GSVector4i(min_pos_12.min(v0f).upld(max_pos_12.max(v0f))).add32(GSVector4i::cxpr(0, 0, 1, 1)); + const GSVector2 v0f = GSVector2::load(&vertices[0].x); + const GSVector2 v1f = GSVector2::load(&vertices[1].x); + const GSVector2 v2f = GSVector2::load(&vertices[2].x); + const GSVector2 min_pos_12 = v1f.min(v2f); + const GSVector2 max_pos_12 = v1f.max(v2f); + const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f)))) + .add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_draw_rect_012 = draw_rect_012.rintersect(m_clamped_drawing_area); const bool first_tri_culled = (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT || clamped_draw_rect_012.rempty()); @@ -2265,9 +2263,8 @@ void GPU_HW::LoadVertices() ComputePolygonUVLimits(vertices.data(), num_vertices); AddDrawnRectangle(clamped_draw_rect_012); - AddDrawTriangleTicks(GSVector4i(native_vertex_positions[0]), GSVector4i(native_vertex_positions[1]), - GSVector4i(native_vertex_positions[2]), rc.shading_enable, rc.texture_enable, - rc.transparency_enable); + AddDrawTriangleTicks(native_vertex_positions[0], native_vertex_positions[1], native_vertex_positions[2], + rc.shading_enable, rc.texture_enable, rc.transparency_enable); // Expand lines to triangles (Doom, Soul Blade, etc.) if (!rc.quad_polygon && m_line_detect_mode >= GPULineDetectMode::BasicTriangles && !is_3d && @@ -2288,9 +2285,9 @@ void GPU_HW::LoadVertices() // quads if (rc.quad_polygon) { - const GSVector4 v3f = GSVector4::loadl(&vertices[3].x); - const GSVector4i draw_rect_123 = - GSVector4i(min_pos_12.min(v3f).upld(max_pos_12.max(v3f))).add32(GSVector4i::cxpr(0, 0, 1, 1)); + const GSVector2 v3f = GSVector2::load(&vertices[3].x); + const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f)))) + .add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area); // Cull polygons which are too large. @@ -2312,9 +2309,8 @@ void GPU_HW::LoadVertices() ComputePolygonUVLimits(vertices.data(), num_vertices); AddDrawnRectangle(clamped_draw_rect_123); - AddDrawTriangleTicks(GSVector4i(native_vertex_positions[2]), GSVector4i(native_vertex_positions[1]), - GSVector4i(native_vertex_positions[3]), rc.shading_enable, rc.texture_enable, - rc.transparency_enable); + AddDrawTriangleTicks(native_vertex_positions[2], native_vertex_positions[1], native_vertex_positions[3], + rc.shading_enable, rc.texture_enable, rc.transparency_enable); const u32 start_index = m_batch_vertex_count; DebugAssert(m_batch_index_space >= 3); @@ -2650,7 +2646,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect) const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode; const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast(tmode)); - const GSVector4i page_offset = GSVector4i(m_current_texture_page_offset).xyxy(); + const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy(); uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift)); // shift only goes on the x uv_rect = uv_rect.add32(page_offset); // page offset @@ -3220,7 +3216,7 @@ void GPU_HW::DispatchRenderCommand() } const GSVector4i page_rect = m_draw_mode.mode_reg.GetTexturePageRectangle(); - m_current_texture_page_offset = page_rect.xy(); + GSVector4i::storel(m_current_texture_page_offset, page_rect); u8 new_texpage_dirty = m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0; new_texpage_dirty |= m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0; diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index fade25027..d1bc5cb67 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -294,7 +294,7 @@ private: GSVector4i m_vram_dirty_draw_rect = INVALID_RECT; GSVector4i m_vram_dirty_write_rect = INVALID_RECT; GSVector4i m_current_uv_rect = INVALID_RECT; - GSVector2i m_current_texture_page_offset = {}; + s32 m_current_texture_page_offset[2] = {}; std::unique_ptr m_wireframe_pipeline; diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index 075156f72..b898be855 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -515,7 +515,7 @@ void GPU_SW::DispatchRenderCommand() GPUBackendDrawPolygonCommand* cmd = m_backend.NewDrawPolygonCommand(num_vertices); FillDrawCommand(cmd, rc); - std::array positions; + std::array positions; const u32 first_color = rc.color_for_first_vertex; const bool shaded = rc.shading_enable; const bool textured = rc.texture_enable; @@ -528,14 +528,15 @@ void GPU_SW::DispatchRenderCommand() vert->x = m_drawing_offset.x + vp.x; vert->y = m_drawing_offset.y + vp.y; vert->texcoord = textured ? Truncate16(FifoPop()) : 0; - positions[i] = GSVector4i::loadl(&vert->x); + positions[i] = GSVector2i::load(&vert->x); } // Cull polygons which are too large. - const GSVector4i min_pos_12 = positions[1].min_i32(positions[2]); - const GSVector4i max_pos_12 = positions[1].max_i32(positions[2]); - const GSVector4i draw_rect_012 = - min_pos_12.min_i32(positions[0]).upl64(max_pos_12.max_i32(positions[0])).add32(GSVector4i::cxpr(0, 0, 1, 1)); + const GSVector2i min_pos_12 = positions[1].min_i32(positions[2]); + const GSVector2i max_pos_12 = positions[1].max_i32(positions[2]); + const GSVector4i draw_rect_012 = GSVector4i(min_pos_12.min_i32(positions[0])) + .upl64(GSVector4i(max_pos_12.max_i32(positions[0]))) + .add32(GSVector4i::cxpr(0, 0, 1, 1)); const bool first_tri_culled = (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT || !m_clamped_drawing_area.rintersects(draw_rect_012)); @@ -556,8 +557,9 @@ void GPU_SW::DispatchRenderCommand() // quads if (rc.quad_polygon) { - const GSVector4i draw_rect_123 = - min_pos_12.min_i32(positions[3]).upl64(max_pos_12.max_i32(positions[3])).add32(GSVector4i::cxpr(0, 0, 1, 1)); + const GSVector4i draw_rect_123 = GSVector4i(min_pos_12.min_i32(positions[3])) + .upl64(GSVector4i(max_pos_12.max_i32(positions[3]))) + .add32(GSVector4i::cxpr(0, 0, 1, 1)); // Cull polygons which are too large. const bool second_tri_culled =