diff --git a/src/common-tests/gsvector_yuvtorgb_test.cpp b/src/common-tests/gsvector_yuvtorgb_test.cpp index c4f60caed..3e6bc821f 100644 --- a/src/common-tests/gsvector_yuvtorgb_test.cpp +++ b/src/common-tests/gsvector_yuvtorgb_test.cpp @@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array& Crblk, const std::array(&Yblk[y * 8]); // BT.601 YUV->RGB coefficients, rounding formula from Mednafen. diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 9b3cd6a70..874371651 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -22,12 +22,12 @@ class alignas(16) GSVector2i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y} {} + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {} - constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {} constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7} + : S8{b0, b1, b2, b3, b4, b5, b6, b7} { } @@ -43,10 +43,10 @@ public: s32 r, g; }; float F32[2]; - s8 I8[8]; - s16 I16[4]; - s32 I32[2]; - s64 I64[1]; + s8 S8[8]; + s16 S16[4]; + s32 S32[2]; + s64 S64[1]; u8 U8[8]; u16 U16[4]; u32 U32[2]; @@ -74,10 +74,10 @@ public: ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); } - ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {} ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7} + : S8{b0, b1, b2, b3, b4, b5, b6, b7} { } @@ -175,6 +175,15 @@ public: return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); } + ALWAYS_INLINE s32 addv_s32() const + { +#ifdef CPU_ARCH_ARM64 + return vaddv_s32(v2s); +#else + return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1); +#endif + } + #ifdef CPU_ARCH_ARM64 ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); } @@ -1050,16 +1059,16 @@ class alignas(16) GSVector4i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {} constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) - : I16{s0, s1, s2, s3, s4, s5, s6, s7} + : S16{s0, s1, s2, s3, s4, s5, s6, s7} { } constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } @@ -1079,10 +1088,10 @@ public: int left, top, right, bottom; }; float F32[4]; - s8 I8[16]; - s16 I16[8]; - s32 I32[4]; - s64 I64[2]; + s8 S8[16]; + s16 S16[8]; + s32 S32[4]; + s64 S64[2]; u8 U8[16]; u16 U16[8]; u32 U32[4]; @@ -1123,13 +1132,13 @@ public: ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) - : I16{s0, s1, s2, s3, s4, s5, s6, s7} + : S16{s0, s1, s2, s3, s4, s5, s6, s7} { } constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } @@ -1318,6 +1327,16 @@ public: #endif } + ALWAYS_INLINE s32 addv_s32() const + { +#ifdef CPU_ARCH_ARM64 + return vaddvq_s32(v4s); +#else + const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s)); + return vget_lane_s32(res, 0) + vget_lane_s32(res, 1); +#endif + } + #ifdef CPU_ARCH_ARM64 ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); } @@ -1641,7 +1660,7 @@ public: } #endif - ALWAYS_INLINE GSVector4i i8to16() const + ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))); } @@ -1651,7 +1670,7 @@ public: return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))); } - ALWAYS_INLINE GSVector4i i8to32() const + ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))); } @@ -1661,7 +1680,7 @@ public: return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))); } - ALWAYS_INLINE GSVector4i i8to64() const + ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(vreinterpretq_s32_s64( vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))))))); @@ -1673,14 +1692,14 @@ public: vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))))); } - ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); } + ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); } ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))); } - ALWAYS_INLINE GSVector4i i16to64() const + ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i( vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s))))))); @@ -1692,7 +1711,7 @@ public: vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))))); } - ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); } + ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); } ALWAYS_INLINE GSVector4i u32to64() const { diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index b82e77485..8f2f26472 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: LGPL-3.0+ // Implementation of GSVector4/GSVector4i when the host does not support any form of SIMD. @@ -47,12 +47,12 @@ class alignas(16) GSVector2i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y} {} + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {} - constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {} + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {} constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7} + : S8{b0, b1, b2, b3, b4, b5, b6, b7} { } @@ -68,10 +68,10 @@ public: s32 r, g; }; float F32[2]; - s8 I8[8]; - s16 I16[4]; - s32 I32[2]; - s64 I64[1]; + s8 S8[8]; + s16 S16[4]; + s32 S32[2]; + s64 S64[1]; u8 U8[8]; u16 U16[4]; u32 U32[2]; @@ -104,18 +104,18 @@ public: ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { - I16[0] = s0; - I16[1] = s1; - I16[2] = s2; - I16[3] = s3; + S16[0] = s0; + S16[1] = s1; + S16[2] = s2; + S16[3] = s3; } ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7} + : S8{b0, b1, b2, b3, b4, b5, b6, b7} { } - ALWAYS_INLINE GSVector2i(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + ALWAYS_INLINE GSVector2i(const GSVector2i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), // so leave the non-constexpr version default @@ -125,7 +125,7 @@ public: ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); - ALWAYS_INLINE void operator=(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + ALWAYS_INLINE void operator=(const GSVector2i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } ALWAYS_INLINE void operator=(s32 i) { x = i; @@ -158,12 +158,12 @@ public: return max_u32(min).min_u32(max); } - GSVector2i min_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); } - GSVector2i max_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); } - GSVector2i min_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); } - GSVector2i max_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); } - GSVector2i min_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); } - GSVector2i max_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); } + GSVector2i min_i8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = std::min(S8[i], v.S8[i])); } + GSVector2i max_i8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = std::max(S8[i], v.S8[i])); } + GSVector2i min_i16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = std::min(S16[i], v.S16[i])); } + GSVector2i max_i16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = std::max(S16[i], v.S16[i])); } + GSVector2i min_i32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = std::min(S32[i], v.S32[i])); } + GSVector2i max_i32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = std::max(S32[i], v.S32[i])); } GSVector2i min_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); } GSVector2i max_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); } @@ -172,6 +172,8 @@ public: GSVector2i min_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); } GSVector2i max_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); } + s32 addv_s32() const { return (S32[0] + S32[1]); } + u8 minv_u8() const { return std::min( @@ -237,21 +239,21 @@ public: GSVector2i shuffle8(const GSVector2i& mask) const { - ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf])); + ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf])); } - GSVector2i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 4) ? i : (i - 4)])); } + GSVector2i ps16() const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S16[(i < 4) ? i : (i - 4)])); } GSVector2i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 4) ? i : (i - 4)])); } - GSVector2i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 2) ? i : (i - 2)])); } + GSVector2i ps32() const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S32[(i < 2) ? i : (i - 2)])); } GSVector2i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 2) ? i : (i - 2)])); } - GSVector2i upl8() const { return GSVector2i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0); } + GSVector2i upl8() const { return GSVector2i(S8[0], 0, S8[1], 0, S8[2], 0, S8[3], 0); } - GSVector2i upl16() const { return GSVector2i(I16[0], 0, I16[1], 0); } + GSVector2i upl16() const { return GSVector2i(S16[0], 0, S16[1], 0); } - GSVector2i upl32() const { return GSVector2i(I32[0], 0); } + GSVector2i upl32() const { return GSVector2i(S32[0], 0); } - GSVector2i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); } + GSVector2i i8to16() const { ALL_LANES_16(ret.S16[i] = S8[i]); } template GSVector2i srl() const @@ -300,12 +302,12 @@ public: template GSVector2i sra16() const { - ALL_LANES_16(ret.I16[i] = I16[i] >> v); + ALL_LANES_16(ret.S16[i] = S16[i] >> v); } - GSVector2i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); } + GSVector2i sra16(s32 v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v); } - GSVector2i srav16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); } + GSVector2i srav16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v.S16[i]); } template GSVector2i sll32() const @@ -330,36 +332,36 @@ public: template GSVector2i sra32() const { - ALL_LANES_32(ret.I32[i] = I32[i] >> v); + ALL_LANES_32(ret.S32[i] = S32[i] >> v); } - GSVector2i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); } + GSVector2i sra32(s32 v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v); } - GSVector2i srav32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); } + GSVector2i srav32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v.S32[i]); } - GSVector2i add8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); } + GSVector2i add8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] + v.S8[i]); } - GSVector2i add16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); } + GSVector2i add16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] + v.S16[i]); } - GSVector2i add32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); } + GSVector2i add32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] + v.S32[i]); } - GSVector2i adds8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); } + GSVector2i adds8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] + v.S8[i])); } - GSVector2i adds16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); } + GSVector2i adds16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] + v.S16[i])); } GSVector2i addus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); } GSVector2i addus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); } - GSVector2i sub8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); } + GSVector2i sub8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] - v.S8[i]); } - GSVector2i sub16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); } + GSVector2i sub16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] - v.S16[i]); } - GSVector2i sub32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); } + GSVector2i sub32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] - v.S32[i]); } - GSVector2i subs8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); } + GSVector2i subs8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] - v.S8[i])); } - GSVector2i subs16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); } + GSVector2i subs16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] - v.S16[i])); } GSVector2i subus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); } @@ -369,35 +371,35 @@ public: GSVector2i avg16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); } - GSVector2i mul16l(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); } + GSVector2i mul16l(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); } - GSVector2i mul32l(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); } + GSVector2i mul32l(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); } - ALWAYS_INLINE bool eq(const GSVector2i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; } + ALWAYS_INLINE bool eq(const GSVector2i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; } - GSVector2i eq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); } - GSVector2i eq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); } - GSVector2i eq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); } + GSVector2i eq8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); } + GSVector2i eq16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] == v.S16[i]) ? -1 : 0); } + GSVector2i eq32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] == v.S32[i]) ? -1 : 0); } - GSVector2i neq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); } - GSVector2i neq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); } - GSVector2i neq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); } + GSVector2i neq8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] != v.S8[i]) ? -1 : 0); } + GSVector2i neq16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] != v.S16[i]) ? -1 : 0); } + GSVector2i neq32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] != v.S32[i]) ? -1 : 0); } - GSVector2i gt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); } - GSVector2i gt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); } - GSVector2i gt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); } + GSVector2i gt8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] > v.S8[i]) ? -1 : 0); } + GSVector2i gt16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] > v.S16[i]) ? -1 : 0); } + GSVector2i gt32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] > v.S32[i]) ? -1 : 0); } - GSVector2i ge8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); } - GSVector2i ge16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); } - GSVector2i ge32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); } + GSVector2i ge8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] >= v.S8[i]) ? -1 : 0); } + GSVector2i ge16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] >= v.S16[i]) ? -1 : 0); } + GSVector2i ge32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] >= v.S32[i]) ? -1 : 0); } - GSVector2i lt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); } - GSVector2i lt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); } - GSVector2i lt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); } + GSVector2i lt8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] < v.S8[i]) ? -1 : 0); } + GSVector2i lt16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] < v.S16[i]) ? -1 : 0); } + GSVector2i lt32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] < v.S32[i]) ? -1 : 0); } - GSVector2i le8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); } - GSVector2i le16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); } - GSVector2i le32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); } + GSVector2i le8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] <= v.S8[i]) ? -1 : 0); } + GSVector2i le16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] <= v.S16[i]) ? -1 : 0); } + GSVector2i le32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] <= v.S32[i]) ? -1 : 0); } ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { @@ -422,42 +424,42 @@ public: ALWAYS_INLINE GSVector2i insert8(s32 a) const { GSVector2i ret = *this; - ret.I8[i] = static_cast(a); + ret.S8[i] = static_cast(a); return ret; } template ALWAYS_INLINE s32 extract8() const { - return I8[i]; + return S8[i]; } template ALWAYS_INLINE GSVector2i insert16(s32 a) const { GSVector2i ret = *this; - ret.I16[i] = static_cast(a); + ret.S16[i] = static_cast(a); return ret; } template ALWAYS_INLINE s32 extract16() const { - return I16[i]; + return S16[i]; } template ALWAYS_INLINE GSVector2i insert32(s32 a) const { GSVector2i ret = *this; - ret.I32[i] = a; + ret.S32[i] = a; return ret; } template ALWAYS_INLINE s32 extract32() const { - return I32[i]; + return S32[i]; } ALWAYS_INLINE static GSVector2i load32(const void* p) @@ -471,7 +473,7 @@ public: ALWAYS_INLINE static GSVector2i load(const void* p) { GSVector2i ret; - std::memcpy(ret.I32, p, sizeof(ret.I32)); + std::memcpy(ret.S32, p, sizeof(ret.S32)); return ret; } @@ -482,7 +484,7 @@ public: return ret; } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.I32, sizeof(I32)); } + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); } @@ -859,16 +861,16 @@ class alignas(16) GSVector4i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {} constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) - : I16{s0, s1, s2, s3, s4, s5, s6, s7} + : S16{s0, s1, s2, s3, s4, s5, s6, s7} { } constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } @@ -888,10 +890,10 @@ public: s32 left, top, right, bottom; }; float F32[4]; - s8 I8[16]; - s16 I16[8]; - s32 I32[4]; - s64 I64[2]; + s8 S8[16]; + s16 S16[8]; + s32 S32[4]; + s64 S64[2]; u8 U8[16]; u16 U16[8]; u32 U32[4]; @@ -932,24 +934,24 @@ public: ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) { - I16[0] = s0; - I16[1] = s1; - I16[2] = s2; - I16[3] = s3; - I16[4] = s4; - I16[5] = s5; - I16[6] = s6; - I16[7] = s7; + S16[0] = s0; + S16[1] = s1; + S16[2] = s2; + S16[3] = s3; + S16[4] = s4; + S16[5] = s5; + S16[6] = s6; + S16[7] = s7; } ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } - ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } - ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : I32{v.I32[0], v.I32[1], 0, 0} {} + ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : S32{v.S32[0], v.S32[1], 0, 0} {} // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), // so leave the non-constexpr version default @@ -959,7 +961,7 @@ public: ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); - ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(S32, v.S32, sizeof(S32)); } ALWAYS_INLINE void operator=(s32 i) { x = i; @@ -1050,12 +1052,12 @@ public: return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); } - GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); } - GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); } - GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); } - GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); } - GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); } - GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); } + GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = std::min(S8[i], v.S8[i])); } + GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = std::max(S8[i], v.S8[i])); } + GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = std::min(S16[i], v.S16[i])); } + GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = std::max(S16[i], v.S16[i])); } + GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = std::min(S32[i], v.S32[i])); } + GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = std::max(S32[i], v.S32[i])); } GSVector4i min_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); } GSVector4i max_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); } @@ -1066,11 +1068,13 @@ public: GSVector4i madd_s16(const GSVector4i& v) const { - ALL_LANES_32(ret.I32[i] = (I16[i * 2] * v.I16[i * 2]) + (I16[i * 2 + 1] * v.I16[i * 2 + 1])); + ALL_LANES_32(ret.S32[i] = (S16[i * 2] * v.S16[i * 2]) + (S16[i * 2 + 1] * v.S16[i * 2 + 1])); } GSVector4i addp_s32() const { return GSVector4i(x + y, z + w, 0, 0); } + s32 addv_s32() const { return (S32[0] + S32[1] + S32[2] + S32[3]); } + u8 minv_u8() const { return std::min( @@ -1185,18 +1189,18 @@ public: GSVector4i shuffle8(const GSVector4i& mask) const { - ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf])); + ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf])); } - GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i - 8])); } - GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 8) ? i : (i - 8)])); } + GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8((i < 8) ? S16[i] : v.S16[i - 8])); } + GSVector4i ps16() const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S16[(i < 8) ? i : (i - 8)])); } GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8((i < 8) ? U16[i] : v.U16[i - 8])); } GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 8) ? i : (i - 8)])); } GSVector4i ps32(const GSVector4i& v) const { - ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 4) ? I32[i] : v.I32[i - 4])); + ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 4) ? S32[i] : v.S32[i - 4])); } - GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 4) ? i : (i - 4)])); } + GSVector4i ps32() const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S32[(i < 4) ? i : (i - 4)])); } GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 4) ? U32[i] : v.U32[i - 4])); @@ -1205,75 +1209,75 @@ public: GSVector4i upl8(const GSVector4i& v) const { - return GSVector4i(I8[0], v.I8[0], I8[1], v.I8[1], I8[2], v.I8[2], I8[3], v.I8[3], I8[4], v.I8[4], I8[5], v.I8[5], - I8[6], v.I8[6], I8[7], v.I8[7]); + return GSVector4i(S8[0], v.S8[0], S8[1], v.S8[1], S8[2], v.S8[2], S8[3], v.S8[3], S8[4], v.S8[4], S8[5], v.S8[5], + S8[6], v.S8[6], S8[7], v.S8[7]); } GSVector4i uph8(const GSVector4i& v) const { - return GSVector4i(I8[8], v.I8[8], I8[9], v.I8[9], I8[10], v.I8[10], I8[11], v.I8[11], I8[12], v.I8[12], I8[13], - v.I8[13], I8[14], v.I8[14], I8[15], v.I8[15]); + return GSVector4i(S8[8], v.S8[8], S8[9], v.S8[9], S8[10], v.S8[10], S8[11], v.S8[11], S8[12], v.S8[12], S8[13], + v.S8[13], S8[14], v.S8[14], S8[15], v.S8[15]); } GSVector4i upl16(const GSVector4i& v) const { - return GSVector4i(I16[0], v.I16[0], I16[1], v.I16[1], I16[2], v.I16[2], I16[3], v.I16[3]); + return GSVector4i(S16[0], v.S16[0], S16[1], v.S16[1], S16[2], v.S16[2], S16[3], v.S16[3]); } GSVector4i uph16(const GSVector4i& v) const { - return GSVector4i(I16[4], v.I16[4], I16[5], v.I16[5], I16[6], v.I16[6], I16[7], v.I16[7]); + return GSVector4i(S16[4], v.S16[4], S16[5], v.S16[5], S16[6], v.S16[6], S16[7], v.S16[7]); } - GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(I32[0], v.I32[0], I32[1], v.I32[1]); } - GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(I32[2], v.I32[2], I32[3], v.I32[3]); } + GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(S32[0], v.S32[0], S32[1], v.S32[1]); } + GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(S32[2], v.S32[2], S32[3], v.S32[3]); } GSVector4i upl64(const GSVector4i& v) const { GSVector4i ret; - ret.I64[0] = I64[0]; - ret.I64[1] = v.I64[0]; + ret.S64[0] = S64[0]; + ret.S64[1] = v.S64[0]; return ret; } GSVector4i uph64(const GSVector4i& v) const { GSVector4i ret; - ret.I64[0] = I64[1]; - ret.I64[1] = v.I64[1]; + ret.S64[0] = S64[1]; + ret.S64[1] = v.S64[1]; return ret; } GSVector4i upl8() const { - return GSVector4i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0, I8[4], 0, I8[5], 0, I8[6], 0, I8[7], 0); + return GSVector4i(S8[0], 0, S8[1], 0, S8[2], 0, S8[3], 0, S8[4], 0, S8[5], 0, S8[6], 0, S8[7], 0); } GSVector4i uph8() const { - return GSVector4i(I8[8], 0, I8[9], 0, I8[10], 0, I8[11], 0, I8[12], 0, I8[13], 0, I8[14], 0, I8[15], 0); + return GSVector4i(S8[8], 0, S8[9], 0, S8[10], 0, S8[11], 0, S8[12], 0, S8[13], 0, S8[14], 0, S8[15], 0); } - GSVector4i upl16() const { return GSVector4i(I16[0], 0, I16[1], 0, I16[2], 0, I16[3], 0); } - GSVector4i uph16() const { return GSVector4i(I16[4], 0, I16[5], 0, I16[6], 0, I16[7], 0); } + GSVector4i upl16() const { return GSVector4i(S16[0], 0, S16[1], 0, S16[2], 0, S16[3], 0); } + GSVector4i uph16() const { return GSVector4i(S16[4], 0, S16[5], 0, S16[6], 0, S16[7], 0); } - GSVector4i upl32() const { return GSVector4i(I32[0], 0, I32[1], 0); } - GSVector4i uph32() const { return GSVector4i(I32[2], 0, I32[3], 0); } + GSVector4i upl32() const { return GSVector4i(S32[0], 0, S32[1], 0); } + GSVector4i uph32() const { return GSVector4i(S32[2], 0, S32[3], 0); } GSVector4i upl64() const { GSVector4i ret; - ret.I64[0] = I64[0]; - ret.I64[1] = 0; + ret.S64[0] = S64[0]; + ret.S64[1] = 0; return ret; } GSVector4i uph64() const { GSVector4i ret; - ret.I64[0] = I64[1]; - ret.I64[1] = 0; + ret.S64[0] = S64[1]; + ret.S64[1] = 0; return ret; } - GSVector4i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); } - GSVector4i i8to32() const { ALL_LANES_32(ret.I32[i] = I8[i]); } - GSVector4i i8to64() const { ALL_LANES_64(ret.I64[i] = I8[i]); } + GSVector4i s8to16() const { ALL_LANES_16(ret.S16[i] = S8[i]); } + GSVector4i s8to32() const { ALL_LANES_32(ret.S32[i] = S8[i]); } + GSVector4i s8to64() const { ALL_LANES_64(ret.S64[i] = S8[i]); } - GSVector4i i16to32() const { ALL_LANES_32(ret.I32[i] = I16[i]); } - GSVector4i i16to64() const { ALL_LANES_64(ret.I64[i] = I16[i]); } - GSVector4i i32to64() const { ALL_LANES_64(ret.I64[i] = I32[i]); } + GSVector4i s16to32() const { ALL_LANES_32(ret.S32[i] = S16[i]); } + GSVector4i s16to64() const { ALL_LANES_64(ret.S64[i] = S16[i]); } + GSVector4i s32to64() const { ALL_LANES_64(ret.S64[i] = S32[i]); } GSVector4i u8to16() const { ALL_LANES_64(ret.U16[i] = U8[i]); } GSVector4i u8to32() const { ALL_LANES_32(ret.U32[i] = U8[i]); } GSVector4i u8to64() const { ALL_LANES_64(ret.U64[i] = U8[i]); } @@ -1341,12 +1345,12 @@ public: template GSVector4i sra16() const { - ALL_LANES_16(ret.I16[i] = I16[i] >> v); + ALL_LANES_16(ret.S16[i] = S16[i] >> v); } - GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); } + GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v); } - GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); } + GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v.S16[i]); } template GSVector4i sll32() const @@ -1371,12 +1375,12 @@ public: template GSVector4i sra32() const { - ALL_LANES_32(ret.I32[i] = I32[i] >> v); + ALL_LANES_32(ret.S32[i] = S32[i] >> v); } - GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); } + GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v); } - GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); } + GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v.S32[i]); } template GSVector4i sll64() const @@ -1401,43 +1405,43 @@ public: template GSVector4i sra64() const { - ALL_LANES_64(ret.I64[i] = I64[i] >> v); + ALL_LANES_64(ret.S64[i] = S64[i] >> v); } - GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v); } + GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.S64[i] = S64[i] >> v); } - GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v.I64[i]); } + GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.S64[i] = S64[i] >> v.S64[i]); } - GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); } + GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] + v.S8[i]); } - GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); } + GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] + v.S16[i]); } - GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); } + GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] + v.S32[i]); } - GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); } + GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] + v.S8[i])); } - GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); } + GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] + v.S16[i])); } GSVector4i hadds16(const GSVector4i& v) const { - return GSVector4i(SSATURATE16(I16[0] + I16[1]), SSATURATE16(I16[2] + I16[3]), SSATURATE16(I16[4] + I16[5]), - SSATURATE16(I16[6] + I16[7]), SSATURATE16(v.I16[0] + v.I16[1]), SSATURATE16(v.I16[2] + v.I16[3]), - SSATURATE16(v.I16[4] + v.I16[5]), SSATURATE16(v.I16[6] + v.I16[7])); + return GSVector4i(SSATURATE16(S16[0] + S16[1]), SSATURATE16(S16[2] + S16[3]), SSATURATE16(S16[4] + S16[5]), + SSATURATE16(S16[6] + S16[7]), SSATURATE16(v.S16[0] + v.S16[1]), SSATURATE16(v.S16[2] + v.S16[3]), + SSATURATE16(v.S16[4] + v.S16[5]), SSATURATE16(v.S16[6] + v.S16[7])); } GSVector4i addus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); } GSVector4i addus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); } - GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); } + GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] - v.S8[i]); } - GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); } + GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] - v.S16[i]); } - GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); } + GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] - v.S32[i]); } - GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); } + GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] - v.S8[i])); } - GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); } + GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] - v.S16[i])); } GSVector4i subus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); } @@ -1447,15 +1451,15 @@ public: GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); } - GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] * v.I16[i]) >> 16); } + GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); } GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); } - GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); } + GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); } - GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = ((I16[i] * v.I16[i]) >> 14) + 1); } + GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); } - GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); } + GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); } template ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const @@ -1501,32 +1505,32 @@ public: return sll16().mul16hs(f); } - ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; } + ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; } - GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); } - GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); } - GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); } - GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = (I64[i] == v.I64[i]) ? -1 : 0); } + GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); } + GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] == v.S16[i]) ? -1 : 0); } + GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] == v.S32[i]) ? -1 : 0); } + GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.S64[i] = (S64[i] == v.S64[i]) ? -1 : 0); } - GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); } - GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); } - GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); } + GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] != v.S8[i]) ? -1 : 0); } + GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] != v.S16[i]) ? -1 : 0); } + GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] != v.S32[i]) ? -1 : 0); } - GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); } - GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); } - GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); } + GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] > v.S8[i]) ? -1 : 0); } + GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] > v.S16[i]) ? -1 : 0); } + GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] > v.S32[i]) ? -1 : 0); } - GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); } - GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); } - GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); } + GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] >= v.S8[i]) ? -1 : 0); } + GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] >= v.S16[i]) ? -1 : 0); } + GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] >= v.S32[i]) ? -1 : 0); } - GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); } - GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); } - GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); } + GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] < v.S8[i]) ? -1 : 0); } + GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] < v.S16[i]) ? -1 : 0); } + GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] < v.S32[i]) ? -1 : 0); } - GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); } - GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); } - GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); } + GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] <= v.S8[i]) ? -1 : 0); } + GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] <= v.S16[i]) ? -1 : 0); } + GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] <= v.S32[i]) ? -1 : 0); } ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = (~v.U64[i]) & U64[i]); } @@ -1550,62 +1554,62 @@ public: ALWAYS_INLINE GSVector4i insert8(s32 a) const { GSVector4i ret = *this; - ret.I8[i] = static_cast(a); + ret.S8[i] = static_cast(a); return ret; } template ALWAYS_INLINE s32 extract8() const { - return I8[i]; + return S8[i]; } template ALWAYS_INLINE GSVector4i insert16(s32 a) const { GSVector4i ret = *this; - ret.I16[i] = static_cast(a); + ret.S16[i] = static_cast(a); return ret; } template ALWAYS_INLINE s32 extract16() const { - return I16[i]; + return S16[i]; } template ALWAYS_INLINE GSVector4i insert32(s32 a) const { GSVector4i ret = *this; - ret.I32[i] = a; + ret.S32[i] = a; return ret; } template ALWAYS_INLINE s32 extract32() const { - return I32[i]; + return S32[i]; } template ALWAYS_INLINE GSVector4i insert64(s64 a) const { GSVector4i ret = *this; - ret.I64[i] = a; + ret.S64[i] = a; return ret; } template ALWAYS_INLINE s64 extract64() const { - return I64[i]; + return S64[i]; } ALWAYS_INLINE static GSVector4i loadnt(const void* p) { GSVector4i ret; - std::memcpy(&ret, p, sizeof(ret.I32)); + std::memcpy(&ret, p, sizeof(ret.S32)); return ret; } @@ -1641,7 +1645,7 @@ public: ALWAYS_INLINE static GSVector4i load(const void* p) { GSVector4i ret; - std::memcpy(ret.I32, p, sizeof(ret.I32)); + std::memcpy(ret.S32, p, sizeof(ret.S32)); return ret; } @@ -1658,16 +1662,16 @@ public: ALWAYS_INLINE static GSVector4i loadq(s64 i) { GSVector4i ret; - ret.I64[0] = i; - ret.I64[1] = 0; + ret.S64[0] = i; + ret.S64[1] = 0; return ret; } - ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.I32, sizeof(v.I32)); } + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); } - ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[0], sizeof(s32) * 2); } + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); } - ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[2], sizeof(s32) * 2); } + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); } ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) { @@ -1678,14 +1682,14 @@ public: template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { - std::memcpy(p, v.I32, sizeof(I32)); + std::memcpy(p, v.S32, sizeof(S32)); } ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); } ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; } - ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.I64[0]; } + ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.S64[0]; } ALWAYS_INLINE void operator&=(const GSVector4i& v) { @@ -1746,7 +1750,7 @@ public: // l/h/lh not implemented until needed #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(I32[xn], I32[yn], I32[zn], I32[wn]);} + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]);} #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index 8d9501bb7..a2b31b4cc 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -1,5 +1,8 @@ // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin // SPDX-License-Identifier: LGPL-3.0+ +// +// Rewritten and NEON+No-SIMD variants added for DuckStation. +// #pragma once @@ -25,12 +28,12 @@ class alignas(16) GSVector2i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y, 0, 0} {} + constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y, 0, 0} {} - constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3, 0, 0, 0, 0} {} + constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3, 0, 0, 0, 0} {} constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} { } @@ -46,10 +49,10 @@ public: s32 r, g; }; float F32[4]; - s8 I8[16]; - s16 I16[8]; - s32 I32[4]; - s64 I64[2]; + s8 S8[16]; + s16 S16[8]; + s32 S32[4]; + s64 S64[2]; u8 U8[16]; u16 U16[8]; u32 U32[4]; @@ -80,7 +83,7 @@ public: ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); } ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} { } @@ -139,6 +142,11 @@ public: ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); } ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); } + ALWAYS_INLINE s32 addv_s32() const + { + return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); + } + ALWAYS_INLINE u8 minv_u8() const { __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); @@ -757,16 +765,16 @@ class alignas(16) GSVector4i }; static constexpr cxpr_init_tag cxpr_init{}; - constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {} constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) - : I16{s0, s1, s2, s3, s4, s5, s6, s7} + : S16{s0, s1, s2, s3, s4, s5, s6, s7} { } constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } @@ -786,10 +794,10 @@ public: s32 left, top, right, bottom; }; float F32[4]; - s8 I8[16]; - s16 I16[8]; - s32 I32[4]; - s64 I64[2]; + s8 S8[16]; + s16 S16[8]; + s32 S32[4]; + s64 S64[2]; u8 U8[16]; u16 U16[8]; u32 U32[4]; @@ -830,7 +838,7 @@ public: ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) - : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} { } @@ -954,6 +962,12 @@ public: ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); } + ALWAYS_INLINE s32 addv_s32() const + { + const __m128i pairs = _mm_hadd_epi32(m, m); + return _mm_cvtsi128_si32(_mm_hadd_epi32(pairs, pairs)); + } + ALWAYS_INLINE u8 minv_u8() const { __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); @@ -1080,14 +1094,14 @@ public: ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); } ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); } - ALWAYS_INLINE GSVector4i i8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); } - ALWAYS_INLINE GSVector4i i8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); } - ALWAYS_INLINE GSVector4i i8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); } + ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); } + ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); } + ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); } #ifdef CPU_ARCH_SSE41 - ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); } - ALWAYS_INLINE GSVector4i i16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); } - ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); } + ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); } + ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); } + ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); } ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); } ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); } ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); } @@ -1952,10 +1966,7 @@ public: return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast(p))))); } - ALWAYS_INLINE GSVector4i f64toi32() const - { - return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); - } + ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); } // clang-format off diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp index 19e5f2aa7..454cc151f 100644 --- a/src/core/mdec.cpp +++ b/src/core/mdec.cpp @@ -978,8 +978,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array& Crblk, const const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080); for (u32 y = 0; y < 8; y++) { - const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).i16to32(); - const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).i16to32(); + const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); + const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); const GSVector4i Y = GSVector4i::load(&Yblk[y * 8]); // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.