GSVector: Add a bunch of fp64 operations
This commit is contained in:
parent
b84d3f14fc
commit
81c9bdeeef
|
@ -1,6 +1,10 @@
|
||||||
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
|
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
|
||||||
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
||||||
|
|
||||||
|
//
|
||||||
|
// Lightweight wrapper over native SIMD types for cross-platform vector code.
|
||||||
|
//
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "common/intrin.h"
|
#include "common/intrin.h"
|
||||||
|
|
|
@ -284,8 +284,6 @@ public:
|
||||||
vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
|
vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
|
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
|
||||||
{
|
{
|
||||||
return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
|
return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
|
||||||
|
@ -537,16 +535,6 @@ public:
|
||||||
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
|
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
|
|
||||||
{
|
|
||||||
return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
|
|
||||||
{
|
|
||||||
return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
|
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
|
||||||
{
|
{
|
||||||
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
|
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
|
||||||
|
@ -828,22 +816,11 @@ public:
|
||||||
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
|
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
|
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
|
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 rcpnr() const
|
|
||||||
{
|
|
||||||
float32x2_t recip = vrecpe_f32(v2s);
|
|
||||||
recip = vmul_f32(recip, vrecps_f32(recip, v2s));
|
|
||||||
return GSVector2(recip);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CPU_ARCH_ARM64
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
|
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
|
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -1160,16 +1137,8 @@ public:
|
||||||
|
|
||||||
// rect
|
// rect
|
||||||
|
|
||||||
ALWAYS_INLINE int width() const { return right - left; }
|
ALWAYS_INLINE s32 width() const { return right - left; }
|
||||||
|
ALWAYS_INLINE s32 height() const { return bottom - top; }
|
||||||
ALWAYS_INLINE int height() const { return bottom - top; }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i rsize() const
|
|
||||||
{
|
|
||||||
return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE s32 rarea() const { return width() * height(); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE bool rempty() const
|
ALWAYS_INLINE bool rempty() const
|
||||||
{
|
{
|
||||||
|
@ -1456,8 +1425,6 @@ public:
|
||||||
vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
|
vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
|
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
|
||||||
{
|
{
|
||||||
#ifdef CPU_ARCH_ARM64
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
@ -2004,50 +1971,6 @@ public:
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
|
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
|
||||||
|
|
||||||
template<int shift>
|
|
||||||
ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// (a - this) * f << shift + this
|
|
||||||
|
|
||||||
return add16(a.sub16(*this).modulate16<shift>(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int shift>
|
|
||||||
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
|
|
||||||
{
|
|
||||||
// (a - b) * c << shift
|
|
||||||
|
|
||||||
return a.sub16(b).modulate16<shift>(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int shift>
|
|
||||||
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
|
|
||||||
const GSVector4i& d)
|
|
||||||
{
|
|
||||||
// (a - b) * c << shift + d
|
|
||||||
|
|
||||||
return d.add16(a.sub16(b).modulate16<shift>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
|
|
||||||
|
|
||||||
return add16(a.sub16(*this).mul16l(f).sra16<4>());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int shift>
|
|
||||||
ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// a * f << shift
|
|
||||||
if (shift == 0)
|
|
||||||
{
|
|
||||||
return mul16hrs(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
return sll16<shift + 1>().mul16hs(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE bool eq(const GSVector4i& v) const
|
ALWAYS_INLINE bool eq(const GSVector4i& v) const
|
||||||
{
|
{
|
||||||
const int32x4_t res = veorq_s32(v4s, v.v4s);
|
const int32x4_t res = veorq_s32(v4s, v.v4s);
|
||||||
|
@ -2355,36 +2278,39 @@ public:
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
|
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
|
||||||
|
|
||||||
// clang-format off
|
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||||
|
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
|
||||||
|
{ \
|
||||||
|
return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
|
||||||
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
|
||||||
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
|
||||||
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
|
||||||
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
#define VECTOR4i_SHUFFLE_1(xs, xn) \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
|
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
|
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
|
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
|
VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
|
VECTOR4i_SHUFFLE_1(x, 0);
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
|
VECTOR4i_SHUFFLE_1(y, 1);
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
|
VECTOR4i_SHUFFLE_1(z, 2);
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
|
VECTOR4i_SHUFFLE_1(w, 3);
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
|
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_1(xs, xn) \
|
#undef VECTOR4i_SHUFFLE_1
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
|
#undef VECTOR4i_SHUFFLE_2
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
|
#undef VECTOR4i_SHUFFLE_3
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
|
#undef VECTOR4i_SHUFFLE_4
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
|
|
||||||
|
|
||||||
VECTOR4i_SHUFFLE_1(x, 0)
|
|
||||||
VECTOR4i_SHUFFLE_1(y, 1)
|
|
||||||
VECTOR4i_SHUFFLE_1(z, 2)
|
|
||||||
VECTOR4i_SHUFFLE_1(w, 3)
|
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class alignas(16) GSVector4
|
class alignas(16) GSVector4
|
||||||
|
@ -2400,6 +2326,8 @@ class alignas(16) GSVector4
|
||||||
|
|
||||||
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
|
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
|
||||||
|
|
||||||
|
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
union
|
union
|
||||||
{
|
{
|
||||||
|
@ -2442,6 +2370,10 @@ public:
|
||||||
|
|
||||||
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
|
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
|
||||||
|
|
||||||
|
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
|
||||||
|
|
||||||
|
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
|
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
|
||||||
{
|
{
|
||||||
const float arr[4] = {x, y, z, w};
|
const float arr[4] = {x, y, z, w};
|
||||||
|
@ -2475,12 +2407,28 @@ public:
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
|
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
|
||||||
|
|
||||||
#ifdef CPU_ARCH_ARM64
|
|
||||||
ALWAYS_INLINE static GSVector4 f64(double x, double y)
|
ALWAYS_INLINE static GSVector4 f64(double x, double y)
|
||||||
{
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
|
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
|
||||||
}
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.F64[0] = x;
|
||||||
|
ret.F64[1] = y;
|
||||||
|
return ret;
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE static GSVector4 f64(double x)
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.F64[0] = ret.F64[1] = x;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
|
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
|
||||||
|
|
||||||
|
@ -2498,15 +2446,6 @@ public:
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
|
ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 rcpnr() const
|
|
||||||
{
|
|
||||||
float32x4_t recip = vrecpeq_f32(v4s);
|
|
||||||
recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
|
|
||||||
return GSVector4(recip);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _M_ARM64
|
#ifdef _M_ARM64
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
|
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
|
||||||
|
@ -2529,27 +2468,6 @@ public:
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
|
|
||||||
{
|
|
||||||
return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
|
|
||||||
}
|
|
||||||
ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const
|
|
||||||
{
|
|
||||||
return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s));
|
|
||||||
}
|
|
||||||
ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; }
|
|
||||||
ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; }
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const
|
|
||||||
{
|
|
||||||
return a.madd(b, *this); // *this + a * b
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const
|
|
||||||
{
|
|
||||||
return a.nmadd(b, *this); // *this - a * b
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CPU_ARCH_ARM64
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
|
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
|
||||||
|
@ -2729,6 +2647,28 @@ public:
|
||||||
return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
|
return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int dst>
|
||||||
|
ALWAYS_INLINE GSVector4 insert64(double v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.F64[dst] = v;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int src>
|
||||||
|
ALWAYS_INLINE double extract64() const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
|
||||||
|
#else
|
||||||
|
return F64[src];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
|
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
|
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
|
||||||
|
@ -2903,73 +2843,219 @@ public:
|
||||||
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
|
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CPU_ARCH_ARM64
|
|
||||||
// Not in ARM32
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
|
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
|
||||||
{
|
{
|
||||||
return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
|
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
|
||||||
{
|
{
|
||||||
return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
|
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
|
||||||
{
|
{
|
||||||
return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 sqrt64() const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 sqr64() const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 floor64() const
|
||||||
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
|
return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
|
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
|
||||||
{
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
|
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
|
||||||
|
#else
|
||||||
|
return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
|
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
|
||||||
{
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
|
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
|
||||||
|
#else
|
||||||
|
const float* fp = static_cast<const float*>(p);
|
||||||
|
return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i f64toi32() const
|
ALWAYS_INLINE GSVector4i f64toi32() const
|
||||||
{
|
{
|
||||||
|
#ifdef CPU_ARCH_ARM64
|
||||||
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
|
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
|
||||||
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
|
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
|
||||||
|
#else
|
||||||
|
const s32 low = static_cast<s32>(F64[0]);
|
||||||
|
const s32 high = static_cast<s32>(F64[1]);
|
||||||
|
#endif
|
||||||
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
|
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||||
|
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
|
||||||
|
{ \
|
||||||
|
return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
|
||||||
|
} \
|
||||||
|
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const \
|
||||||
|
{ \
|
||||||
|
return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); \
|
||||||
|
}
|
||||||
|
|
||||||
// clang-format off
|
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
|
||||||
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
|
||||||
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
|
||||||
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
|
||||||
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
#define VECTOR4_SHUFFLE_1(xs, xn) \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
|
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
|
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
|
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
|
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
|
VECTOR4_SHUFFLE_1(x, 0);
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
|
VECTOR4_SHUFFLE_1(y, 1);
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
|
VECTOR4_SHUFFLE_1(z, 2);
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
|
VECTOR4_SHUFFLE_1(w, 3);
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
|
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_1(xs, xn) \
|
#undef VECTOR4_SHUFFLE_1
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
|
#undef VECTOR4_SHUFFLE_2
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
|
#undef VECTOR4_SHUFFLE_3
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
|
#undef VECTOR4_SHUFFLE_4
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
|
|
||||||
|
|
||||||
VECTOR4_SHUFFLE_1(x, 0)
|
|
||||||
VECTOR4_SHUFFLE_1(y, 1)
|
|
||||||
VECTOR4_SHUFFLE_1(z, 2)
|
|
||||||
VECTOR4_SHUFFLE_1(w, 3)
|
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 broadcast32() const
|
ALWAYS_INLINE GSVector4 broadcast32() const
|
||||||
{
|
{
|
||||||
|
|
|
@ -235,8 +235,6 @@ public:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
|
|
||||||
|
|
||||||
GSVector2i shuffle8(const GSVector2i& mask) const
|
GSVector2i shuffle8(const GSVector2i& mask) const
|
||||||
{
|
{
|
||||||
ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
|
ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
|
||||||
|
@ -601,15 +599,6 @@ public:
|
||||||
|
|
||||||
GSVector2 neg() const { return GSVector2(-x, -y); }
|
GSVector2 neg() const { return GSVector2(-x, -y); }
|
||||||
|
|
||||||
GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); }
|
|
||||||
|
|
||||||
GSVector2 rcpnr() const
|
|
||||||
{
|
|
||||||
GSVector2 v_ = rcp();
|
|
||||||
|
|
||||||
return (v_ + v_) - (v_ * v_) * *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
|
GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
|
||||||
|
|
||||||
GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
|
GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
|
||||||
|
@ -973,16 +962,8 @@ public:
|
||||||
// rect
|
// rect
|
||||||
|
|
||||||
ALWAYS_INLINE s32 width() const { return right - left; }
|
ALWAYS_INLINE s32 width() const { return right - left; }
|
||||||
|
|
||||||
ALWAYS_INLINE s32 height() const { return bottom - top; }
|
ALWAYS_INLINE s32 height() const { return bottom - top; }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i rsize() const
|
|
||||||
{
|
|
||||||
return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE s32 rarea() const { return width() * height(); }
|
|
||||||
|
|
||||||
ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
|
ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
|
||||||
|
|
||||||
// TODO: Optimize for no-simd, this generates crap code.
|
// TODO: Optimize for no-simd, this generates crap code.
|
||||||
|
@ -1185,8 +1166,6 @@ public:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
|
|
||||||
|
|
||||||
GSVector4i shuffle8(const GSVector4i& mask) const
|
GSVector4i shuffle8(const GSVector4i& mask) const
|
||||||
{
|
{
|
||||||
ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
|
ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
|
||||||
|
@ -1447,64 +1426,14 @@ public:
|
||||||
|
|
||||||
GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
|
GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
|
||||||
|
|
||||||
GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
|
|
||||||
|
|
||||||
GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
|
|
||||||
|
|
||||||
GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); }
|
GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); }
|
||||||
|
|
||||||
GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); }
|
|
||||||
|
|
||||||
GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); }
|
GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); }
|
||||||
|
|
||||||
GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); }
|
GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); }
|
||||||
|
|
||||||
GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
|
GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
|
||||||
|
|
||||||
template<s32 shift>
|
|
||||||
ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// (a - this) * f << shift + this
|
|
||||||
|
|
||||||
return add16(a.sub16(*this).modulate16<shift>(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<s32 shift>
|
|
||||||
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
|
|
||||||
{
|
|
||||||
// (a - b) * c << shift
|
|
||||||
|
|
||||||
return a.sub16(b).modulate16<shift>(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<s32 shift>
|
|
||||||
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
|
|
||||||
const GSVector4i& d)
|
|
||||||
{
|
|
||||||
// (a - b) * c << shift + d
|
|
||||||
|
|
||||||
return d.add16(a.sub16(b).modulate16<shift>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
|
|
||||||
|
|
||||||
return add16(a_.sub16(*this).mul16l(f).sra16<4>());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<s32 shift>
|
|
||||||
ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
|
|
||||||
{
|
|
||||||
// a * f << shift
|
|
||||||
if constexpr (shift == 0)
|
|
||||||
{
|
|
||||||
return mul16hrs(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
return sll16<shift + 1>().mul16hs(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
|
ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
|
||||||
|
|
||||||
GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
|
GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
|
||||||
|
@ -1746,36 +1675,36 @@ public:
|
||||||
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); }
|
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); }
|
||||||
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); }
|
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); }
|
||||||
|
|
||||||
// clang-format off
|
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||||
// l/h/lh not implemented until needed
|
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]); }
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
||||||
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]);}
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
|
||||||
|
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
|
||||||
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
|
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
|
#define VECTOR4i_SHUFFLE_1(xs, xn) \
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
|
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
|
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
|
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
|
||||||
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
|
VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4i_SHUFFLE_1(xs, xn) \
|
VECTOR4i_SHUFFLE_1(x, 0);
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
|
VECTOR4i_SHUFFLE_1(y, 1);
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
|
VECTOR4i_SHUFFLE_1(z, 2);
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
|
VECTOR4i_SHUFFLE_1(w, 3);
|
||||||
VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
|
|
||||||
|
|
||||||
VECTOR4i_SHUFFLE_1(x, 0)
|
#undef VECTOR4i_SHUFFLE_1
|
||||||
VECTOR4i_SHUFFLE_1(y, 1)
|
#undef VECTOR4i_SHUFFLE_2
|
||||||
VECTOR4i_SHUFFLE_1(z, 2)
|
#undef VECTOR4i_SHUFFLE_3
|
||||||
VECTOR4i_SHUFFLE_1(w, 3)
|
#undef VECTOR4i_SHUFFLE_4
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class alignas(16) GSVector4
|
class alignas(16) GSVector4
|
||||||
|
@ -1791,6 +1720,8 @@ class alignas(16) GSVector4
|
||||||
|
|
||||||
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
|
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
|
||||||
|
|
||||||
|
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
union
|
union
|
||||||
{
|
{
|
||||||
|
@ -1832,6 +1763,10 @@ public:
|
||||||
|
|
||||||
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
|
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
|
||||||
|
|
||||||
|
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
|
||||||
|
|
||||||
|
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
|
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
|
||||||
{
|
{
|
||||||
this->x = x;
|
this->x = x;
|
||||||
|
@ -1881,6 +1816,13 @@ public:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE static GSVector4 f64(double x)
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.F64[0] = ret.F64[1] = x;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
|
ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
|
||||||
|
|
||||||
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
|
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
|
||||||
|
@ -1893,37 +1835,10 @@ public:
|
||||||
|
|
||||||
GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
|
GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
|
||||||
|
|
||||||
GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); }
|
|
||||||
|
|
||||||
GSVector4 rcpnr() const
|
|
||||||
{
|
|
||||||
GSVector4 v_ = rcp();
|
|
||||||
|
|
||||||
return (v_ + v_) - (v_ * v_) * *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
|
GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
|
||||||
|
|
||||||
GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
|
GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
|
||||||
|
|
||||||
GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; }
|
|
||||||
|
|
||||||
GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; }
|
|
||||||
|
|
||||||
GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; }
|
|
||||||
|
|
||||||
GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; }
|
|
||||||
|
|
||||||
GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
|
|
||||||
{
|
|
||||||
return a_.madd(b_, *this); // *this + a * b
|
|
||||||
}
|
|
||||||
|
|
||||||
GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
|
|
||||||
{
|
|
||||||
return a_.nmadd(b_, *this); // *this - a * b
|
|
||||||
}
|
|
||||||
|
|
||||||
GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
|
GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
|
||||||
|
|
||||||
GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
|
GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
|
||||||
|
@ -2045,6 +1960,20 @@ public:
|
||||||
return I32[i];
|
return I32[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int dst>
|
||||||
|
ALWAYS_INLINE GSVector4 insert64(double v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.F64[dst] = v;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int src>
|
||||||
|
ALWAYS_INLINE double extract64() const
|
||||||
|
{
|
||||||
|
return F64[src];
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
|
ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
|
||||||
|
|
||||||
ALWAYS_INLINE static constexpr GSVector4 xffffffff()
|
ALWAYS_INLINE static constexpr GSVector4 xffffffff()
|
||||||
|
@ -2300,6 +2229,71 @@ public:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
GSVector4 ret;
|
||||||
|
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
|
||||||
|
{
|
||||||
|
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 neg64() const {return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL(); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); }
|
||||||
|
|
||||||
|
ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); }
|
||||||
|
|
||||||
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
|
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
|
||||||
{
|
{
|
||||||
GSVector4 ret;
|
GSVector4 ret;
|
||||||
|
@ -2323,36 +2317,40 @@ public:
|
||||||
return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
|
return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// clang-format off
|
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||||
|
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
|
||||||
|
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const \
|
||||||
|
{ \
|
||||||
|
return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); \
|
||||||
|
}
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
||||||
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
|
||||||
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); }
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
|
||||||
|
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
|
||||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
|
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
|
#define VECTOR4_SHUFFLE_1(xs, xn) \
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
|
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
|
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
|
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
|
||||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
|
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
|
||||||
|
|
||||||
#define VECTOR4_SHUFFLE_1(xs, xn) \
|
VECTOR4_SHUFFLE_1(x, 0);
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
|
VECTOR4_SHUFFLE_1(y, 1);
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
|
VECTOR4_SHUFFLE_1(z, 2);
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
|
VECTOR4_SHUFFLE_1(w, 3);
|
||||||
VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
|
|
||||||
|
|
||||||
VECTOR4_SHUFFLE_1(x, 0)
|
#undef VECTOR4_SHUFFLE_1
|
||||||
VECTOR4_SHUFFLE_1(y, 1)
|
#undef VECTOR4_SHUFFLE_2
|
||||||
VECTOR4_SHUFFLE_1(z, 2)
|
#undef VECTOR4_SHUFFLE_3
|
||||||
VECTOR4_SHUFFLE_1(w, 3)
|
#undef VECTOR4_SHUFFLE_4
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
|
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue