GS: ARM64 compatibility

This commit is contained in:
Stenzek 2024-03-21 17:48:42 +10:00 committed by Connor McLaughlin
parent 71036c95a4
commit 0a4c037898
20 changed files with 6064 additions and 20 deletions

View File

@ -449,12 +449,22 @@ set(pcsx2GSSourcesUnshared
GS/Renderers/Common/GSVertexTraceFMM.cpp
GS/Renderers/HW/GSRendererHWMultiISA.cpp
GS/Renderers/SW/GSDrawScanline.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
GS/Renderers/SW/GSRasterizer.cpp
GS/Renderers/SW/GSRendererSW.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
)
if(_M_X86)
list(APPEND pcsx2GSSourcesUnshared
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
)
elseif(_M_ARM64)
list(APPEND pcsx2GSSourcesUnshared
GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
)
endif()
set(pcsx2GSSources
GS/GS.cpp
GS/GSCapture.cpp
@ -509,10 +519,6 @@ set(pcsx2GSHeaders
GS/GSTables.h
GS/GSUtil.h
GS/GSVector.h
GS/GSVector4.h
GS/GSVector4i.h
GS/GSVector8.h
GS/GSVector8i.h
GS/GSXXH.h
GS/MultiISA.h
GS/Renderers/Common/GSDevice.h
@ -540,6 +546,20 @@ set(pcsx2GSHeaders
GS/Renderers/SW/GSVertexSW.h
)
if(_M_X86)
list(APPEND pcsx2GSHeaders
GS/GSVector4.h
GS/GSVector4i.h
GS/GSVector8.h
GS/GSVector8i.h
)
elseif(_M_ARM64)
list(APPEND pcsx2GSHeaders
GS/GSVector4_arm64.h
GS/GSVector4i_arm64.h
)
endif()
if(USE_OPENGL)
list(APPEND pcsx2GSSources
GS/Renderers/OpenGL/GLContext.cpp

View File

@ -3588,8 +3588,13 @@ __forceinline void GSState::VertexKick(u32 skip)
break;
}
#ifndef _M_ARM64
// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
skip |= test.mask() & 0xff;
#else
// mask() is slow on ARM, so just pull the bits out instead, thankfully we only care about the first 4 bytes.
skip |= (static_cast<u64>(test.extract64<0>()) & UINT64_C(0x8080808080808080)) != 0;
#endif
}
if (skip != 0)

View File

@ -207,6 +207,9 @@ GSRendererType GSUtil::GetPreferredRenderer()
#if defined(__APPLE__)
// Mac: Prefer Metal hardware.
preferred_renderer = GSRendererType::Metal;
#elif defined(_WIN32) && defined(_M_ARM64)
// Default to DX12 on Windows-on-ARM.
preferred_renderer = GSRendererType::DX12;
#elif defined(_WIN32)
// Use D3D device info to select renderer.
preferred_renderer = D3D::GetPreferredRenderer();

View File

@ -59,6 +59,8 @@ constinit const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000f
constinit const GSVector4 GSVector4::m_max = cxpr(FLT_MAX);
constinit const GSVector4 GSVector4::m_min = cxpr(FLT_MIN);
#ifdef _M_X86
constinit const GSVector8 GSVector8::m_half = cxpr(0.5f);
constinit const GSVector8 GSVector8::m_one = cxpr(1.0f);
constinit const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff);
@ -143,6 +145,8 @@ constinit const GSVector8i GSVector8i::m_x0f[33] =
cxpr(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
#endif
GSVector4i GSVector4i::fit(int arx, int ary) const
{
GSVector4i r = *this;

View File

@ -84,6 +84,7 @@ typedef GSVector2T<int> GSVector2i;
class GSVector4;
class GSVector4i;
#if defined(_M_X86)
#if _M_SSE >= 0x500
class GSVector8;
@ -102,16 +103,30 @@ class GSVector8i;
#include "GSVector8i.h"
#include "GSVector8.h"
#elif defined(_M_ARM64)
#include "GSVector4i_arm64.h"
#include "GSVector4_arm64.h"
#endif
// conversion
__forceinline_odr GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
{
#if defined(_M_X86)
m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
#elif defined(_M_ARM64)
// GS thread uses default (nearest) rounding.
v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
#endif
}
__forceinline_odr GSVector4::GSVector4(const GSVector4i& v)
{
#if defined(_M_X86)
m = _mm_cvtepi32_ps(v);
#elif defined(_M_ARM64)
v4s = vcvtq_f32_s32(v.v4s);
#endif
}
__forceinline_odr void GSVector4i::sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
@ -153,12 +168,20 @@ __forceinline_odr void GSVector8i::sw32_inv(GSVector8i& a, GSVector8i& b)
__forceinline_odr GSVector4i GSVector4i::cast(const GSVector4& v)
{
#ifndef _M_ARM64
return GSVector4i(_mm_castps_si128(v.m));
#else
return GSVector4i(vreinterpretq_s32_f32(v.v4s));
#endif
}
__forceinline_odr GSVector4 GSVector4::cast(const GSVector4i& v)
{
#ifndef _M_ARM64
return GSVector4(_mm_castsi128_ps(v.m));
#else
return GSVector4(vreinterpretq_f32_s32(v.v4s));
#endif
}
#if _M_SSE >= 0x500

787
pcsx2/GS/GSVector4_arm64.h Normal file
View File

@ -0,0 +1,787 @@
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
class alignas(16) GSVector4
{
struct cxpr_init_tag
{
};
static constexpr cxpr_init_tag cxpr_init{};
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w)
: F32{x, y, z, w}
{
}
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w)
: I32{x, y, z, w}
{
}
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y)
: U64{x, y}
{
}
public:
union
{
struct { float x, y, z, w; };
struct { float r, g, b, a; };
struct { float left, top, right, bottom; };
float v[4];
float F32[4];
double F64[2];
s8 I8[16];
s16 I16[8];
s32 I32[4];
s64 I64[2];
u8 U8[16];
u16 U16[8];
u32 U32[4];
u64 U64[2];
float32x4_t v4s;
};
static const GSVector4 m_ps0123;
static const GSVector4 m_ps4567;
static const GSVector4 m_half;
static const GSVector4 m_one;
static const GSVector4 m_two;
static const GSVector4 m_four;
static const GSVector4 m_x4b000000;
static const GSVector4 m_x4f800000;
static const GSVector4 m_xc1e00000000fffff;
static const GSVector4 m_max;
static const GSVector4 m_min;
GSVector4() = default;
constexpr static GSVector4 cxpr(float x, float y, float z, float w)
{
return GSVector4(cxpr_init, x, y, z, w);
}
constexpr static GSVector4 cxpr(float x)
{
return GSVector4(cxpr_init, x, x, x, x);
}
constexpr static GSVector4 cxpr(int x, int y, int z, int w)
{
return GSVector4(cxpr_init, x, y, z, w);
}
constexpr static GSVector4 cxpr(int x)
{
return GSVector4(cxpr_init, x, x, x, x);
}
constexpr static GSVector4 cxpr64(u64 x, u64 y)
{
return GSVector4(cxpr_init, x, y);
}
constexpr static GSVector4 cxpr64(u64 x)
{
return GSVector4(cxpr_init, x, x);
}
__forceinline GSVector4(float x, float y, float z, float w)
{
const float arr[4] = { x, y, z, w };
v4s = vld1q_f32(arr);
}
__forceinline GSVector4(float x, float y)
{
v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
}
__forceinline GSVector4(int x, int y, int z, int w)
{
const int arr[4] = { x, y, z, w };
v4s = vcvtq_f32_s32(vld1q_s32(arr));
}
__forceinline GSVector4(int x, int y)
{
v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
}
__forceinline explicit GSVector4(const GSVector2& v)
{
v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0));
}
__forceinline explicit GSVector4(const GSVector2i& v)
{
v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0)));
}
__forceinline constexpr explicit GSVector4(float32x4_t m)
: v4s(m)
{
}
__forceinline explicit GSVector4(float f)
{
v4s = vdupq_n_f32(f);
}
__forceinline explicit GSVector4(int i)
{
v4s = vcvtq_f32_s32(vdupq_n_s32(i));
}
__forceinline explicit GSVector4(u32 u)
{
GSVector4i v((int)u);
*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
}
__forceinline explicit GSVector4(const GSVector4i& v);
__forceinline static GSVector4 cast(const GSVector4i& v);
__forceinline static GSVector4 f64(double x, double y)
{
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
}
__forceinline void operator=(float f)
{
v4s = vdupq_n_f32(f);
}
__forceinline void operator=(float32x4_t m)
{
v4s = m;
}
__forceinline operator float32x4_t() const
{
return v4s;
}
/// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks we don't need the whole vector
/// Useful for e.g. preventing clang from optimizing shuffles that remove possibly-denormal garbage data from vectors before computing with them
__forceinline GSVector4 noopt()
{
// Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the future the implementation should be updated
#ifdef __clang__
// __asm__("":"+x"(m)::);
#endif
return *this;
}
__forceinline u32 rgba32() const
{
return GSVector4i(*this).rgba32();
}
__forceinline static GSVector4 rgba32(u32 rgba)
{
return GSVector4(GSVector4i::load((int)rgba).u8to32());
}
__forceinline static GSVector4 rgba32(u32 rgba, int shift)
{
return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
}
__forceinline static GSVector4 unorm8(u32 rgba)
{
return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f);
}
__forceinline GSVector4 abs() const
{
return GSVector4(vabsq_f32(v4s));
}
__forceinline GSVector4 neg() const
{
return GSVector4(vnegq_f32(v4s));
}
__forceinline GSVector4 rcp() const
{
return GSVector4(vrecpeq_f32(v4s));
}
__forceinline GSVector4 rcpnr() const
{
float32x4_t recip = vrecpeq_f32(v4s);
recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
return GSVector4(recip);
}
template <int mode>
__forceinline GSVector4 round() const
{
if constexpr (mode == Round_NegInf)
return floor();
else if constexpr (mode == Round_PosInf)
return ceil();
else if constexpr (mode == Round_NearestInt)
return GSVector4(vrndnq_f32(v4s));
else
return GSVector4(vrndq_f32(v4s));
}
__forceinline GSVector4 floor() const
{
return GSVector4(vrndmq_f32(v4s));
}
__forceinline GSVector4 ceil() const
{
return GSVector4(vrndpq_f32(v4s));
}
// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
#define LOG_POLY0(x, c0) GSVector4(c0)
#define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0)))
#define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0)))
#define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0)))
#define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0)))
#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0)))
__forceinline GSVector4 log2(int precision = 5) const
{
// NOTE: sign bit ignored, safe to pass negative numbers
// The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e,
// and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range).
GSVector4 one = m_one;
GSVector4i i = GSVector4i::cast(*this);
GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
GSVector4 p;
// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
switch (precision)
{
case 3:
p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
break;
case 4:
p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
break;
default:
case 5:
p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
break;
case 6:
p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
break;
}
// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
p = p * (m - one);
return p + e;
}
__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
{
return *this * a + b;
}
__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
{
return *this * a - b;
}
__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
{
return b - *this * a;
}
__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
{
return -b - *this * a;
}
__forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const
{
return a.madd(b, *this); // *this + a * b
}
__forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const
{
return a.nmadd(b, *this); // *this - a * b
}
__forceinline GSVector4 hadd() const
{
return GSVector4(vpaddq_f32(v4s, v4s));
}
__forceinline GSVector4 hadd(const GSVector4& v) const
{
return GSVector4(vpaddq_f32(v4s, v.v4s));
}
__forceinline GSVector4 hsub() const
{
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s)));
}
__forceinline GSVector4 hsub(const GSVector4& v) const
{
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
}
__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
{
return max(a).min(b);
}
__forceinline GSVector4 sat(const GSVector4& a) const
{
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
return sat(minv, maxv);
}
__forceinline GSVector4 sat(const float scale = 255) const
{
return sat(zero(), GSVector4(scale));
}
__forceinline GSVector4 clamp(const float scale = 255) const
{
return min(GSVector4(scale));
}
__forceinline GSVector4 min(const GSVector4& a) const
{
return GSVector4(vminq_f32(v4s, a.v4s));
}
__forceinline GSVector4 max(const GSVector4& a) const
{
return GSVector4(vmaxq_f32(v4s, a.v4s));
}
template <int mask>
__forceinline GSVector4 blend32(const GSVector4& a) const
{
return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2, (mask & 8) ? 7 : 3));
}
__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
{
// duplicate sign bit across and bit select
const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
}
__forceinline GSVector4 upl(const GSVector4& a) const
{
return GSVector4(vzip1q_f32(v4s, a.v4s));
}
__forceinline GSVector4 uph(const GSVector4& a) const
{
return GSVector4(vzip2q_f32(v4s, a.v4s));
}
__forceinline GSVector4 upld(const GSVector4& a) const
{
return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
}
__forceinline GSVector4 uphd(const GSVector4& a) const
{
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
}
__forceinline GSVector4 l2h(const GSVector4& a) const
{
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
}
__forceinline GSVector4 h2l(const GSVector4& a) const
{
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
}
__forceinline GSVector4 andnot(const GSVector4& v) const
{
return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
}
__forceinline int mask() const
{
static const int32_t shifts[] = { 0, 1, 2, 3 };
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
}
__forceinline bool alltrue() const
{
// return mask() == 0xf;
return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
}
__forceinline bool allfalse() const
{
return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
}
__forceinline GSVector4 replace_nan(const GSVector4& v) const
{
return v.blend32(*this, *this == *this);
}
template <int src, int dst>
__forceinline GSVector4 insert32(const GSVector4& v) const
{
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
}
template <int i>
__forceinline int extract32() const
{
return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
}
__forceinline static GSVector4 zero()
{
return GSVector4(vdupq_n_f32(0.0f));
}
__forceinline static GSVector4 xffffffff()
{
return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu)));
}
__forceinline static GSVector4 ps0123()
{
return GSVector4(m_ps0123);
}
__forceinline static GSVector4 ps4567()
{
return GSVector4(m_ps4567);
}
__forceinline static GSVector4 loadl(const void* p)
{
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
}
__forceinline static GSVector4 load(float f)
{
return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0));
}
__forceinline static GSVector4 load(u32 u)
{
GSVector4i v = GSVector4i::load((int)u);
return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
}
template <bool aligned>
__forceinline static GSVector4 load(const void* p)
{
return GSVector4(vld1q_f32((const float*)p));
}
__forceinline static void storent(void* p, const GSVector4& v)
{
vst1q_f32((float*)p, v.v4s);
}
__forceinline static void storel(void* p, const GSVector4& v)
{
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
}
__forceinline static void storeh(void* p, const GSVector4& v)
{
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
}
template <bool aligned>
__forceinline static void store(void* p, const GSVector4& v)
{
vst1q_f32((float*)p, v.v4s);
}
__forceinline static void store(float* p, const GSVector4& v)
{
vst1q_lane_f32(p, v.v4s, 0);
}
__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
{
GSVector4i mask = GSVector4i::x000000ff();
a = GSVector4(v & mask);
b = GSVector4((v >> 8) & mask);
c = GSVector4((v >> 16) & mask);
d = GSVector4((v >> 24));
}
__forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
{
GSVector4 v0 = a.xyxy(b);
GSVector4 v1 = c.xyxy(d);
GSVector4 e = v0.xzxz(v1);
GSVector4 f = v0.ywyw(v1);
GSVector4 v2 = a.zwzw(b);
GSVector4 v3 = c.zwzw(d);
GSVector4 g = v2.xzxz(v3);
GSVector4 h = v2.ywyw(v3);
a = e;
b = f;
c = g;
d = h;
}
__forceinline GSVector4 operator-() const
{
return neg();
}
__forceinline void operator+=(const GSVector4& v)
{
v4s = vaddq_f32(v4s, v.v4s);
}
__forceinline void operator-=(const GSVector4& v)
{
v4s = vsubq_f32(v4s, v.v4s);
}
__forceinline void operator*=(const GSVector4& v)
{
v4s = vmulq_f32(v4s, v.v4s);
}
__forceinline void operator/=(const GSVector4& v)
{
v4s = vdivq_f32(v4s, v.v4s);
}
__forceinline void operator+=(float f)
{
*this += GSVector4(f);
}
__forceinline void operator-=(float f)
{
*this -= GSVector4(f);
}
__forceinline void operator*=(float f)
{
*this *= GSVector4(f);
}
__forceinline void operator/=(float f)
{
*this /= GSVector4(f);
}
__forceinline void operator&=(const GSVector4& v)
{
v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
}
__forceinline void operator|=(const GSVector4& v)
{
v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
}
__forceinline void operator^=(const GSVector4& v)
{
v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
}
__forceinline friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
}
__forceinline friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
}
__forceinline friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
}
__forceinline friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
}
__forceinline friend GSVector4 operator+(const GSVector4& v, float f)
{
return v + GSVector4(f);
}
__forceinline friend GSVector4 operator-(const GSVector4& v, float f)
{
return v - GSVector4(f);
}
__forceinline friend GSVector4 operator*(const GSVector4& v, float f)
{
return v * GSVector4(f);
}
__forceinline friend GSVector4 operator/(const GSVector4& v, float f)
{
return v / GSVector4(f);
}
__forceinline friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
}
__forceinline friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
}
__forceinline friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
}
__forceinline friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
}
__forceinline friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
{
// NEON has no !=
return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
}
__forceinline friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
}
__forceinline friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
}
__forceinline friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
}
__forceinline friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
{
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
}
__forceinline GSVector4 mul64(const GSVector4& v) const
{
return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
}
__forceinline GSVector4 add64(const GSVector4& v) const
{
return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
}
__forceinline GSVector4 sub64(const GSVector4& v) const
{
return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
}
__forceinline static GSVector4 f32to64(const GSVector4& v)
{
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
}
__forceinline static GSVector4 f32to64(const void* p)
{
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
}
__forceinline GSVector4i f64toi32(bool truncate = true) const
{
const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
}
// clang-format off
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
__forceinline GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
#define VECTOR4_SHUFFLE_1(xs, xn) \
VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
VECTOR4_SHUFFLE_1(x, 0)
VECTOR4_SHUFFLE_1(y, 1)
VECTOR4_SHUFFLE_1(z, 2)
VECTOR4_SHUFFLE_1(w, 3)
// clang-format on
__forceinline GSVector4 broadcast32() const
{
return GSVector4(vdupq_laneq_f32(v4s, 0));
}
__forceinline static GSVector4 broadcast32(const GSVector4& v)
{
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
}
__forceinline static GSVector4 broadcast32(const void* f)
{
return GSVector4(vld1q_dup_f32((const float*)f));
}
__forceinline static GSVector4 broadcast64(const void* f)
{
return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
}
};

2222
pcsx2/GS/GSVector4i_arm64.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,8 @@
#define strcasecmp _stricmp
#endif
#ifdef _M_X86
static ProcessorFeatures::VectorISA getCurrentISA()
{
// For debugging
@ -41,11 +43,14 @@ static ProcessorFeatures::VectorISA getCurrentISA()
return ProcessorFeatures::VectorISA::SSE4;
}
#endif
static ProcessorFeatures getProcessorFeatures()
{
cpuinfo_initialize();
ProcessorFeatures features = {};
#if defined(_M_X86)
features.vectorISA = getCurrentISA();
features.hasFMA = cpuinfo_has_x86_fma3();
if (const char* over = getenv("OVERRIDE_FMA"))
@ -74,6 +79,7 @@ static ProcessorFeatures getProcessorFeatures()
features.hasSlowGather = true;
}
}
#endif
return features;
}

View File

@ -44,10 +44,12 @@
struct ProcessorFeatures
{
#ifdef _M_X86
enum class VectorISA { SSE4, AVX, AVX2 };
VectorISA vectorISA;
bool hasFMA;
bool hasSlowGather;
#endif
};
extern const ProcessorFeatures g_cpu;

View File

@ -21,10 +21,14 @@ struct alignas(32) GSVertex
u32 FOG; // FOG:28
};
#if defined(_M_X86)
#if _M_SSE >= 0x500
__m256i mx;
#endif
__m128i m[2];
#elif defined(_M_ARM64)
int32x4_t m[2];
#endif
};
};

View File

@ -318,8 +318,6 @@ std::string D3D::GetDriverVersionFromLUID(const LUID& luid)
return ret;
}
#ifdef _M_X86
D3D::VendorID D3D::GetVendorID(IDXGIAdapter1* adapter)
{
DXGI_ADAPTER_DESC1 desc;
@ -382,6 +380,7 @@ GSRendererType D3D::GetPreferredRenderer()
Console.Error("D3D12CreateDevice() for automatic renderer failed: %08X", hr);
return device;
};
#ifdef ENABLE_VULKAN
static constexpr auto check_for_mapping_layers = []() {
PCWSTR familyName = L"Microsoft.D3DMappingLayers_8wekyb3d8bbwe";
UINT32 numPackages = 0, bufferLength = 0;
@ -391,7 +390,7 @@ GSRendererType D3D::GetPreferredRenderer()
Host::AddIconOSDMessage("VKDriverUnsupported", ICON_FA_TV,
TRANSLATE_STR("GS",
"Your system has the \"OpenCL, OpenGL, and Vulkan Compatibility Pack\" installed.\n"
"This Vulkan driver crashes PCSX2 on some GPUs.\n"
"This Vulkan driver crashes PCSX2 on some GPUs.\n"
"To use the Vulkan renderer, you should remove this app package."),
Host::OSD_WARNING_DURATION);
return true;
@ -416,6 +415,9 @@ GSRendererType D3D::GetPreferredRenderer()
" to use the Vulkan renderer."), Host::OSD_WARNING_DURATION);
return false;
};
#else
static constexpr auto check_vulkan_supported = []() { return false; };
#endif
switch (GetVendorID(adapter.get()))
{
@ -470,14 +472,16 @@ GSRendererType D3D::GetPreferredRenderer()
default:
{
// Default is D3D11
// Default is D3D11, but prefer DX12 on ARM (better drivers).
#ifdef _M_ARM64
return GSRendererType::DX12;
#else
return GSRendererType::DX11;
#endif
}
}
}
#endif // _M_X86
wil::com_ptr_nothrow<ID3DBlob> D3D::CompileShader(D3D::ShaderType type, D3D_FEATURE_LEVEL feature_level, bool debug,
const std::string_view code, const D3D_SHADER_MACRO* macros /* = nullptr */,
const char* entry_point /* = "main" */)

View File

@ -44,7 +44,6 @@ namespace D3D
// returns the driver version from the registry as a string
std::string GetDriverVersionFromLUID(const LUID& luid);
#ifdef _M_X86
// this is sort of a legacy thing that doesn't have much to do with d3d (just the easiest way)
// checks to see if the adapter at 0 is NV and thus we should prefer OpenGL
enum class VendorID
@ -57,7 +56,6 @@ namespace D3D
VendorID GetVendorID(IDXGIAdapter1* adapter);
GSRendererType GetPreferredRenderer();
#endif
// D3DCompiler wrapper.
enum class ShaderType

View File

@ -1,11 +1,18 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-FileCopyrightText: 2002-2024 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
#pragma once
#include "GS/GSState.h"
#ifdef _M_X86
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h"
#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h"
#endif
#ifdef _M_ARM64
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h"
#endif
struct GSScanlineLocalData;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,82 @@
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#pragma once
#include "GS/Renderers/Common/GSFunctionMap.h"
#include "GS/Renderers/SW/GSScanlineEnvironment.h"
#include "vixl/aarch64/macro-assembler-aarch64.h"
class GSDrawScanlineCodeGenerator
{
public:
GSDrawScanlineCodeGenerator(u64 key, void* code, size_t maxsize);
void Generate();
size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
private:
void Init();
void Step();
void TestZ(const vixl::aarch64::VRegister& temp1, const vixl::aarch64::VRegister& temp2);
void SampleTexture();
void SampleTexture_TexelReadHelper(int mip_offset);
void Wrap(const vixl::aarch64::VRegister& uv0);
void Wrap(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1);
void SampleTextureLOD();
void WrapLOD(const vixl::aarch64::VRegister& uv,
const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
void WrapLOD(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1,
const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
void AlphaTFX();
void ReadMask();
void TestAlpha();
void ColorTFX();
void Fog();
void ReadFrame();
void TestDestAlpha();
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
void ReadPixel(const vixl::aarch64::VRegister& dst, const vixl::aarch64::Register& addr);
void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, const vixl::aarch64::Register& mask, bool high, bool fast, int psm, int fz);
void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, u8 i, int psm);
void ReadTexel1(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src,
const vixl::aarch64::VRegister& tmp1, int mip_offset);
void ReadTexel4(
const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
int mip_offset);
void ReadTexelImplLoadTexLOD(const vixl::aarch64::Register& addr, int lod, int mip_offset);
void ReadTexelImpl(
const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
int pixels, int mip_offset);
void ReadTexelImpl(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& addr,
u8 i, const vixl::aarch64::Register& baseRegister, bool preserveDst);
void modulate16(const vixl::aarch64::VRegister& d, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
void modulate16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
void lerp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f, u8 shift);
void lerp16_4(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f);
void mix16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& temp);
void clamp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& temp);
void alltrue(const vixl::aarch64::VRegister& test, const vixl::aarch64::VRegister& temp);
void blend8(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
void blend8r(const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
void split16_2x8(const vixl::aarch64::VRegister& l, const vixl::aarch64::VRegister& h, const vixl::aarch64::VRegister& src);
vixl::aarch64::MacroAssembler m_emitter;
GSScanlineSelector m_sel;
vixl::aarch64::Label m_step_label;
};

View File

@ -156,6 +156,26 @@ struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a p
struct { GSVector4i i, f; } lod; // lcm == 1
#endif
#ifdef _M_ARM64
// Mini version of constant data for ARM64, we don't need all of it
alignas(16) u32 const_test_128b[8][4] = {
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
{0xffffffff, 0x00000000, 0x00000000, 0x00000000},
{0xffffffff, 0xffffffff, 0x00000000, 0x00000000},
{0xffffffff, 0xffffffff, 0xffffffff, 0x00000000},
{0x00000000, 0xffffffff, 0xffffffff, 0xffffffff},
{0x00000000, 0x00000000, 0xffffffff, 0xffffffff},
{0x00000000, 0x00000000, 0x00000000, 0xffffffff},
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
};
alignas(16) u16 const_movemaskw_mask[8] = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
alignas(16) float const_log2_coef[4] = {
0.204446009836232697516f,
-1.04913055217340124191f,
2.28330284476918490682f,
1.0f};
#endif
};
struct alignas(32) GSScanlineLocalData // per prim variables, each thread has its own

View File

@ -0,0 +1,339 @@
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
#include "GS/Renderers/SW/GSVertexSW.h"
#include "common/StringUtil.h"
#include "common/Perf.h"
#include <cstdint>
MULTI_ISA_UNSHARED_IMPL;
using namespace vixl::aarch64;
static const auto& _vertex = x0;
static const auto& _index = x1;
static const auto& _dscan = x2;
static const auto& _locals = x3;
static const auto& _scratchaddr = x7;
static const auto& _vscratch = v31;
#define _local(field) MemOperand(_locals, offsetof(GSScanlineLocalData, field))
#define armAsm (&m_emitter)
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize)
: m_emitter(static_cast<vixl::byte*>(code), maxsize, vixl::aarch64::PositionDependentCode)
, m_sel(key)
{
m_en.z = m_sel.zb ? 1 : 0;
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
}
void GSSetupPrimCodeGenerator::Generate()
{
const bool needs_shift = ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS) || m_en.t || (m_en.c && m_sel.iip);
if (needs_shift)
{
armAsm->Mov(x4, reinterpret_cast<intptr_t>(g_const.m_shift_128b));
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
armAsm->Ldr(VRegister(3 + i, kFormat16B), MemOperand(x4, i * sizeof(g_const.m_shift_128b[0])));
}
}
Depth();
Texture();
Color();
armAsm->Ret();
armAsm->FinalizeCode();
Perf::any.RegisterKey(GetCode(), GetSize(), "GSSetupPrim_", m_sel.key);
}
void GSSetupPrimCodeGenerator::Depth()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
if (m_en.f)
{
// GSVector4 df = t.wwww();
armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, t.w));
armAsm->Ld1r(v1.V4S(), MemOperand(_scratchaddr));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
armAsm->Fmul(v2.V4S(), v1.V4S(), v3.V4S());
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
armAsm->Str(v2.V4S(), _local(d4.f));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
armAsm->Str(v2.V4S(), _local(d[i].f));
}
}
if (m_en.z)
{
// VectorF dz = VectorF::broadcast64(&dscan.p.z)
armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, p.z));
armAsm->Ld1r(_vscratch.V2D(), MemOperand(_scratchaddr));
// m_local.d4.z = dz.mul64(GSVector4::f32to64(shift));
armAsm->Fcvtl(v1.V2D(), v3.V2S());
armAsm->Fmul(v1.V2D(), v1.V2D(), _vscratch.V2D());
armAsm->Str(v1.V2D(), _local(d4.z));
armAsm->Fcvtn(v0.V2S(), _vscratch.V2D());
armAsm->Fcvtn2(v0.V4S(), _vscratch.V2D());
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
armAsm->Fmul(v1.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Str(v1.V4S(), _local(d[i].z));
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16)));
armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
armAsm->Add(x4, _vertex, x4);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, p)));
armAsm->Fcvtzs(v1.V4S(), v0.V4S());
armAsm->Dup(v1.V8H(), v1.V8H(), 6);
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, p.f)));
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
armAsm->Add(_scratchaddr, x4, offsetof(GSVertexSW, t.w));
armAsm->Ld1r(v0.V4S(), MemOperand(_scratchaddr));
armAsm->Str(v0, MemOperand(_locals, offsetof(GSScanlineLocalData, p.z)));
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
armAsm->Ldr(v0, MemOperand(_dscan, offsetof(GSVertexSW, t)));
armAsm->Fmul(v1.V4S(), v0.V4S(), v3.V4S());
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
armAsm->Fcvtzs(v1.V4S(), v1.V4S());
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
}
else
{
// m_local.d4.stq = t * 4.0f;
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
armAsm->Dup(v1.V4S(), v0.V4S(), j);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, 128, 4));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
switch (j)
{
case 0: armAsm->Str(v2, _local(d[i].s)); break;
case 1: armAsm->Str(v2, _local(d[i].t)); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch (j)
{
case 0: armAsm->Str(v2, _local(d[i].s)); break;
case 1: armAsm->Str(v2, _local(d[i].t)); break;
case 2: armAsm->Str(v2, _local(d[i].q)); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Rev64(_vscratch.V4S(), v2.V4S());
armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
armAsm->Dup(v0.V4S(), v16.V4S(), 0);
armAsm->Dup(v1.V4S(), v16.V4S(), 2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
armAsm->Sqxtn(v3.V4H(), v3.V4S());
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
// m_local.d[i].rb = r.upl16(b);
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
armAsm->Str(v2, _local(d[i].rb));
}
// GSVector4 c = dscan.c;
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
armAsm->Dup(v0.V4S(), v16.V4S(), 1);
armAsm->Dup(v1.V4S(), v16.V4S(), 3);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
armAsm->Sqxtn(v3.V4H(), v3.V4S());
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
// m_local.d[i].ga = g.upl16(a);
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
armAsm->Str(v2, _local(d[i].ga));
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16) * last));
armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
armAsm->Add(x4, _vertex, x4);
}
armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, c)));
armAsm->Fcvtzs(v0.V4S(), v0.V4S());
// c = c.upl16(c.zwxy());
armAsm->Ext(v1.V16B(), v0.V16B(), v0.V16B(), 8);
armAsm->Zip1(v0.V8H(), v0.V8H(), v1.V8H());
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
armAsm->Ushr(v0.V8H(), v0.V8H(), 7);
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
armAsm->Dup(v1.V4S(), v0.V4S(), 0);
armAsm->Dup(v2.V4S(), v0.V4S(), 2);
armAsm->Str(v1, _local(c.rb));
armAsm->Str(v2, _local(c.ga));
}
}

View File

@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#pragma once
#include "GS/Renderers/Common/GSFunctionMap.h"
#include "GS/Renderers/SW/GSScanlineEnvironment.h"
#include "vixl/aarch64/macro-assembler-aarch64.h"
class GSSetupPrimCodeGenerator
{
public:
GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize);
void Generate();
size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
private:
void Depth();
void Texture();
void Color();
vixl::aarch64::MacroAssembler m_emitter;
GSScanlineSelector m_sel;
struct
{
u32 z : 1, f : 1, t : 1, c : 1;
} m_en;
};

View File

@ -207,6 +207,12 @@
<ClCompile Include="GS\Renderers\OpenGL\GLStreamBuffer.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GS\Renderers\Vulkan\VKBuilders.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClCompile>
@ -317,7 +323,9 @@
<ClCompile Include="GS\Renderers\Common\GSDirtyRect.cpp" />
<ClCompile Include="GS\GSDrawingContext.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GS\GSDump.cpp" />
<ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
<ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
@ -333,7 +341,9 @@
<ClCompile Include="GS\Renderers\HW\GSRendererHWMultiISA.cpp" />
<ClCompile Include="GS\Renderers\Null\GSRendererNull.cpp" />
<ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GS\GSState.cpp" />
<ClCompile Include="GS\GSTables.cpp" />
<ClCompile Include="GS\Renderers\Common\GSTexture.cpp" />
@ -607,6 +617,8 @@
<ClInclude Include="DEV9\Win32\pcap_io_win32_funcs.h" />
<ClInclude Include="DEV9\Win32\tap.h" />
<ClInclude Include="GameList.h" />
<ClInclude Include="GS\GSVector4i_arm64.h" />
<ClInclude Include="GS\GSVector4_arm64.h" />
<ClInclude Include="GS\Renderers\DX11\D3D11ShaderCache.h" />
<ClInclude Include="GS\Renderers\DX12\D3D12Builders.h" />
<ClInclude Include="GS\Renderers\DX12\D3D12DescriptorHeapManager.h" />
@ -628,6 +640,12 @@
<ClInclude Include="GS\Renderers\OpenGL\GLStreamBuffer.h">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\Renderers\Vulkan\VKBuilders.h">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClInclude>
@ -727,8 +745,12 @@
<ClInclude Include="GS\GSDrawingContext.h" />
<ClInclude Include="GS\GSDrawingEnvironment.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\GSDump.h" />
<ClInclude Include="GS\Renderers\Common\GSFastList.h" />
<ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
@ -743,7 +765,9 @@
<ClInclude Include="GS\Renderers\Null\GSRendererNull.h" />
<ClInclude Include="GS\Renderers\SW\GSRendererSW.h" />
<ClInclude Include="GS\Renderers\SW\GSScanlineEnvironment.h" />
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="GS\GSState.h" />
<ClInclude Include="GS\GSTables.h" />
<ClInclude Include="GS\Renderers\Common\GSTexture.h" />

View File

@ -1392,6 +1392,12 @@
<ClCompile Include="CDVD\FlatFileReader.cpp">
<Filter>System\ISO</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Patch.h">
@ -2303,6 +2309,18 @@
<ClInclude Include="Vif_HashBucket.h">
<Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\GSVector4_arm64.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="GS\GSVector4i_arm64.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuildStep Include="rdebug\deci2.h">