mirror of https://github.com/PCSX2/pcsx2.git
GS: ARM64 compatibility
This commit is contained in:
parent
71036c95a4
commit
0a4c037898
|
@ -449,12 +449,22 @@ set(pcsx2GSSourcesUnshared
|
|||
GS/Renderers/Common/GSVertexTraceFMM.cpp
|
||||
GS/Renderers/HW/GSRendererHWMultiISA.cpp
|
||||
GS/Renderers/SW/GSDrawScanline.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSRasterizer.cpp
|
||||
GS/Renderers/SW/GSRendererSW.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
|
||||
)
|
||||
|
||||
if(_M_X86)
|
||||
list(APPEND pcsx2GSSourcesUnshared
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
|
||||
)
|
||||
elseif(_M_ARM64)
|
||||
list(APPEND pcsx2GSSourcesUnshared
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
set(pcsx2GSSources
|
||||
GS/GS.cpp
|
||||
GS/GSCapture.cpp
|
||||
|
@ -509,10 +519,6 @@ set(pcsx2GSHeaders
|
|||
GS/GSTables.h
|
||||
GS/GSUtil.h
|
||||
GS/GSVector.h
|
||||
GS/GSVector4.h
|
||||
GS/GSVector4i.h
|
||||
GS/GSVector8.h
|
||||
GS/GSVector8i.h
|
||||
GS/GSXXH.h
|
||||
GS/MultiISA.h
|
||||
GS/Renderers/Common/GSDevice.h
|
||||
|
@ -540,6 +546,20 @@ set(pcsx2GSHeaders
|
|||
GS/Renderers/SW/GSVertexSW.h
|
||||
)
|
||||
|
||||
if(_M_X86)
|
||||
list(APPEND pcsx2GSHeaders
|
||||
GS/GSVector4.h
|
||||
GS/GSVector4i.h
|
||||
GS/GSVector8.h
|
||||
GS/GSVector8i.h
|
||||
)
|
||||
elseif(_M_ARM64)
|
||||
list(APPEND pcsx2GSHeaders
|
||||
GS/GSVector4_arm64.h
|
||||
GS/GSVector4i_arm64.h
|
||||
)
|
||||
endif()
|
||||
|
||||
if(USE_OPENGL)
|
||||
list(APPEND pcsx2GSSources
|
||||
GS/Renderers/OpenGL/GLContext.cpp
|
||||
|
|
|
@ -3588,8 +3588,13 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
break;
|
||||
}
|
||||
|
||||
#ifndef _M_ARM64
|
||||
// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
|
||||
skip |= test.mask() & 0xff;
|
||||
#else
|
||||
// mask() is slow on ARM, so just pull the bits out instead, thankfully we only care about the first 4 bytes.
|
||||
skip |= (static_cast<u64>(test.extract64<0>()) & UINT64_C(0x8080808080808080)) != 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (skip != 0)
|
||||
|
|
|
@ -207,6 +207,9 @@ GSRendererType GSUtil::GetPreferredRenderer()
|
|||
#if defined(__APPLE__)
|
||||
// Mac: Prefer Metal hardware.
|
||||
preferred_renderer = GSRendererType::Metal;
|
||||
#elif defined(_WIN32) && defined(_M_ARM64)
|
||||
// Default to DX12 on Windows-on-ARM.
|
||||
preferred_renderer = GSRendererType::DX12;
|
||||
#elif defined(_WIN32)
|
||||
// Use D3D device info to select renderer.
|
||||
preferred_renderer = D3D::GetPreferredRenderer();
|
||||
|
|
|
@ -59,6 +59,8 @@ constinit const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000f
|
|||
constinit const GSVector4 GSVector4::m_max = cxpr(FLT_MAX);
|
||||
constinit const GSVector4 GSVector4::m_min = cxpr(FLT_MIN);
|
||||
|
||||
#ifdef _M_X86
|
||||
|
||||
constinit const GSVector8 GSVector8::m_half = cxpr(0.5f);
|
||||
constinit const GSVector8 GSVector8::m_one = cxpr(1.0f);
|
||||
constinit const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff);
|
||||
|
@ -143,6 +145,8 @@ constinit const GSVector8i GSVector8i::m_x0f[33] =
|
|||
cxpr(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
GSVector4i GSVector4i::fit(int arx, int ary) const
|
||||
{
|
||||
GSVector4i r = *this;
|
||||
|
|
|
@ -84,6 +84,7 @@ typedef GSVector2T<int> GSVector2i;
|
|||
class GSVector4;
|
||||
class GSVector4i;
|
||||
|
||||
#if defined(_M_X86)
|
||||
#if _M_SSE >= 0x500
|
||||
|
||||
class GSVector8;
|
||||
|
@ -102,16 +103,30 @@ class GSVector8i;
|
|||
#include "GSVector8i.h"
|
||||
#include "GSVector8.h"
|
||||
|
||||
#elif defined(_M_ARM64)
|
||||
#include "GSVector4i_arm64.h"
|
||||
#include "GSVector4_arm64.h"
|
||||
#endif
|
||||
|
||||
// conversion
|
||||
|
||||
__forceinline_odr GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
|
||||
{
|
||||
#if defined(_M_X86)
|
||||
m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
|
||||
#elif defined(_M_ARM64)
|
||||
// GS thread uses default (nearest) rounding.
|
||||
v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline_odr GSVector4::GSVector4(const GSVector4i& v)
|
||||
{
|
||||
#if defined(_M_X86)
|
||||
m = _mm_cvtepi32_ps(v);
|
||||
#elif defined(_M_ARM64)
|
||||
v4s = vcvtq_f32_s32(v.v4s);
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline_odr void GSVector4i::sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
|
||||
|
@ -153,12 +168,20 @@ __forceinline_odr void GSVector8i::sw32_inv(GSVector8i& a, GSVector8i& b)
|
|||
|
||||
__forceinline_odr GSVector4i GSVector4i::cast(const GSVector4& v)
|
||||
{
|
||||
#ifndef _M_ARM64
|
||||
return GSVector4i(_mm_castps_si128(v.m));
|
||||
#else
|
||||
return GSVector4i(vreinterpretq_s32_f32(v.v4s));
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline_odr GSVector4 GSVector4::cast(const GSVector4i& v)
|
||||
{
|
||||
#ifndef _M_ARM64
|
||||
return GSVector4(_mm_castsi128_ps(v.m));
|
||||
#else
|
||||
return GSVector4(vreinterpretq_f32_s32(v.v4s));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x500
|
||||
|
|
|
@ -0,0 +1,787 @@
|
|||
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
|
||||
// SPDX-License-Identifier: GPL-3.0
|
||||
|
||||
class alignas(16) GSVector4
|
||||
{
|
||||
struct cxpr_init_tag
|
||||
{
|
||||
};
|
||||
static constexpr cxpr_init_tag cxpr_init{};
|
||||
|
||||
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w)
|
||||
: F32{x, y, z, w}
|
||||
{
|
||||
}
|
||||
|
||||
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w)
|
||||
: I32{x, y, z, w}
|
||||
{
|
||||
}
|
||||
|
||||
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y)
|
||||
: U64{x, y}
|
||||
{
|
||||
}
|
||||
|
||||
public:
|
||||
union
|
||||
{
|
||||
struct { float x, y, z, w; };
|
||||
struct { float r, g, b, a; };
|
||||
struct { float left, top, right, bottom; };
|
||||
float v[4];
|
||||
float F32[4];
|
||||
double F64[2];
|
||||
s8 I8[16];
|
||||
s16 I16[8];
|
||||
s32 I32[4];
|
||||
s64 I64[2];
|
||||
u8 U8[16];
|
||||
u16 U16[8];
|
||||
u32 U32[4];
|
||||
u64 U64[2];
|
||||
float32x4_t v4s;
|
||||
};
|
||||
|
||||
static const GSVector4 m_ps0123;
|
||||
static const GSVector4 m_ps4567;
|
||||
static const GSVector4 m_half;
|
||||
static const GSVector4 m_one;
|
||||
static const GSVector4 m_two;
|
||||
static const GSVector4 m_four;
|
||||
static const GSVector4 m_x4b000000;
|
||||
static const GSVector4 m_x4f800000;
|
||||
static const GSVector4 m_xc1e00000000fffff;
|
||||
static const GSVector4 m_max;
|
||||
static const GSVector4 m_min;
|
||||
|
||||
GSVector4() = default;
|
||||
|
||||
constexpr static GSVector4 cxpr(float x, float y, float z, float w)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, y, z, w);
|
||||
}
|
||||
|
||||
constexpr static GSVector4 cxpr(float x)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, x, x, x);
|
||||
}
|
||||
|
||||
constexpr static GSVector4 cxpr(int x, int y, int z, int w)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, y, z, w);
|
||||
}
|
||||
|
||||
constexpr static GSVector4 cxpr(int x)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, x, x, x);
|
||||
}
|
||||
|
||||
constexpr static GSVector4 cxpr64(u64 x, u64 y)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, y);
|
||||
}
|
||||
|
||||
constexpr static GSVector4 cxpr64(u64 x)
|
||||
{
|
||||
return GSVector4(cxpr_init, x, x);
|
||||
}
|
||||
|
||||
__forceinline GSVector4(float x, float y, float z, float w)
|
||||
{
|
||||
const float arr[4] = { x, y, z, w };
|
||||
v4s = vld1q_f32(arr);
|
||||
}
|
||||
|
||||
__forceinline GSVector4(float x, float y)
|
||||
{
|
||||
v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
|
||||
}
|
||||
|
||||
__forceinline GSVector4(int x, int y, int z, int w)
|
||||
{
|
||||
const int arr[4] = { x, y, z, w };
|
||||
v4s = vcvtq_f32_s32(vld1q_s32(arr));
|
||||
}
|
||||
|
||||
__forceinline GSVector4(int x, int y)
|
||||
{
|
||||
v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(const GSVector2& v)
|
||||
{
|
||||
v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0));
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(const GSVector2i& v)
|
||||
{
|
||||
v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0)));
|
||||
}
|
||||
|
||||
__forceinline constexpr explicit GSVector4(float32x4_t m)
|
||||
: v4s(m)
|
||||
{
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(float f)
|
||||
{
|
||||
v4s = vdupq_n_f32(f);
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(int i)
|
||||
{
|
||||
v4s = vcvtq_f32_s32(vdupq_n_s32(i));
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(u32 u)
|
||||
{
|
||||
GSVector4i v((int)u);
|
||||
|
||||
*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(const GSVector4i& v);
|
||||
|
||||
__forceinline static GSVector4 cast(const GSVector4i& v);
|
||||
|
||||
__forceinline static GSVector4 f64(double x, double y)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
|
||||
}
|
||||
|
||||
__forceinline void operator=(float f)
|
||||
{
|
||||
v4s = vdupq_n_f32(f);
|
||||
}
|
||||
|
||||
__forceinline void operator=(float32x4_t m)
|
||||
{
|
||||
v4s = m;
|
||||
}
|
||||
|
||||
__forceinline operator float32x4_t() const
|
||||
{
|
||||
return v4s;
|
||||
}
|
||||
|
||||
/// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks we don't need the whole vector
|
||||
/// Useful for e.g. preventing clang from optimizing shuffles that remove possibly-denormal garbage data from vectors before computing with them
|
||||
__forceinline GSVector4 noopt()
|
||||
{
|
||||
// Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the future the implementation should be updated
|
||||
#ifdef __clang__
|
||||
// __asm__("":"+x"(m)::);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
__forceinline u32 rgba32() const
|
||||
{
|
||||
return GSVector4i(*this).rgba32();
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 rgba32(u32 rgba)
|
||||
{
|
||||
return GSVector4(GSVector4i::load((int)rgba).u8to32());
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 rgba32(u32 rgba, int shift)
|
||||
{
|
||||
return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 unorm8(u32 rgba)
|
||||
{
|
||||
return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f);
|
||||
}
|
||||
|
||||
__forceinline GSVector4 abs() const
|
||||
{
|
||||
return GSVector4(vabsq_f32(v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 neg() const
|
||||
{
|
||||
return GSVector4(vnegq_f32(v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 rcp() const
|
||||
{
|
||||
return GSVector4(vrecpeq_f32(v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 rcpnr() const
|
||||
{
|
||||
float32x4_t recip = vrecpeq_f32(v4s);
|
||||
recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
|
||||
return GSVector4(recip);
|
||||
}
|
||||
|
||||
template <int mode>
|
||||
__forceinline GSVector4 round() const
|
||||
{
|
||||
if constexpr (mode == Round_NegInf)
|
||||
return floor();
|
||||
else if constexpr (mode == Round_PosInf)
|
||||
return ceil();
|
||||
else if constexpr (mode == Round_NearestInt)
|
||||
return GSVector4(vrndnq_f32(v4s));
|
||||
else
|
||||
return GSVector4(vrndq_f32(v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 floor() const
|
||||
{
|
||||
return GSVector4(vrndmq_f32(v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 ceil() const
|
||||
{
|
||||
return GSVector4(vrndpq_f32(v4s));
|
||||
}
|
||||
|
||||
// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||
|
||||
#define LOG_POLY0(x, c0) GSVector4(c0)
|
||||
#define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0)))
|
||||
#define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0)))
|
||||
#define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0)))
|
||||
#define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0)))
|
||||
#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0)))
|
||||
|
||||
__forceinline GSVector4 log2(int precision = 5) const
|
||||
{
|
||||
// NOTE: sign bit ignored, safe to pass negative numbers
|
||||
|
||||
// The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e,
|
||||
// and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range).
|
||||
|
||||
GSVector4 one = m_one;
|
||||
|
||||
GSVector4i i = GSVector4i::cast(*this);
|
||||
|
||||
GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
|
||||
GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
|
||||
|
||||
GSVector4 p;
|
||||
|
||||
// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
|
||||
|
||||
switch (precision)
|
||||
{
|
||||
case 3:
|
||||
p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
|
||||
break;
|
||||
case 4:
|
||||
p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
|
||||
break;
|
||||
default:
|
||||
case 5:
|
||||
p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
|
||||
break;
|
||||
case 6:
|
||||
p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
|
||||
break;
|
||||
}
|
||||
|
||||
// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
|
||||
|
||||
p = p * (m - one);
|
||||
|
||||
return p + e;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return *this * a + b;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return *this * a - b;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return b - *this * a;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return -b - *this * a;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return a.madd(b, *this); // *this + a * b
|
||||
}
|
||||
|
||||
__forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return a.nmadd(b, *this); // *this - a * b
|
||||
}
|
||||
|
||||
__forceinline GSVector4 hadd() const
|
||||
{
|
||||
return GSVector4(vpaddq_f32(v4s, v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 hadd(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vpaddq_f32(v4s, v.v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 hsub() const
|
||||
{
|
||||
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 hsub(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
|
||||
{
|
||||
return max(a).min(b);
|
||||
}
|
||||
|
||||
__forceinline GSVector4 sat(const GSVector4& a) const
|
||||
{
|
||||
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
|
||||
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
|
||||
return sat(minv, maxv);
|
||||
}
|
||||
|
||||
__forceinline GSVector4 sat(const float scale = 255) const
|
||||
{
|
||||
return sat(zero(), GSVector4(scale));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 clamp(const float scale = 255) const
|
||||
{
|
||||
return min(GSVector4(scale));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 min(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vminq_f32(v4s, a.v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 max(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vmaxq_f32(v4s, a.v4s));
|
||||
}
|
||||
|
||||
template <int mask>
|
||||
__forceinline GSVector4 blend32(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2, (mask & 8) ? 7 : 3));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
|
||||
{
|
||||
// duplicate sign bit across and bit select
|
||||
const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
|
||||
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 upl(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vzip1q_f32(v4s, a.v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 uph(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vzip2q_f32(v4s, a.v4s));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 upld(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 uphd(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 l2h(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 h2l(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 andnot(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
|
||||
}
|
||||
|
||||
__forceinline int mask() const
|
||||
{
|
||||
static const int32_t shifts[] = { 0, 1, 2, 3 };
|
||||
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
|
||||
}
|
||||
|
||||
__forceinline bool alltrue() const
|
||||
{
|
||||
// return mask() == 0xf;
|
||||
return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
|
||||
}
|
||||
|
||||
__forceinline bool allfalse() const
|
||||
{
|
||||
return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 replace_nan(const GSVector4& v) const
|
||||
{
|
||||
return v.blend32(*this, *this == *this);
|
||||
}
|
||||
|
||||
template <int src, int dst>
|
||||
__forceinline GSVector4 insert32(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
|
||||
}
|
||||
|
||||
template <int i>
|
||||
__forceinline int extract32() const
|
||||
{
|
||||
return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 zero()
|
||||
{
|
||||
return GSVector4(vdupq_n_f32(0.0f));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 xffffffff()
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu)));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 ps0123()
|
||||
{
|
||||
return GSVector4(m_ps0123);
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 ps4567()
|
||||
{
|
||||
return GSVector4(m_ps4567);
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 loadl(const void* p)
|
||||
{
|
||||
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 load(float f)
|
||||
{
|
||||
return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 load(u32 u)
|
||||
{
|
||||
GSVector4i v = GSVector4i::load((int)u);
|
||||
|
||||
return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
|
||||
}
|
||||
|
||||
template <bool aligned>
|
||||
__forceinline static GSVector4 load(const void* p)
|
||||
{
|
||||
return GSVector4(vld1q_f32((const float*)p));
|
||||
}
|
||||
|
||||
__forceinline static void storent(void* p, const GSVector4& v)
|
||||
{
|
||||
vst1q_f32((float*)p, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline static void storel(void* p, const GSVector4& v)
|
||||
{
|
||||
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline static void storeh(void* p, const GSVector4& v)
|
||||
{
|
||||
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
|
||||
}
|
||||
|
||||
template <bool aligned>
|
||||
__forceinline static void store(void* p, const GSVector4& v)
|
||||
{
|
||||
vst1q_f32((float*)p, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline static void store(float* p, const GSVector4& v)
|
||||
{
|
||||
vst1q_lane_f32(p, v.v4s, 0);
|
||||
}
|
||||
|
||||
__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
|
||||
{
|
||||
GSVector4i mask = GSVector4i::x000000ff();
|
||||
|
||||
a = GSVector4(v & mask);
|
||||
b = GSVector4((v >> 8) & mask);
|
||||
c = GSVector4((v >> 16) & mask);
|
||||
d = GSVector4((v >> 24));
|
||||
}
|
||||
|
||||
__forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
|
||||
{
|
||||
GSVector4 v0 = a.xyxy(b);
|
||||
GSVector4 v1 = c.xyxy(d);
|
||||
|
||||
GSVector4 e = v0.xzxz(v1);
|
||||
GSVector4 f = v0.ywyw(v1);
|
||||
|
||||
GSVector4 v2 = a.zwzw(b);
|
||||
GSVector4 v3 = c.zwzw(d);
|
||||
|
||||
GSVector4 g = v2.xzxz(v3);
|
||||
GSVector4 h = v2.ywyw(v3);
|
||||
|
||||
a = e;
|
||||
b = f;
|
||||
c = g;
|
||||
d = h;
|
||||
}
|
||||
|
||||
__forceinline GSVector4 operator-() const
|
||||
{
|
||||
return neg();
|
||||
}
|
||||
|
||||
__forceinline void operator+=(const GSVector4& v)
|
||||
{
|
||||
v4s = vaddq_f32(v4s, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline void operator-=(const GSVector4& v)
|
||||
{
|
||||
v4s = vsubq_f32(v4s, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline void operator*=(const GSVector4& v)
|
||||
{
|
||||
v4s = vmulq_f32(v4s, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline void operator/=(const GSVector4& v)
|
||||
{
|
||||
v4s = vdivq_f32(v4s, v.v4s);
|
||||
}
|
||||
|
||||
__forceinline void operator+=(float f)
|
||||
{
|
||||
*this += GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline void operator-=(float f)
|
||||
{
|
||||
*this -= GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline void operator*=(float f)
|
||||
{
|
||||
*this *= GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline void operator/=(float f)
|
||||
{
|
||||
*this /= GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline void operator&=(const GSVector4& v)
|
||||
{
|
||||
v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline void operator|=(const GSVector4& v)
|
||||
{
|
||||
v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline void operator^=(const GSVector4& v)
|
||||
{
|
||||
v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator+(const GSVector4& v, float f)
|
||||
{
|
||||
return v + GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator-(const GSVector4& v, float f)
|
||||
{
|
||||
return v - GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator*(const GSVector4& v, float f)
|
||||
{
|
||||
return v * GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator/(const GSVector4& v, float f)
|
||||
{
|
||||
return v / GSVector4(f);
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
// NEON has no !=
|
||||
return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
|
||||
}
|
||||
|
||||
__forceinline friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 mul64(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 add64(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline GSVector4 sub64(const GSVector4& v) const
|
||||
{
|
||||
return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 f32to64(const GSVector4& v)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 f32to64(const void* p)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
|
||||
}
|
||||
|
||||
__forceinline GSVector4i f64toi32(bool truncate = true) const
|
||||
{
|
||||
const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
|
||||
const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
|
||||
const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
|
||||
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||
__forceinline GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
|
||||
__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
|
||||
|
||||
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
|
||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
|
||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
|
||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
|
||||
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
|
||||
|
||||
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
|
||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
|
||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
|
||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
|
||||
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
|
||||
|
||||
#define VECTOR4_SHUFFLE_1(xs, xn) \
|
||||
VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
|
||||
VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
|
||||
VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
|
||||
VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
|
||||
|
||||
VECTOR4_SHUFFLE_1(x, 0)
|
||||
VECTOR4_SHUFFLE_1(y, 1)
|
||||
VECTOR4_SHUFFLE_1(z, 2)
|
||||
VECTOR4_SHUFFLE_1(w, 3)
|
||||
|
||||
// clang-format on
|
||||
|
||||
__forceinline GSVector4 broadcast32() const
|
||||
{
|
||||
return GSVector4(vdupq_laneq_f32(v4s, 0));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 broadcast32(const GSVector4& v)
|
||||
{
|
||||
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 broadcast32(const void* f)
|
||||
{
|
||||
return GSVector4(vld1q_dup_f32((const float*)f));
|
||||
}
|
||||
|
||||
__forceinline static GSVector4 broadcast64(const void* f)
|
||||
{
|
||||
return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
|
||||
}
|
||||
};
|
File diff suppressed because it is too large
Load Diff
|
@ -11,6 +11,8 @@
|
|||
#define strcasecmp _stricmp
|
||||
#endif
|
||||
|
||||
#ifdef _M_X86
|
||||
|
||||
static ProcessorFeatures::VectorISA getCurrentISA()
|
||||
{
|
||||
// For debugging
|
||||
|
@ -41,11 +43,14 @@ static ProcessorFeatures::VectorISA getCurrentISA()
|
|||
return ProcessorFeatures::VectorISA::SSE4;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static ProcessorFeatures getProcessorFeatures()
|
||||
{
|
||||
cpuinfo_initialize();
|
||||
|
||||
ProcessorFeatures features = {};
|
||||
#if defined(_M_X86)
|
||||
features.vectorISA = getCurrentISA();
|
||||
features.hasFMA = cpuinfo_has_x86_fma3();
|
||||
if (const char* over = getenv("OVERRIDE_FMA"))
|
||||
|
@ -74,6 +79,7 @@ static ProcessorFeatures getProcessorFeatures()
|
|||
features.hasSlowGather = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return features;
|
||||
}
|
||||
|
||||
|
|
|
@ -44,10 +44,12 @@
|
|||
|
||||
struct ProcessorFeatures
|
||||
{
|
||||
#ifdef _M_X86
|
||||
enum class VectorISA { SSE4, AVX, AVX2 };
|
||||
VectorISA vectorISA;
|
||||
bool hasFMA;
|
||||
bool hasSlowGather;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern const ProcessorFeatures g_cpu;
|
||||
|
|
|
@ -21,10 +21,14 @@ struct alignas(32) GSVertex
|
|||
u32 FOG; // FOG:28
|
||||
};
|
||||
|
||||
#if defined(_M_X86)
|
||||
#if _M_SSE >= 0x500
|
||||
__m256i mx;
|
||||
#endif
|
||||
__m128i m[2];
|
||||
#elif defined(_M_ARM64)
|
||||
int32x4_t m[2];
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
@ -318,8 +318,6 @@ std::string D3D::GetDriverVersionFromLUID(const LUID& luid)
|
|||
return ret;
|
||||
}
|
||||
|
||||
#ifdef _M_X86
|
||||
|
||||
D3D::VendorID D3D::GetVendorID(IDXGIAdapter1* adapter)
|
||||
{
|
||||
DXGI_ADAPTER_DESC1 desc;
|
||||
|
@ -382,6 +380,7 @@ GSRendererType D3D::GetPreferredRenderer()
|
|||
Console.Error("D3D12CreateDevice() for automatic renderer failed: %08X", hr);
|
||||
return device;
|
||||
};
|
||||
#ifdef ENABLE_VULKAN
|
||||
static constexpr auto check_for_mapping_layers = []() {
|
||||
PCWSTR familyName = L"Microsoft.D3DMappingLayers_8wekyb3d8bbwe";
|
||||
UINT32 numPackages = 0, bufferLength = 0;
|
||||
|
@ -416,6 +415,9 @@ GSRendererType D3D::GetPreferredRenderer()
|
|||
" to use the Vulkan renderer."), Host::OSD_WARNING_DURATION);
|
||||
return false;
|
||||
};
|
||||
#else
|
||||
static constexpr auto check_vulkan_supported = []() { return false; };
|
||||
#endif
|
||||
|
||||
switch (GetVendorID(adapter.get()))
|
||||
{
|
||||
|
@ -470,14 +472,16 @@ GSRendererType D3D::GetPreferredRenderer()
|
|||
|
||||
default:
|
||||
{
|
||||
// Default is D3D11
|
||||
// Default is D3D11, but prefer DX12 on ARM (better drivers).
|
||||
#ifdef _M_ARM64
|
||||
return GSRendererType::DX12;
|
||||
#else
|
||||
return GSRendererType::DX11;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // _M_X86
|
||||
|
||||
wil::com_ptr_nothrow<ID3DBlob> D3D::CompileShader(D3D::ShaderType type, D3D_FEATURE_LEVEL feature_level, bool debug,
|
||||
const std::string_view code, const D3D_SHADER_MACRO* macros /* = nullptr */,
|
||||
const char* entry_point /* = "main" */)
|
||||
|
|
|
@ -44,7 +44,6 @@ namespace D3D
|
|||
// returns the driver version from the registry as a string
|
||||
std::string GetDriverVersionFromLUID(const LUID& luid);
|
||||
|
||||
#ifdef _M_X86
|
||||
// this is sort of a legacy thing that doesn't have much to do with d3d (just the easiest way)
|
||||
// checks to see if the adapter at 0 is NV and thus we should prefer OpenGL
|
||||
enum class VendorID
|
||||
|
@ -57,7 +56,6 @@ namespace D3D
|
|||
|
||||
VendorID GetVendorID(IDXGIAdapter1* adapter);
|
||||
GSRendererType GetPreferredRenderer();
|
||||
#endif
|
||||
|
||||
// D3DCompiler wrapper.
|
||||
enum class ShaderType
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
|
||||
// SPDX-FileCopyrightText: 2002-2024 PCSX2 Dev Team
|
||||
// SPDX-License-Identifier: LGPL-3.0+
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GS/GSState.h"
|
||||
|
||||
#ifdef _M_X86
|
||||
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h"
|
||||
#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h"
|
||||
#endif
|
||||
#ifdef _M_ARM64
|
||||
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
|
||||
#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h"
|
||||
#endif
|
||||
|
||||
struct GSScanlineLocalData;
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,82 @@
|
|||
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
|
||||
// SPDX-License-Identifier: GPL-3.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GS/Renderers/Common/GSFunctionMap.h"
|
||||
#include "GS/Renderers/SW/GSScanlineEnvironment.h"
|
||||
|
||||
#include "vixl/aarch64/macro-assembler-aarch64.h"
|
||||
|
||||
class GSDrawScanlineCodeGenerator
|
||||
{
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator(u64 key, void* code, size_t maxsize);
|
||||
void Generate();
|
||||
|
||||
size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
|
||||
const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
|
||||
|
||||
private:
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const vixl::aarch64::VRegister& temp1, const vixl::aarch64::VRegister& temp2);
|
||||
void SampleTexture();
|
||||
void SampleTexture_TexelReadHelper(int mip_offset);
|
||||
void Wrap(const vixl::aarch64::VRegister& uv0);
|
||||
void Wrap(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const vixl::aarch64::VRegister& uv,
|
||||
const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
|
||||
const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
|
||||
void WrapLOD(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1,
|
||||
const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
|
||||
const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
void ReadPixel(const vixl::aarch64::VRegister& dst, const vixl::aarch64::Register& addr);
|
||||
void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, const vixl::aarch64::Register& mask, bool high, bool fast, int psm, int fz);
|
||||
void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, u8 i, int psm);
|
||||
|
||||
void ReadTexel1(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src,
|
||||
const vixl::aarch64::VRegister& tmp1, int mip_offset);
|
||||
void ReadTexel4(
|
||||
const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
|
||||
const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
|
||||
const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
|
||||
int mip_offset);
|
||||
void ReadTexelImplLoadTexLOD(const vixl::aarch64::Register& addr, int lod, int mip_offset);
|
||||
void ReadTexelImpl(
|
||||
const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
|
||||
const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
|
||||
const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImpl(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& addr,
|
||||
u8 i, const vixl::aarch64::Register& baseRegister, bool preserveDst);
|
||||
|
||||
void modulate16(const vixl::aarch64::VRegister& d, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
|
||||
void modulate16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
|
||||
void lerp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f, u8 shift);
|
||||
void lerp16_4(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f);
|
||||
void mix16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& temp);
|
||||
void clamp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& temp);
|
||||
void alltrue(const vixl::aarch64::VRegister& test, const vixl::aarch64::VRegister& temp);
|
||||
void blend8(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
|
||||
void blend8r(const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
|
||||
void split16_2x8(const vixl::aarch64::VRegister& l, const vixl::aarch64::VRegister& h, const vixl::aarch64::VRegister& src);
|
||||
|
||||
vixl::aarch64::MacroAssembler m_emitter;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
|
||||
vixl::aarch64::Label m_step_label;
|
||||
};
|
|
@ -156,6 +156,26 @@ struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a p
|
|||
struct { GSVector4i i, f; } lod; // lcm == 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM64
|
||||
// Mini version of constant data for ARM64, we don't need all of it
|
||||
alignas(16) u32 const_test_128b[8][4] = {
|
||||
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
|
||||
{0xffffffff, 0x00000000, 0x00000000, 0x00000000},
|
||||
{0xffffffff, 0xffffffff, 0x00000000, 0x00000000},
|
||||
{0xffffffff, 0xffffffff, 0xffffffff, 0x00000000},
|
||||
{0x00000000, 0xffffffff, 0xffffffff, 0xffffffff},
|
||||
{0x00000000, 0x00000000, 0xffffffff, 0xffffffff},
|
||||
{0x00000000, 0x00000000, 0x00000000, 0xffffffff},
|
||||
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
|
||||
};
|
||||
alignas(16) u16 const_movemaskw_mask[8] = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
|
||||
alignas(16) float const_log2_coef[4] = {
|
||||
0.204446009836232697516f,
|
||||
-1.04913055217340124191f,
|
||||
2.28330284476918490682f,
|
||||
1.0f};
|
||||
#endif
|
||||
};
|
||||
|
||||
struct alignas(32) GSScanlineLocalData // per prim variables, each thread has its own
|
||||
|
|
|
@ -0,0 +1,339 @@
|
|||
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
|
||||
// SPDX-License-Identifier: GPL-3.0
|
||||
|
||||
#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
|
||||
#include "GS/Renderers/SW/GSVertexSW.h"
|
||||
|
||||
#include "common/StringUtil.h"
|
||||
#include "common/Perf.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
MULTI_ISA_UNSHARED_IMPL;
|
||||
|
||||
using namespace vixl::aarch64;
|
||||
|
||||
static const auto& _vertex = x0;
|
||||
static const auto& _index = x1;
|
||||
static const auto& _dscan = x2;
|
||||
static const auto& _locals = x3;
|
||||
static const auto& _scratchaddr = x7;
|
||||
static const auto& _vscratch = v31;
|
||||
|
||||
#define _local(field) MemOperand(_locals, offsetof(GSScanlineLocalData, field))
|
||||
#define armAsm (&m_emitter)
|
||||
|
||||
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize)
|
||||
: m_emitter(static_cast<vixl::byte*>(code), maxsize, vixl::aarch64::PositionDependentCode)
|
||||
, m_sel(key)
|
||||
{
|
||||
m_en.z = m_sel.zb ? 1 : 0;
|
||||
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
const bool needs_shift = ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS) || m_en.t || (m_en.c && m_sel.iip);
|
||||
if (needs_shift)
|
||||
{
|
||||
armAsm->Mov(x4, reinterpret_cast<intptr_t>(g_const.m_shift_128b));
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
armAsm->Ldr(VRegister(3 + i, kFormat16B), MemOperand(x4, i * sizeof(g_const.m_shift_128b[0])));
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
armAsm->Ret();
|
||||
|
||||
armAsm->FinalizeCode();
|
||||
|
||||
Perf::any.RegisterKey(GetCode(), GetSize(), "GSSetupPrim_", m_sel.key);
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = t.wwww();
|
||||
armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, t.w));
|
||||
armAsm->Ld1r(v1.V4S(), MemOperand(_scratchaddr));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v1.V4S(), v3.V4S());
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
|
||||
|
||||
armAsm->Str(v2.V4S(), _local(d4.f));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
|
||||
|
||||
armAsm->Str(v2.V4S(), _local(d[i].f));
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// VectorF dz = VectorF::broadcast64(&dscan.p.z)
|
||||
armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, p.z));
|
||||
armAsm->Ld1r(_vscratch.V2D(), MemOperand(_scratchaddr));
|
||||
|
||||
// m_local.d4.z = dz.mul64(GSVector4::f32to64(shift));
|
||||
armAsm->Fcvtl(v1.V2D(), v3.V2S());
|
||||
armAsm->Fmul(v1.V2D(), v1.V2D(), _vscratch.V2D());
|
||||
armAsm->Str(v1.V2D(), _local(d4.z));
|
||||
|
||||
armAsm->Fcvtn(v0.V2S(), _vscratch.V2D());
|
||||
armAsm->Fcvtn2(v0.V4S(), _vscratch.V2D());
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
|
||||
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
|
||||
|
||||
armAsm->Fmul(v1.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Str(v1.V4S(), _local(d[i].z));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16)));
|
||||
armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
|
||||
armAsm->Add(x4, _vertex, x4);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, p)));
|
||||
|
||||
armAsm->Fcvtzs(v1.V4S(), v0.V4S());
|
||||
armAsm->Dup(v1.V8H(), v1.V8H(), 6);
|
||||
|
||||
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, p.f)));
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
armAsm->Add(_scratchaddr, x4, offsetof(GSVertexSW, t.w));
|
||||
armAsm->Ld1r(v0.V4S(), MemOperand(_scratchaddr));
|
||||
armAsm->Str(v0, MemOperand(_locals, offsetof(GSScanlineLocalData, p.z)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
armAsm->Ldr(v0, MemOperand(_dscan, offsetof(GSVertexSW, t)));
|
||||
armAsm->Fmul(v1.V4S(), v0.V4S(), v3.V4S());
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
armAsm->Fcvtzs(v1.V4S(), v1.V4S());
|
||||
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
armAsm->Dup(v1.V4S(), v0.V4S(), j);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, 128, 4));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: armAsm->Str(v2, _local(d[i].s)); break;
|
||||
case 1: armAsm->Str(v2, _local(d[i].t)); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: armAsm->Str(v2, _local(d[i].s)); break;
|
||||
case 1: armAsm->Str(v2, _local(d[i].t)); break;
|
||||
case 2: armAsm->Str(v2, _local(d[i].q)); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
armAsm->Rev64(_vscratch.V4S(), v2.V4S());
|
||||
armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
|
||||
armAsm->Sqxtn(v2.V4H(), v2.V4S());
|
||||
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
|
||||
armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
armAsm->Dup(v0.V4S(), v16.V4S(), 0);
|
||||
armAsm->Dup(v1.V4S(), v16.V4S(), 2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
armAsm->Sqxtn(v2.V4H(), v2.V4S());
|
||||
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
|
||||
armAsm->Sqxtn(v3.V4H(), v3.V4S());
|
||||
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
|
||||
armAsm->Str(v2, _local(d[i].rb));
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
armAsm->Dup(v0.V4S(), v16.V4S(), 1);
|
||||
armAsm->Dup(v1.V4S(), v16.V4S(), 3);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
|
||||
armAsm->Sqxtn(v2.V4H(), v2.V4S());
|
||||
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
|
||||
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
|
||||
armAsm->Sqxtn(v3.V4H(), v3.V4S());
|
||||
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
|
||||
armAsm->Str(v2, _local(d[i].ga));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16) * last));
|
||||
armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
|
||||
armAsm->Add(x4, _vertex, x4);
|
||||
}
|
||||
|
||||
armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, c)));
|
||||
armAsm->Fcvtzs(v0.V4S(), v0.V4S());
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
armAsm->Ext(v1.V16B(), v0.V16B(), v0.V16B(), 8);
|
||||
armAsm->Zip1(v0.V8H(), v0.V8H(), v1.V8H());
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
armAsm->Ushr(v0.V8H(), v0.V8H(), 7);
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
armAsm->Dup(v1.V4S(), v0.V4S(), 0);
|
||||
armAsm->Dup(v2.V4S(), v0.V4S(), 2);
|
||||
|
||||
armAsm->Str(v1, _local(c.rb));
|
||||
armAsm->Str(v2, _local(c.ga));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
|
||||
// SPDX-License-Identifier: GPL-3.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GS/Renderers/Common/GSFunctionMap.h"
|
||||
#include "GS/Renderers/SW/GSScanlineEnvironment.h"
|
||||
|
||||
#include "vixl/aarch64/macro-assembler-aarch64.h"
|
||||
|
||||
class GSSetupPrimCodeGenerator
|
||||
{
|
||||
public:
|
||||
GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize);
|
||||
void Generate();
|
||||
|
||||
size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
|
||||
const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
|
||||
|
||||
private:
|
||||
void Depth();
|
||||
void Texture();
|
||||
void Color();
|
||||
|
||||
vixl::aarch64::MacroAssembler m_emitter;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
|
||||
struct
|
||||
{
|
||||
u32 z : 1, f : 1, t : 1, c : 1;
|
||||
} m_en;
|
||||
};
|
|
@ -207,6 +207,12 @@
|
|||
<ClCompile Include="GS\Renderers\OpenGL\GLStreamBuffer.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\Vulkan\VKBuilders.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
|
@ -317,7 +323,9 @@
|
|||
<ClCompile Include="GS\Renderers\Common\GSDirtyRect.cpp" />
|
||||
<ClCompile Include="GS\GSDrawingContext.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\GSDump.cpp" />
|
||||
<ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
|
||||
<ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
|
||||
|
@ -333,7 +341,9 @@
|
|||
<ClCompile Include="GS\Renderers\HW\GSRendererHWMultiISA.cpp" />
|
||||
<ClCompile Include="GS\Renderers\Null\GSRendererNull.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\GSState.cpp" />
|
||||
<ClCompile Include="GS\GSTables.cpp" />
|
||||
<ClCompile Include="GS\Renderers\Common\GSTexture.cpp" />
|
||||
|
@ -607,6 +617,8 @@
|
|||
<ClInclude Include="DEV9\Win32\pcap_io_win32_funcs.h" />
|
||||
<ClInclude Include="DEV9\Win32\tap.h" />
|
||||
<ClInclude Include="GameList.h" />
|
||||
<ClInclude Include="GS\GSVector4i_arm64.h" />
|
||||
<ClInclude Include="GS\GSVector4_arm64.h" />
|
||||
<ClInclude Include="GS\Renderers\DX11\D3D11ShaderCache.h" />
|
||||
<ClInclude Include="GS\Renderers\DX12\D3D12Builders.h" />
|
||||
<ClInclude Include="GS\Renderers\DX12\D3D12DescriptorHeapManager.h" />
|
||||
|
@ -628,6 +640,12 @@
|
|||
<ClInclude Include="GS\Renderers\OpenGL\GLStreamBuffer.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\Vulkan\VKBuilders.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
|
@ -727,8 +745,12 @@
|
|||
<ClInclude Include="GS\GSDrawingContext.h" />
|
||||
<ClInclude Include="GS\GSDrawingEnvironment.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GSDump.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFastList.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
|
||||
|
@ -743,7 +765,9 @@
|
|||
<ClInclude Include="GS\Renderers\Null\GSRendererNull.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSRendererSW.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSScanlineEnvironment.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
|
||||
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GSState.h" />
|
||||
<ClInclude Include="GS\GSTables.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSTexture.h" />
|
||||
|
|
|
@ -1392,6 +1392,12 @@
|
|||
<ClCompile Include="CDVD\FlatFileReader.cpp">
|
||||
<Filter>System\ISO</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Patch.h">
|
||||
|
@ -2303,6 +2309,18 @@
|
|||
<ClInclude Include="Vif_HashBucket.h">
|
||||
<Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GSVector4_arm64.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GSVector4i_arm64.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="rdebug\deci2.h">
|
||||
|
|
Loading…
Reference in New Issue