Common: Replace x86_intrin.h with generic Intrin.h

For later Apple Silicon support.
This commit is contained in:
Stenzek 2023-12-22 23:03:25 +10:00 committed by Connor McLaughlin
parent d9abe10308
commit 0bc9c7ffa1
27 changed files with 285 additions and 218 deletions

View File

@ -111,6 +111,11 @@ if(${PCSX2_TARGET_ARCHITECTURES} MATCHES "x86_64")
endif()
list(APPEND PCSX2_DEFS _M_X86=1)
set(_M_X86 1)
# SSE4.1 is not set by MSVC, it uses _M_SSE instead.
if(MSVC)
list(APPEND PCSX2_DEFS __SSE4_1__=1)
endif()
else()
message(FATAL_ERROR "Unsupported architecture: ${PCSX2_TARGET_ARCHITECTURES}")
endif()

View File

@ -93,12 +93,14 @@ target_sources(common PRIVATE
ScopedGuard.h
SettingsInterface.h
SettingsWrapper.h
SingleRegisterTypes.h
SmallString.h
StringUtil.h
Timer.h
TextureDecompress.h
Threading.h
TraceLog.h
VectorIntrin.h
WAVWriter.h
WindowInfo.h
WrappedMemCopy.h

View File

@ -3,12 +3,13 @@
#include "General.h"
#include "Console.h"
#include "emitter/x86_intrin.h"
#include "VectorIntrin.h"
static u32 PAUSE_TIME = 0;
static void MultiPause()
{
#ifdef _M_X86
_mm_pause();
_mm_pause();
_mm_pause();
@ -17,6 +18,27 @@ static void MultiPause()
_mm_pause();
_mm_pause();
_mm_pause();
#elif defined(_M_ARM64) && defined(_MSC_VER)
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
__isb(_ARM64_BARRIER_SY);
#elif defined(_M_ARM64)
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
__asm__ __volatile__("isb");
#else
#error Unknown architecture.
#endif
}
static u32 MeasurePauseTime()

View File

@ -0,0 +1,185 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
// --------------------------------------------------------------------------------------
// r64 / r128 - Types that are guaranteed to fit in one register
// --------------------------------------------------------------------------------------
// Note: Recompilers rely on some of these types and the registers they allocate to,
// so be careful if you want to change them
#pragma once
#include "Pcsx2Defs.h"
#include "Pcsx2Types.h"
#include "VectorIntrin.h"
#include <cstring>
#if defined(_M_X86)
// Can't stick them in structs because it breaks calling convention things, yay
using r128 = __m128i;
// Calling convention setting, yay
#define RETURNS_R128 r128 __vectorcall
#define TAKES_R128 __vectorcall
// And since we can't stick them in structs, we get lots of static methods, yay!
[[maybe_unused]] __fi static r128 r128_load(const void* ptr)
{
return _mm_load_si128(reinterpret_cast<const r128*>(ptr));
}
[[maybe_unused]] __fi static void r128_store(void* ptr, r128 val)
{
return _mm_store_si128(reinterpret_cast<r128*>(ptr), val);
}
[[maybe_unused]] __fi static void r128_store_unaligned(void* ptr, r128 val)
{
return _mm_storeu_si128(reinterpret_cast<r128*>(ptr), val);
}
[[maybe_unused]] __fi static r128 r128_zero()
{
return _mm_setzero_si128();
}
/// Expects that r64 came from r64-handling code, and not from a recompiler or something
[[maybe_unused]] __fi static r128 r128_from_u64_dup(u64 val)
{
return _mm_set1_epi64x(val);
}
[[maybe_unused]] __fi static r128 r128_from_u64_zext(u64 val)
{
return _mm_set_epi64x(0, val);
}
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
{
return _mm_setr_epi32(lo0, lo1, hi0, hi1);
}
[[maybe_unused]] __fi static r128 r128_from_u128(const u128& u)
{
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(&u));
}
[[maybe_unused]] __fi static u32 r128_to_u32(r128 val)
{
return _mm_cvtsi128_si32(val);
}
[[maybe_unused]] __fi static u64 r128_to_u64(r128 val)
{
return _mm_cvtsi128_si64(val);
}
[[maybe_unused]] __fi static u128 r128_to_u128(r128 val)
{
alignas(16) u128 ret;
_mm_store_si128(reinterpret_cast<r128*>(&ret), val);
return ret;
}
[[maybe_unused]] __fi static void CopyQWC(void* dest, const void* src)
{
_mm_store_ps((float*)dest, _mm_load_ps((const float*)src));
}
[[maybe_unused]] __fi static void ZeroQWC(void* dest)
{
_mm_store_ps((float*)dest, _mm_setzero_ps());
}
[[maybe_unused]] __fi static void ZeroQWC(u128& dest)
{
_mm_store_ps((float*)&dest, _mm_setzero_ps());
}
#elif defined(_M_ARM64)
using r128 = uint32x4_t;
#define RETURNS_R128 r128 __vectorcall
#define TAKES_R128 __vectorcall
[[maybe_unused]] __fi static void CopyQWC(void* dest, const void* src)
{
vst1q_u8(static_cast<u8*>(dest), vld1q_u8(static_cast<const u8*>(src)));
}
[[maybe_unused]] __fi static void ZeroQWC(void* dest)
{
vst1q_u8(static_cast<u8*>(dest), vmovq_n_u8(0));
}
[[maybe_unused]] __fi static void ZeroQWC(u128& dest)
{
vst1q_u8(&dest._u8[0], vmovq_n_u8(0));
}
[[maybe_unused]] __fi static r128 r128_load(const void* ptr)
{
return vld1q_u32(reinterpret_cast<const uint32_t*>(ptr));
}
[[maybe_unused]] __fi static void r128_store(void* ptr, r128 value)
{
return vst1q_u32(reinterpret_cast<uint32_t*>(ptr), value);
}
[[maybe_unused]] __fi static void r128_store_unaligned(void* ptr, r128 value)
{
return vst1q_u32(reinterpret_cast<uint32_t*>(ptr), value);
}
[[maybe_unused]] __fi static r128 r128_zero()
{
return vmovq_n_u32(0);
}
/// Expects that r64 came from r64-handling code, and not from a recompiler or something
[[maybe_unused]] __fi static r128 r128_from_u64_dup(u64 val)
{
return vreinterpretq_u32_u64(vdupq_n_u64(val));
}
[[maybe_unused]] __fi static r128 r128_from_u64_zext(u64 val)
{
return vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(val), vcreate_u64(0)));
}
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
{
const u32 values[4] = {lo0, lo1, hi0, hi1};
return vld1q_u32(values);
}
[[maybe_unused]] __fi static r128 r128_from_u128(const u128& u)
{
return vld1q_u32(reinterpret_cast<const uint32_t*>(u._u32));
}
[[maybe_unused]] __fi static u32 r128_to_u32(r128 val)
{
return vgetq_lane_u32(val, 0);
}
[[maybe_unused]] __fi static u64 r128_to_u64(r128 val)
{
return vgetq_lane_u64(vreinterpretq_u64_u32(val), 0);
}
[[maybe_unused]] __fi static u128 r128_to_u128(r128 val)
{
alignas(16) u128 ret;
vst1q_u32(ret._u32, val);
return ret;
}
#else
#error Unknown architecture.
#endif

50
common/VectorIntrin.h Normal file
View File

@ -0,0 +1,50 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
// Includes appropriate intrinsic header based on platform.
#pragma once
#ifdef _MSC_VER
#include <intrin.h>
#endif
#if defined(_M_X86)
#if defined(__AVX2__)
#define _M_SSE 0x501
#elif defined(__AVX__)
#define _M_SSE 0x500
#elif defined(__SSE4_1__)
#define _M_SSE 0x401
#else
#error PCSX2 requires compiling for at least SSE 4.1
#endif
// Starting with AVX, processors have fast unaligned loads
// Reduce code duplication by not compiling multiple versions
#if _M_SSE >= 0x500
#define FAST_UNALIGNED 1
#else
#define FAST_UNALIGNED 0
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
#elif defined(_M_ARM64)
#if defined(_MSC_VER) && !defined(__clang__)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
#ifdef __APPLE__
#include <stdlib.h> // alloca
#else
#include <malloc.h> // alloca
#endif

View File

@ -118,6 +118,8 @@
<ClInclude Include="HeapArray.h" />
<ClInclude Include="HeterogeneousContainers.h" />
<ClInclude Include="Image.h" />
<ClInclude Include="SingleRegisterTypes.h" />
<ClInclude Include="VectorIntrin.h" />
<ClInclude Include="LRUCache.h" />
<ClInclude Include="HTTPDownloader.h" />
<ClInclude Include="HTTPDownloaderCurl.h">

View File

@ -354,6 +354,10 @@
<ClInclude Include="SmallString.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="VectorIntrin.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="SingleRegisterTypes.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">

View File

@ -4,7 +4,7 @@
#include "common/General.h"
#include "common/emitter/tools.h"
#include "common/emitter/internal.h"
#include "common/emitter/x86_intrin.h"
#include "common/VectorIntrin.h"
#include <atomic>
// CPU information support

View File

@ -3,7 +3,7 @@
#include "common/emitter/internal.h"
#include "common/emitter/tools.h"
#include "common/emitter/x86_intrin.h"
#include "common/VectorIntrin.h"
// Mask of valid bit fields for the target CPU. Typically this is either 0xFFFF (SSE2
// or better) or 0xFFBF (SSE1 and earlier). Code can ensure a safe/valid MXCSR by

View File

@ -1,42 +0,0 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
#pragma once
// Because nobody can't agree on a single name !
#if defined(__GNUC__)
// Yes there are several files for the same features!
// x86intrin.h which is the general include provided by the compiler
// x86_intrin.h, this file, which is compatibility layer for severals intrinsics
#include "x86intrin.h"
#else
#include "Intrin.h"
#endif
// Rotate instruction
#if defined(__clang__) && __clang_major__ < 9
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-function"
// Seriously what is so complicated to provided this bunch of intrinsics in clangs.
static unsigned int _rotr(unsigned int x, int s)
{
return (x >> s) | (x << (32 - s));
}
static unsigned int _rotl(unsigned int x, int s)
{
return (x << s) | (x >> (32 - s));
}
#pragma clang diagnostic pop
#endif
// Not correctly defined in GCC4.8 and below ! (dunno for VS)
#ifndef _MM_MK_INSERTPS_NDX
#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) (((srcField) << 6) | ((dstField) << 4) | (zeroMask))
#endif

View File

@ -55,7 +55,6 @@ endif()
if(WIN32)
set(MIN_WIN32 0x0A00)
target_compile_definitions(PCSX2_FLAGS INTERFACE
__SSE4_1__
WINVER=${MIN_WIN32}
_WIN32_WINNT=${MIN_WIN32}
WIN32_LEAN_AND_MEAN
@ -190,7 +189,6 @@ set(pcsx2Headers
Memory.h
MemoryTypes.h
Patch.h
PCSX2Base.h
PerformanceMetrics.h
PrecompiledHeader.h
R3000A.h
@ -200,7 +198,6 @@ set(pcsx2Headers
ShaderCacheVersion.h
Sifcmd.h
Sif.h
SingleRegisterTypes.h
SIO/Sio.h
SIO/Sio2.h
SIO/Sio0.h

View File

@ -4,7 +4,8 @@
#pragma once
#include "Common.h"
#include "SingleRegisterTypes.h"
#include "common/SingleRegisterTypes.h"
void resetCache();
void writeCache8(u32 mem, u8 value);

View File

@ -6,7 +6,8 @@
#include "Common.h"
#include "Gif.h"
#include "GS/GS.h"
#include "SingleRegisterTypes.h"
#include "common/SingleRegisterTypes.h"
extern double GetVerticalFrequency();
alignas(16) extern u8 g_RealGSMem[Ps2MemSize::GSregs];

View File

@ -5,16 +5,7 @@
#include "common/Pcsx2Defs.h"
#include "common/Assertions.h"
#include "PCSX2Base.h"
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#if _M_SSE >= 0x500
#include <immintrin.h>
#endif
#include "common/VectorIntrin.h"
#include <algorithm>
#include <cstring>

View File

@ -3,8 +3,8 @@
#pragma once
#include "PCSX2Base.h"
#include "common/Pcsx2Defs.h"
#include "common/VectorIntrin.h"
// For multiple-isa compilation
#ifdef MULTI_ISA_UNSHARED_COMPILATION

View File

@ -3,7 +3,6 @@
#pragma once
#include "PCSX2Base.h"
#include <string>
#include <vector>

View File

@ -3,33 +3,8 @@
#pragma once
#ifdef __linux__
#include <signal.h>
#endif
#include "vtlb.h"
#include "common/emitter/x86_intrin.h"
// [TODO] This *could* be replaced with an assignment operator on u128 that implicitly
// uses _mm_store and _mm_load internally. However, there are alignment concerns --
// u128 is not alignment strict. (we would need a u128 and u128a for types known to
// be strictly 128-bit aligned).
static __fi void CopyQWC( void* dest, const void* src )
{
_mm_store_ps( (float*)dest, _mm_load_ps((const float*)src) );
}
static __fi void ZeroQWC( void* dest )
{
_mm_store_ps( (float*)dest, _mm_setzero_ps() );
}
static __fi void ZeroQWC( u128& dest )
{
_mm_store_ps( (float*)&dest, _mm_setzero_ps() );
}
#define PSM(mem) (vtlb_GetPhyPtr((mem)&0x1fffffff)) //pcsx2 is a competition.The one with most hacks wins :D
#define psHs8(mem) (*(s8 *)&eeHw[(mem) & 0xffff])
@ -108,7 +83,7 @@ extern void memMapVUmicro();
#define memWrite32 vtlb_memWrite<mem32_t>
#define memWrite64 vtlb_memWrite<mem64_t>
static __fi void memRead128(u32 mem, mem128_t* out) { _mm_store_si128((__m128i*)out, vtlb_memRead128(mem)); }
static __fi void memRead128(u32 mem, mem128_t* out) { r128_store(out, vtlb_memRead128(mem)); }
static __fi void memRead128(u32 mem, mem128_t& out) { memRead128(mem, &out); }
static __fi void memWrite128(u32 mem, const mem128_t* val) { vtlb_memWrite128(mem, r128_load(val)); }

View File

@ -1,27 +0,0 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
/// Base defines and typedefs that are needed by all code in PCSX2
/// Prefer this over including Pcsx2Defs.h to make sure everyone gets all the defines, as missing defines fail silently
#pragma once
#include "common/Pcsx2Defs.h"
#if defined(__AVX2__)
#define _M_SSE 0x501
#elif defined(__AVX__)
#define _M_SSE 0x500
#elif defined(__SSE4_1__)
#define _M_SSE 0x401
#else
#error PCSX2 requires compiling for at least SSE 4.1
#endif
// Starting with AVX, processors have fast unaligned loads
// Reduce code duplication by not compiling multiple versions
#if _M_SSE >= 0x500
#define FAST_UNALIGNED 1
#else
#define FAST_UNALIGNED 0
#endif

View File

@ -43,9 +43,3 @@
// We use fmt a fair bit now.
#include "fmt/core.h"
//////////////////////////////////////////////////////////////////////////////////////////
// Begin Pcsx2 Includes: Add items here that are local to Pcsx2 but stay relatively
// unchanged for long periods of time, or happen to be used by almost everything, so they
// need a full recompile anyway, when modified (etc)
#include "PCSX2Base.h"

View File

@ -1,79 +0,0 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
// --------------------------------------------------------------------------------------
// r64 / r128 - Types that are guaranteed to fit in one register
// --------------------------------------------------------------------------------------
// Note: Recompilers rely on some of these types and the registers they allocate to,
// so be careful if you want to change them
#pragma once
#include <cstring>
#include <immintrin.h>
#include <emmintrin.h>
// Can't stick them in structs because it breaks calling convention things, yay
using r128 = __m128i;
// Calling convention setting, yay
#define RETURNS_R128 r128 __vectorcall
#define TAKES_R128 __vectorcall
// And since we can't stick them in structs, we get lots of static methods, yay!
__forceinline static r128 r128_load(const void* ptr)
{
return _mm_load_si128(reinterpret_cast<const r128*>(ptr));
}
__forceinline static void r128_store(void* ptr, r128 val)
{
return _mm_store_si128(reinterpret_cast<r128*>(ptr), val);
}
__forceinline static void r128_store_unaligned(void* ptr, r128 val)
{
return _mm_storeu_si128(reinterpret_cast<r128*>(ptr), val);
}
__forceinline static r128 r128_zero()
{
return _mm_setzero_si128();
}
/// Expects that r64 came from r64-handling code, and not from a recompiler or something
__forceinline static r128 r128_from_u64_dup(u64 val)
{
return _mm_set1_epi64x(val);
}
__forceinline static r128 r128_from_u64_zext(u64 val)
{
return _mm_set_epi64x(0, val);
}
__forceinline static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
{
return _mm_setr_epi32(lo0, lo1, hi0, hi1);
}
__forceinline static r128 r128_from_u128(const u128& u)
{
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(&u));
}
__forceinline static u32 r128_to_u32(r128 val)
{
return _mm_cvtsi128_si32(val);
}
__forceinline static u64 r128_to_u64(r128 val)
{
return _mm_cvtsi128_si64(val);
}
__forceinline static u128 r128_to_u128(r128 val)
{
alignas(16) u128 ret;
_mm_store_si128(reinterpret_cast<r128*>(&ret), val);
return ret;
}

View File

@ -22,7 +22,7 @@
#include "common/StringUtil.h"
#ifdef _M_X86
#include "common/emitter/x86_intrin.h"
#include "common/emitter/tools.h"
#endif
extern R5900cpu GSDumpReplayerCpu;

View File

@ -24,7 +24,6 @@
#include "LogSink.h"
#include "MTGS.h"
#include "MTVU.h"
#include "PCSX2Base.h"
#include "PINE.h"
#include "Patch.h"
#include "PerformanceMetrics.h"
@ -50,7 +49,6 @@
#include "common/StringUtil.h"
#include "common/Threading.h"
#include "common/Timer.h"
#include "common/emitter/tools.h"
#include "IconsFontAwesome5.h"
#include "discord_rpc.h"
@ -61,7 +59,7 @@
#include <sstream>
#ifdef _M_X86
#include "common/emitter/x86_intrin.h"
#include "common/emitter/tools.h"
#endif
#ifdef _WIN32

View File

@ -672,7 +672,6 @@
<ClInclude Include="IPU\IPUdma.h" />
<ClInclude Include="Mdec.h" />
<ClInclude Include="Patch.h" />
<ClInclude Include="PCSX2Base.h" />
<ClInclude Include="PrecompiledHeader.h" />
<ClInclude Include="ps2\pgif.h" />
<ClInclude Include="StateWrapper.h" />
@ -711,7 +710,6 @@
<ClInclude Include="Common.h" />
<ClInclude Include="Config.h" />
<ClInclude Include="SaveState.h" />
<ClInclude Include="SingleRegisterTypes.h" />
<ClInclude Include="System.h" />
<ClInclude Include="Counters.h" />
<ClInclude Include="Dmac.h" />

View File

@ -1418,9 +1418,6 @@
<ClInclude Include="Patch.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="PCSX2Base.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="PrecompiledHeader.h">
<Filter>Misc</Filter>
</ClInclude>
@ -1442,9 +1439,6 @@
<ClInclude Include="SaveState.h">
<Filter>System\Include</Filter>
</ClInclude>
<ClInclude Include="SingleRegisterTypes.h">
<Filter>System\Include</Filter>
</ClInclude>
<ClInclude Include="System.h">
<Filter>System\Include</Filter>
</ClInclude>

View File

@ -4,8 +4,8 @@
#pragma once
#include "Hw.h"
#include "SingleRegisterTypes.h"
#include "common/SingleRegisterTypes.h"
// hw read functions
template< uint page > extern mem8_t hwRead8 (u32 mem);

View File

@ -4,9 +4,10 @@
#pragma once
#include "MemoryTypes.h"
#include "SingleRegisterTypes.h"
#include "System.h"
#include "common/SingleRegisterTypes.h"
static const uptr VTLB_AllocUpperBounds = _1gb * 2;
// Specialized function pointers for each read type

View File

@ -7,10 +7,6 @@
#include "Vif_Dma.h"
#include "newVif.h"
#include "common/emitter/x86_intrin.h"
using namespace x86Emitter;
// --------------------------------------------------------------------------------------
// VifUnpackSSE_Base
// --------------------------------------------------------------------------------------