Compile fixes for Windows-on-ARM64

This commit is contained in:
Stenzek 2019-11-26 15:31:45 +11:00
parent 6fcb1c6c46
commit d744c5a148
13 changed files with 115 additions and 64 deletions

View File

@ -364,6 +364,8 @@ void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
#if defined(IOS)
// Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
#elif defined(WIN32)
FlushInstructionCache(GetCurrentProcess(), start, end - start);
#else
// Don't rely on GCC's __clear_cache implementation, as it caches
// icache/dcache cache line sizes, that can vary between cores on
@ -2172,6 +2174,8 @@ void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
ARM64Reg second;
if (!(num_regs & 1))
second = (ARM64Reg)(X0 + *it++);
else
second = {};
// 8 byte per register, but 16 byte alignment, so we may have to padd one register.
// Only update the SP on the last load to avoid the dependency between those loads.
@ -4164,20 +4168,19 @@ void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
bool flags)
{
switch ((negative << 1) | flags)
if (!negative)
{
case 0:
ADD(Rd, Rn, imm, shift);
break;
case 1:
ADDS(Rd, Rn, imm, shift);
break;
case 2:
SUB(Rd, Rn, imm, shift);
break;
case 3:
SUBS(Rd, Rn, imm, shift);
break;
if (!flags)
ADD(Rd, Rn, imm, shift);
else
ADDS(Rd, Rn, imm, shift);
}
else
{
if (!flags)
SUB(Rd, Rn, imm, shift);
else
SUBS(Rd, Rn, imm, shift);
}
}
@ -4185,7 +4188,7 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega
ARM64Reg scratch)
{
bool has_scratch = scratch != INVALID_REG;
u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
u64 imm_neg = Is64Bit(Rd) ? u64(-s64(imm)) : u64(-s64(imm)) & 0xFFFFFFFFuLL;
bool neg_neg = negative ? false : true;
// Fast paths, aarch64 immediate instructions
@ -4232,20 +4235,19 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega
(u32)imm);
negative ^= MOVI2R2(scratch, imm, imm_neg);
switch ((negative << 1) | flags)
if (!negative)
{
case 0:
ADD(Rd, Rn, scratch);
break;
case 1:
ADDS(Rd, Rn, scratch);
break;
case 2:
SUB(Rd, Rn, scratch);
break;
case 3:
SUBS(Rd, Rn, scratch);
break;
if (!flags)
ADD(Rd, Rn, scratch);
else
ADDS(Rd, Rn, scratch);
}
else
{
if (!flags)
SUB(Rd, Rn, scratch);
else
SUBS(Rd, Rn, scratch);
}
}

View File

@ -2,13 +2,17 @@
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <asm/hwcap.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include <string>
#include <thread>
#ifndef _WIN32
#include <asm/hwcap.h>
#include <sys/auxv.h>
#include <unistd.h>
#endif
#include <fmt/format.h>
@ -16,6 +20,8 @@
#include "Common/CommonTypes.h"
#include "Common/FileUtil.h"
#ifndef WIN32
const char procfile[] = "/proc/cpuinfo";
static std::string GetCPUString()
@ -42,6 +48,8 @@ static std::string GetCPUString()
return cpu_string;
}
#endif
CPUInfo cpu_info;
CPUInfo::CPUInfo()
@ -60,6 +68,21 @@ void CPUInfo::Detect()
Mode64bit = true;
vendor = CPUVendor::ARM;
#ifdef _WIN32
num_cores = std::thread::hardware_concurrency();
// Windows does not provide any mechanism for querying the system registers on ARMv8, unlike Linux
// which traps the register reads and emulates them in the kernel. There are environment variables
// containing some of the CPU-specific values, which we could use for a lookup table in the
// future. For now, assume all features are present as all known devices which are Windows-on-ARM
// compatible also support these extensions.
bFP = true;
bASIMD = true;
bAES = true;
bCRC32 = true;
bSHA1 = true;
bSHA2 = true;
#else
// Get the information about the CPU
num_cores = sysconf(_SC_NPROCESSORS_CONF);
strncpy(cpu_string, GetCPUString().c_str(), sizeof(cpu_string));
@ -71,6 +94,7 @@ void CPUInfo::Detect()
bCRC32 = hwcaps & HWCAP_CRC32;
bSHA1 = hwcaps & HWCAP_SHA1;
bSHA2 = hwcaps & HWCAP_SHA2;
#endif
}
// Turn the CPU info into a string we can show

View File

@ -11,7 +11,7 @@
#include "Common/CommonFuncs.h"
#include "Common/Intrinsics.h"
#ifdef _M_ARM_64
#if defined(_M_ARM_64) && !defined(_MSC_VER)
#include <arm_acle.h>
#endif

View File

@ -33,6 +33,10 @@ typedef CONTEXT SContext;
#define CTX_R14 R14
#define CTX_R15 R15
#define CTX_RIP Rip
#elif _M_ARM64
#define CTX_REG(x) X[x]
#define CTX_SP Sp
#define CTX_PC Pc
#else
#error No context definition for architecture
#endif

View File

@ -754,9 +754,9 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(msr));
FixupBranch b1 = TBNZ(WA, 13); // Test FP enabled bit
FixupBranch far = B();
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far);
SetJumpTarget(far_addr);
gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);

View File

@ -143,9 +143,9 @@ void JitArm64::bcx(UGeckoInstruction inst)
JumpIfCRFieldBit(inst.BI >> 2, 3 - (inst.BI & 3), !(inst.BO_2 & BO_BRANCH_IF_TRUE));
}
FixupBranch far = B();
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far);
SetJumpTarget(far_addr);
if (inst.LK)
{
@ -160,12 +160,12 @@ void JitArm64::bcx(UGeckoInstruction inst)
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
ARM64Reg WA2 = gpr.GetReg();
ARM64Reg XA2 = EncodeRegTo64(WA2);
MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
gpr.Unlock(WA);
MOVP2R(XA2, &CoreTiming::Idle);
BLR(XA2);
gpr.Unlock(WA2);
WriteExceptionExit(js.op->branchTo);
}
@ -260,9 +260,9 @@ void JitArm64::bclrx(UGeckoInstruction inst)
if (conditional)
{
FixupBranch far = B();
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far);
SetJumpTarget(far_addr);
}
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_LR]));

View File

@ -35,7 +35,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
bool inputs_are_singles = fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
(!use_c || fpr.IsSingle(c, !packed));
ARM64Reg VA, VB, VC, VD;
ARM64Reg VA{}, VB{}, VC{}, VD{};
if (packed)
{

View File

@ -368,7 +368,11 @@ void JitArm64::cntlzwx(UGeckoInstruction inst)
if (gpr.IsImm(s))
{
#ifdef _MSC_VER
gpr.SetImmediate(a, _CountLeadingZeros(gpr.GetImm(s)));
#else
gpr.SetImmediate(a, __builtin_clz(gpr.GetImm(s)));
#endif
if (inst.Rc)
ComputeRC0(gpr.GetImm(a));
}
@ -931,7 +935,7 @@ void JitArm64::subfex(UGeckoInstruction inst)
// d = ~a + b + carry;
if (gpr.IsImm(a))
MOVI2R(WA, ~gpr.GetImm(a));
MOVI2R(WA, u32(~gpr.GetImm(a)));
else
MVN(WA, gpr.R(a));
ADCS(gpr.R(d), WA, gpr.R(b));
@ -1187,7 +1191,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
if (inst.Rc)
ComputeRC0(imm_d);
}
else if (gpr.IsImm(b) && gpr.GetImm(b) != 0 && gpr.GetImm(b) != -1u)
else if (gpr.IsImm(b) && gpr.GetImm(b) != 0 && gpr.GetImm(b) != UINT32_C(0xFFFFFFFF))
{
ARM64Reg WA = gpr.GetReg();
MOVI2R(WA, gpr.GetImm(b));

View File

@ -408,7 +408,7 @@ void JitArm64::stX(UGeckoInstruction inst)
gpr.BindToRegister(a, false);
ARM64Reg WA = gpr.GetReg();
ARM64Reg RB;
ARM64Reg RB = {};
ARM64Reg RA = gpr.R(a);
if (regOffset != -1)
RB = gpr.R(regOffset);
@ -549,9 +549,9 @@ void JitArm64::dcbx(UGeckoInstruction inst)
LSR(value, value, addr); // move current bit to bit 0
FixupBranch bit_not_set = TBZ(value, 0);
FixupBranch far = B();
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far);
SetJumpTarget(far_addr);
BitSet32 gprs_to_push = gpr.GetCallerSavedUsed();
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
@ -568,10 +568,10 @@ void JitArm64::dcbx(UGeckoInstruction inst)
m_float_emit.ABI_PopRegisters(fprs_to_push, X30);
ABI_PopRegisters(gprs_to_push);
FixupBranch near = B();
FixupBranch near_addr = B();
SwitchToNearCode();
SetJumpTarget(bit_not_set);
SetJumpTarget(near);
SetJumpTarget(near_addr);
gpr.Unlock(addr, value, W30);
}

View File

@ -149,6 +149,7 @@ Arm64GPRCache::GuestRegInfo Arm64GPRCache::GetGuestByIndex(size_t index)
if (index >= GUEST_CR_OFFSET && index < GUEST_CR_OFFSET + GUEST_CR_COUNT)
return GetGuestCR(index - GUEST_CR_OFFSET);
ASSERT_MSG(DYNA_REC, false, "Invalid index for guest register");
return GetGuestGPR(0);
}
void Arm64GPRCache::FlushRegister(size_t index, bool maintain_state)
@ -161,7 +162,7 @@ void Arm64GPRCache::FlushRegister(size_t index, bool maintain_state)
{
ARM64Reg host_reg = reg.GetReg();
if (reg.IsDirty())
m_emit->STR(INDEX_UNSIGNED, host_reg, PPC_REG, guest_reg.ppc_offset);
m_emit->STR(INDEX_UNSIGNED, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
if (!maintain_state)
{
@ -173,14 +174,14 @@ void Arm64GPRCache::FlushRegister(size_t index, bool maintain_state)
{
if (!reg.GetImm())
{
m_emit->STR(INDEX_UNSIGNED, bitsize == 64 ? ZR : WZR, PPC_REG, guest_reg.ppc_offset);
m_emit->STR(INDEX_UNSIGNED, bitsize == 64 ? ZR : WZR, PPC_REG, u32(guest_reg.ppc_offset));
}
else
{
ARM64Reg host_reg = bitsize != 64 ? GetReg() : EncodeRegTo64(GetReg());
m_emit->MOVI2R(host_reg, reg.GetImm());
m_emit->STR(INDEX_UNSIGNED, host_reg, PPC_REG, guest_reg.ppc_offset);
m_emit->STR(INDEX_UNSIGNED, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
UnlockRegister(DecodeReg(host_reg));
}
@ -207,7 +208,7 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
ARM64Reg RX1 = R(GetGuestByIndex(i));
ARM64Reg RX2 = R(GetGuestByIndex(i + 1));
m_emit->STP(INDEX_SIGNED, RX1, RX2, PPC_REG, ppc_offset);
m_emit->STP(INDEX_SIGNED, RX1, RX2, PPC_REG, u32(ppc_offset));
if (!maintain_state)
{
UnlockRegister(DecodeReg(RX1));
@ -285,7 +286,7 @@ ARM64Reg Arm64GPRCache::R(const GuestRegInfo& guest_reg)
ARM64Reg host_reg = bitsize != 64 ? GetReg() : EncodeRegTo64(GetReg());
reg.Load(host_reg);
reg.SetDirty(false);
m_emit->LDR(INDEX_UNSIGNED, host_reg, PPC_REG, guest_reg.ppc_offset);
m_emit->LDR(INDEX_UNSIGNED, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
return host_reg;
}
break;
@ -318,7 +319,7 @@ void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool do_load)
ARM64Reg host_reg = bitsize != 64 ? GetReg() : EncodeRegTo64(GetReg());
reg.Load(host_reg);
if (do_load)
m_emit->LDR(INDEX_UNSIGNED, host_reg, PPC_REG, guest_reg.ppc_offset);
m_emit->LDR(INDEX_UNSIGNED, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
}
}
@ -450,7 +451,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
// Load the high 64bits from the file and insert them in to the high 64bits of the host
// register
ARM64Reg tmp_reg = GetReg();
m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps1));
m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, PPC_REG, u32(PPCSTATE_OFF(ps[preg].ps1)));
m_float_emit->INS(64, host_reg, 1, tmp_reg, 0);
UnlockRegister(tmp_reg);
@ -503,7 +504,8 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
reg.Load(host_reg, REG_LOWER_PAIR);
}
reg.SetDirty(false);
m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps0));
m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, PPC_REG,
u32(PPCSTATE_OFF(ps[preg].ps0)));
return host_reg;
}
default:
@ -551,7 +553,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
// store.
// It would take longer to do an insert to a temporary and a 64bit store than to just do this.
m_float_emit->STR(128, INDEX_UNSIGNED, flush_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps0));
m_float_emit->STR(128, INDEX_UNSIGNED, flush_reg, PPC_REG, u32(PPCSTATE_OFF(ps[preg].ps0)));
break;
case REG_DUP_SINGLE:
flush_reg = GetReg();
@ -559,7 +561,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
[[fallthrough]];
case REG_DUP:
// Store PSR1 (which is equal to PSR0) in memory.
m_float_emit->STR(64, INDEX_UNSIGNED, flush_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps1));
m_float_emit->STR(64, INDEX_UNSIGNED, flush_reg, PPC_REG, u32(PPCSTATE_OFF(ps[preg].ps1)));
break;
default:
// All other types doesn't store anything in PSR1.
@ -684,7 +686,10 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
store_size = 64;
if (dirty)
m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps0));
{
m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, PPC_REG,
u32(PPCSTATE_OFF(ps[preg].ps0)));
}
if (!maintain_state)
{
@ -700,8 +705,8 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
// Too bad moving them would break savestate compatibility between x86_64 and AArch64
// m_float_emit->STP(64, INDEX_SIGNED, host_reg, host_reg, PPC_REG,
// PPCSTATE_OFF(ps[preg].ps0));
m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps0));
m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, PPC_REG, PPCSTATE_OFF(ps[preg].ps1));
m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, PPC_REG, u32(PPCSTATE_OFF(ps[preg].ps0)));
m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, PPC_REG, u32(PPCSTATE_OFF(ps[preg].ps1)));
}
if (!maintain_state)

View File

@ -32,6 +32,7 @@ FixupBranch JitArm64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
return jump_if_set ? TBNZ(XA, 62) : TBZ(XA, 62);
default:
ASSERT_MSG(DYNA_REC, false, "Invalid CR bit");
return {};
}
}
@ -196,9 +197,9 @@ void JitArm64::twx(UGeckoInstruction inst)
SetJumpTarget(fixup);
}
FixupBranch far = B();
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far);
SetJumpTarget(far_addr);
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);

View File

@ -379,7 +379,7 @@ void VertexLoaderARM64::GenerateVertexLoader()
bool has_tc_scale = false;
for (int i = 0; i < 8; i++)
{
has_tc |= tc[i];
has_tc |= tc[i] != 0;
has_tc_scale |= !!m_VtxAttr.texCoord[i].Frac;
}

View File

@ -17,14 +17,21 @@
#include "Core/ConfigManager.h"
#include "Core/Core.h"
// OpenGL is not available on Windows-on-ARM64
#if !defined(_WIN32) || !defined(_M_ARM64)
#define HAS_OPENGL 1
#endif
// TODO: ugly
#ifdef _WIN32
#include "VideoBackends/D3D/VideoBackend.h"
#include "VideoBackends/D3D12/VideoBackend.h"
#endif
#include "VideoBackends/Null/VideoBackend.h"
#ifdef HAS_OPENGL
#include "VideoBackends/OGL/VideoBackend.h"
#include "VideoBackends/Software/VideoBackend.h"
#endif
#include "VideoBackends/Vulkan/VideoBackend.h"
#include "VideoCommon/AsyncRequests.h"
@ -182,13 +189,17 @@ u16 VideoBackendBase::Video_GetBoundingBox(int index)
void VideoBackendBase::PopulateList()
{
// OGL > D3D11 > Vulkan > SW > Null
#ifdef HAS_OPENGL
g_available_video_backends.push_back(std::make_unique<OGL::VideoBackend>());
#endif
#ifdef _WIN32
g_available_video_backends.push_back(std::make_unique<DX11::VideoBackend>());
g_available_video_backends.push_back(std::make_unique<DX12::VideoBackend>());
#endif
g_available_video_backends.push_back(std::make_unique<Vulkan::VideoBackend>());
#ifdef HAS_OPENGL
g_available_video_backends.push_back(std::make_unique<SW::VideoSoftware>());
#endif
g_available_video_backends.push_back(std::make_unique<Null::VideoBackend>());
const auto iter =