From aa430608e70fe6857ef7a9d63b59525b5f261f18 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 29 Jun 2021 22:25:43 +0200 Subject: [PATCH] support allocating more registers for aarch64 JIT also some minor fixes for the x64 JIT as well --- src/ARMJIT.cpp | 8 +++- src/ARMJIT_A64/ARMJIT_Branch.cpp | 10 ++-- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 74 ++++++++++++++++++----------- src/ARMJIT_A64/ARMJIT_Compiler.h | 7 +-- src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 21 +++++--- src/ARMJIT_x64/ARMJIT_Branch.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 25 ++++++---- src/ARMJIT_x64/ARMJIT_Compiler.h | 6 +-- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 14 +++--- src/ARM_InstrInfo.cpp | 7 ++- 10 files changed, 112 insertions(+), 64 deletions(-) diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 38e9ddcd..361801b7 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -645,6 +645,8 @@ void CompileBlock(ARM* cpu) u32 lr; bool hasLink = false; + bool hasMemoryInstr = false; + do { r15 += thumb ? 2 : 4; @@ -707,6 +709,10 @@ void CompileBlock(ARM* cpu) } instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + hasMemoryInstr |= thumb + ? (instrs[i].Info.Kind >= ARMInstrInfo::tk_LDR_PCREL && instrs[i].Info.Kind <= ARMInstrInfo::tk_STMIA) + : (instrs[i].Info.Kind >= ARMInstrInfo::ak_STR_REG_LSL && instrs[i].Info.Kind <= ARMInstrInfo::ak_STM); + cpu->R[15] = r15; cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; @@ -915,7 +921,7 @@ void CompileBlock(ARM* cpu) #if defined(__APPLE__) && defined(__aarch64__) pthread_jit_write_protect_np(false); #endif - block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i, hasMemoryInstr); #if defined(__APPLE__) && defined(__aarch64__) pthread_jit_write_protect_np(true); #endif diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index eeabfb0e..2f640c8a 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -27,7 +27,7 @@ namespace ARMJIT { template -void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR) +void JumpToTrampoline(T* cpu, u32 addr, bool changeCPSR) { cpu->JumpTo(addr, changeCPSR); } @@ -301,7 +301,7 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto bool cpsrDirty = CPSRDirty; SaveCPSR(); SaveCycles(); - PushRegs(restoreCPSR); + PushRegs(restoreCPSR, true); if (switchThumb) MOV(W1, addr); @@ -315,11 +315,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto MOV(X0, RCPU); MOVI2R(W2, restoreCPSR); if (Num == 0) - QuickCallFunction(X3, jumpToTrampoline); + QuickCallFunction(X3, JumpToTrampoline); else - QuickCallFunction(X3, jumpToTrampoline); + QuickCallFunction(X3, JumpToTrampoline); - PopRegs(restoreCPSR); + PopRegs(restoreCPSR, true); LoadCycles(); LoadCPSR(); if (CurInstr.Cond() < 0xE) diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 4fbb804b..7dc854ae 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -58,9 +58,14 @@ namespace ARMJIT template <> const ARM64Reg RegisterCache::NativeRegAllocOrder[] = - {W19, W20, W21, W22, W23, W24, W25, W26}; +{ + W19, W20, W21, W22, W23, W24, W25, + W8, W9, W10, W11, W12, W13, W14, W15 +}; template <> -const int RegisterCache::NativeRegsAvailable = 8; +const int RegisterCache::NativeRegsAvailable = 15; + +const BitSet32 CallerSavedPushRegs({W8, W9, W10, W11, W12, W13, W14, W15}); const int JitMemSize = 16 * 1024 * 1024; #ifndef __SWITCH__ @@ -164,44 +169,55 @@ void Compiler::A_Comp_MSR() MOV(W2, RCPSR); MOV(X0, RCPU); - PushRegs(true); - - QuickCallFunction(X3, (void*)&UpdateModeTrampoline); - - PopRegs(true); + PushRegs(true, true); + QuickCallFunction(X3, UpdateModeTrampoline); + PopRegs(true, true); } } } -void Compiler::PushRegs(bool saveHiRegs) + +void Compiler::PushRegs(bool saveHiRegs, bool saveRegsToBeChanged, bool allowUnload) { + BitSet32 loadedRegs(RegCache.LoadedRegs); + if (saveHiRegs) { - if (Thumb || CurInstr.Cond() == 0xE) + BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) { - BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); - for (int reg : hiRegsLoaded) + if (Thumb || CurInstr.Cond() == 0xE) RegCache.UnloadRegister(reg); + else + SaveReg(reg, RegCache.Mapping[reg]); + // prevent saving the register twice + loadedRegs[reg] = false; } - else + } + + for (int reg : loadedRegs) + { + if (CallerSavedPushRegs[RegCache.Mapping[reg]] + && (saveRegsToBeChanged || !((1<= 8 && reg < 15) + || (CallerSavedPushRegs[RegCache.Mapping[reg]] + && (saveRegsToBeChanged || !((1<= W19 && reg <= W26))) + if (!(reg == W4 || (reg >= W8 && reg <= W15) || (reg >= W19 && reg <= W25))) continue; ARM64Reg rdMapped = (ARM64Reg)reg; PatchedStoreFuncs[consoleType][num][size][reg] = GetRXPtr(); @@ -371,7 +388,7 @@ Compiler::Compiler() { MOV(W1, rdMapped); } - ABI_PushRegisters({30}); + ABI_PushRegisters(BitSet32({30}) | CallerSavedPushRegs); if (consoleType == 0) { switch ((8 << size) | num) @@ -397,7 +414,7 @@ Compiler::Compiler() } } - ABI_PopRegisters({30}); + ABI_PopRegisters(BitSet32({30}) | CallerSavedPushRegs); RET(); for (int signextend = 0; signextend < 2; signextend++) @@ -405,7 +422,7 @@ Compiler::Compiler() PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetRXPtr(); if (num == 0) MOV(X1, RCPU); - ABI_PushRegisters({30}); + ABI_PushRegisters(BitSet32({30}) | CallerSavedPushRegs); if (consoleType == 0) { switch ((8 << size) | num) @@ -430,7 +447,7 @@ Compiler::Compiler() case 9: QuickCallFunction(X3, SlowRead7); break; } } - ABI_PopRegisters({30}); + ABI_PopRegisters(BitSet32({30}) | CallerSavedPushRegs); if (size == 32) MOV(rdMapped, W0); else if (signextend) @@ -673,7 +690,7 @@ void Compiler::Comp_BranchSpecialBehaviour(bool taken) } } -JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount, bool hasMemInstr) { if (JitMemMainSize - GetCodeOffset() < 1024 * 16) { @@ -695,6 +712,9 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] RegCache = RegisterCache(this, instrs, instrsCount, true); CPSRDirty = false; + if (hasMemInstr) + MOVP2R(RMemBase, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + for (int i = 0; i < instrsCount; i++) { CurInstr = instrs[i]; diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index d18da933..24e730b1 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -32,6 +32,7 @@ namespace ARMJIT { +const Arm64Gen::ARM64Reg RMemBase = Arm64Gen::X26; const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27; const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28; const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29; @@ -99,8 +100,8 @@ public: Compiler(); ~Compiler(); - void PushRegs(bool saveHiRegs); - void PopRegs(bool saveHiRegs); + void PushRegs(bool saveHiRegs, bool saveRegsToBeChanged, bool allowUnload = true); + void PopRegs(bool saveHiRegs, bool saveRegsToBeChanged); Arm64Gen::ARM64Reg MapReg(int reg) { @@ -108,7 +109,7 @@ public: return RegCache.Mapping[reg]; } - JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount, bool hasMemInstr); bool CanCompile(bool thumb, u16 kind); diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 4f05d4da..5ac629b1 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -194,13 +194,11 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) ptrdiff_t memopStart = GetCodeOffset(); LoadStorePatch patch; - assert((rdMapped >= W19 && rdMapped <= W26) || rdMapped == W4); + assert((rdMapped >= W8 && rdMapped <= W15) || (rdMapped >= W19 && rdMapped <= W25) || rdMapped == W4); patch.PatchFunc = flags & memop_Store ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped] : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped]; - MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); - // take a chance at fastmem if (size > 8) ANDI2R(W1, W0, addressMask); @@ -208,11 +206,11 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) ptrdiff_t loadStorePosition = GetCodeOffset(); if (flags & memop_Store) { - STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7); + STRGeneric(size, rdMapped, size > 8 ? X1 : X0, RMemBase); } else { - LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); + LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, RMemBase); if (size == 32 && !addrIsStatic) { UBFIZ(W0, W0, 3, 2); @@ -230,12 +228,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) if (addrIsStatic) func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size); + PushRegs(false, false); + if (func) { if (flags & memop_Store) MOV(W1, rdMapped); QuickCallFunction(X2, (void (*)())func); + PopRegs(false, false); + if (!(flags & memop_Store)) { if (size == 32) @@ -314,6 +316,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) } } + PopRegs(false, false); + if (!(flags & memop_Store)) { if (size == 32) @@ -515,8 +519,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ptrdiff_t fastPathStart = GetCodeOffset(); ptrdiff_t loadStoreOffsets[8]; - MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); - ADD(X1, X1, X0); + ADD(X1, RMemBase, X0); u32 offset = 0; BitSet16::Iterator it = regs.begin(); @@ -655,6 +658,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } } + PushRegs(false, false, !compileFastPath); + ADD(X1, SP, 0); MOVI2R(W2, regsCount); @@ -680,6 +685,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } } + PopRegs(false, false); + if (!store) { if (usermode && !regs[15] && (regs & BitSet16(0x7f00))) diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 8ca3542d..a89cf990 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -165,7 +165,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) bool cpsrDirty = CPSRDirty; SaveCPSR(); - PushRegs(restoreCPSR); + PushRegs(restoreCPSR, true); MOV(64, R(ABI_PARAM1), R(RCPU)); MOV(32, R(ABI_PARAM2), R(addr)); @@ -178,7 +178,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) else CALL((void*)&ARMv4JumpToTrampoline); - PopRegs(restoreCPSR); + PopRegs(restoreCPSR, true); LoadCPSR(); // in case this instruction is skipped diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 5ab8c6a2..aef8b91b 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -64,7 +64,7 @@ const BitSet32 CallerSavedPushRegs({R10, R11}); const BitSet32 CallerSavedPushRegs({R9, R10, R11}); #endif -void Compiler::PushRegs(bool saveHiRegs) +void Compiler::PushRegs(bool saveHiRegs, bool saveRegsToBeChanged, bool allowUnload) { BitSet32 loadedRegs(RegCache.LoadedRegs); @@ -83,17 +83,26 @@ void Compiler::PushRegs(bool saveHiRegs) } for (int reg : loadedRegs) - if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) - SaveReg(reg, RegCache.Mapping[reg]); + { + if (CallerSavedPushRegs[RegCache.Mapping[reg]] + && (saveRegsToBeChanged || !((1<= 8 && reg < 15) - || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) + || (CallerSavedPushRegs[RegCache.Mapping[reg]] + && (saveRegsToBeChanged || !((1<); break; } - PopRegs(false); + PopRegs(false, false); if (allocOffset) ADD(64, R(RSP), Imm8(allocOffset)); @@ -606,7 +606,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (allocOffset) SUB(64, R(RSP), Imm8(allocOffset)); - PushRegs(false); + PushRegs(false, false, !compileFastPath); MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); if (allocOffset) @@ -628,7 +628,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); - PopRegs(false); + PopRegs(false, false); } if (compileFastPath) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 7562e78f..535436ce 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -526,7 +526,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_LoadMem) { if (res.SrcRegs == (1 << 15)) - res.SpecialKind = special_LoadLiteral; + res.SpecialKind = special_LoadLiteral; else res.SpecialKind = special_LoadMem; } @@ -536,6 +536,11 @@ Info Decode(bool thumb, u32 num, u32 instr) u16 set = (instr & 0xFFFF); res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.DstRegs |= set; + // when the instruction is executed not in usermode a banked register in memory will be written to + // but the unbanked register will still be allocated, so it is expected to carry the proper value + // thus it is a source register + if (instr & (1<<22)) + res.SrcRegs |= set & 0x7F00; } if (res.Kind == ak_STM) {