From 2f6a740b6523400776d5ba48d78590a54c3f23c1 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 28 Oct 2024 01:52:06 +0100 Subject: [PATCH] works on Linux x64 still needs to be fixed for everything else --- src/ARMJIT.cpp | 13 ++++ src/ARMJIT.h | 10 +--- src/ARMJIT_CodeMem.cpp | 93 +++++++++++++++++++++++++++++ src/ARMJIT_CodeMem.h | 43 +++++++++++++ src/ARMJIT_x64/ARMJIT_Branch.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 52 ++++------------ src/ARMJIT_x64/ARMJIT_Compiler.h | 2 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 64 ++++++++++---------- src/CMakeLists.txt | 1 + src/NDS.cpp | 5 +- src/NDS.h | 4 +- src/dolphin/x64Emitter.h | 22 +++++++ src/frontend/qt_sdl/EmuInstance.cpp | 3 - 13 files changed, 227 insertions(+), 89 deletions(-) create mode 100644 src/ARMJIT_CodeMem.cpp create mode 100644 src/ARMJIT_CodeMem.h diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 1ebcce8e..d698b0a9 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -30,6 +30,7 @@ #include "ARMJIT_Internal.h" #include "ARMJIT_Memory.h" #include "ARMJIT_Compiler.h" +#include "ARMJIT_CodeMem.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" @@ -467,6 +468,18 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F +ARMJIT::ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept : + NDS(nds), + Memory(nds), + JITCompiler(nds), + MaxBlockSize(jit.has_value() ? std::clamp(jit->MaxBlockSize, 1u, 32u) : 32), + LiteralOptimizations(jit.has_value() ? jit->LiteralOptimizations : false), + BranchOptimizations(jit.has_value() ? jit->BranchOptimizations : false), + FastMemory(jit.has_value() ? jit->FastMemory : false) +{ + ARMJIT_CodeMem::Init(); +} + void ARMJIT::RetireJitBlock(JitBlock* block) noexcept { auto it = RestoreCandidates.find(block->InstrHash); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index a228a4dd..98bee567 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -44,15 +44,7 @@ class JitBlock; class ARMJIT { public: - ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept : - NDS(nds), - Memory(nds), - JITCompiler(nds), - MaxBlockSize(jit.has_value() ? std::clamp(jit->MaxBlockSize, 1u, 32u) : 32), - LiteralOptimizations(jit.has_value() ? jit->LiteralOptimizations : false), - BranchOptimizations(jit.has_value() ? jit->BranchOptimizations : false), - FastMemory(jit.has_value() ? jit->FastMemory : false) - {} + ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept; ~ARMJIT() noexcept; void InvalidateByAddr(u32) noexcept; void CheckAndInvalidateWVRAM(int) noexcept; diff --git a/src/ARMJIT_CodeMem.cpp b/src/ARMJIT_CodeMem.cpp new file mode 100644 index 00000000..4d5b344b --- /dev/null +++ b/src/ARMJIT_CodeMem.cpp @@ -0,0 +1,93 @@ +#include "ARMJIT_CodeMem.h" + +#ifdef _WIN32 +#include +#else +#include +#include +#endif + +#include + +#include + +namespace melonDS +{ + +namespace ARMJIT_CodeMem +{ + +std::mutex codeMemoryMutex; + +static constexpr size_t NumCodeMemSlices = 4; + +// I haven't heard of pages larger than 16 KB +alignas(16*1024) u8 CodeMemory[NumCodeMemSlices * CodeMemorySliceSize]; + +u32 AvailableCodeMemSlices = (1 << NumCodeMemSlices) - 1; + +int RefCounter = 0; + +void* Allocate() +{ + std::lock_guard guard(codeMemoryMutex); + + if (AvailableCodeMemSlices) + { + int slice = __builtin_ctz(AvailableCodeMemSlices); + AvailableCodeMemSlices &= ~(1 << slice); + //printf("allocating slice %d\n", slice); + return &CodeMemory[slice * CodeMemorySliceSize]; + } + + // allocate +#ifdef _WIN32 + // FIXME +#else + //printf("mmaping...\n"); + return mmap(NULL, CodeMemorySliceSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#endif +} + +void Free(void* codeMem) +{ + std::lock_guard guard(codeMemoryMutex); + + for (int i = 0; i < NumCodeMemSlices; i++) + { + if (codeMem == &CodeMemory[CodeMemorySliceSize * i]) + { + //printf("freeing slice\n"); + AvailableCodeMemSlices |= 1 << i; + return; + } + } + +#ifdef _WIN32 + // FIXME +#else + munmap(codeMem, CodeMemorySliceSize); +#endif +} + +void Init() +{ + std::lock_guard guard(codeMemoryMutex); + + RefCounter++; + if (RefCounter == 1) + { + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(CodeMemory, sizeof(CodeMemory), PAGE_EXECUTE_READWRITE, &dummy); + #elif defined(__APPLE__) + // Apple always uses dynamic allocation + #else + mprotect(CodeMemory, sizeof(CodeMemory), PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + } +} + +} + +} diff --git a/src/ARMJIT_CodeMem.h b/src/ARMJIT_CodeMem.h new file mode 100644 index 00000000..9f0f2f81 --- /dev/null +++ b/src/ARMJIT_CodeMem.h @@ -0,0 +1,43 @@ +/* + Copyright 2016-2024 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#ifndef ARMJIT_CODEMEM_H +#define ARMJIT_CODEMEM_H + +#include "types.h" + +#include + +namespace melonDS +{ + +namespace ARMJIT_CodeMem +{ + +static constexpr size_t CodeMemorySliceSize = 1024*1024*32; + +void Init(); + +void* Allocate(); +void Free(void* codeMem); + +} + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index c32e2b73..bd73ae71 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -176,9 +176,9 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) else MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste if (Num == 0) - CALL((void*)&ARMv5JumpToTrampoline); + ABI_CallFunction(ARMv5JumpToTrampoline); else - CALL((void*)&ARMv4JumpToTrampoline); + ABI_CallFunction(ARMv4JumpToTrampoline); PopRegs(restoreCPSR, true); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index ba6c0fb4..619e74d0 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -21,19 +21,13 @@ #include "../ARMJIT.h" #include "../ARMInterpreter.h" #include "../NDS.h" +#include "../ARMJIT_CodeMem.h" #include #include #include "../dolphin/CommonFuncs.h" -#ifdef _WIN32 -#include -#else -#include -#include -#endif - using namespace Gen; using namespace Common; @@ -222,46 +216,19 @@ void Compiler::A_Comp_MSR() MOV(32, R(ABI_PARAM3), R(RCPSR)); MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((void*)&UpdateModeTrampoline); + ABI_CallFunction(UpdateModeTrampoline); PopRegs(true, true); } } } -/* - We'll repurpose this .bss memory - - */ -u8 CodeMemory[1024 * 1024 * 32]; - Compiler::Compiler(melonDS::NDS& nds) : XEmitter(), NDS(nds) { - { - #ifdef _WIN32 - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); + CodeMemBase = static_cast(ARMJIT_CodeMem::Allocate()); + CodeMemSize = ARMJIT_CodeMem::CodeMemorySliceSize; - u64 pageSize = (u64)sysInfo.dwPageSize; - #else - u64 pageSize = sysconf(_SC_PAGE_SIZE); - #endif - - u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); - u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; - - #ifdef _WIN32 - DWORD dummy; - VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); - #elif defined(__APPLE__) - pageAligned = (u8*)mmap(NULL, 1024*1024*32, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS ,-1, 0); - #else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); - #endif - - ResetStart = pageAligned; - CodeMemSize = alignedSize; - } + ResetStart = CodeMemBase; Reset(); @@ -475,6 +442,11 @@ Compiler::Compiler(melonDS::NDS& nds) : XEmitter(), NDS(nds) FarSize = (ResetStart + CodeMemSize) - FarStart; } +Compiler::~Compiler() +{ + ARMJIT_CodeMem::Free(CodeMemBase); +} + void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -684,7 +656,7 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) if (ConstantCycles) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - JMP((u8*)&ARM_Ret, true); + ABI_TailCall(ARM_Ret); } } @@ -846,7 +818,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] if (ConstantCycles) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - JMP((u8*)ARM_Ret, true); + ABI_TailCall(ARM_Ret); #ifdef JIT_PROFILING_ENABLED CreateMethod("JIT_Block_%d_%d_%08X", (void*)res, Num, Thumb, instrs[0].Addr); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 3965e882..c714a6ba 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -84,6 +84,7 @@ class Compiler : public Gen::XEmitter { public: explicit Compiler(melonDS::NDS& nds); + ~Compiler(); void Reset(); @@ -256,6 +257,7 @@ public: std::unordered_map LoadStorePatches {}; + u8* CodeMemBase; u8* ResetStart {}; u32 CodeMemSize {}; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 219c7271..71cd0770 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -316,24 +316,24 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowWrite9); break; - case 16: CALL((void*)&SlowWrite9); break; - case 8: CALL((void*)&SlowWrite9); break; - case 33: CALL((void*)&SlowWrite9); break; - case 17: CALL((void*)&SlowWrite9); break; - case 9: CALL((void*)&SlowWrite9); break; + case 32: ABI_CallFunction(SlowWrite9); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 8: ABI_CallFunction(&SlowWrite9); break; + case 33: ABI_CallFunction(&SlowWrite9); break; + case 17: ABI_CallFunction(&SlowWrite9); break; + case 9: ABI_CallFunction(&SlowWrite9); break; } } else { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowRead9); break; - case 16: CALL((void*)&SlowRead9); break; - case 8: CALL((void*)&SlowRead9); break; - case 33: CALL((void*)&SlowRead9); break; - case 17: CALL((void*)&SlowRead9); break; - case 9: CALL((void*)&SlowRead9); break; + case 32: ABI_CallFunction(&SlowRead9); break; + case 16: ABI_CallFunction(&SlowRead9); break; + case 8: ABI_CallFunction(&SlowRead9); break; + case 33: ABI_CallFunction(&SlowRead9); break; + case 17: ABI_CallFunction(&SlowRead9); break; + case 9: ABI_CallFunction(&SlowRead9); break; } } } @@ -347,24 +347,24 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowWrite7); break; - case 16: CALL((void*)&SlowWrite7); break; - case 8: CALL((void*)&SlowWrite7); break; - case 33: CALL((void*)&SlowWrite7); break; - case 17: CALL((void*)&SlowWrite7); break; - case 9: CALL((void*)&SlowWrite7); break; + case 32: ABI_CallFunction(&SlowWrite7); break; + case 16: ABI_CallFunction(&SlowWrite7); break; + case 8: ABI_CallFunction(&SlowWrite7); break; + case 33: ABI_CallFunction(&SlowWrite7); break; + case 17: ABI_CallFunction(&SlowWrite7); break; + case 9: ABI_CallFunction(&SlowWrite7); break; } } else { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowRead7); break; - case 16: CALL((void*)&SlowRead7); break; - case 8: CALL((void*)&SlowRead7); break; - case 33: CALL((void*)&SlowRead7); break; - case 17: CALL((void*)&SlowRead7); break; - case 9: CALL((void*)&SlowRead7); break; + case 32: ABI_CallFunction(&SlowRead7); break; + case 16: ABI_CallFunction(&SlowRead7); break; + case 8: ABI_CallFunction(&SlowRead7); break; + case 33: ABI_CallFunction(&SlowRead7); break; + case 17: ABI_CallFunction(&SlowRead7); break; + case 9: ABI_CallFunction(&SlowRead7); break; } } } @@ -526,10 +526,10 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc switch (Num * 2 | NDS.ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: ABI_CallFunction(&SlowBlockTransfer9); break; + case 1: ABI_CallFunction(&SlowBlockTransfer9); break; + case 2: ABI_CallFunction(&SlowBlockTransfer7); break; + case 3: ABI_CallFunction(&SlowBlockTransfer7); break; } PopRegs(false, false); @@ -630,10 +630,10 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc switch (Num * 2 | NDS.ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: ABI_CallFunction(&SlowBlockTransfer9); break; + case 1: ABI_CallFunction(&SlowBlockTransfer9); break; + case 2: ABI_CallFunction(&SlowBlockTransfer7); break; + case 3: ABI_CallFunction(&SlowBlockTransfer7); break; } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1f947d11..2296c1c4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -97,6 +97,7 @@ if (ENABLE_JIT) ARMJIT.cpp ARMJIT_Memory.cpp + ARMJIT_CodeMem.cpp dolphin/CommonFuncs.cpp) diff --git a/src/NDS.cpp b/src/NDS.cpp index 1023d3c0..3d88233f 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -74,7 +74,7 @@ const s32 kIterationCycleMargin = 8; // // timings for GBA slot and wifi are set up at runtime -NDS* NDS::Current = nullptr; +thread_local NDS* NDS::Current = nullptr; NDS::NDS() noexcept : NDS( @@ -130,6 +130,7 @@ NDS::NDS(NDSArgs&& args, int type, void* userdata) noexcept : MainRAM = JIT.Memory.GetMainRAM(); SharedWRAM = JIT.Memory.GetSharedWRAM(); ARM7WRAM = JIT.Memory.GetARM7WRAM(); + } NDS::~NDS() noexcept @@ -892,6 +893,8 @@ void NDS::RunSystemSleep(u64 timestamp) template u32 NDS::RunFrame() { + Current = this; + FrameStartTimestamp = SysTimestamp; GPU.TotalScanlines = 0; diff --git a/src/NDS.h b/src/NDS.h index b2bfb385..8e5de45b 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -536,8 +536,8 @@ public: NDS& operator=(const NDS&) = delete; NDS(NDS&&) = delete; NDS& operator=(NDS&&) = delete; - // The frontend should set and unset this manually after creating and destroying the NDS object. - [[deprecated("Temporary workaround until JIT code generation is revised to accommodate multiple NDS objects.")]] static NDS* Current; + + static thread_local NDS* Current; protected: explicit NDS(NDSArgs&& args, int type, void* userdata) noexcept; virtual void DoSavestateExtra(Savestate* file) {} diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h index 36603218..d83a41f8 100644 --- a/src/dolphin/x64Emitter.h +++ b/src/dolphin/x64Emitter.h @@ -1019,6 +1019,28 @@ public: CALL(ptr); } } + template + void ABI_TailCall(FunctionPointer func) + { + static_assert(std::is_pointer() && + std::is_function>(), + "Supplied type must be a function pointer."); + + const u8* ptr = reinterpret_cast(func); + const u64 address = reinterpret_cast(ptr); + const u64 distance = address - (reinterpret_cast(code) + 5); + + if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) + { + // Far call + MOV(64, R(RAX), Imm64(address)); + JMPptr(R(RAX)); + } + else + { + JMP(ptr, true); + } + } template void ABI_CallFunctionC16(FunctionPointer func, u16 param1) diff --git a/src/frontend/qt_sdl/EmuInstance.cpp b/src/frontend/qt_sdl/EmuInstance.cpp index 90116692..d79b6a13 100644 --- a/src/frontend/qt_sdl/EmuInstance.cpp +++ b/src/frontend/qt_sdl/EmuInstance.cpp @@ -165,7 +165,6 @@ EmuInstance::~EmuInstance() audioDeInit(); inputDeInit(); - NDS::Current = nullptr; if (nds) { saveRTCData(); @@ -1342,7 +1341,6 @@ bool EmuInstance::updateConsole(UpdateConsoleNDSArgs&& _ndsargs, UpdateConsoleGB if ((!nds) || (consoleType != nds->ConsoleType)) { - NDS::Current = nullptr; if (nds) { saveRTCData(); @@ -1354,7 +1352,6 @@ bool EmuInstance::updateConsole(UpdateConsoleNDSArgs&& _ndsargs, UpdateConsoleGB else nds = new NDS(std::move(ndsargs), this); - NDS::Current = nds; nds->Reset(); loadRTCData(); //emuThread->updateVideoRenderer(); // not actually needed?