From 99ce959913bd678781d066dc78a9e30e1daf5f05 Mon Sep 17 00:00:00 2001 From: Kemal Afzal Date: Mon, 18 Nov 2024 20:43:05 +0100 Subject: [PATCH] Multiinstance jit (#2201) * works on Linux x64 still needs to be fixed for everything else * use lots of PROT_NONE memory to reliably reserve virtual address space * multi instance fastmem on Linux * Windows * blarg * disable fastmem if the page size is not 4kb * fix fast mem dialog option * make aarch64 work as well * fastmem 16kb pages support --- src/ARMJIT.cpp | 40 ++- src/ARMJIT.h | 11 +- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 45 +-- src/ARMJIT_A64/ARMJIT_Compiler.h | 1 + src/ARMJIT_Global.cpp | 118 +++++++ src/ARMJIT_Global.h | 44 +++ src/ARMJIT_Internal.h | 4 +- src/ARMJIT_Memory.cpp | 361 ++++++++++++++++------ src/ARMJIT_Memory.h | 44 ++- src/ARMJIT_x64/ARMJIT_Branch.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 54 +--- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 64 ++-- src/CMakeLists.txt | 5 + src/NDS.cpp | 5 +- src/NDS.h | 4 +- src/dolphin/x64Emitter.h | 22 ++ src/frontend/qt_sdl/EmuInstance.cpp | 3 - src/frontend/qt_sdl/EmuSettingsDialog.cpp | 7 +- 19 files changed, 573 insertions(+), 265 deletions(-) create mode 100644 src/ARMJIT_Global.cpp create mode 100644 src/ARMJIT_Global.h diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 1ebcce8e..9582f7c8 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -30,6 +30,7 @@ #include "ARMJIT_Internal.h" #include "ARMJIT_Memory.h" #include "ARMJIT_Compiler.h" +#include "ARMJIT_Global.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" @@ -467,6 +468,16 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F +ARMJIT::ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept : + NDS(nds), + Memory(nds), + JITCompiler(nds), + MaxBlockSize(jit.has_value() ? std::clamp(jit->MaxBlockSize, 1u, 32u) : 32), + LiteralOptimizations(jit.has_value() ? jit->LiteralOptimizations : false), + BranchOptimizations(jit.has_value() ? jit->BranchOptimizations : false), + FastMemory((jit.has_value() ? jit->FastMemory : false) && ARMJIT_Memory::IsFastMemSupported()) +{} + void ARMJIT::RetireJitBlock(JitBlock* block) noexcept { auto it = RestoreCandidates.find(block->InstrHash); @@ -483,6 +494,7 @@ void ARMJIT::RetireJitBlock(JitBlock* block) noexcept void ARMJIT::SetJITArgs(JITArgs args) noexcept { + args.FastMemory = args.FastMemory && ARMJIT_Memory::IsFastMemSupported(); args.MaxBlockSize = std::clamp(args.MaxBlockSize, 1u, 32u); if (MaxBlockSize != args.MaxBlockSize @@ -499,36 +511,22 @@ void ARMJIT::SetJITArgs(JITArgs args) noexcept void ARMJIT::SetMaxBlockSize(int size) noexcept { - size = std::clamp(size, 1, 32); - - if (size != MaxBlockSize) - ResetBlockCache(); - - MaxBlockSize = size; + SetJITArgs(JITArgs{static_cast(size), LiteralOptimizations, LiteralOptimizations, FastMemory}); } void ARMJIT::SetLiteralOptimizations(bool enabled) noexcept { - if (LiteralOptimizations != enabled) - ResetBlockCache(); - - LiteralOptimizations = enabled; + SetJITArgs(JITArgs{static_cast(MaxBlockSize), enabled, BranchOptimizations, FastMemory}); } void ARMJIT::SetBranchOptimizations(bool enabled) noexcept { - if (BranchOptimizations != enabled) - ResetBlockCache(); - - BranchOptimizations = enabled; + SetJITArgs(JITArgs{static_cast(MaxBlockSize), LiteralOptimizations, enabled, FastMemory}); } void ARMJIT::SetFastMemory(bool enabled) noexcept { - if (FastMemory != enabled) - ResetBlockCache(); - - FastMemory = enabled; + SetJITArgs(JITArgs{static_cast(MaxBlockSize), LiteralOptimizations, BranchOptimizations, enabled}); } void ARMJIT::CompileBlock(ARM* cpu) noexcept @@ -918,7 +916,7 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept AddressRange* region = CodeMemRegions[addressRanges[j] >> 27]; - if (!PageContainsCode(®ion[(addressRanges[j] & 0x7FFF000) / 512])) + if (!PageContainsCode(®ion[(addressRanges[j] & 0x7FFF000 & ~(Memory.PageSize - 1)) / 512], Memory.PageSize)) Memory.SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true); AddressRange* range = ®ion[(addressRanges[j] & 0x7FFFFFF) / 512]; @@ -971,7 +969,7 @@ void ARMJIT::InvalidateByAddr(u32 localAddr) noexcept range->Blocks.Remove(i); if (range->Blocks.Length == 0 - && !PageContainsCode(®ion[(localAddr & 0x7FFF000) / 512])) + && !PageContainsCode(®ion[(localAddr & 0x7FFF000 & ~(Memory.PageSize - 1)) / 512], Memory.PageSize)) { Memory.SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false); } @@ -1005,7 +1003,7 @@ void ARMJIT::InvalidateByAddr(u32 localAddr) noexcept if (otherRange->Blocks.Length == 0) { - if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512])) + if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000 & ~(Memory.PageSize - 1)) / 512], Memory.PageSize)) Memory.SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false); otherRange->Code = 0; diff --git a/src/ARMJIT.h b/src/ARMJIT.h index a228a4dd..309aa8e8 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -44,15 +44,7 @@ class JitBlock; class ARMJIT { public: - ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept : - NDS(nds), - Memory(nds), - JITCompiler(nds), - MaxBlockSize(jit.has_value() ? std::clamp(jit->MaxBlockSize, 1u, 32u) : 32), - LiteralOptimizations(jit.has_value() ? jit->LiteralOptimizations : false), - BranchOptimizations(jit.has_value() ? jit->BranchOptimizations : false), - FastMemory(jit.has_value() ? jit->FastMemory : false) - {} + ARMJIT(melonDS::NDS& nds, std::optional jit) noexcept; ~ARMJIT() noexcept; void InvalidateByAddr(u32) noexcept; void CheckAndInvalidateWVRAM(int) noexcept; @@ -80,6 +72,7 @@ private: bool LiteralOptimizations = false; bool BranchOptimizations = false; bool FastMemory = false; + public: melonDS::NDS& NDS; TinyVector InvalidLiterals {}; diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index f05de448..2cb0bf3b 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -22,17 +22,7 @@ #include "../ARMInterpreter.h" #include "../ARMJIT.h" #include "../NDS.h" - -#if defined(__SWITCH__) -#include - -extern char __start__; -#elif defined(_WIN32) -#include -#else -#include -#include -#endif +#include "../ARMJIT_Global.h" #include @@ -66,11 +56,6 @@ const int RegisterCache::NativeRegsAvailable = 15; const BitSet32 CallerSavedPushRegs({W8, W9, W10, W11, W12, W13, W14, W15}); -const int JitMemSize = 16 * 1024 * 1024; -#ifndef __SWITCH__ -u8 JitMem[JitMemSize]; -#endif - void Compiler::MovePC() { ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); @@ -260,29 +245,12 @@ Compiler::Compiler(melonDS::NDS& nds) : Arm64Gen::ARM64XEmitter(), NDS(nds) SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); JitMemMainSize = JitMemSize; #else - #ifdef _WIN32 - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); + ARMJIT_Global::Init(); - u64 pageSize = (u64)sysInfo.dwPageSize; - #else - u64 pageSize = sysconf(_SC_PAGE_SIZE); - #endif - u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); - u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned; + CodeMemBase = ARMJIT_Global::AllocateCodeMem(); - #if defined(_WIN32) - DWORD dummy; - VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); - #elif defined(__APPLE__) - pageAligned = (u8*)mmap(NULL, 1024*1024*16, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT,-1, 0); - nds.JIT.JitEnableWrite(); - #else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); - #endif - - SetCodeBase(pageAligned, pageAligned); - JitMemMainSize = alignedSize; + SetCodeBase(reinterpret_cast(CodeMemBase), reinterpret_cast(CodeMemBase)); + JitMemMainSize = ARMJIT_Global::CodeMemorySliceSize; #endif SetCodePtr(0); @@ -493,6 +461,9 @@ Compiler::~Compiler() free(JitRWBase); } #endif + + ARMJIT_Global::FreeCodeMem(CodeMemBase); + ARMJIT_Global::DeInit(); } void Compiler::LoadCycles() diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index a7b567f6..44886b13 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -275,6 +275,7 @@ public: void* JitRWStart; void* JitRXStart; #endif + void* CodeMemBase; void* ReadBanked, *WriteBanked; diff --git a/src/ARMJIT_Global.cpp b/src/ARMJIT_Global.cpp new file mode 100644 index 00000000..b6510379 --- /dev/null +++ b/src/ARMJIT_Global.cpp @@ -0,0 +1,118 @@ +#include "ARMJIT_Global.h" +#include "ARMJIT_Memory.h" + +#ifdef _WIN32 +#include +#else +#include +#include +#endif + +#include +#include + +#include + +namespace melonDS +{ + +namespace ARMJIT_Global +{ + +std::mutex globalMutex; + +#ifndef __APPLE__ +static constexpr size_t NumCodeMemSlices = 4; +static constexpr size_t CodeMemoryAlignedSize = NumCodeMemSlices * CodeMemorySliceSize; + +// I haven't heard of pages larger than 16 KB +u8 CodeMemory[CodeMemoryAlignedSize + 16*1024]; + +u32 AvailableCodeMemSlices = (1 << NumCodeMemSlices) - 1; + +u8* GetAlignedCodeMemoryStart() +{ + return reinterpret_cast((reinterpret_cast(CodeMemory) + (16*1024-1)) & ~static_cast(16*1024-1)); +} +#endif + +int RefCounter = 0; + +void* AllocateCodeMem() +{ + std::lock_guard guard(globalMutex); + +#ifndef __APPLE__ + if (AvailableCodeMemSlices) + { + int slice = __builtin_ctz(AvailableCodeMemSlices); + AvailableCodeMemSlices &= ~(1 << slice); + //printf("allocating slice %d\n", slice); + return &GetAlignedCodeMemoryStart()[slice * CodeMemorySliceSize]; + } +#endif + + // allocate +#ifdef _WIN32 + return VirtualAlloc(nullptr, CodeMemorySliceSize, MEM_RESERVE|MEM_COMMIT, PAGE_EXECUTE_READWRITE); +#else + //printf("mmaping...\n"); + return mmap(nullptr, CodeMemorySliceSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#endif +} + +void FreeCodeMem(void* codeMem) +{ + std::lock_guard guard(globalMutex); + + for (int i = 0; i < NumCodeMemSlices; i++) + { + if (codeMem == &GetAlignedCodeMemoryStart()[CodeMemorySliceSize * i]) + { + //printf("freeing slice\n"); + AvailableCodeMemSlices |= 1 << i; + return; + } + } + +#ifdef _WIN32 + VirtualFree(codeMem, CodeMemorySliceSize, MEM_RELEASE|MEM_DECOMMIT); +#else + munmap(codeMem, CodeMemorySliceSize); +#endif +} + +void Init() +{ + std::lock_guard guard(globalMutex); + + RefCounter++; + if (RefCounter == 1) + { + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(GetAlignedCodeMemoryStart(), CodeMemoryAlignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #elif defined(__APPLE__) + // Apple always uses dynamic allocation + #else + mprotect(GetAlignedCodeMemoryStart(), CodeMemoryAlignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + + ARMJIT_Memory::RegisterFaultHandler(); + } +} + +void DeInit() +{ + std::lock_guard guard(globalMutex); + + RefCounter--; + if (RefCounter == 0) + { + ARMJIT_Memory::UnregisterFaultHandler(); + } +} + +} + +} diff --git a/src/ARMJIT_Global.h b/src/ARMJIT_Global.h new file mode 100644 index 00000000..299d71a6 --- /dev/null +++ b/src/ARMJIT_Global.h @@ -0,0 +1,44 @@ +/* + Copyright 2016-2024 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#ifndef ARMJIT_GLOBAL_H +#define ARMJIT_GLOBAL_H + +#include "types.h" + +#include + +namespace melonDS +{ + +namespace ARMJIT_Global +{ + +static constexpr size_t CodeMemorySliceSize = 1024*1024*32; + +void Init(); +void DeInit(); + +void* AllocateCodeMem(); +void FreeCodeMem(void* codeMem); + +} + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 5b393903..12591d30 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -85,9 +85,9 @@ typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -inline bool PageContainsCode(const AddressRange* range) +inline bool PageContainsCode(const AddressRange* range, u32 pageSize) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < pageSize / 512; i++) { if (range[i].Blocks.Length > 0) return true; diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp index 51e022d1..ae8391bb 100644 --- a/src/ARMJIT_Memory.cpp +++ b/src/ARMJIT_Memory.cpp @@ -39,6 +39,7 @@ #include "ARMJIT_Internal.h" #include "ARMJIT_Compiler.h" +#include "ARMJIT_Global.h" #include "DSi.h" #include "GPU.h" @@ -100,6 +101,9 @@ namespace melonDS { +static constexpr u64 AddrSpaceSize = 0x100000000; +static constexpr u64 VirtmemAreaSize = AddrSpaceSize * 2 + MemoryTotalSize; + using Platform::Log; using Platform::LogLevel; @@ -152,6 +156,15 @@ void __libnx_exception_handler(ThreadExceptionDump* ctx) #elif defined(_WIN32) +static LPVOID ExceptionHandlerHandle = nullptr; +static HMODULE KernelBaseDll = nullptr; + +using VirtualAlloc2Type = PVOID WINAPI (*)(HANDLE Process, PVOID BaseAddress, SIZE_T Size, ULONG AllocationType, ULONG PageProtection, MEM_EXTENDED_PARAMETER* ExtendedParameters, ULONG ParameterCount); +using MapViewOfFile3Type = PVOID WINAPI (*)(HANDLE FileMapping, HANDLE Process, PVOID BaseAddress, ULONG64 Offset, SIZE_T ViewSize, ULONG AllocationType, ULONG PageProtection, MEM_EXTENDED_PARAMETER* ExtendedParameters, ULONG ParameterCount); + +static VirtualAlloc2Type virtualAlloc2Ptr; +static MapViewOfFile3Type mapViewOfFile3Ptr; + LONG ARMJIT_Memory::ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) { if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION) @@ -170,6 +183,7 @@ LONG ARMJIT_Memory::ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) return EXCEPTION_CONTINUE_EXECUTION; } + Log(LogLevel::Debug, "it all returns to nothing\n"); return EXCEPTION_CONTINUE_SEARCH; } @@ -261,18 +275,61 @@ enum memstate_MappedProtected, }; - +#define CHECK_ALIGNED(value) assert(((value) & (PageSize-1)) == 0) bool ARMJIT_Memory::MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) noexcept { + CHECK_ALIGNED(addr); + CHECK_ALIGNED(offset); + CHECK_ALIGNED(size); + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; #ifdef __SWITCH__ Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size)); return R_SUCCEEDED(r); #elif defined(_WIN32) - bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst; - return r; + uintptr_t uintptrDst = reinterpret_cast(dst); + for (auto it = VirtmemPlaceholders.begin(); it != VirtmemPlaceholders.end(); it++) + { + if (uintptrDst >= it->Start && uintptrDst+size <= it->Start+it->Size) + { + //Log(LogLevel::Debug, "found mapping %llx %llx %llx %llx\n", uintptrDst, size, it->Start, it->Size); + // we split this place holder so that we have a fitting place holder for the mapping + if (uintptrDst != it->Start || size != it->Size) + { + if (!VirtualFree(dst, size, MEM_RELEASE|MEM_PRESERVE_PLACEHOLDER)) + { + Log(LogLevel::Debug, "VirtualFree failed with %x\n", GetLastError()); + return false; + } + } + + VirtmemPlaceholder splitPlaceholder = *it; + VirtmemPlaceholders.erase(it); + if (uintptrDst > splitPlaceholder.Start) + { + //Log(LogLevel::Debug, "splitting on the left %llx\n", uintptrDst - splitPlaceholder.Start); + VirtmemPlaceholders.push_back({splitPlaceholder.Start, uintptrDst - splitPlaceholder.Start}); + } + if (uintptrDst+size < splitPlaceholder.Start+splitPlaceholder.Size) + { + //Log(LogLevel::Debug, "splitting on the right %llx\n", (splitPlaceholder.Start+splitPlaceholder.Size)-(uintptrDst+size)); + VirtmemPlaceholders.push_back({uintptrDst+size, (splitPlaceholder.Start+splitPlaceholder.Size)-(uintptrDst+size)}); + } + + if (!mapViewOfFile3Ptr(MemoryFile, nullptr, dst, offset, size, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, nullptr, 0)) + { + Log(LogLevel::Debug, "MapViewOfFile3 failed with %x\n", GetLastError()); + return false; + } + + return true; + } + } + + Log(LogLevel::Debug, "no mapping at all found??? %p %x %p\n", dst, size, MemoryBase); + return false; #else return mmap(dst, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, MemoryFile, offset) != MAP_FAILED; #endif @@ -280,21 +337,68 @@ bool ARMJIT_Memory::MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) noexce bool ARMJIT_Memory::UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) noexcept { + CHECK_ALIGNED(addr); + CHECK_ALIGNED(offset); + CHECK_ALIGNED(size); + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; #ifdef __SWITCH__ Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size); return R_SUCCEEDED(r); #elif defined(_WIN32) - return UnmapViewOfFile(dst); + if (!UnmapViewOfFileEx(dst, MEM_PRESERVE_PLACEHOLDER)) + { + Log(LogLevel::Debug, "UnmapViewOfFileEx failed %x\n", GetLastError()); + return false; + } + + uintptr_t uintptrDst = reinterpret_cast(dst); + uintptr_t coalesceStart = uintptrDst; + size_t coalesceSize = size; + + for (auto it = VirtmemPlaceholders.begin(); it != VirtmemPlaceholders.end();) + { + if (it->Start+it->Size == uintptrDst) + { + //Log(LogLevel::Debug, "Coalescing to the left\n"); + coalesceStart = it->Start; + coalesceSize += it->Size; + it = VirtmemPlaceholders.erase(it); + } + else if (it->Start == uintptrDst+size) + { + //Log(LogLevel::Debug, "Coalescing to the right\n"); + coalesceSize += it->Size; + it = VirtmemPlaceholders.erase(it); + } + else + { + it++; + } + } + + if (coalesceStart != uintptrDst || coalesceSize != size) + { + if (!VirtualFree(reinterpret_cast(coalesceStart), coalesceSize, MEM_RELEASE|MEM_COALESCE_PLACEHOLDERS)) + return false; + + } + VirtmemPlaceholders.push_back({coalesceStart, coalesceSize}); + //Log(LogLevel::Debug, "Adding coalesced region %llx %llx", coalesceStart, coalesceSize); + + return true; #else - return munmap(dst, size) == 0; + return mmap(dst, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0) != MAP_FAILED; #endif } #ifndef __SWITCH__ void ARMJIT_Memory::SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection) noexcept { + CHECK_ALIGNED(addr); + CHECK_ALIGNED(size); + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; #if defined(_WIN32) DWORD winProtection, oldProtection; @@ -305,6 +409,10 @@ void ARMJIT_Memory::SetCodeProtectionRange(u32 addr, u32 size, u32 num, int prot else winProtection = PAGE_READWRITE; bool success = VirtualProtect(dst, size, winProtection, &oldProtection); + if (!success) + { + Log(LogLevel::Debug, "VirtualProtect failed with %x\n", GetLastError()); + } assert(success); #else int posixProt; @@ -335,14 +443,14 @@ void ARMJIT_Memory::Mapping::Unmap(int region, melonDS::NDS& nds) noexcept else { u32 segmentOffset = offset; - u8 status = statuses[(Addr + offset) >> 12]; - while (statuses[(Addr + offset) >> 12] == status + u8 status = statuses[(Addr + offset) >> PageShift]; + while (statuses[(Addr + offset) >> PageShift] == status && offset < Size && (!skipDTCM || Addr + offset != dtcmStart)) { - assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); - statuses[(Addr + offset) >> 12] = memstate_Unmapped; - offset += 0x1000; + assert(statuses[(Addr + offset) >> PageShift] != memstate_Unmapped); + statuses[(Addr + offset) >> PageShift] = memstate_Unmapped; + offset += PageSize; } #ifdef __SWITCH__ @@ -358,7 +466,6 @@ void ARMJIT_Memory::Mapping::Unmap(int region, melonDS::NDS& nds) noexcept } #ifndef __SWITCH__ -#ifndef _WIN32 u32 dtcmEnd = dtcmStart + dtcmSize; if (Num == 0 && dtcmEnd >= Addr @@ -378,7 +485,6 @@ void ARMJIT_Memory::Mapping::Unmap(int region, melonDS::NDS& nds) noexcept } } else -#endif { bool succeded = nds.JIT.Memory.UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); assert(succeded); @@ -388,7 +494,7 @@ void ARMJIT_Memory::Mapping::Unmap(int region, melonDS::NDS& nds) noexcept void ARMJIT_Memory::SetCodeProtection(int region, u32 offset, bool protect) noexcept { - offset &= ~0xFFF; + offset &= ~(PageSize - 1); //printf("set code protection %d %x %d\n", region, offset, protect); for (int i = 0; i < Mappings[region].Length; i++) @@ -406,9 +512,9 @@ void ARMJIT_Memory::SetCodeProtection(int region, u32 offset, bool protect) noex u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); - //printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]); - assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); - states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; + //printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> PageShift]); + assert(states[effectiveAddr >> PageShift] == (protect ? memstate_MappedRW : memstate_MappedProtected)); + states[effectiveAddr >> PageShift] = protect ? memstate_MappedProtected : memstate_MappedRW; #if defined(__SWITCH__) bool success; @@ -418,7 +524,7 @@ void ARMJIT_Memory::SetCodeProtection(int region, u32 offset, bool protect) noex success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); assert(success); #else - SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2); + SetCodeProtectionRange(effectiveAddr, PageSize, mapping.Num, protect ? 1 : 2); #endif } } @@ -543,11 +649,19 @@ bool ARMJIT_Memory::MapAtAddress(u32 addr) noexcept u32 dtcmSize = ~NDS.ARM9.DTCMMask + 1; u32 dtcmEnd = dtcmStart + dtcmSize; #ifndef __SWITCH__ -#ifndef _WIN32 if (num == 0 && dtcmEnd >= mirrorStart && dtcmStart < mirrorStart + mirrorSize) { + if (dtcmSize < PageSize) + { + // we could technically mask out the DTCM by setting a hole to access permissions + // but realistically there isn't much of a point in mapping less than 16kb of DTCM + // so it isn't worth more complex support + Log(LogLevel::Info, "DTCM size smaller than 16kb skipping mapping entirely"); + return false; + } + bool success; if (dtcmStart > mirrorStart) { @@ -562,7 +676,6 @@ bool ARMJIT_Memory::MapAtAddress(u32 addr) noexcept } } else -#endif { bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); assert(succeded); @@ -579,22 +692,19 @@ bool ARMJIT_Memory::MapAtAddress(u32 addr) noexcept { if (skipDTCM && mirrorStart + offset == dtcmStart) { -#ifdef _WIN32 - SetCodeProtectionRange(dtcmStart, dtcmSize, 0, 0); -#endif offset += dtcmSize; } else { u32 sectionOffset = offset; - bool hasCode = isExecutable && PageContainsCode(&range[offset / 512]); + bool hasCode = isExecutable && PageContainsCode(&range[offset / 512], PageSize); while (offset < mirrorSize - && (!isExecutable || PageContainsCode(&range[offset / 512]) == hasCode) + && (!isExecutable || PageContainsCode(&range[offset / 512], PageSize) == hasCode) && (!skipDTCM || mirrorStart + offset != NDS.ARM9.DTCMBase)) { - assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); - states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW; - offset += 0x1000; + assert(states[(mirrorStart + offset) >> PageShift] == memstate_Unmapped); + states[(mirrorStart + offset) >> PageShift] = hasCode ? memstate_MappedProtected : memstate_MappedRW; + offset += PageSize; } u32 sectionSize = offset - sectionOffset; @@ -624,6 +734,86 @@ bool ARMJIT_Memory::MapAtAddress(u32 addr) noexcept return true; } +u32 ARMJIT_Memory::PageSize = 0; +u32 ARMJIT_Memory::PageShift = 0; + +bool ARMJIT_Memory::IsFastMemSupported() +{ +#ifdef __APPLE__ + return false; +#else + static bool initialised = false; + static bool isSupported = false; + if (!initialised) + { +#ifdef _WIN32 + ARMJIT_Global::Init(); + isSupported = virtualAlloc2Ptr != nullptr; + ARMJIT_Global::DeInit(); + + PageSize = RegularPageSize; +#else + PageSize = __sysconf(_SC_PAGESIZE); + isSupported = PageShift == RegularPageSize || PageSize == LargePageSize; +#endif + PageShift = __builtin_ctz(PageSize); + initialised = true; + } + return isSupported; +#endif +} + +void ARMJIT_Memory::RegisterFaultHandler() +{ +#ifdef _WIN32 + ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler); + + KernelBaseDll = LoadLibrary("KernelBase.dll"); + if (KernelBaseDll) + { + virtualAlloc2Ptr = reinterpret_cast(GetProcAddress(KernelBaseDll, "VirtualAlloc2")); + mapViewOfFile3Ptr = reinterpret_cast(GetProcAddress(KernelBaseDll, "MapViewOfFile3")); + } + + if (!virtualAlloc2Ptr) + { + Log(LogLevel::Error, "Could not load new Windows virtual memory functions, fast memory is disabled.\n"); + } +#else + struct sigaction sa; + sa.sa_handler = nullptr; + sa.sa_sigaction = &SigsegvHandler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sigaction(SIGSEGV, &sa, &OldSaSegv); +#ifdef __APPLE__ + sigaction(SIGBUS, &sa, &OldSaBus); +#endif +#endif +} + +void ARMJIT_Memory::UnregisterFaultHandler() +{ +#ifdef _WIN32 + if (ExceptionHandlerHandle) + { + RemoveVectoredExceptionHandler(ExceptionHandlerHandle); + ExceptionHandlerHandle = nullptr; + } + + if (KernelBaseDll) + { + FreeLibrary(KernelBaseDll); + KernelBaseDll = nullptr; + } +#else + sigaction(SIGSEGV, &OldSaSegv, nullptr); +#ifdef __APPLE__ + sigaction(SIGBUS, &OldSaBus, nullptr); +#endif +#endif +} + bool ARMJIT_Memory::FaultHandler(FaultDescription& faultDesc, melonDS::NDS& nds) { if (nds.JIT.JITCompiler.IsJITFault(faultDesc.FaultPC)) @@ -632,7 +822,7 @@ bool ARMJIT_Memory::FaultHandler(FaultDescription& faultDesc, melonDS::NDS& nds) u8* memStatus = nds.CurCPU == 0 ? nds.JIT.Memory.MappingStatus9 : nds.JIT.Memory.MappingStatus7; - if (memStatus[faultDesc.EmulatedFaultAddr >> 12] == memstate_Unmapped) + if (memStatus[faultDesc.EmulatedFaultAddr >> PageShift] == memstate_Unmapped) rewriteToSlowPath = !nds.JIT.Memory.MapAtAddress(faultDesc.EmulatedFaultAddr); if (rewriteToSlowPath) @@ -643,10 +833,9 @@ bool ARMJIT_Memory::FaultHandler(FaultDescription& faultDesc, melonDS::NDS& nds) return false; } -const u64 AddrSpaceSize = 0x100000000; - ARMJIT_Memory::ARMJIT_Memory(melonDS::NDS& nds) : NDS(nds) { + ARMJIT_Global::Init(); #if defined(__SWITCH__) MemoryBase = (u8*)aligned_alloc(0x1000, MemoryTotalSize); virtmemLock(); @@ -671,33 +860,27 @@ ARMJIT_Memory::ARMJIT_Memory(melonDS::NDS& nds) : NDS(nds) u8* basePtr = MemoryBaseCodeMem; #elif defined(_WIN32) - ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler); + if (virtualAlloc2Ptr) + { + MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, 0, MemoryTotalSize, nullptr); - MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL); + MemoryBase = reinterpret_cast(virtualAlloc2Ptr(nullptr, nullptr, VirtmemAreaSize, + MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, + PAGE_NOACCESS, + nullptr, 0)); + // split off placeholder and map base mapping + VirtualFree(MemoryBase, MemoryTotalSize, MEM_RELEASE|MEM_PRESERVE_PLACEHOLDER); + mapViewOfFile3Ptr(MemoryFile, nullptr, MemoryBase, 0, MemoryTotalSize, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, nullptr, 0); - MemoryBase = (u8*)VirtualAlloc(NULL, AddrSpaceSize*4, MEM_RESERVE, PAGE_READWRITE); - VirtualFree(MemoryBase, 0, MEM_RELEASE); - // this is incredible hacky - // but someone else is trying to go into our address space! - // Windows will very likely give them virtual memory starting at the same address - // as it is giving us now. - // That's why we don't use this address, but instead 4gb inwards - // I know this is terrible - FastMem9Start = MemoryBase + AddrSpaceSize; - FastMem7Start = MemoryBase + AddrSpaceSize*2; - MemoryBase = MemoryBase + AddrSpaceSize*3; - - MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase); + VirtmemPlaceholders.push_back({reinterpret_cast(MemoryBase)+MemoryTotalSize, AddrSpaceSize*2}); + } + else + { + // old Windows version + MemoryBase = new u8[MemoryTotalSize]; + } #else - // this used to be allocated with three different mmaps - // The idea was to give the OS more freedom where to position the buffers, - // but something was bad about this so instead we take this vmem eating monster - // which seems to work better. - MemoryBase = (u8*)mmap(NULL, AddrSpaceSize*4, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); - munmap(MemoryBase, AddrSpaceSize*4); - FastMem9Start = MemoryBase; - FastMem7Start = MemoryBase + AddrSpaceSize; - MemoryBase = MemoryBase + AddrSpaceSize*2; + MemoryBase = (u8*)mmap(nullptr, VirtmemAreaSize, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); #if defined(__ANDROID__) Libandroid = Platform::DynamicLibrary_Load("libandroid.so"); @@ -730,20 +913,10 @@ ARMJIT_Memory::ARMJIT_Memory(melonDS::NDS& nds) : NDS(nds) Log(LogLevel::Error, "Failed to allocate memory using ftruncate! (%s)", strerror(errno)); } - struct sigaction sa; - sa.sa_handler = nullptr; - sa.sa_sigaction = &SigsegvHandler; - sa.sa_flags = SA_SIGINFO; - sigemptyset(&sa.sa_mask); - sigaction(SIGSEGV, &sa, &OldSaSegv); -#ifdef __APPLE__ - sigaction(SIGBUS, &sa, &OldSaBus); -#endif - mmap(MemoryBase, MemoryTotalSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, MemoryFile, 0); - - u8* basePtr = MemoryBase; #endif + FastMem9Start = MemoryBase+MemoryTotalSize; + FastMem7Start = static_cast(FastMem9Start)+AddrSpaceSize; } ARMJIT_Memory::~ARMJIT_Memory() noexcept @@ -764,34 +937,37 @@ ARMJIT_Memory::~ARMJIT_Memory() noexcept free(MemoryBase); MemoryBase = nullptr; #elif defined(_WIN32) - if (MemoryBase) + if (virtualAlloc2Ptr) { - bool viewUnmapped = UnmapViewOfFile(MemoryBase); - assert(viewUnmapped); - MemoryBase = nullptr; - FastMem9Start = nullptr; - FastMem7Start = nullptr; - } + if (MemoryBase) + { + bool viewUnmapped = UnmapViewOfFileEx(MemoryBase, MEM_PRESERVE_PLACEHOLDER); + assert(viewUnmapped); + bool viewCoalesced = VirtualFree(MemoryBase, VirtmemAreaSize, MEM_RELEASE|MEM_COALESCE_PLACEHOLDERS); + assert(viewCoalesced); + bool freeEverything = VirtualFree(MemoryBase, 0, MEM_RELEASE); + assert(freeEverything); - if (MemoryFile) - { - CloseHandle(MemoryFile); - MemoryFile = INVALID_HANDLE_VALUE; - } + MemoryBase = nullptr; + FastMem9Start = nullptr; + FastMem7Start = nullptr; + printf("unmappinged everything\n"); + } - if (ExceptionHandlerHandle) + if (MemoryFile) + { + CloseHandle(MemoryFile); + MemoryFile = INVALID_HANDLE_VALUE; + } + } + else { - RemoveVectoredExceptionHandler(ExceptionHandlerHandle); - ExceptionHandlerHandle = nullptr; + delete[] MemoryBase; } #else - sigaction(SIGSEGV, &OldSaSegv, nullptr); -#ifdef __APPLE__ - sigaction(SIGBUS, &OldSaBus, nullptr); -#endif if (MemoryBase) { - munmap(MemoryBase, MemoryTotalSize); + munmap(MemoryBase, VirtmemAreaSize); MemoryBase = nullptr; FastMem9Start = nullptr; FastMem7Start = nullptr; @@ -812,6 +988,8 @@ ARMJIT_Memory::~ARMJIT_Memory() noexcept #endif #endif + + ARMJIT_Global::DeInit(); } void ARMJIT_Memory::Reset() noexcept @@ -834,17 +1012,6 @@ void ARMJIT_Memory::Reset() noexcept bool ARMJIT_Memory::IsFastmemCompatible(int region) const noexcept { -#ifdef _WIN32 - /* - TODO: with some hacks, the smaller shared WRAM regions - could be mapped in some occaisons as well - */ - if (region == memregion_DTCM - || region == memregion_SharedWRAM - || region == memregion_NewSharedWRAM_B - || region == memregion_NewSharedWRAM_C) - return false; -#endif return OffsetsPerRegion[region] != UINT32_MAX; } diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h index 88e647d5..cac9dc62 100644 --- a/src/ARMJIT_Memory.h +++ b/src/ARMJIT_Memory.h @@ -23,6 +23,7 @@ #include "MemConstants.h" #ifdef JIT_ENABLED +# include # include "TinyVector.h" # include "ARM.h" # if defined(__SWITCH__) @@ -48,23 +49,22 @@ class Compiler; class ARMJIT; #endif +static constexpr u32 LargePageSize = 0x4000; +static constexpr u32 RegularPageSize = 0x1000; + constexpr u32 RoundUp(u32 size) noexcept { -#ifdef _WIN32 - return (size + 0xFFFF) & ~0xFFFF; -#else - return size; -#endif + return (size + LargePageSize - 1) & ~(LargePageSize - 1); } -const u32 MemBlockMainRAMOffset = 0; -const u32 MemBlockSWRAMOffset = RoundUp(MainRAMMaxSize); -const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(SharedWRAMSize); -const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(ARM7WRAMSize); -const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize); -const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(NWRAMSize); -const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(NWRAMSize); -const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(NWRAMSize); +static constexpr u32 MemBlockMainRAMOffset = 0; +static constexpr u32 MemBlockSWRAMOffset = RoundUp(MainRAMMaxSize); +static constexpr u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(SharedWRAMSize); +static constexpr u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(ARM7WRAMSize); +static constexpr u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize); +static constexpr u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(NWRAMSize); +static constexpr u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(NWRAMSize); +static constexpr u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(NWRAMSize); class ARMJIT_Memory { @@ -137,6 +137,14 @@ public: bool IsFastmemCompatible(int region) const noexcept; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) const noexcept; bool MapAtAddress(u32 addr) noexcept; + + static bool IsFastMemSupported(); + + static void RegisterFaultHandler(); + static void UnregisterFaultHandler(); + + static u32 PageSize; + static u32 PageShift; private: friend class Compiler; struct Mapping @@ -162,14 +170,22 @@ private: void* FastMem9Start; void* FastMem7Start; u8* MemoryBase = nullptr; + #if defined(__SWITCH__) VirtmemReservation* FastMem9Reservation, *FastMem7Reservation; u8* MemoryBaseCodeMem; #elif defined(_WIN32) + struct VirtmemPlaceholder + { + uintptr_t Start; + size_t Size; + }; + std::vector VirtmemPlaceholders; + static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo); HANDLE MemoryFile = INVALID_HANDLE_VALUE; - LPVOID ExceptionHandlerHandle = nullptr; #else + static void SigsegvHandler(int sig, siginfo_t* info, void* rawContext); int MemoryFile = -1; #endif diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index c32e2b73..bd73ae71 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -176,9 +176,9 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) else MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste if (Num == 0) - CALL((void*)&ARMv5JumpToTrampoline); + ABI_CallFunction(ARMv5JumpToTrampoline); else - CALL((void*)&ARMv4JumpToTrampoline); + ABI_CallFunction(ARMv4JumpToTrampoline); PopRegs(restoreCPSR, true); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index ba6c0fb4..6de4caf6 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -21,19 +21,13 @@ #include "../ARMJIT.h" #include "../ARMInterpreter.h" #include "../NDS.h" +#include "../ARMJIT_Global.h" #include #include #include "../dolphin/CommonFuncs.h" -#ifdef _WIN32 -#include -#else -#include -#include -#endif - using namespace Gen; using namespace Common; @@ -222,46 +216,21 @@ void Compiler::A_Comp_MSR() MOV(32, R(ABI_PARAM3), R(RCPSR)); MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((void*)&UpdateModeTrampoline); + ABI_CallFunction(UpdateModeTrampoline); PopRegs(true, true); } } } -/* - We'll repurpose this .bss memory - - */ -u8 CodeMemory[1024 * 1024 * 32]; - Compiler::Compiler(melonDS::NDS& nds) : XEmitter(), NDS(nds) { - { - #ifdef _WIN32 - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); + ARMJIT_Global::Init(); - u64 pageSize = (u64)sysInfo.dwPageSize; - #else - u64 pageSize = sysconf(_SC_PAGE_SIZE); - #endif + CodeMemBase = static_cast(ARMJIT_Global::AllocateCodeMem()); + CodeMemSize = ARMJIT_Global::CodeMemorySliceSize; - u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); - u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; - - #ifdef _WIN32 - DWORD dummy; - VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); - #elif defined(__APPLE__) - pageAligned = (u8*)mmap(NULL, 1024*1024*32, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS ,-1, 0); - #else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); - #endif - - ResetStart = pageAligned; - CodeMemSize = alignedSize; - } + ResetStart = CodeMemBase; Reset(); @@ -475,6 +444,13 @@ Compiler::Compiler(melonDS::NDS& nds) : XEmitter(), NDS(nds) FarSize = (ResetStart + CodeMemSize) - FarStart; } +Compiler::~Compiler() +{ + ARMJIT_Global::FreeCodeMem(CodeMemBase); + + ARMJIT_Global::DeInit(); +} + void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -684,7 +660,7 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) if (ConstantCycles) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - JMP((u8*)&ARM_Ret, true); + ABI_TailCall(ARM_Ret); } } @@ -846,7 +822,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] if (ConstantCycles) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - JMP((u8*)ARM_Ret, true); + ABI_TailCall(ARM_Ret); #ifdef JIT_PROFILING_ENABLED CreateMethod("JIT_Block_%d_%d_%08X", (void*)res, Num, Thumb, instrs[0].Addr); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 3965e882..c714a6ba 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -84,6 +84,7 @@ class Compiler : public Gen::XEmitter { public: explicit Compiler(melonDS::NDS& nds); + ~Compiler(); void Reset(); @@ -256,6 +257,7 @@ public: std::unordered_map LoadStorePatches {}; + u8* CodeMemBase; u8* ResetStart {}; u32 CodeMemSize {}; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 219c7271..71cd0770 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -316,24 +316,24 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowWrite9); break; - case 16: CALL((void*)&SlowWrite9); break; - case 8: CALL((void*)&SlowWrite9); break; - case 33: CALL((void*)&SlowWrite9); break; - case 17: CALL((void*)&SlowWrite9); break; - case 9: CALL((void*)&SlowWrite9); break; + case 32: ABI_CallFunction(SlowWrite9); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 8: ABI_CallFunction(&SlowWrite9); break; + case 33: ABI_CallFunction(&SlowWrite9); break; + case 17: ABI_CallFunction(&SlowWrite9); break; + case 9: ABI_CallFunction(&SlowWrite9); break; } } else { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowRead9); break; - case 16: CALL((void*)&SlowRead9); break; - case 8: CALL((void*)&SlowRead9); break; - case 33: CALL((void*)&SlowRead9); break; - case 17: CALL((void*)&SlowRead9); break; - case 9: CALL((void*)&SlowRead9); break; + case 32: ABI_CallFunction(&SlowRead9); break; + case 16: ABI_CallFunction(&SlowRead9); break; + case 8: ABI_CallFunction(&SlowRead9); break; + case 33: ABI_CallFunction(&SlowRead9); break; + case 17: ABI_CallFunction(&SlowRead9); break; + case 9: ABI_CallFunction(&SlowRead9); break; } } } @@ -347,24 +347,24 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowWrite7); break; - case 16: CALL((void*)&SlowWrite7); break; - case 8: CALL((void*)&SlowWrite7); break; - case 33: CALL((void*)&SlowWrite7); break; - case 17: CALL((void*)&SlowWrite7); break; - case 9: CALL((void*)&SlowWrite7); break; + case 32: ABI_CallFunction(&SlowWrite7); break; + case 16: ABI_CallFunction(&SlowWrite7); break; + case 8: ABI_CallFunction(&SlowWrite7); break; + case 33: ABI_CallFunction(&SlowWrite7); break; + case 17: ABI_CallFunction(&SlowWrite7); break; + case 9: ABI_CallFunction(&SlowWrite7); break; } } else { switch (size | NDS.ConsoleType) { - case 32: CALL((void*)&SlowRead7); break; - case 16: CALL((void*)&SlowRead7); break; - case 8: CALL((void*)&SlowRead7); break; - case 33: CALL((void*)&SlowRead7); break; - case 17: CALL((void*)&SlowRead7); break; - case 9: CALL((void*)&SlowRead7); break; + case 32: ABI_CallFunction(&SlowRead7); break; + case 16: ABI_CallFunction(&SlowRead7); break; + case 8: ABI_CallFunction(&SlowRead7); break; + case 33: ABI_CallFunction(&SlowRead7); break; + case 17: ABI_CallFunction(&SlowRead7); break; + case 9: ABI_CallFunction(&SlowRead7); break; } } } @@ -526,10 +526,10 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc switch (Num * 2 | NDS.ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: ABI_CallFunction(&SlowBlockTransfer9); break; + case 1: ABI_CallFunction(&SlowBlockTransfer9); break; + case 2: ABI_CallFunction(&SlowBlockTransfer7); break; + case 3: ABI_CallFunction(&SlowBlockTransfer7); break; } PopRegs(false, false); @@ -630,10 +630,10 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc switch (Num * 2 | NDS.ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: ABI_CallFunction(&SlowBlockTransfer9); break; + case 1: ABI_CallFunction(&SlowBlockTransfer9); break; + case 2: ABI_CallFunction(&SlowBlockTransfer7); break; + case 3: ABI_CallFunction(&SlowBlockTransfer7); break; } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a4cb6f1e..fa8d475c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -97,8 +97,13 @@ if (ENABLE_JIT) ARMJIT.cpp ARMJIT_Memory.cpp + ARMJIT_Global.cpp dolphin/CommonFuncs.cpp) + + if (WIN32) + target_link_libraries(core PRIVATE onecore) + endif() if (ARCHITECTURE STREQUAL x86_64) target_sources(core PRIVATE diff --git a/src/NDS.cpp b/src/NDS.cpp index d8e1d216..b9370c6b 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -74,7 +74,7 @@ const s32 kIterationCycleMargin = 8; // // timings for GBA slot and wifi are set up at runtime -NDS* NDS::Current = nullptr; +thread_local NDS* NDS::Current = nullptr; NDS::NDS() noexcept : NDS( @@ -128,6 +128,7 @@ NDS::NDS(NDSArgs&& args, int type, void* userdata) noexcept : MainRAM = JIT.Memory.GetMainRAM(); SharedWRAM = JIT.Memory.GetSharedWRAM(); ARM7WRAM = JIT.Memory.GetARM7WRAM(); + } NDS::~NDS() noexcept @@ -894,6 +895,8 @@ void NDS::RunSystemSleep(u64 timestamp) template u32 NDS::RunFrame() { + Current = this; + FrameStartTimestamp = SysTimestamp; GPU.TotalScanlines = 0; diff --git a/src/NDS.h b/src/NDS.h index da68799f..8dfbf28b 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -541,8 +541,8 @@ public: NDS& operator=(const NDS&) = delete; NDS(NDS&&) = delete; NDS& operator=(NDS&&) = delete; - // The frontend should set and unset this manually after creating and destroying the NDS object. - [[deprecated("Temporary workaround until JIT code generation is revised to accommodate multiple NDS objects.")]] static NDS* Current; + + static thread_local NDS* Current; protected: explicit NDS(NDSArgs&& args, int type, void* userdata) noexcept; virtual void DoSavestateExtra(Savestate* file) {} diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h index 36603218..d83a41f8 100644 --- a/src/dolphin/x64Emitter.h +++ b/src/dolphin/x64Emitter.h @@ -1019,6 +1019,28 @@ public: CALL(ptr); } } + template + void ABI_TailCall(FunctionPointer func) + { + static_assert(std::is_pointer() && + std::is_function>(), + "Supplied type must be a function pointer."); + + const u8* ptr = reinterpret_cast(func); + const u64 address = reinterpret_cast(ptr); + const u64 distance = address - (reinterpret_cast(code) + 5); + + if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) + { + // Far call + MOV(64, R(RAX), Imm64(address)); + JMPptr(R(RAX)); + } + else + { + JMP(ptr, true); + } + } template void ABI_CallFunctionC16(FunctionPointer func, u16 param1) diff --git a/src/frontend/qt_sdl/EmuInstance.cpp b/src/frontend/qt_sdl/EmuInstance.cpp index 1af8a887..a1d106b8 100644 --- a/src/frontend/qt_sdl/EmuInstance.cpp +++ b/src/frontend/qt_sdl/EmuInstance.cpp @@ -165,7 +165,6 @@ EmuInstance::~EmuInstance() audioDeInit(); inputDeInit(); - NDS::Current = nullptr; if (nds) { saveRTCData(); @@ -1339,7 +1338,6 @@ bool EmuInstance::updateConsole() noexcept renderLock.lock(); if ((!nds) || (consoleType != nds->ConsoleType)) { - NDS::Current = nullptr; if (nds) { saveRTCData(); @@ -1351,7 +1349,6 @@ bool EmuInstance::updateConsole() noexcept else nds = new NDS(std::move(ndsargs), this); - NDS::Current = nds; nds->Reset(); loadRTCData(); //emuThread->updateVideoRenderer(); // not actually needed? diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp index b37f7118..ed5eba10 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp +++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp @@ -82,9 +82,6 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new ui->chkJITBranchOptimisations->setChecked(cfg.GetBool("JIT.BranchOptimisations")); ui->chkJITLiteralOptimisations->setChecked(cfg.GetBool("JIT.LiteralOptimisations")); ui->chkJITFastMemory->setChecked(cfg.GetBool("JIT.FastMemory")); - #ifdef __APPLE__ - ui->chkJITFastMemory->setDisabled(true); - #endif ui->spnJITMaximumBlockSize->setValue(cfg.GetInt("JIT.MaxBlockSize")); #else ui->chkEnableJIT->setDisabled(true); @@ -541,9 +538,7 @@ void EmuSettingsDialog::on_chkEnableJIT_toggled() bool disabled = !ui->chkEnableJIT->isChecked(); ui->chkJITBranchOptimisations->setDisabled(disabled); ui->chkJITLiteralOptimisations->setDisabled(disabled); - #ifndef __APPLE__ - ui->chkJITFastMemory->setDisabled(disabled); - #endif + ui->chkJITFastMemory->setDisabled(disabled || !ARMJIT_Memory::IsFastMemSupported()); ui->spnJITMaximumBlockSize->setDisabled(disabled); on_cbGdbEnabled_toggled();