From 86c1f6e1e7e2abcc3f42e1182134e632b218b6d3 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 1 Jan 2023 20:18:03 +0100 Subject: [PATCH] Jit: Don't use a second stack This second stack leads to JNI problems on Android, because ART fetches the address and size of the original stack using pthread functions (see GetThreadStack in art/runtime/thread.cc), and (presumably) treats stack addresses outside of the original stack as invalid. (What I don't understand is why some JNI operations on the CPU thread work fine despite this but others don't.) Instead of creating a second stack, let's borrow the approach ART uses: Use pthread functions to find out the stack's address and size, then install guard pages at an appropriate location. This lets us get rid of a workaround we had in the MsgAlert function. Because we're no longer choosing the stack size ourselves, I've made some tweaks to where the put the guard pages. Previously we had a stack of 2 MiB and a safe zone of 512 KiB. We now accept stacks as small as 512 KiB (used on macOS) and use a safe zone of 256 KiB. I feel like this should be fine, but haven't done much testing beyond "it seems to work". By the way, on Windows it was already the case that we didn't create a second stack... But there was a bug in the implementation! The code for protecting the stack has to run on the CPU thread, since it's the CPU thread's stack we want to protect, but it was actually running on EmuThread. This commit fixes that, since now this bug matters on other operating systems too. --- Source/Android/jni/MainAndroid.cpp | 26 ++--- Source/Core/Common/Thread.cpp | 36 +++++++ Source/Core/Common/Thread.h | 9 ++ Source/Core/Core/PowerPC/Jit64/Jit.cpp | 100 +++++++++++++------ Source/Core/Core/PowerPC/Jit64/Jit.h | 6 +- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 28 ++---- Source/Core/Core/PowerPC/Jit64/JitAsm.h | 3 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 89 +++++++++++------ Source/Core/Core/PowerPC/JitArm64/Jit.h | 8 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 19 +--- 10 files changed, 203 insertions(+), 121 deletions(-) diff --git a/Source/Android/jni/MainAndroid.cpp b/Source/Android/jni/MainAndroid.cpp index 7902a9973f..01728ce014 100644 --- a/Source/Android/jni/MainAndroid.cpp +++ b/Source/Android/jni/MainAndroid.cpp @@ -195,26 +195,18 @@ std::unique_ptr Host_CreateGBAHost(std::weak_ptrCallStaticBooleanMethod( + IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no, + style == Common::MsgType::Warning, s_need_nonblocking_alert_msg); - jstring j_caption = ToJString(env, caption); - jstring j_text = ToJString(env, text); - - // Execute the Java method. - result = env->CallStaticBooleanMethod( - IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no, - style == Common::MsgType::Warning, s_need_nonblocking_alert_msg); - - env->DeleteLocalRef(j_caption); - env->DeleteLocalRef(j_text); - }).join(); + env->DeleteLocalRef(j_caption); + env->DeleteLocalRef(j_text); return result != JNI_FALSE; } diff --git a/Source/Core/Common/Thread.cpp b/Source/Core/Common/Thread.cpp index 7bbc27e5f4..810db15d78 100644 --- a/Source/Core/Common/Thread.cpp +++ b/Source/Core/Common/Thread.cpp @@ -7,6 +7,7 @@ #include #include #else +#include #include #endif @@ -185,6 +186,41 @@ void SetCurrentThreadName(const char* name) #endif } +std::tuple GetCurrentThreadStack() +{ + void* stack_addr; + size_t stack_size; + + pthread_t self = pthread_self(); + +#ifdef __APPLE__ + stack_size = pthread_get_stacksize_np(self); + stack_addr = reinterpret_cast(pthread_get_stackaddr_np(self)) - stack_size; +#elif defined __OpenBSD__ + stack_t stack; + pthread_stackseg_np(self, &stack); + + stack_addr = reinterpret_cast(stack->ss_sp) - stack->ss_size; + stack_size = stack->ss_size; +#else + pthread_attr_t attr; + +#ifdef __FreeBSD__ + pthread_attr_init(&attr); + pthread_attr_get_np(self, &attr); +#else + // Linux and NetBSD + pthread_getattr_np(self, &attr); +#endif + + pthread_attr_getstack(&attr, &stack_addr, &stack_size); + + pthread_attr_destroy(&attr); +#endif + + return std::make_tuple(stack_addr, stack_size); +} + #endif } // namespace Common diff --git a/Source/Core/Common/Thread.h b/Source/Core/Common/Thread.h index 7239df131b..fbba27a22c 100644 --- a/Source/Core/Common/Thread.h +++ b/Source/Core/Common/Thread.h @@ -5,6 +5,10 @@ #include +#ifndef _WIN32 +#include +#endif + // Don't include Common.h here as it will break LogManager #include "Common/CommonTypes.h" @@ -35,4 +39,9 @@ inline void YieldCPU() void SetCurrentThreadName(const char* name); +#ifndef _WIN32 +// Returns the lowest address of the stack and the size of the stack +std::tuple GetCurrentThreadStack(); +#endif + } // namespace Common diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index fd855b3893..6c9a616913 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -10,11 +10,14 @@ #include #include -// for the PROFILER stuff #ifdef _WIN32 #include +#include +#else +#include #endif +#include "Common/Align.h" #include "Common/CommonTypes.h" #include "Common/GekkoDisassembler.h" #include "Common/IOFile.h" @@ -23,6 +26,7 @@ #include "Common/PerformanceCounter.h" #include "Common/StringUtil.h" #include "Common/Swap.h" +#include "Common/Thread.h" #include "Common/x64ABI.h" #include "Core/Core.h" #include "Core/CoreTiming.h" @@ -140,15 +144,16 @@ using namespace PowerPC; // But when windows reaches the last guard page, it raises a "Stack Overflow" // exception which we can hook into, however by default it leaves you with less // than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack -// Overflow early while we still have 512kb of stack remaining. +// Overflow early while we still have 256kb of stack remaining. // After resetting the stack to the top, we call _resetstkoflw() to restore -// the guard page at the 512kb mark. +// the guard page at the 256kb mark. enum { - STACK_SIZE = 2 * 1024 * 1024, - SAFE_STACK_SIZE = 512 * 1024, - GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above) + SAFE_STACK_SIZE = 256 * 1024, + MIN_UNSAFE_STACK_SIZE = 192 * 1024, + MIN_STACK_SIZE = SAFE_STACK_SIZE + MIN_UNSAFE_STACK_SIZE, + GUARD_SIZE = 64 * 1024, GUARD_OFFSET = SAFE_STACK_SIZE - GUARD_SIZE, }; @@ -158,27 +163,57 @@ Jit64::Jit64() : QuantizedMemoryRoutines(*this) Jit64::~Jit64() = default; -void Jit64::AllocStack() +void Jit64::ProtectStack() { -#ifndef _WIN32 - m_stack = static_cast(Common::AllocateMemoryPages(STACK_SIZE)); - Common::ReadProtectMemory(m_stack, GUARD_SIZE); - Common::ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); -#else - // For windows we just keep using the system stack and reserve a large amount of memory at the end - // of the stack. + if (!m_enable_blr_optimization) + return; + +#ifdef _WIN32 ULONG reserveSize = SAFE_STACK_SIZE; SetThreadStackGuarantee(&reserveSize); +#else + auto [stack_addr, stack_size] = Common::GetCurrentThreadStack(); + + const uintptr_t stack_base_addr = reinterpret_cast(stack_addr); + const uintptr_t stack_middle_addr = reinterpret_cast(&stack_addr); + if (stack_middle_addr < stack_base_addr || stack_middle_addr >= stack_base_addr + stack_size) + { + PanicAlertFmt("Failed to get correct stack base"); + m_enable_blr_optimization = false; + return; + } + + const long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) + { + PanicAlertFmt("Failed to get page size"); + m_enable_blr_optimization = false; + return; + } + + const uintptr_t stack_guard_addr = Common::AlignUp(stack_base_addr + GUARD_OFFSET, page_size); + if (stack_guard_addr >= stack_middle_addr || + stack_middle_addr - stack_guard_addr < GUARD_SIZE + MIN_UNSAFE_STACK_SIZE) + { + PanicAlertFmt("Stack is too small for BLR optimization (size {:x}, base {:x}, current stack " + "pointer {:x}, alignment {:x})", + stack_size, stack_base_addr, stack_middle_addr, page_size); + m_enable_blr_optimization = false; + return; + } + + m_stack_guard = reinterpret_cast(stack_guard_addr); + Common::ReadProtectMemory(m_stack_guard, GUARD_SIZE); #endif } -void Jit64::FreeStack() +void Jit64::UnprotectStack() { #ifndef _WIN32 - if (m_stack) + if (m_stack_guard) { - Common::FreeMemoryPages(m_stack, STACK_SIZE); - m_stack = nullptr; + Common::UnWriteProtectMemory(m_stack_guard, GUARD_SIZE); + m_stack_guard = nullptr; } #endif } @@ -194,11 +229,9 @@ bool Jit64::HandleStackFault() WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program."); + UnprotectStack(); m_enable_blr_optimization = false; -#ifndef _WIN32 - // Windows does this automatically. - Common::UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); -#endif + // We're going to need to clear the whole cache to get rid of the bad // CALLs, but we can't yet. Fake the downcount so we're forced to the // dispatcher (no block linking), and clear the cache so we're sent to @@ -214,11 +247,13 @@ bool Jit64::HandleStackFault() bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx) { - uintptr_t stack = (uintptr_t)m_stack; - uintptr_t diff = access_address - stack; + const uintptr_t stack_guard = reinterpret_cast(m_stack_guard); // In the trap region? - if (m_enable_blr_optimization && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE) + if (m_enable_blr_optimization && access_address >= stack_guard && + access_address < stack_guard + GUARD_SIZE) + { return HandleStackFault(); + } // This generates some fairly heavy trampolines, but it doesn't really hurt. // Only instructions that access I/O will get these, and there won't be that @@ -370,12 +405,10 @@ void Jit64::Init() m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging; m_cleanup_after_stackfault = false; - m_stack = nullptr; - if (m_enable_blr_optimization) - AllocStack(); + m_stack_guard = nullptr; blocks.Init(); - asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr); + asm_routines.Init(); // important: do this *after* generating the global asm routines, because we can't use farcode in // them. @@ -415,7 +448,6 @@ void Jit64::ResetFreeMemoryRanges() void Jit64::Shutdown() { - FreeStack(); FreeCodeSpace(); auto& system = Core::System::GetInstance(); @@ -735,14 +767,22 @@ void Jit64::WriteExternalExceptionExit() void Jit64::Run() { + ProtectStack(); + CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code; pExecAddr(); + + UnprotectStack(); } void Jit64::SingleStep() { + ProtectStack(); + CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code; pExecAddr(); + + UnprotectStack(); } void Jit64::Trace() diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 521c8f5f35..4d96ea518d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -255,8 +255,8 @@ private: bool HandleFunctionHooking(u32 address); - void AllocStack(); - void FreeStack(); + void ProtectStack(); + void UnprotectStack(); void ResetFreeMemoryRanges(); @@ -270,7 +270,7 @@ private: bool m_enable_blr_optimization = false; bool m_cleanup_after_stackfault = false; - u8* m_stack = nullptr; + u8* m_stack_guard = nullptr; HyoutaUtilities::RangeSizeSet m_free_ranges_near; HyoutaUtilities::RangeSizeSet m_free_ranges_far; diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 3dc98a317e..61f6f43d7b 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -24,10 +24,9 @@ Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(j { } -void Jit64AsmRoutineManager::Init(u8* stack_top) +void Jit64AsmRoutineManager::Init() { m_const_pool.Init(AllocChildCodeSpace(4096), 4096); - m_stack_top = stack_top; Generate(); WriteProtect(); } @@ -50,17 +49,8 @@ void Jit64AsmRoutineManager::Generate() // MOV(64, R(RMEM), Imm64((u64)Memory::physical_base)); MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80)); - if (m_stack_top) - { - // Pivot the stack to our custom one. - MOV(64, R(RSCRATCH), R(RSP)); - MOV(64, R(RSP), ImmPtr(m_stack_top - 0x20)); - MOV(64, MDisp(RSP, 0x18), R(RSCRATCH)); - } - else - { - MOV(64, PPCSTATE(stored_stack_pointer), R(RSP)); - } + MOV(64, PPCSTATE(stored_stack_pointer), R(RSP)); + // something that can't pass the BLR test MOV(64, MDisp(RSP, 8), Imm32((u32)-1)); @@ -209,12 +199,9 @@ void Jit64AsmRoutineManager::Generate() if (enable_debugging) SetJumpTarget(dbg_exit); + // Reset the stack pointer, since the BLR optimization may have pushed things onto the stack + // without popping them. ResetStack(*this); - if (m_stack_top) - { - ADD(64, R(RSP), Imm8(0x18)); - POP(RSP); - } ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); RET(); @@ -226,10 +213,7 @@ void Jit64AsmRoutineManager::Generate() void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter) { - if (m_stack_top) - emitter.MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20)); - else - emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer)); + emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer)); } void Jit64AsmRoutineManager::GenerateCommon() diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.h b/Source/Core/Core/PowerPC/Jit64/JitAsm.h index b0d44cb867..06713b1766 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.h +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.h @@ -36,7 +36,7 @@ public: explicit Jit64AsmRoutineManager(Jit64& jit); - void Init(u8* stack_top); + void Init(); void ResetStack(Gen::X64CodeBlock& emitter); @@ -44,6 +44,5 @@ private: void Generate(); void GenerateCommon(); - u8* m_stack_top = nullptr; JitBase& m_jit; }; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 5770e30acf..78b837bfaf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -5,6 +5,13 @@ #include +#ifdef _WIN32 +#include +#else +#include +#endif + +#include "Common/Align.h" #include "Common/Arm64Emitter.h" #include "Common/CommonTypes.h" #include "Common/Logging/Log.h" @@ -12,6 +19,7 @@ #include "Common/MsgHandler.h" #include "Common/PerformanceCounter.h" #include "Common/StringUtil.h" +#include "Common/Thread.h" #include "Core/ConfigManager.h" #include "Core/Core.h" @@ -38,9 +46,10 @@ constexpr size_t CODE_SIZE = 1024 * 1024 * 32; constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64; constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64; -constexpr size_t STACK_SIZE = 2 * 1024 * 1024; -constexpr size_t SAFE_STACK_SIZE = 512 * 1024; -constexpr size_t GUARD_SIZE = 64 * 1024; // two guards - bottom (permanent) and middle (see above) +constexpr size_t SAFE_STACK_SIZE = 256 * 1024; +constexpr size_t MIN_UNSAFE_STACK_SIZE = 192 * 1024; +constexpr size_t MIN_STACK_SIZE = SAFE_STACK_SIZE + MIN_UNSAFE_STACK_SIZE; +constexpr size_t GUARD_SIZE = 64 * 1024; constexpr size_t GUARD_OFFSET = SAFE_STACK_SIZE - GUARD_SIZE; JitArm64::JitArm64() : m_float_emit(this) @@ -74,7 +83,6 @@ void JitArm64::Init() m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging; m_cleanup_after_stackfault = false; - AllocStack(); GenerateAsm(); ResetFreeMemoryRanges(); @@ -117,9 +125,8 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx) bool success = false; // Handle BLR stack faults, may happen in C++ code. - uintptr_t stack = (uintptr_t)m_stack_base; - uintptr_t diff = access_address - stack; - if (diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE) + const uintptr_t stack_guard = reinterpret_cast(m_stack_guard); + if (access_address >= stack_guard && access_address < stack_guard + GUARD_SIZE) success = HandleStackFault(); // If the fault is in JIT code space, look for fastmem areas. @@ -162,10 +169,10 @@ bool JitArm64::HandleStackFault() return false; ERROR_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program."); + + UnprotectStack(); m_enable_blr_optimization = false; -#ifndef _WIN32 - Common::UnWriteProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE); -#endif + GetBlockCache()->InvalidateICache(0, 0xffffffff, true); Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0); m_cleanup_after_stackfault = true; @@ -205,7 +212,6 @@ void JitArm64::Shutdown() memory.ShutdownFastmemArena(); FreeCodeSpace(); blocks.Shutdown(); - FreeStack(); } void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) @@ -337,37 +343,56 @@ void JitArm64::ResetStack() ADD(ARM64Reg::SP, ARM64Reg::X0, 0); } -void JitArm64::AllocStack() +void JitArm64::ProtectStack() { if (!m_enable_blr_optimization) return; -#ifndef _WIN32 - m_stack_base = static_cast(Common::AllocateMemoryPages(STACK_SIZE)); - if (!m_stack_base) +#ifdef _WIN32 + ULONG reserveSize = SAFE_STACK_SIZE; + SetThreadStackGuarantee(&reserveSize); +#else + auto [stack_addr, stack_size] = Common::GetCurrentThreadStack(); + + const uintptr_t stack_base_addr = reinterpret_cast(stack_addr); + const uintptr_t stack_middle_addr = reinterpret_cast(&stack_addr); + if (stack_middle_addr < stack_base_addr || stack_middle_addr >= stack_base_addr + stack_size) { + PanicAlertFmt("Failed to get correct stack base"); m_enable_blr_optimization = false; return; } - m_stack_pointer = m_stack_base + STACK_SIZE; - Common::ReadProtectMemory(m_stack_base, GUARD_SIZE); - Common::ReadProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE); -#else - // For windows we just keep using the system stack and reserve a large amount of memory at the end - // of the stack. - ULONG reserveSize = SAFE_STACK_SIZE; - SetThreadStackGuarantee(&reserveSize); + const long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) + { + PanicAlertFmt("Failed to get page size"); + m_enable_blr_optimization = false; + return; + } + + const uintptr_t stack_guard_addr = Common::AlignUp(stack_base_addr + GUARD_OFFSET, page_size); + if (stack_guard_addr >= stack_middle_addr || + stack_middle_addr - stack_guard_addr < GUARD_SIZE + MIN_UNSAFE_STACK_SIZE) + { + PanicAlertFmt("Stack is too small for BLR optimization (size {:x}, base {:x}, current stack " + "pointer {:x}, alignment {:x})", + stack_size, stack_base_addr, stack_middle_addr, page_size); + m_enable_blr_optimization = false; + return; + } + + m_stack_guard = reinterpret_cast(stack_guard_addr); + Common::ReadProtectMemory(m_stack_guard, GUARD_SIZE); #endif } -void JitArm64::FreeStack() +void JitArm64::UnprotectStack() { #ifndef _WIN32 - if (m_stack_base) - Common::FreeMemoryPages(m_stack_base, STACK_SIZE); - m_stack_base = nullptr; - m_stack_pointer = nullptr; + if (m_stack_guard) + Common::UnWriteProtectMemory(m_stack_guard, GUARD_SIZE); + m_stack_guard = nullptr; #endif } @@ -696,14 +721,22 @@ void JitArm64::EndTimeProfile(JitBlock* b) void JitArm64::Run() { + ProtectStack(); + CompiledCode pExecAddr = (CompiledCode)enter_code; pExecAddr(); + + UnprotectStack(); } void JitArm64::SingleStep() { + ProtectStack(); + CompiledCode pExecAddr = (CompiledCode)enter_code; pExecAddr(); + + UnprotectStack(); } void JitArm64::Trace() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 665f234490..ca8fd80a30 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -288,8 +288,8 @@ protected: void DoDownCount(); void Cleanup(); void ResetStack(); - void AllocStack(); - void FreeStack(); + void ProtectStack(); + void UnprotectStack(); void ResetFreeMemoryRanges(); @@ -365,9 +365,7 @@ protected: bool m_enable_blr_optimization = false; bool m_cleanup_after_stackfault = false; - u8* m_stack_base = nullptr; - u8* m_stack_pointer = nullptr; - u8* m_saved_stack_pointer = nullptr; + u8* m_stack_guard = nullptr; HyoutaUtilities::RangeSizeSet m_free_ranges_near; HyoutaUtilities::RangeSizeSet m_free_ranges_far; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 7d4a25f7f8..bfc0de7449 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -45,23 +45,14 @@ void JitArm64::GenerateAsm() MOVP2R(PPC_REG, &PowerPC::ppcState); - // Swap the stack pointer, so we have proper guard pages. + // Store the stack pointer, so we can reset it if the BLR optimization fails. ADD(ARM64Reg::X0, ARM64Reg::SP, 0); - STR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, - MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer)); - LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, MOVPage2R(ARM64Reg::X1, &m_stack_pointer)); - FixupBranch no_fake_stack = CBZ(ARM64Reg::X0); - ADD(ARM64Reg::SP, ARM64Reg::X0, 0); - SetJumpTarget(no_fake_stack); + STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); // Push {nullptr; -1} as invalid destination on the stack. MOVI2R(ARM64Reg::X0, 0xFFFFFFFF); STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X0, ARM64Reg::SP, -16); - // Store the stack pointer, so we can reset it if the BLR optimization fails. - ADD(ARM64Reg::X0, ARM64Reg::SP, 0); - STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); - // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance(). // Advance() does an exception check so we don't know what PC to use until afterwards. FixupBranch to_start_of_timing_slice = B(); @@ -204,9 +195,9 @@ void JitArm64::GenerateAsm() if (enable_debugging) SetJumpTarget(debug_exit); - // Reset the stack pointer, as the BLR optimization have touched it. - LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, - MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer)); + // Reset the stack pointer, since the BLR optimization may have pushed things onto the stack + // without popping them. + LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); ADD(ARM64Reg::SP, ARM64Reg::X0, 0); m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X30);