Merge pull request #11399 from JosJuice/jit-one-stack

Jit: Don't use a second stack
This commit is contained in:
JosJuice 2023-03-03 22:27:16 +01:00 committed by GitHub
commit 95ce41ac56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 248 additions and 258 deletions

View File

@ -195,26 +195,18 @@ std::unique_ptr<GBAHostInterface> Host_CreateGBAHost(std::weak_ptr<HW::GBA::Core
static bool MsgAlert(const char* caption, const char* text, bool yes_no, Common::MsgType style) static bool MsgAlert(const char* caption, const char* text, bool yes_no, Common::MsgType style)
{ {
// If a panic alert happens very early in the execution of a game, we can crash here with JNIEnv* env = IDCache::GetEnvForThread();
// the error "JNI NewString called with pending exception java.lang.StackOverflowError".
// As a workaround, let's put the call on a new thread with a brand new stack.
jboolean result; jstring j_caption = ToJString(env, caption);
jstring j_text = ToJString(env, text);
std::thread([&] { // Execute the Java method.
JNIEnv* env = IDCache::GetEnvForThread(); jboolean result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);
jstring j_caption = ToJString(env, caption); env->DeleteLocalRef(j_caption);
jstring j_text = ToJString(env, text); env->DeleteLocalRef(j_text);
// Execute the Java method.
result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);
env->DeleteLocalRef(j_caption);
env->DeleteLocalRef(j_text);
}).join();
return result != JNI_FALSE; return result != JNI_FALSE;
} }

View File

@ -7,6 +7,7 @@
#include <Windows.h> #include <Windows.h>
#include <processthreadsapi.h> #include <processthreadsapi.h>
#else #else
#include <pthread.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -185,6 +186,41 @@ void SetCurrentThreadName(const char* name)
#endif #endif
} }
std::tuple<void*, size_t> GetCurrentThreadStack()
{
void* stack_addr;
size_t stack_size;
pthread_t self = pthread_self();
#ifdef __APPLE__
stack_size = pthread_get_stacksize_np(self);
stack_addr = reinterpret_cast<u8*>(pthread_get_stackaddr_np(self)) - stack_size;
#elif defined __OpenBSD__
stack_t stack;
pthread_stackseg_np(self, &stack);
stack_addr = reinterpret_cast<u8*>(stack->ss_sp) - stack->ss_size;
stack_size = stack->ss_size;
#else
pthread_attr_t attr;
#ifdef __FreeBSD__
pthread_attr_init(&attr);
pthread_attr_get_np(self, &attr);
#else
// Linux and NetBSD
pthread_getattr_np(self, &attr);
#endif
pthread_attr_getstack(&attr, &stack_addr, &stack_size);
pthread_attr_destroy(&attr);
#endif
return std::make_tuple(stack_addr, stack_size);
}
#endif #endif
} // namespace Common } // namespace Common

View File

@ -5,6 +5,10 @@
#include <thread> #include <thread>
#ifndef _WIN32
#include <tuple>
#endif
// Don't include Common.h here as it will break LogManager // Don't include Common.h here as it will break LogManager
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
@ -35,4 +39,9 @@ inline void YieldCPU()
void SetCurrentThreadName(const char* name); void SetCurrentThreadName(const char* name);
#ifndef _WIN32
// Returns the lowest address of the stack and the size of the stack
std::tuple<void*, size_t> GetCurrentThreadStack();
#endif
} // namespace Common } // namespace Common

View File

@ -19,7 +19,6 @@
#include "Common/GekkoDisassembler.h" #include "Common/GekkoDisassembler.h"
#include "Common/IOFile.h" #include "Common/IOFile.h"
#include "Common/Logging/Log.h" #include "Common/Logging/Log.h"
#include "Common/MemoryUtil.h"
#include "Common/PerformanceCounter.h" #include "Common/PerformanceCounter.h"
#include "Common/StringUtil.h" #include "Common/StringUtil.h"
#include "Common/Swap.h" #include "Common/Swap.h"
@ -117,108 +116,21 @@ using namespace PowerPC;
and such, but it's currently limited to integer ops only. This can definitely be made better. and such, but it's currently limited to integer ops only. This can definitely be made better.
*/ */
// The BLR optimization is nice, but it means that JITted code can overflow the
// native stack by repeatedly running BL. (The chance of this happening in any
// retail game is close to 0, but correctness is correctness...) Also, the
// overflow might not happen directly in the JITted code but in a C++ function
// called from it, so we can't just adjust RSP in the case of a fault.
// Instead, we have to have extra stack space preallocated under the fault
// point which allows the code to continue, after wiping the JIT cache so we
// can reset things at a safe point. Once this condition trips, the
// optimization is permanently disabled, under the assumption this will never
// happen in practice.
// On Unix, we just mark an appropriate region of the stack as PROT_NONE and
// handle it the same way as fastmem faults. It's safe to take a fault with a
// bad RSP, because on Linux we can use sigaltstack and on OS X we're already
// on a separate thread.
// Windows is... under-documented.
// It already puts guard pages so it can automatically grow the stack and it
// doesn't look like there is a way to hook into a guard page fault and implement
// our own logic.
// But when windows reaches the last guard page, it raises a "Stack Overflow"
// exception which we can hook into, however by default it leaves you with less
// than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack
// Overflow early while we still have 512kb of stack remaining.
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 512kb mark.
enum
{
STACK_SIZE = 2 * 1024 * 1024,
SAFE_STACK_SIZE = 512 * 1024,
GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above)
GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE,
};
Jit64::Jit64() : QuantizedMemoryRoutines(*this) Jit64::Jit64() : QuantizedMemoryRoutines(*this)
{ {
} }
Jit64::~Jit64() = default; Jit64::~Jit64() = default;
void Jit64::AllocStack()
{
#ifndef _WIN32
m_stack = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
Common::ReadProtectMemory(m_stack, GUARD_SIZE);
Common::ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#else
// For windows we just keep using the system stack and reserve a large amount of memory at the end
// of the stack.
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#endif
}
void Jit64::FreeStack()
{
#ifndef _WIN32
if (m_stack)
{
Common::FreeMemoryPages(m_stack, STACK_SIZE);
m_stack = nullptr;
}
#endif
}
bool Jit64::HandleStackFault()
{
// It's possible the stack fault might have been caused by something other than
// the BLR optimization. If the fault was triggered from another thread, or
// when BLR optimization isn't enabled then there is nothing we can do about the fault.
// Return false so the regular stack overflow handler can trigger (which crashes)
if (!m_enable_blr_optimization || !Core::IsCPUThread())
return false;
WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");
m_enable_blr_optimization = false;
#ifndef _WIN32
// Windows does this automatically.
Common::UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#endif
// We're going to need to clear the whole cache to get rid of the bad
// CALLs, but we can't yet. Fake the downcount so we're forced to the
// dispatcher (no block linking), and clear the cache so we're sent to
// Jit. In the case of Windows, we will also need to call _resetstkoflw()
// to reset the guard page.
// Yeah, it's kind of gross.
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;
return true;
}
bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx) bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx)
{ {
uintptr_t stack = (uintptr_t)m_stack; const uintptr_t stack_guard = reinterpret_cast<uintptr_t>(m_stack_guard);
uintptr_t diff = access_address - stack;
// In the trap region? // In the trap region?
if (m_enable_blr_optimization && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE) if (m_enable_blr_optimization && access_address >= stack_guard &&
access_address < stack_guard + GUARD_SIZE)
{
return HandleStackFault(); return HandleStackFault();
}
// This generates some fairly heavy trampolines, but it doesn't really hurt. // This generates some fairly heavy trampolines, but it doesn't really hurt.
// Only instructions that access I/O will get these, and there won't be that // Only instructions that access I/O will get these, and there won't be that
@ -365,17 +277,10 @@ void Jit64::Init()
m_const_pool.Init(AllocChildCodeSpace(constpool_size), constpool_size); m_const_pool.Init(AllocChildCodeSpace(constpool_size), constpool_size);
ResetCodePtr(); ResetCodePtr();
// BLR optimization has the same consequences as block linking, as well as m_stack_guard = nullptr;
// depending on the fault handler to be safe in the event of excessive BL.
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;
m_stack = nullptr;
if (m_enable_blr_optimization)
AllocStack();
blocks.Init(); blocks.Init();
asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr); asm_routines.Init();
// important: do this *after* generating the global asm routines, because we can't use farcode in // important: do this *after* generating the global asm routines, because we can't use farcode in
// them. // them.
@ -415,7 +320,6 @@ void Jit64::ResetFreeMemoryRanges()
void Jit64::Shutdown() void Jit64::Shutdown()
{ {
FreeStack();
FreeCodeSpace(); FreeCodeSpace();
auto& system = Core::System::GetInstance(); auto& system = Core::System::GetInstance();
@ -735,14 +639,22 @@ void Jit64::WriteExternalExceptionExit()
void Jit64::Run() void Jit64::Run()
{ {
ProtectStack();
CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code; CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr(); pExecAddr();
UnprotectStack();
} }
void Jit64::SingleStep() void Jit64::SingleStep()
{ {
ProtectStack();
CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code; CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr(); pExecAddr();
UnprotectStack();
} }
void Jit64::Trace() void Jit64::Trace()
@ -779,15 +691,7 @@ void Jit64::Jit(u32 em_address)
void Jit64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) void Jit64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
{ {
if (m_cleanup_after_stackfault) CleanUpAfterStackFault();
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
if (trampolines.IsAlmostFull() || SConfig::GetInstance().bJITNoBlockCache) if (trampolines.IsAlmostFull() || SConfig::GetInstance().bJITNoBlockCache)
{ {

View File

@ -50,7 +50,6 @@ public:
void Shutdown() override; void Shutdown() override;
bool HandleFault(uintptr_t access_address, SContext* ctx) override; bool HandleFault(uintptr_t access_address, SContext* ctx) override;
bool HandleStackFault() override;
bool BackPatch(SContext* ctx); bool BackPatch(SContext* ctx);
void EnableOptimization(); void EnableOptimization();
@ -255,9 +254,6 @@ private:
bool HandleFunctionHooking(u32 address); bool HandleFunctionHooking(u32 address);
void AllocStack();
void FreeStack();
void ResetFreeMemoryRanges(); void ResetFreeMemoryRanges();
JitBlockCache blocks{*this}; JitBlockCache blocks{*this};
@ -268,10 +264,6 @@ private:
Jit64AsmRoutineManager asm_routines{*this}; Jit64AsmRoutineManager asm_routines{*this};
bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack = nullptr;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
}; };

View File

@ -24,10 +24,9 @@ Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(j
{ {
} }
void Jit64AsmRoutineManager::Init(u8* stack_top) void Jit64AsmRoutineManager::Init()
{ {
m_const_pool.Init(AllocChildCodeSpace(4096), 4096); m_const_pool.Init(AllocChildCodeSpace(4096), 4096);
m_stack_top = stack_top;
Generate(); Generate();
WriteProtect(); WriteProtect();
} }
@ -50,17 +49,8 @@ void Jit64AsmRoutineManager::Generate()
// MOV(64, R(RMEM), Imm64((u64)Memory::physical_base)); // MOV(64, R(RMEM), Imm64((u64)Memory::physical_base));
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80)); MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));
if (m_stack_top) MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));
{
// Pivot the stack to our custom one.
MOV(64, R(RSCRATCH), R(RSP));
MOV(64, R(RSP), ImmPtr(m_stack_top - 0x20));
MOV(64, MDisp(RSP, 0x18), R(RSCRATCH));
}
else
{
MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));
}
// something that can't pass the BLR test // something that can't pass the BLR test
MOV(64, MDisp(RSP, 8), Imm32((u32)-1)); MOV(64, MDisp(RSP, 8), Imm32((u32)-1));
@ -209,12 +199,9 @@ void Jit64AsmRoutineManager::Generate()
if (enable_debugging) if (enable_debugging)
SetJumpTarget(dbg_exit); SetJumpTarget(dbg_exit);
// Reset the stack pointer, since the BLR optimization may have pushed things onto the stack
// without popping them.
ResetStack(*this); ResetStack(*this);
if (m_stack_top)
{
ADD(64, R(RSP), Imm8(0x18));
POP(RSP);
}
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET(); RET();
@ -226,10 +213,7 @@ void Jit64AsmRoutineManager::Generate()
void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter) void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter)
{ {
if (m_stack_top) emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
emitter.MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20));
else
emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
} }
void Jit64AsmRoutineManager::GenerateCommon() void Jit64AsmRoutineManager::GenerateCommon()

View File

@ -36,7 +36,7 @@ public:
explicit Jit64AsmRoutineManager(Jit64& jit); explicit Jit64AsmRoutineManager(Jit64& jit);
void Init(u8* stack_top); void Init();
void ResetStack(Gen::X64CodeBlock& emitter); void ResetStack(Gen::X64CodeBlock& emitter);
@ -44,6 +44,5 @@ private:
void Generate(); void Generate();
void GenerateCommon(); void GenerateCommon();
u8* m_stack_top = nullptr;
JitBase& m_jit; JitBase& m_jit;
}; };

View File

@ -38,11 +38,6 @@ constexpr size_t CODE_SIZE = 1024 * 1024 * 32;
constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64; constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64;
constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64; constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64;
constexpr size_t STACK_SIZE = 2 * 1024 * 1024;
constexpr size_t SAFE_STACK_SIZE = 512 * 1024;
constexpr size_t GUARD_SIZE = 64 * 1024; // two guards - bottom (permanent) and middle (see above)
constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE;
JitArm64::JitArm64() : m_float_emit(this) JitArm64::JitArm64() : m_float_emit(this)
{ {
} }
@ -71,10 +66,6 @@ void JitArm64::Init()
code_block.m_gpa = &js.gpa; code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa; code_block.m_fpa = &js.fpa;
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;
AllocStack();
GenerateAsm(); GenerateAsm();
ResetFreeMemoryRanges(); ResetFreeMemoryRanges();
@ -117,9 +108,8 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
bool success = false; bool success = false;
// Handle BLR stack faults, may happen in C++ code. // Handle BLR stack faults, may happen in C++ code.
uintptr_t stack = (uintptr_t)m_stack_base; const uintptr_t stack_guard = reinterpret_cast<uintptr_t>(m_stack_guard);
uintptr_t diff = access_address - stack; if (access_address >= stack_guard && access_address < stack_guard + GUARD_SIZE)
if (diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE)
success = HandleStackFault(); success = HandleStackFault();
// If the fault is in JIT code space, look for fastmem areas. // If the fault is in JIT code space, look for fastmem areas.
@ -156,23 +146,6 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
return success; return success;
} }
bool JitArm64::HandleStackFault()
{
if (!m_enable_blr_optimization)
return false;
ERROR_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");
m_enable_blr_optimization = false;
#ifndef _WIN32
Common::UnWriteProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE);
#endif
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;
return true;
}
void JitArm64::ClearCache() void JitArm64::ClearCache()
{ {
m_fault_to_handler.clear(); m_fault_to_handler.clear();
@ -205,7 +178,6 @@ void JitArm64::Shutdown()
memory.ShutdownFastmemArena(); memory.ShutdownFastmemArena();
FreeCodeSpace(); FreeCodeSpace();
blocks.Shutdown(); blocks.Shutdown();
FreeStack();
} }
void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
@ -337,40 +309,6 @@ void JitArm64::ResetStack()
ADD(ARM64Reg::SP, ARM64Reg::X0, 0); ADD(ARM64Reg::SP, ARM64Reg::X0, 0);
} }
void JitArm64::AllocStack()
{
if (!m_enable_blr_optimization)
return;
#ifndef _WIN32
m_stack_base = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
if (!m_stack_base)
{
m_enable_blr_optimization = false;
return;
}
m_stack_pointer = m_stack_base + STACK_SIZE;
Common::ReadProtectMemory(m_stack_base, GUARD_SIZE);
Common::ReadProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE);
#else
// For windows we just keep using the system stack and reserve a large amount of memory at the end
// of the stack.
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#endif
}
void JitArm64::FreeStack()
{
#ifndef _WIN32
if (m_stack_base)
Common::FreeMemoryPages(m_stack_base, STACK_SIZE);
m_stack_base = nullptr;
m_stack_pointer = nullptr;
#endif
}
void JitArm64::IntializeSpeculativeConstants() void JitArm64::IntializeSpeculativeConstants()
{ {
// If the block depends on an input register which looks like a gather pipe or MMIO related // If the block depends on an input register which looks like a gather pipe or MMIO related
@ -696,14 +634,22 @@ void JitArm64::EndTimeProfile(JitBlock* b)
void JitArm64::Run() void JitArm64::Run()
{ {
ProtectStack();
CompiledCode pExecAddr = (CompiledCode)enter_code; CompiledCode pExecAddr = (CompiledCode)enter_code;
pExecAddr(); pExecAddr();
UnprotectStack();
} }
void JitArm64::SingleStep() void JitArm64::SingleStep()
{ {
ProtectStack();
CompiledCode pExecAddr = (CompiledCode)enter_code; CompiledCode pExecAddr = (CompiledCode)enter_code;
pExecAddr(); pExecAddr();
UnprotectStack();
} }
void JitArm64::Trace() void JitArm64::Trace()
@ -740,15 +686,7 @@ void JitArm64::Jit(u32 em_address)
void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
{ {
if (m_cleanup_after_stackfault) CleanUpAfterStackFault();
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
if (SConfig::GetInstance().bJITNoBlockCache) if (SConfig::GetInstance().bJITNoBlockCache)
ClearCache(); ClearCache();

View File

@ -32,7 +32,6 @@ public:
bool IsInCodeSpace(const u8* ptr) const { return IsInSpace(ptr); } bool IsInCodeSpace(const u8* ptr) const { return IsInSpace(ptr); }
bool HandleFault(uintptr_t access_address, SContext* ctx) override; bool HandleFault(uintptr_t access_address, SContext* ctx) override;
void DoBacktrace(uintptr_t access_address, SContext* ctx); void DoBacktrace(uintptr_t access_address, SContext* ctx);
bool HandleStackFault() override;
bool HandleFastmemFault(SContext* ctx); bool HandleFastmemFault(SContext* ctx);
void ClearCache() override; void ClearCache() override;
@ -288,8 +287,6 @@ protected:
void DoDownCount(); void DoDownCount();
void Cleanup(); void Cleanup();
void ResetStack(); void ResetStack();
void AllocStack();
void FreeStack();
void ResetFreeMemoryRanges(); void ResetFreeMemoryRanges();
@ -363,12 +360,6 @@ protected:
u8* m_near_code_end = nullptr; u8* m_near_code_end = nullptr;
bool m_near_code_write_failed = false; bool m_near_code_write_failed = false;
bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack_base = nullptr;
u8* m_stack_pointer = nullptr;
u8* m_saved_stack_pointer = nullptr;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
}; };

View File

@ -45,23 +45,14 @@ void JitArm64::GenerateAsm()
MOVP2R(PPC_REG, &PowerPC::ppcState); MOVP2R(PPC_REG, &PowerPC::ppcState);
// Swap the stack pointer, so we have proper guard pages. // Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(ARM64Reg::X0, ARM64Reg::SP, 0); ADD(ARM64Reg::X0, ARM64Reg::SP, 0);
STR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer));
LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, MOVPage2R(ARM64Reg::X1, &m_stack_pointer));
FixupBranch no_fake_stack = CBZ(ARM64Reg::X0);
ADD(ARM64Reg::SP, ARM64Reg::X0, 0);
SetJumpTarget(no_fake_stack);
// Push {nullptr; -1} as invalid destination on the stack. // Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(ARM64Reg::X0, 0xFFFFFFFF); MOVI2R(ARM64Reg::X0, 0xFFFFFFFF);
STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X0, ARM64Reg::SP, -16); STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X0, ARM64Reg::SP, -16);
// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(ARM64Reg::X0, ARM64Reg::SP, 0);
STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance(). // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards. // Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B(); FixupBranch to_start_of_timing_slice = B();
@ -204,9 +195,9 @@ void JitArm64::GenerateAsm()
if (enable_debugging) if (enable_debugging)
SetJumpTarget(debug_exit); SetJumpTarget(debug_exit);
// Reset the stack pointer, as the BLR optimization have touched it. // Reset the stack pointer, since the BLR optimization may have pushed things onto the stack
LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, // without popping them.
MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer)); LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(ARM64Reg::SP, ARM64Reg::X0, 0); ADD(ARM64Reg::SP, ARM64Reg::X0, 0);
m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X30); m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X30);

View File

@ -3,15 +3,53 @@
#include "Core/PowerPC/JitCommon/JitBase.h" #include "Core/PowerPC/JitCommon/JitBase.h"
#include "Common/Align.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/MemoryUtil.h"
#include "Common/Thread.h"
#include "Core/Config/MainSettings.h" #include "Core/Config/MainSettings.h"
#include "Core/ConfigManager.h" #include "Core/ConfigManager.h"
#include "Core/Core.h" #include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/HW/CPU.h" #include "Core/HW/CPU.h"
#include "Core/PowerPC/PPCAnalyst.h" #include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PowerPC.h" #include "Core/PowerPC/PowerPC.h"
#include "Core/System.h" #include "Core/System.h"
#ifdef _WIN32
#include <windows.h>
#include <processthreadsapi.h>
#else
#include <unistd.h>
#endif
// The BLR optimization is nice, but it means that JITted code can overflow the
// native stack by repeatedly running BL. (The chance of this happening in any
// retail game is close to 0, but correctness is correctness...) Also, the
// overflow might not happen directly in the JITted code but in a C++ function
// called from it, so we can't just adjust RSP in the case of a fault.
// Instead, we have to have extra stack space preallocated under the fault
// point which allows the code to continue, after wiping the JIT cache so we
// can reset things at a safe point. Once this condition trips, the
// optimization is permanently disabled, under the assumption this will never
// happen in practice.
// On Unix, we just mark an appropriate region of the stack as PROT_NONE and
// handle it the same way as fastmem faults. It's safe to take a fault with a
// bad RSP, because on Linux we can use sigaltstack and on OS X we're already
// on a separate thread.
// Windows is... under-documented.
// It already puts guard pages so it can automatically grow the stack and it
// doesn't look like there is a way to hook into a guard page fault and implement
// our own logic.
// But when windows reaches the last guard page, it raises a "Stack Overflow"
// exception which we can hook into, however by default it leaves you with less
// than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack
// Overflow early while we still have 256kb of stack remaining.
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 256kb mark.
const u8* JitBase::Dispatch(JitBase& jit) const u8* JitBase::Dispatch(JitBase& jit)
{ {
return jit.GetBlockCache()->Dispatch(); return jit.GetBlockCache()->Dispatch();
@ -72,6 +110,107 @@ void JitBase::RefreshConfig()
analyzer.SetDivByZeroExceptionsEnabled(m_enable_div_by_zero_exceptions); analyzer.SetDivByZeroExceptionsEnabled(m_enable_div_by_zero_exceptions);
} }
void JitBase::InitBLROptimization()
{
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;
}
void JitBase::ProtectStack()
{
if (!m_enable_blr_optimization)
return;
#ifdef _WIN32
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#else
auto [stack_addr, stack_size] = Common::GetCurrentThreadStack();
const uintptr_t stack_base_addr = reinterpret_cast<uintptr_t>(stack_addr);
const uintptr_t stack_middle_addr = reinterpret_cast<uintptr_t>(&stack_addr);
if (stack_middle_addr < stack_base_addr || stack_middle_addr >= stack_base_addr + stack_size)
{
PanicAlertFmt("Failed to get correct stack base");
m_enable_blr_optimization = false;
return;
}
const long page_size = sysconf(_SC_PAGESIZE);
if (page_size <= 0)
{
PanicAlertFmt("Failed to get page size");
m_enable_blr_optimization = false;
return;
}
const uintptr_t stack_guard_addr = Common::AlignUp(stack_base_addr + GUARD_OFFSET, page_size);
if (stack_guard_addr >= stack_middle_addr ||
stack_middle_addr - stack_guard_addr < GUARD_SIZE + MIN_UNSAFE_STACK_SIZE)
{
PanicAlertFmt("Stack is too small for BLR optimization (size {:x}, base {:x}, current stack "
"pointer {:x}, alignment {:x})",
stack_size, stack_base_addr, stack_middle_addr, page_size);
m_enable_blr_optimization = false;
return;
}
m_stack_guard = reinterpret_cast<u8*>(stack_guard_addr);
Common::ReadProtectMemory(m_stack_guard, GUARD_SIZE);
#endif
}
void JitBase::UnprotectStack()
{
#ifndef _WIN32
if (m_stack_guard)
{
Common::UnWriteProtectMemory(m_stack_guard, GUARD_SIZE);
m_stack_guard = nullptr;
}
#endif
}
bool JitBase::HandleStackFault()
{
// It's possible the stack fault might have been caused by something other than
// the BLR optimization. If the fault was triggered from another thread, or
// when BLR optimization isn't enabled then there is nothing we can do about the fault.
// Return false so the regular stack overflow handler can trigger (which crashes)
if (!m_enable_blr_optimization || !Core::IsCPUThread())
return false;
WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");
UnprotectStack();
m_enable_blr_optimization = false;
// We're going to need to clear the whole cache to get rid of the bad
// CALLs, but we can't yet. Fake the downcount so we're forced to the
// dispatcher (no block linking), and clear the cache so we're sent to
// Jit. In the case of Windows, we will also need to call _resetstkoflw()
// to reset the guard page.
// Yeah, it's kind of gross.
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;
return true;
}
void JitBase::CleanUpAfterStackFault()
{
if (m_cleanup_after_stackfault)
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
}
bool JitBase::CanMergeNextInstructions(int count) const bool JitBase::CanMergeNextInstructions(int count) const
{ {
if (CPU::IsStepping() || js.instructionsLeft < count) if (CPU::IsStepping() || js.instructionsLeft < count)

View File

@ -54,6 +54,12 @@ protected:
#endif #endif
}; };
static constexpr size_t SAFE_STACK_SIZE = 256 * 1024;
static constexpr size_t MIN_UNSAFE_STACK_SIZE = 192 * 1024;
static constexpr size_t MIN_STACK_SIZE = SAFE_STACK_SIZE + MIN_UNSAFE_STACK_SIZE;
static constexpr size_t GUARD_SIZE = 64 * 1024;
static constexpr size_t GUARD_OFFSET = SAFE_STACK_SIZE - GUARD_SIZE;
struct JitOptions struct JitOptions
{ {
bool enableBlocklink; bool enableBlocklink;
@ -138,8 +144,17 @@ protected:
bool m_pause_on_panic_enabled = false; bool m_pause_on_panic_enabled = false;
bool m_accurate_cpu_cache_enabled = false; bool m_accurate_cpu_cache_enabled = false;
bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack_guard = nullptr;
void RefreshConfig(); void RefreshConfig();
void InitBLROptimization();
void ProtectStack();
void UnprotectStack();
void CleanUpAfterStackFault();
bool CanMergeNextInstructions(int count) const; bool CanMergeNextInstructions(int count) const;
void UpdateMemoryAndExceptionOptions(); void UpdateMemoryAndExceptionOptions();
@ -160,7 +175,7 @@ public:
virtual const CommonAsmRoutinesBase* GetAsmRoutines() = 0; virtual const CommonAsmRoutinesBase* GetAsmRoutines() = 0;
virtual bool HandleFault(uintptr_t access_address, SContext* ctx) = 0; virtual bool HandleFault(uintptr_t access_address, SContext* ctx) = 0;
virtual bool HandleStackFault() { return false; } bool HandleStackFault();
static constexpr std::size_t code_buffer_size = 32000; static constexpr std::size_t code_buffer_size = 32000;