diff --git a/.gitignore b/.gitignore index 34791bda2..27fe839a6 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,5 @@ node_modules/.bin/ /tools/shader-playground/*.dll /profile_print_times.py /profile_times.txt +/cache1 +/cache0 diff --git a/src/xenia/base/console_win.cc b/src/xenia/base/console_win.cc index 2549a46ed..612e195fc 100644 --- a/src/xenia/base/console_win.cc +++ b/src/xenia/base/console_win.cc @@ -35,13 +35,15 @@ static bool has_shell_environment_variable() { } void AttachConsole() { - bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE; + +bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE; +#if 0 if (!has_console || !has_shell_environment_variable()) { // We weren't launched from a console, so just return. has_console_attached_ = false; return; } - + #endif AllocConsole(); has_console_attached_ = true; diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 7b9063084..b1ab4d82b 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -410,34 +410,7 @@ static float ArchReciprocal(float den) { return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); } -#if 0 -using ArchFloatMask = float; - -XE_FORCEINLINE -static ArchFloatMask ArchCmpneqFloatMask(float x, float y) { - return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y))); -} -XE_FORCEINLINE -static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) { - return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y))); -} -XE_FORCEINLINE -static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) { - return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y))); -} - -XE_FORCEINLINE -static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) { - return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y))); -} - -XE_FORCEINLINE -static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { - return static_cast(_mm_movemask_ps(_mm_set_ss(x))); -} - -constexpr ArchFloatMask floatmask_zero = .0f; -#else + using ArchFloatMask = __m128; XE_FORCEINLINE @@ -464,7 +437,7 @@ static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { } constexpr ArchFloatMask floatmask_zero{.0f}; -#endif + #else static float ArchMin(float x, float y) { return std::min(x, y); } static float ArchMax(float x, float y) { return std::max(x, y); } @@ -610,17 +583,17 @@ union IDivExtraInfo { } info; }; // returns magicnum multiplier -static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) { - IDivExtraInfo extra; +static constexpr uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) { + IDivExtraInfo extra{}; uint32_t d = _denom; - int p; - uint32_t nc, delta, q1, r1, q2, r2; + int p=0; + uint32_t nc=0, delta=0, q1=0, r1=0, q2=0, r2=0; struct { unsigned M; int a; int s; - } magu; + } magu{}; magu.a = 0; nc = -1 - ((uint32_t) - (int32_t)d) % d; p = 31; @@ -660,13 +633,13 @@ static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) { return static_cast(q2 + 1); } -static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul, +static constexpr uint32_t ApplyUint32Div(uint32_t num, uint32_t mul, uint32_t extradata) { - IDivExtraInfo extra; + IDivExtraInfo extra{}; extra.value_ = extradata; - uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32; + uint32_t result = static_cast((static_cast(num) * static_cast(mul)) >> 32); if (extra.info.add_) { uint32_t addend = result + num; addend = ((addend < result ? 0x80000000 : 0) | addend); @@ -675,7 +648,7 @@ static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul, return result >> extra.info.shift_; } -static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul, +static constexpr uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul, uint32_t extradata, uint32_t original) { uint32_t dived = ApplyUint32Div(num, mul, extradata); unsigned result = num - (dived * original); @@ -686,12 +659,12 @@ static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul, struct MagicDiv { uint32_t multiplier_; uint32_t extradata_; - MagicDiv() : multiplier_(0), extradata_(0) {} - MagicDiv(uint32_t original) { + constexpr MagicDiv() : multiplier_(0), extradata_(0) {} + constexpr MagicDiv(uint32_t original) : MagicDiv() { multiplier_ = PregenerateUint32Div(original, extradata_); } - uint32_t Apply(uint32_t numerator) const { + constexpr uint32_t Apply(uint32_t numerator) const { return ApplyUint32Div(numerator, multiplier_, extradata_); } }; diff --git a/src/xenia/base/memory_win.cc b/src/xenia/base/memory_win.cc index 807e3911c..580e5fd05 100644 --- a/src/xenia/base/memory_win.cc +++ b/src/xenia/base/memory_win.cc @@ -28,6 +28,9 @@ namespace xe { namespace memory { size_t page_size() { +#if XE_ARCH_AMD64 == 1 + return 4096; +#else static size_t value = 0; if (!value) { SYSTEM_INFO si; @@ -35,9 +38,13 @@ size_t page_size() { value = si.dwPageSize; } return value; +#endif } size_t allocation_granularity() { +#if XE_ARCH_AMD64 == 1 && XE_PLATFORM_WIN32 == 1 + return 65536; +#else static size_t value = 0; if (!value) { SYSTEM_INFO si; @@ -45,6 +52,7 @@ size_t allocation_granularity() { value = si.dwAllocationGranularity; } return value; +#endif } DWORD ToWin32ProtectFlags(PageAccess access) { diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index a608f04b4..d342a05b5 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -37,7 +37,7 @@ #define XE_USE_NTDLL_FUNCTIONS 1 //chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine // -#define XE_USE_KUSER_SHARED 0 +#define XE_USE_KUSER_SHARED 1 #if XE_USE_NTDLL_FUNCTIONS == 1 /* ntdll versions of functions often skip through a lot of extra garbage in diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h index fce3410d7..2e167b0f8 100644 --- a/src/xenia/cpu/backend/backend.h +++ b/src/xenia/cpu/backend/backend.h @@ -67,7 +67,22 @@ class Backend { // up until the start of ctx may be used by the backend to store whatever data // they want virtual void InitializeBackendContext(void* ctx) {} + + /* + Free any dynamically allocated data/resources that the backendcontext uses + */ + virtual void DeinitializeBackendContext(void* ctx) {} virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){}; + /* + called by KeSetCurrentStackPointers in xboxkrnl_threading.cc just prior + to calling XThread::Reenter this is an opportunity for a backend to clear any + data related to the guest stack + + in the case of the X64 backend, it means we reset the stackpoint index + to 0, since its a new stack and all of our old entries are invalid now + + * */ + virtual void PrepareForReentry(void* ctx) {} protected: Processor* processor_ = nullptr; diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 7b3e63222..99576ea85 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -31,7 +31,16 @@ DEFINE_bool(record_mmio_access_exceptions, true, "For guest addresses records whether we caught any mmio accesses " "for them. This info can then be used on a subsequent run to " "instruct the recompiler to emit checks", - "CPU"); + "x64"); + +DEFINE_int64(max_stackpoints, 65536, + "Max number of host->guest stack mappings we can record.", "x64"); + +DEFINE_bool(enable_host_guest_stack_synchronization, true, + "Records entries for guest/host stack mappings at function starts " + "and checks for reentry at return sites. Has slight performance " + "impact, but fixes crashes in games that use setjmp/longjmp.", + "x64"); #if XE_X64_PROFILER_AVAILABLE == 1 DECLARE_bool(instrument_call_times); #endif @@ -41,15 +50,29 @@ namespace cpu { namespace backend { namespace x64 { -class X64ThunkEmitter : public X64Emitter { +class X64HelperEmitter : public X64Emitter { public: - X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator); - ~X64ThunkEmitter() override; + struct _code_offsets { + size_t prolog; + size_t prolog_stack_alloc; + size_t body; + size_t epilog; + size_t tail; + }; + X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator); + ~X64HelperEmitter() override; HostToGuestThunk EmitHostToGuestThunk(); GuestToHostThunk EmitGuestToHostThunk(); ResolveFunctionThunk EmitResolveFunctionThunk(); + void* EmitGuestAndHostSynchronizeStackHelper(); + // 1 for loading byte, 2 for halfword and 4 for word. + // these specialized versions save space in the caller + void* EmitGuestAndHostSynchronizeStackSizeLoadThunk( + void* sync_func, unsigned stack_element_size); private: + void* EmitCurrentForOffsets(const _code_offsets& offsets, + size_t stack_size = 0); // The following four functions provide save/load functionality for registers. // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been // allocated on the stack. @@ -184,11 +207,26 @@ bool X64Backend::Initialize(Processor* processor) { // Generate thunks used to transition between jitted code and host code. XbyakAllocator allocator; - X64ThunkEmitter thunk_emitter(this, &allocator); + X64HelperEmitter thunk_emitter(this, &allocator); host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk(); guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk(); resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk(); + if (cvars::enable_host_guest_stack_synchronization) { + synchronize_guest_and_host_stack_helper_ = + thunk_emitter.EmitGuestAndHostSynchronizeStackHelper(); + + synchronize_guest_and_host_stack_helper_size8_ = + thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk( + synchronize_guest_and_host_stack_helper_, 1); + synchronize_guest_and_host_stack_helper_size16_ = + thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk( + synchronize_guest_and_host_stack_helper_, 2); + synchronize_guest_and_host_stack_helper_size32_ = + thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk( + synchronize_guest_and_host_stack_helper_, 4); + } + // Set the code cache to use the ResolveFunction thunk for default // indirections. assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull); @@ -203,9 +241,10 @@ bool X64Backend::Initialize(Processor* processor) { // Setup exception callback ExceptionHandler::Install(&ExceptionCallbackThunk, this); - - processor->memory()->SetMMIOExceptionRecordingCallback( - ForwardMMIOAccessForRecording, (void*)this); + if (cvars::record_mmio_access_exceptions) { + processor->memory()->SetMMIOExceptionRecordingCallback( + ForwardMMIOAccessForRecording, (void*)this); + } #if XE_X64_PROFILER_AVAILABLE == 1 if (cvars::instrument_call_times) { @@ -509,23 +548,32 @@ bool X64Backend::ExceptionCallback(Exception* ex) { return processor()->OnThreadBreakpointHit(ex); } -X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator) +X64HelperEmitter::X64HelperEmitter(X64Backend* backend, + XbyakAllocator* allocator) : X64Emitter(backend, allocator) {} -X64ThunkEmitter::~X64ThunkEmitter() {} +X64HelperEmitter::~X64HelperEmitter() {} +void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets, + size_t stack_size) { + EmitFunctionInfo func_info = {}; + func_info.code_size.total = getSize(); + func_info.code_size.prolog = code_offsets.body - code_offsets.prolog; + func_info.code_size.body = code_offsets.epilog - code_offsets.body; + func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog; + func_info.code_size.tail = getSize() - code_offsets.tail; + func_info.prolog_stack_alloc_offset = + code_offsets.prolog_stack_alloc - code_offsets.prolog; + func_info.stack_size = stack_size; -HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { + void* fn = Emplace(func_info); + return fn; +} +HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() { // rcx = target // rdx = arg0 (context) // r8 = arg1 (guest return address) - struct _code_offsets { - size_t prolog; - size_t prolog_stack_alloc; - size_t body; - size_t epilog; - size_t tail; - } code_offsets = {}; + _code_offsets code_offsets = {}; const size_t stack_size = StackLayout::THUNK_STACK_SIZE; @@ -576,19 +624,13 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { return (HostToGuestThunk)fn; } -GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { +GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() { // rcx = target function // rdx = arg0 // r8 = arg1 // r9 = arg2 - struct _code_offsets { - size_t prolog; - size_t prolog_stack_alloc; - size_t body; - size_t epilog; - size_t tail; - } code_offsets = {}; + _code_offsets code_offsets = {}; const size_t stack_size = StackLayout::THUNK_STACK_SIZE; @@ -635,17 +677,11 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { // X64Emitter handles actually resolving functions. uint64_t ResolveFunction(void* raw_context, uint64_t target_address); -ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { +ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() { // ebx = target PPC address // rcx = context - struct _code_offsets { - size_t prolog; - size_t prolog_stack_alloc; - size_t body; - size_t epilog; - size_t tail; - } code_offsets = {}; + _code_offsets code_offsets = {}; const size_t stack_size = StackLayout::THUNK_STACK_SIZE; @@ -688,8 +724,116 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { void* fn = Emplace(func_info); return (ResolveFunctionThunk)fn; } +// r11 = size of callers stack, r8 = return address w/ adjustment +//i'm not proud of this code, but it shouldn't be executed frequently at all +void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() { + _code_offsets code_offsets = {}; + code_offsets.prolog = getSize(); + mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints))); + mov(eax, + GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth))); -void X64ThunkEmitter::EmitSaveVolatileRegs() { + lea(ecx, ptr[eax - 1]); + mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]); + + Xbyak::Label looper{}; + Xbyak::Label loopout{}; + Xbyak::Label signed_underflow{}; + xor_(r12d, r12d); + + //todo: should use Loop instruction here if hasFastLoop, + //currently xbyak does not support it but its super easy to modify xbyak to have it + L(looper); + imul(edx, ecx, sizeof(X64BackendStackpoint)); + mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]); + + cmp(r10d, r9d); + + jge(loopout, T_NEAR); + + inc(r12d); + + if (IsFeatureEnabled(kX64FlagsIndependentVars)) { + dec(ecx); + } else { + sub(ecx, 1); + } + js(signed_underflow, T_NEAR); // should be impossible!! + + + jmp(looper, T_NEAR); + L(loopout); + Xbyak::Label skip_adjust{}; + cmp(r12d, 1);//should never happen? + jle(skip_adjust, T_NEAR); + mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]); + if (IsFeatureEnabled(kX64FlagsIndependentVars)) { + inc(ecx); + } else { + add(ecx, 1); + } + + // this->DebugBreak(); + sub(rsp, r11); // adjust stack + + mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)), + ecx); // set next stackpoint index to be after the one we restored to + L(skip_adjust); + + jmp(r8); // return to caller + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + + L(signed_underflow); + //find a good, compact way to signal error here + // maybe an invalid opcode that we execute, then detect in an exception handler? + + this->DebugBreak(); + // stack unwinding, take first entry + //actually, no reason to have this + + /*mov(rsp, ptr[rbx + offsetof(X64BackendStackpoint, host_stack_)]); + mov(ptr[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r9d); + sub(rsp, r11); + xor_(eax, eax); + inc(eax); + mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)), + eax); + + jmp(r8);*/ + // this->DebugBreak(); // err, add an xe::FatalError to call for this + + return EmitCurrentForOffsets(code_offsets); +} + +void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk( + void* sync_func, unsigned stack_element_size) { + _code_offsets code_offsets = {}; + code_offsets.prolog = getSize(); + pop(r8); // return address + + switch (stack_element_size) { + case 4: + mov(r11d, ptr[r8]); + break; + case 2: + movzx(r11d, word[r8]); + break; + case 1: + movzx(r11d, byte[r8]); + break; + } + add(r8, stack_element_size); + jmp(sync_func, T_NEAR); + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + return EmitCurrentForOffsets(code_offsets); +} +void X64HelperEmitter::EmitSaveVolatileRegs() { // Save off volatile registers. // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax); mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx); @@ -711,7 +855,7 @@ void X64ThunkEmitter::EmitSaveVolatileRegs() { vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5); } -void X64ThunkEmitter::EmitLoadVolatileRegs() { +void X64HelperEmitter::EmitLoadVolatileRegs() { // mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]); mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]); mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]); @@ -732,7 +876,7 @@ void X64ThunkEmitter::EmitLoadVolatileRegs() { vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]); } -void X64ThunkEmitter::EmitSaveNonvolatileRegs() { +void X64HelperEmitter::EmitSaveNonvolatileRegs() { mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx); mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp); #if XE_PLATFORM_WIN32 @@ -760,7 +904,7 @@ void X64ThunkEmitter::EmitSaveNonvolatileRegs() { #endif } -void X64ThunkEmitter::EmitLoadNonvolatileRegs() { +void X64HelperEmitter::EmitLoadNonvolatileRegs() { mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]); mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]); #if XE_PLATFORM_WIN32 @@ -788,16 +932,41 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() { } void X64Backend::InitializeBackendContext(void* ctx) { X64BackendContext* bctx = BackendContextForGuestContext(ctx); - bctx->ResolveFunction_Ptr = reinterpret_cast(&ResolveFunction); bctx->mxcsr_fpu = DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the // rounding on ppc is at startup + + /* + todo: stackpoint arrays should be pooled virtual memory at the very + least there may be some fancy virtual address tricks we can do here + + */ + + bctx->stackpoints = cvars::enable_host_guest_stack_synchronization + ? new X64BackendStackpoint[cvars::max_stackpoints] + : nullptr; + bctx->current_stackpoint_depth = 0; bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR; bctx->flags = 0; // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; bctx->guest_tick_count = Clock::GetGuestTickCountPointer(); } +void X64Backend::DeinitializeBackendContext(void* ctx) { + X64BackendContext* bctx = BackendContextForGuestContext(ctx); + + if (bctx->stackpoints) { + delete[] bctx->stackpoints; + bctx->stackpoints = nullptr; + } +} + +void X64Backend::PrepareForReentry(void* ctx) { + X64BackendContext* bctx = BackendContextForGuestContext(ctx); + + bctx->current_stackpoint_depth = 0; +} + const uint32_t mxcsr_table[8] = { 0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80, }; diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index cb5a375ec..92ee0f7a4 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -24,7 +24,8 @@ #endif DECLARE_int64(x64_extension_mask); - +DECLARE_int64(max_stackpoints); +DECLARE_bool(enable_host_guest_stack_synchronization); namespace xe { class Exception; } // namespace xe @@ -41,14 +42,25 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); typedef void (*ResolveFunctionThunk)(); +struct X64BackendStackpoint { + uint64_t host_stack_; + unsigned guest_stack_; + // pad to 16 bytes so we never end up having a 64 bit load/store for + // host_stack_ straddling two lines. Consider this field reserved for future + // use + unsigned unused_; +}; // located prior to the ctx register // some things it would be nice to have be per-emulator instance instead of per // context (somehow placing a global X64BackendCtx prior to membase, so we can // negatively index the membase reg) struct X64BackendContext { - void* ResolveFunction_Ptr; // cached pointer to resolvefunction + // guest_tick_count is used if inline_loadclock is used uint64_t* guest_tick_count; + // records mapping of host_stack to guest_stack + X64BackendStackpoint* stackpoints; + unsigned int current_stackpoint_depth; unsigned int mxcsr_fpu; // currently, the way we implement rounding mode // affects both vmx and the fpu unsigned int mxcsr_vmx; @@ -81,6 +93,19 @@ class X64Backend : public Backend { return resolve_function_thunk_; } + void* synchronize_guest_and_host_stack_helper() const { + return synchronize_guest_and_host_stack_helper_; + } + void* synchronize_guest_and_host_stack_helper_for_size(size_t sz) const { + switch (sz) { + case 1: + return synchronize_guest_and_host_stack_helper_size8_; + case 2: + return synchronize_guest_and_host_stack_helper_size16_; + default: + return synchronize_guest_and_host_stack_helper_size32_; + } + } bool Initialize(Processor* processor) override; void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override; @@ -97,7 +122,8 @@ class X64Backend : public Backend { void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override; void UninstallBreakpoint(Breakpoint* breakpoint) override; virtual void InitializeBackendContext(void* ctx) override; - + virtual void DeinitializeBackendContext(void* ctx) override; + virtual void PrepareForReentry(void* ctx) override; X64BackendContext* BackendContextForGuestContext(void* ctx) { return reinterpret_cast( reinterpret_cast(ctx) - sizeof(X64BackendContext)); @@ -120,7 +146,12 @@ class X64Backend : public Backend { HostToGuestThunk host_to_guest_thunk_; GuestToHostThunk guest_to_host_thunk_; ResolveFunctionThunk resolve_function_thunk_; + void* synchronize_guest_and_host_stack_helper_ = nullptr; + // loads stack sizes 1 byte, 2 bytes or 4 bytes + void* synchronize_guest_and_host_stack_helper_size8_ = nullptr; + void* synchronize_guest_and_host_stack_helper_size16_ = nullptr; + void* synchronize_guest_and_host_stack_helper_size32_ = nullptr; #if XE_X64_PROFILER_AVAILABLE == 1 GuestProfilerData profiler_data_; #endif diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index bc9224ab6..463b245d0 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -213,6 +213,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { func_info.stack_size = stack_size; stack_size_ = stack_size; + PushStackpoint(); sub(rsp, (uint32_t)stack_size); code_offsets.prolog_stack_alloc = getSize(); @@ -271,6 +272,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { */ // Body. auto block = builder->first_block(); + synchronize_stack_on_next_instruction_ = false; while (block) { ForgetMxcsrMode(); // at start of block, mxcsr mode is undefined @@ -287,6 +289,12 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { // Process instructions. const Instr* instr = block->instr_head; while (instr) { + if (synchronize_stack_on_next_instruction_) { + if (instr->GetOpcodeNum() != hir::OPCODE_SOURCE_OFFSET) { + synchronize_stack_on_next_instruction_ = false; + EnsureSynchronizedGuestAndHostStack(); + } + } const Instr* new_tail = instr; if (!SelectSequence(this, instr, &new_tail)) { // No sequence found! @@ -314,6 +322,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { EmitProfilerEpilogue(); add(rsp, (uint32_t)stack_size); + PopStackpoint(); ret(); // todo: do some kind of sorting by alignment? for (auto&& tail_item : tail_code_) { @@ -453,12 +462,186 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) { // This is used by the X64ThunkEmitter's ResolveFunctionThunk. uint64_t ResolveFunction(void* raw_context, uint64_t target_address) { - auto thread_state = - reinterpret_cast(raw_context)->thread_state; + auto guest_context = reinterpret_cast(raw_context); + + auto thread_state = guest_context->thread_state; // TODO(benvanik): required? assert_not_zero(target_address); + /* + todo: refactor this! + + The purpose of this code is to allow guest longjmp to call into + the body of an existing host function. There are a lot of conditions we + have to check here to ensure that we do not mess up a normal call to a + function + + The address must be within an XexModule (may need to make some changes + to instructionaddressflags to remove this limitation) The target address + must be a known return site. The guest address must be part of a function + that was already translated. + + */ + + if (cvars::enable_host_guest_stack_synchronization) { + auto processor = thread_state->processor(); + auto module_for_address = + processor->LookupModule(static_cast(target_address)); + + if (module_for_address) { + XexModule* xexmod = dynamic_cast(module_for_address); + if (xexmod) { + InfoCacheFlags* flags = xexmod->GetInstructionAddressFlags( + static_cast(target_address)); + if (flags) { + if (flags->is_return_site) { + auto ones_with_address = processor->FindFunctionsWithAddress( + static_cast(target_address)); + if (ones_with_address.size() != 0) { + // this loop to find a host address for the guest address is + // necessary because FindFunctionsWithAddress works via a range + // check, but if the function consists of multiple blocks + // scattered around with "holes" of instructions that cannot be + // reached in between those holes the instructions that cannot be + // reached will incorrectly be considered members of the function + + X64Function* candidate = nullptr; + uintptr_t host_address = 0; + for (auto&& entry : ones_with_address) { + X64Function* xfunc = static_cast(entry); + + host_address = xfunc->MapGuestAddressToMachineCode( + static_cast(target_address)); + // host address does exist within the function, and that host + // function is not the start of the function, it is instead + // somewhere within its existing body + // i originally did not have this (xfunc->machine_code() != + // reinterpret_cast(host_address))) condition + // here when i distributed builds for testing, no issues arose + // related to it but i wanted to be more explicit + if (host_address && + xfunc->machine_code() != + reinterpret_cast(host_address)) { + candidate = xfunc; + break; + } + } + // we found an existing X64Function, and a return site within that + // function that has a host address w/ native code + if (candidate && host_address) { + X64Backend* backend = + static_cast(processor->backend()); + // grab the backend context, next we have to check whether the + // guest and host stack are out of sync if they arent, its fine + // for the backend to create a new function for the guest + // address we're resolving if they are, it means that the reason + // we're resolving this address is because context is being + // restored (probably by longjmp) + X64BackendContext* backend_context = + backend->BackendContextForGuestContext(guest_context); + + uint32_t current_stackpoint_index = + backend_context->current_stackpoint_depth; + + --current_stackpoint_index; + + X64BackendStackpoint* stackpoints = + backend_context->stackpoints; + + uint32_t current_guest_stackpointer = + static_cast(guest_context->r[1]); + uint32_t num_frames_bigger = 0; + + /* + if the current guest stack pointer is bigger than the + recorded pointer for this stack thats fine, plenty of + functions restore the original stack pointer early + + if more than 1... we're longjmping and sure of it at + this point (jumping to a return site that has already been + emitted) + */ + while (current_stackpoint_index != 0xFFFFFFFF) { + if (current_guest_stackpointer > + stackpoints[current_stackpoint_index].guest_stack_) { + --current_stackpoint_index; + ++num_frames_bigger; + + } else { + break; + } + } + /* + DEFINITELY a longjmp, return original + host address. returning the existing host address is going to + set off some extra machinery we have set up to support this + + to break it down, our caller (us being + this ResolveFunction that this comment is in) is + X64Backend::resolve_function_thunk_ which is implemented in + x64_backend.cc X64HelperEmitter::EmitResolveFunctionThunk, or + a call from the resolver table + + the x64 fastcall abi dictates that the + stack must always be 16 byte aligned. We select our stack + size for functions to ensure that we keep rsp aligned to 16 + bytes + + but by calling into the body of an + existing function we've pushed our return address onto the + stack (dont worry about this return address, it gets + discarded in a later step) + + this means that the stack is no longer + 16 byte aligned, (rsp % 16) now == 8, and this is the only + time outside of the prolog or epilog of a function that this + will be the case + + so, after all direct or indirect + function calls we set + X64Emitter::synchronize_stack_on_next_instruction_ to true. + On the next instruction that is not + OPCODE_SOURCE_OFFSET we will emit a check when we see + synchronize_stack_on_next_instruction_ is true. We have to + skip OPCODE_SOURCE_OFFSET because its not a "real" + instruction and if we emit on it the return address of the + function call will point to AFTER our check, so itll never be + executed. + + our check is just going to do test esp, + 15 to see if the stack is misaligned. (using esp instead of + rsp saves 1 byte). We tail emit the handling for when the + check succeeds because in 99.99999% of function calls it will + be aligned, in the end the runtime cost of these checks is 5 + bytes for the test instruction which ought to be one cycle + and 5 bytes for the jmp with no cycles taken for the jump + which will be predicted not taken. + + Our handling for the check is implemented in X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper. we don't call it directly though, + instead we go through backend()->synchronize_guest_and_host_stack_helper_for_size(num_bytes_needed_to_represent_stack_size). we place the stack size after the + call instruction so we can load it in the helper and readjust the return address to point after the literal value. + + The helper is going to search the array of stackpoints to find the first one that is greater than or equal to the current stack pointer, when it finds + the entry it will set the currently host rsp to the host stack pointer value in the entry, and then subtract the stack size of the caller from that. + the current stackpoint index is adjusted to point to the one after the stackpoint we restored to. + + The helper then jumps back to the function that was longjmp'ed to, with the host stack in its proper state. it just works! + + + + */ + + if (num_frames_bigger > 1) { + return host_address; + } + } + } + } + } + } + } + } auto fn = thread_state->processor()->ResolveFunction( static_cast(target_address)); assert_not_null(fn); @@ -479,7 +662,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); call((void*)fn->machine_code()); - + synchronize_stack_on_next_instruction_ = true; } else { // tail call EmitTraceUserCallReturn(); @@ -488,8 +671,10 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); add(rsp, static_cast(stack_size())); + PopStackpoint(); jmp((void*)fn->machine_code(), T_NEAR); } + return; } else if (code_cache_->has_indirection_table()) { // Load the pointer to the indirection table maintained in X64CodeCache. @@ -513,12 +698,14 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); add(rsp, static_cast(stack_size())); + PopStackpoint(); jmp(rax); } else { // Return address is from the previous SET_RETURN_ADDRESS. mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); call(rax); + synchronize_stack_on_next_instruction_ = true; } } @@ -557,12 +744,14 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); add(rsp, static_cast(stack_size())); + PopStackpoint(); jmp(rax); } else { // Return address is from the previous SET_RETURN_ADDRESS. mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); call(rax); + synchronize_stack_on_next_instruction_ = true; } } @@ -1458,6 +1647,126 @@ Xbyak::Address X64Emitter::GetBackendFlagsPtr() const { pt.setBit(32); return pt; } + +void X64Emitter::HandleStackpointOverflowError(ppc::PPCContext* context) { + // context->lr + // todo: show lr in message? + xe::FatalError( + "Overflowed stackpoints! Please report this error for this title to " + "Xenia developers."); +} + +void X64Emitter::PushStackpoint() { + if (!cvars::enable_host_guest_stack_synchronization) { + return; + } + // push the current host and guest stack pointers + // this is done before a stack frame is set up or any guest instructions are + // executed this code is probably the most intrusive part of the stackpoint + mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints))); + mov(eax, + GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth))); + + mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]); + + imul(r9d, eax, sizeof(X64BackendStackpoint)); + add(rbx, r9); + + mov(qword[rbx + offsetof(X64BackendStackpoint, host_stack_)], rsp); + mov(dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r8d); + if (IsFeatureEnabled(kX64FlagsIndependentVars)) { + inc(eax); + } else { + add(eax, 1); + } + + mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)), + eax); + + cmp(eax, (uint32_t)cvars::max_stackpoints); + + Xbyak::Label& overflowed_stackpoints = + AddToTail([](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + // we never subtracted anything from rsp, so our stack is misaligned and + // will fault in guesttohostthunk + // e.sub(e.rsp, 8); + e.push(e.rax); // easier realign, 1 byte opcode vs 4 bytes for sub + + e.CallNativeSafe((void*)X64Emitter::HandleStackpointOverflowError); + }); + jge(overflowed_stackpoints, T_NEAR); +} +void X64Emitter::PopStackpoint() { + if (!cvars::enable_host_guest_stack_synchronization) { + return; + } + // todo: maybe verify that rsp and r1 == the stackpoint? + Xbyak::Address stackpoint_pos_pointer = + GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)); + stackpoint_pos_pointer.setBit(32); + dec(stackpoint_pos_pointer); +} + +void X64Emitter::EnsureSynchronizedGuestAndHostStack() { + if (!cvars::enable_host_guest_stack_synchronization) { + return; + } + // chrispy: keeping this old slower test here in case in the future changes + // need to be made + // that result in the stack not being 8 byte misaligned on context reentry + +#if 0 + Xbyak::Label skip{}; + mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]); + mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints))); + imul(eax, + GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)), + sizeof(X64BackendStackpoint)); + sub(eax, sizeof(X64BackendStackpoint)); + add(rbx, rax); + + cmp(r8d, dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)]); + jle(skip, T_NEAR); + Xbyak::Label skip{}; + mov(r11d, stack_size()); + call(backend_->synchronize_guest_and_host_stack_helper()); + L(skip); +#endif + + Xbyak::Label& return_from_sync = this->NewCachedLabel(); + + // if we got here somehow from setjmp or the like we ought to have a + // misaligned stack right now! this provides us with a very fast pretest for + // this condition + test(esp, 15); + + Xbyak::Label& sync_label = this->AddToTail( + [&return_from_sync](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + + uint32_t stack32 = static_cast(e.stack_size()); + auto backend = e.backend(); + + if (stack32 < 256) { + e.call(backend->synchronize_guest_and_host_stack_helper_for_size(1)); + e.db(stack32); + + } else if (stack32 < 65536) { + e.call(backend->synchronize_guest_and_host_stack_helper_for_size(2)); + e.dw(stack32); + } else { + // ought to be impossible, a host stack bigger than 65536?? + e.call(backend->synchronize_guest_and_host_stack_helper_for_size(4)); + e.dd(stack32); + } + e.jmp(return_from_sync, T_NEAR); + }); + + jnz(sync_label, T_NEAR); + + L(return_from_sync); +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 155994bf9..4fdeab4a4 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -299,6 +299,11 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0); Xbyak::Label& NewCachedLabel(); + + void PushStackpoint(); + void PopStackpoint(); + + void EnsureSynchronizedGuestAndHostStack(); FunctionDebugInfo* debug_info() const { return debug_info_; } size_t stack_size() const { return stack_size_; } @@ -381,13 +386,14 @@ class X64Emitter : public Xbyak::CodeGenerator { bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info); void EmitGetCurrentThreadId(); void EmitTraceUserCallReturn(); - + static void HandleStackpointOverflowError(ppc::PPCContext* context); protected: Processor* processor_ = nullptr; X64Backend* backend_ = nullptr; X64CodeCache* code_cache_ = nullptr; XbyakAllocator* allocator_ = nullptr; XexModule* guest_module_ = nullptr; + bool synchronize_stack_on_next_instruction_ = false; Xbyak::util::Cpu cpu_; uint64_t feature_flags_ = 0; uint32_t current_guest_function_ = 0; diff --git a/src/xenia/cpu/entry_table.cc b/src/xenia/cpu/entry_table.cc index 840706171..4b9181be7 100644 --- a/src/xenia/cpu/entry_table.cc +++ b/src/xenia/cpu/entry_table.cc @@ -56,6 +56,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) { if (entry) { // If we aren't ready yet spin and wait. if (entry->status == Entry::STATUS_COMPILING) { + // chrispy: i think this is dead code, if we are compiling we're holding + // the global lock, arent we? so we wouldnt be executing here // Still compiling, so spin. do { global_lock.unlock(); diff --git a/src/xenia/cpu/function.cc b/src/xenia/cpu/function.cc index ebd8c5ba1..828c5f94e 100644 --- a/src/xenia/cpu/function.cc +++ b/src/xenia/cpu/function.cc @@ -110,8 +110,13 @@ uint32_t GuestFunction::MapGuestAddressToMachineCodeOffset( uintptr_t GuestFunction::MapGuestAddressToMachineCode( uint32_t guest_address) const { auto entry = LookupGuestAddress(guest_address); - return reinterpret_cast(machine_code()) + - (entry ? entry->code_offset : 0); + + if (entry) { + return reinterpret_cast(machine_code()) + entry->code_offset; + } else { + return 0; + + } } uint32_t GuestFunction::MapMachineCodeToGuestAddress( diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc index 867651c32..b36c36e68 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.cc +++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc @@ -27,18 +27,13 @@ #include "xenia/cpu/ppc/ppc_frontend.h" #include "xenia/cpu/ppc/ppc_opcode_info.h" #include "xenia/cpu/processor.h" - +#include "xenia/cpu/xex_module.h" DEFINE_bool( break_on_unimplemented_instructions, true, "Break to the host debugger (or crash if no debugger attached) if an " "unimplemented PowerPC instruction is encountered.", "CPU"); -DEFINE_bool( - emit_useless_fpscr_updates, false, - "Emit useless fpscr update instructions (pre-10/30/2022 behavior). ", - "CPU"); - namespace xe { namespace cpu { namespace ppc { @@ -94,8 +89,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) { function_ = function; start_address_ = function_->address(); - //chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice - //i've also seen ones with a start and end address that are the same... + // chrispy: i've seen this one happen, not sure why but i think from trying to + // precompile twice i've also seen ones with a start and end address that are + // the same... assert_true(function_->address() <= function_->end_address()); instr_count_ = (function_->end_address() - function_->address()) / 4 + 1; @@ -250,7 +246,8 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) { } void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) { - //chrispy: label->name is unused, it would be nice to be able to remove the field and this code + // chrispy: label->name is unused, it would be nice to be able to remove the + // field and this code char name_buffer[13]; auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address); name_buffer[format_result.size] = '\0'; @@ -457,37 +454,38 @@ void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) { // TODO(benvanik): detect overflow and nan cases. // fx and vx are the most important. /* - chrispy: stubbed this out because right now all it does is waste - memory and CPU time + chrispy: i stubbed this out at one point because all it does is waste + memory and CPU time, however, this introduced issues with raiden + (substitute w/ titleid later) which probably means they stash stuff in the + fpscr? + */ - if (cvars::emit_useless_fpscr_updates) { - Value* fx = LoadConstantInt8(0); - Value* fex = LoadConstantInt8(0); - Value* vx = LoadConstantInt8(0); - Value* ox = LoadConstantInt8(0); - if (update_cr1) { - // Store into the CR1 field. - // We do this instead of just calling CopyFPSCRToCR1 so that we don't - // have to read back the bits and do shifting work. - StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); - StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); - StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); - StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); - } + Value* fx = LoadConstantInt8(0); + Value* fex = LoadConstantInt8(0); + Value* vx = LoadConstantInt8(0); + Value* ox = LoadConstantInt8(0); - // Generate our new bits. - Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31); - new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30)); - new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29)); - new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28)); - - // Mix into fpscr while preserving sticky bits (FX and OX). - Value* bits = LoadFPSCR(); - bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits); - StoreFPSCR(bits); + if (update_cr1) { + // Store into the CR1 field. + // We do this instead of just calling CopyFPSCRToCR1 so that we don't + // have to read back the bits and do shifting work. + StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); + StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); + StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); + StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); } + // Generate our new bits. + Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31); + new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30)); + new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29)); + new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28)); + + // Mix into fpscr while preserving sticky bits (FX and OX). + Value* bits = LoadFPSCR(); + bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits); + StoreFPSCR(bits); } void PPCHIRBuilder::CopyFPSCRToCR1() { @@ -587,7 +585,24 @@ void PPCHIRBuilder::StoreReserved(Value* val) { Value* PPCHIRBuilder::LoadReserved() { return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE); } +void PPCHIRBuilder::SetReturnAddress(Value* value) { + /* + Record the address as being a possible target of a return. This is + needed for longjmp emulation. See x64_emitter.cc's ResolveFunction + */ + Module* mod = this->function_->module(); + if (value && value->IsConstant()) { + if (mod) { + XexModule* xexmod = dynamic_cast(mod); + if (xexmod) { + auto flags = xexmod->GetInstructionAddressFlags(value->AsUint32()); + flags->is_return_site = true; + } + } + } + HIRBuilder::SetReturnAddress(value); +} } // namespace ppc } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.h b/src/xenia/cpu/ppc/ppc_hir_builder.h index a7eb6fc4a..ad99b63d8 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.h +++ b/src/xenia/cpu/ppc/ppc_hir_builder.h @@ -80,7 +80,8 @@ class PPCHIRBuilder : public hir::HIRBuilder { void StoreReserved(Value* val); Value* LoadReserved(); - + //calls original impl in hirbuilder, but also records the is_return_site bit into flags in the guestmodule + void SetReturnAddress(Value* value); private: void MaybeBreakOnInstruction(uint32_t address); void AnnotateLabel(uint32_t address, Label* label); diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index aa9f83013..cb6105464 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -263,12 +263,11 @@ Function* Processor::ResolveFunction(uint32_t address) { return nullptr; } - if (!DemandFunction(function)) { entry->status = Entry::STATUS_FAILED; return nullptr; } - //only add it to the list of resolved functions if resolving succeeded + //only add it to the list of resolved functions if resolving succeeded auto module_for = function->module(); auto xexmod = dynamic_cast(module_for); @@ -291,23 +290,23 @@ Function* Processor::ResolveFunction(uint32_t address) { return nullptr; } } - +Module* Processor::LookupModule(uint32_t address) { + auto global_lock = global_critical_region_.Acquire(); + // TODO(benvanik): sort by code address (if contiguous) so can bsearch. + // TODO(benvanik): cache last module low/high, as likely to be in there. + for (const auto& module : modules_) { + if (module->ContainsAddress(address)) { + return module.get(); + } + } + return nullptr; +} Function* Processor::LookupFunction(uint32_t address) { // TODO(benvanik): fast reject invalid addresses/log errors. // Find the module that contains the address. - Module* code_module = nullptr; - { - auto global_lock = global_critical_region_.Acquire(); - // TODO(benvanik): sort by code address (if contiguous) so can bsearch. - // TODO(benvanik): cache last module low/high, as likely to be in there. - for (const auto& module : modules_) { - if (module->ContainsAddress(address)) { - code_module = module.get(); - break; - } - } - } + Module* code_module = LookupModule(address); + if (!code_module) { // No module found that could contain the address. return nullptr; diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h index eaa958d3d..630cf4633 100644 --- a/src/xenia/cpu/processor.h +++ b/src/xenia/cpu/processor.h @@ -115,6 +115,7 @@ class Processor { void RemoveFunctionByAddress(uint32_t address); Function* LookupFunction(uint32_t address); + Module* LookupModule(uint32_t address); Function* LookupFunction(Module* module, uint32_t address); Function* ResolveFunction(uint32_t address); diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc index fe9467dd8..08338a0a0 100644 --- a/src/xenia/cpu/thread_state.cc +++ b/src/xenia/cpu/thread_state.cc @@ -78,7 +78,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id, // Allocate with 64b alignment. context_ = reinterpret_cast( - AllocateContext()); + AllocateContext()); processor->backend()->InitializeBackendContext(context_); assert_true(((uint64_t)context_ & 0x3F) == 0); std::memset(context_, 0, sizeof(ppc::PPCContext)); @@ -105,9 +105,9 @@ ThreadState::~ThreadState() { thread_state_ = nullptr; } if (context_) { + processor_->backend()->DeinitializeBackendContext(context_); FreeContext(reinterpret_cast(context_)); } - // memory::AlignedFree(context_); } void ThreadState::Bind(ThreadState* thread_state) { diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 449827cc5..4e997579f 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -38,9 +38,10 @@ DEFINE_bool(disable_instruction_infocache, false, "CPU"); DEFINE_bool( - disable_early_precompilation, false, - "Disables pre-compiling guest functions that we know we've called/that " - "we've recognized as being functions via simple heuristics.", + enable_early_precompilation, false, + "Enable pre-compiling guest functions that we know we've called/that " + "we've recognized as being functions via simple heuristics, good for error " + "finding/stress testing with the JIT", "CPU"); static const uint8_t xe_xex2_retail_key[16] = { @@ -1115,6 +1116,7 @@ void XexModule::Precompile() { if (!FindSaveRest()) { return; } + info_cache_.Init(this); PrecompileDiscoveredFunctions(); } @@ -1343,22 +1345,26 @@ void XexInfoCache::Init(XexModule* xexmod) { num_codebytes += 3; // round up to nearest multiple of 4 num_codebytes &= ~3; - bool did_exist = true; - if (!std::filesystem::exists(infocache_path)) { - recreate: - xe::filesystem::CreateEmptyFile(infocache_path); - did_exist = false; - } + auto try_open = [this, &infocache_path, num_codebytes]() { + bool did_exist = true; - // todo: prepopulate with stuff from pdata, dll exports + if (!std::filesystem::exists(infocache_path)) { + xe::filesystem::CreateEmptyFile(infocache_path); + did_exist = false; + } - this->executable_addr_flags_ = std::move(xe::MappedMemory::Open( - infocache_path, xe::MappedMemory::Mode::kReadWrite, 0, - sizeof(InfoCacheFlagsHeader) + - (sizeof(InfoCacheFlags) * - (num_codebytes / - 4)))); // one infocacheflags entry for each PPC instr-sized addr + // todo: prepopulate with stuff from pdata, dll exports + this->executable_addr_flags_ = std::move(xe::MappedMemory::Open( + infocache_path, xe::MappedMemory::Mode::kReadWrite, 0, + sizeof(InfoCacheFlagsHeader) + + (sizeof(InfoCacheFlags) * + (num_codebytes / + 4)))); // one infocacheflags entry for each PPC instr-sized addr + return did_exist; + }; + + bool did_exist = try_open(); if (!did_exist) { GetHeader()->version = CURRENT_INFOCACHE_VERSION; @@ -1366,7 +1372,7 @@ void XexInfoCache::Init(XexModule* xexmod) { if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) { this->executable_addr_flags_->Close(); std::filesystem::remove(infocache_path); - goto recreate; + try_open(); } } } @@ -1380,7 +1386,7 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) { return info_cache_.LookupFlags(guest_addr); } void XexModule::PrecompileDiscoveredFunctions() { - if (cvars::disable_early_precompilation) { + if (!cvars::enable_early_precompilation) { return; } auto others = PreanalyzeCode(); @@ -1397,7 +1403,7 @@ void XexModule::PrecompileDiscoveredFunctions() { } } void XexModule::PrecompileKnownFunctions() { - if (cvars::disable_early_precompilation) { + if (!cvars::enable_early_precompilation) { return; } uint32_t start = 0; @@ -1435,18 +1441,14 @@ static bool IsOpcodeBL(unsigned w) { std::vector XexModule::PreanalyzeCode() { uint32_t low_8_aligned = xe::align(low_address_, 8); - - uint32_t highest_exec_addr = 0; for (auto&& sec : pe_sections_) { if ((sec.flags & kXEPESectionContainsCode)) { - - - highest_exec_addr = + highest_exec_addr = std::max(highest_exec_addr, sec.address + sec.size); - } + } } uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1); uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8; @@ -1476,7 +1478,7 @@ std::vector XexModule::PreanalyzeCode() { uint32_t mfspr_r12_lr32 = *reinterpret_cast(&mfspr_r12_lr[0]); - auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) { + auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) { funcstart_candidate_stack[stack_pos++] = addr; }; /* @@ -1926,7 +1928,7 @@ bool XexModule::FindSaveRest() { address += 2 * 4; } } - if (!cvars::disable_early_precompilation) { + if (cvars::enable_early_precompilation) { for (auto&& to_ensure_precompiled : resolve_on_exit) { // we want to make sure an address for these functions is available before // any other functions are compiled for code generation purposes but we do diff --git a/src/xenia/cpu/xex_module.h b/src/xenia/cpu/xex_module.h index bec6b7e0f..ded57c6f8 100644 --- a/src/xenia/cpu/xex_module.h +++ b/src/xenia/cpu/xex_module.h @@ -29,23 +29,27 @@ constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1"); constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2"); constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F'); - class Runtime; struct InfoCacheFlags { uint32_t was_resolved : 1; // has this address ever been called/requested // via resolvefunction? uint32_t accessed_mmio : 1; uint32_t is_syscall_func : 1; - uint32_t reserved : 29; + uint32_t is_return_site : 1; // address can be reached from another function + // by returning + uint32_t reserved : 28; }; +static_assert(sizeof(InfoCacheFlags) == 4, + "InfoCacheFlags size should be equal to sizeof ppc instruction."); + struct XexInfoCache { - //increment this to invalidate all user infocaches - static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 1; + // increment this to invalidate all user infocaches + static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 4; struct InfoCacheFlagsHeader { uint32_t version; - unsigned char reserved[252]; + unsigned char reserved[252]; InfoCacheFlags* LookupFlags(unsigned offset) { return &reinterpret_cast(&this[1])[offset]; @@ -228,7 +232,8 @@ class XexModule : public xe::cpu::Module { InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); - virtual void Precompile() override; + virtual void Precompile() override; + protected: std::unique_ptr CreateFunction(uint32_t address) override; diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 62191477e..c99afd595 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1911,21 +1911,8 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) { - RingBuffer::ReadRange range = - ring->BeginRead(num_registers * sizeof(uint32_t)); - - XE_LIKELY_IF(!range.second) { - uint32_t num_regs_firstrange = - static_cast(range.first_length / sizeof(uint32_t)); - - D3D12CommandProcessor::WriteRegistersFromMem( - base, reinterpret_cast(const_cast(range.first)), - num_regs_firstrange); - ring->EndRead(range); - } - else { - return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers); - } + WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base, + num_registers); } template @@ -2042,7 +2029,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound( RingBuffer::ReadRange range = ring->BeginRead(num_registers * sizeof(uint32_t)); - XE_LIKELY_IF(!range.second) { WriteRegisterRangeFromMem_WithKnownBound( @@ -2710,9 +2696,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } if (vfetch_current_queued) { - // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though - // pre-acquire the critical region so we're not repeatedly re-acquiring it - // in requestrange + // so far, i have never seen vfetch_current_queued > 4. 1 is most common, + // 2 happens occasionally. did not test many games though pre-acquire the + // critical region so we're not repeatedly re-acquiring it in requestrange auto shared_memory_request_range_hoisted = global_critical_region::Acquire(); @@ -4351,7 +4337,8 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); + float_constant_map_entry = + xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) + (float_constant_index << 2)] @@ -4382,7 +4369,8 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); + float_constant_map_entry = + xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) + (float_constant_index << 2)] diff --git a/src/xenia/kernel/xam/xam_input.cc b/src/xenia/kernel/xam/xam_input.cc index 242ee1cfa..f9d9fa40f 100644 --- a/src/xenia/kernel/xam/xam_input.cc +++ b/src/xenia/kernel/xam/xam_input.cc @@ -41,10 +41,23 @@ DECLARE_XAM_EXPORT1(XamEnableInactivityProcessing, kInput, kStub); // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx dword_result_t XamInputGetCapabilities_entry( - dword_t user_index, dword_t flags, pointer_t caps) { + dword_t user_index, dword_t _flags, pointer_t caps) { + unsigned flags = _flags; + //chrispy: actually, it appears that caps is never checked for null, it is memset at the start regardless if (!caps) { return X_ERROR_BAD_ARGUMENTS; } + if ((flags & 0x40000000) != 0) { + //should trap + } + + if ((flags & 4) != 0) { + //should trap + } + if (!flags) { + flags = 3; + } + if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) { // Ignore any query for other types of devices. @@ -118,7 +131,7 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags, DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency); // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx -dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk, +dword_result_t XamInputSetState_entry(dword_t user_index, dword_t flags, /* flags, as far as i can see, is not used*/ pointer_t vibration) { if (user_index >= 4) { return X_E_DEVICE_NOT_CONNECTED; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc index fefe2df4e..928ec780f 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc @@ -508,7 +508,16 @@ dword_result_t RtlInitializeCriticalSectionAndSpinCount_entry( DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone, kImplemented); +static void CriticalSectionPrefetchW(const void* vp) { +#if XE_ARCH_AMD64 == 1 + if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) { + swcache::PrefetchW(vp); + } +#endif +} + void RtlEnterCriticalSection_entry(pointer_t cs) { + CriticalSectionPrefetchW(&cs->lock_count); uint32_t cur_thread = XThread::GetCurrentThread()->guest_object(); uint32_t spin_count = cs->header.absolute * 256; @@ -544,6 +553,7 @@ DECLARE_XBOXKRNL_EXPORT2(RtlEnterCriticalSection, kNone, kImplemented, dword_result_t RtlTryEnterCriticalSection_entry( pointer_t cs) { + CriticalSectionPrefetchW(&cs->lock_count); uint32_t thread = XThread::GetCurrentThread()->guest_object(); if (xe::atomic_cas(-1, 0, &cs->lock_count)) { diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 574a91585..14179939e 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -7,6 +7,7 @@ ****************************************************************************** */ +#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h" #include #include #include "xenia/base/atomic.h" @@ -18,7 +19,6 @@ #include "xenia/kernel/user_module.h" #include "xenia/kernel/util/shim_utils.h" #include "xenia/kernel/xboxkrnl/xboxkrnl_private.h" -#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h" #include "xenia/kernel/xevent.h" #include "xenia/kernel/xmutant.h" #include "xenia/kernel/xsemaphore.h" @@ -165,8 +165,16 @@ dword_result_t NtResumeThread_entry(dword_t handle, uint32_t suspend_count = 0; auto thread = kernel_state()->object_table()->LookupObject(handle); + if (thread) { - result = thread->Resume(&suspend_count); + if (thread->type() == XObject::Type::Thread) { + result = thread->Resume(&suspend_count); + + } else { + return X_STATUS_OBJECT_TYPE_MISMATCH; + } + } else { + return X_STATUS_INVALID_HANDLE; } if (suspend_count_ptr) { *suspend_count_ptr = suspend_count; @@ -190,15 +198,27 @@ dword_result_t KeResumeThread_entry(lpvoid_t thread_ptr) { DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented); dword_result_t NtSuspendThread_entry(dword_t handle, - lpdword_t suspend_count_ptr) { + lpdword_t suspend_count_ptr, + const ppc_context_t& context) { X_RESULT result = X_STATUS_SUCCESS; uint32_t suspend_count = 0; auto thread = kernel_state()->object_table()->LookupObject(handle); if (thread) { - result = thread->Suspend(&suspend_count); + if (thread->type() == XObject::Type::Thread) { + auto current_pcr = context->TranslateVirtualGPR(context->r[13]); + + if (current_pcr->current_thread == thread->guest_object() || + !thread->guest_object()->terminated) { + result = thread->Suspend(&suspend_count); + } else { + return X_STATUS_THREAD_IS_TERMINATING; + } + } else { + return X_STATUS_OBJECT_TYPE_MISMATCH; + } } else { - result = X_STATUS_INVALID_HANDLE; + return X_STATUS_INVALID_HANDLE; } if (suspend_count_ptr) { @@ -213,23 +233,23 @@ void KeSetCurrentStackPointers_entry(lpvoid_t stack_ptr, pointer_t thread, lpvoid_t stack_alloc_base, lpvoid_t stack_base, - lpvoid_t stack_limit) { + lpvoid_t stack_limit, const ppc_context_t& context) { auto current_thread = XThread::GetCurrentThread(); - auto context = current_thread->thread_state()->context(); - auto pcr = kernel_memory()->TranslateVirtual( - static_cast(context->r[13])); + auto pcr = context->TranslateVirtualGPR(context->r[13]); + thread->stack_alloc_base = stack_alloc_base.value(); thread->stack_base = stack_base.value(); thread->stack_limit = stack_limit.value(); pcr->stack_base_ptr = stack_base.guest_address(); pcr->stack_end_ptr = stack_limit.guest_address(); context->r[1] = stack_ptr.guest_address(); - + // If a fiber is set, and the thread matches, reenter to avoid issues with // host stack overflowing. if (thread->fiber_ptr && current_thread->guest_object() == thread.guest_address()) { + context->processor->backend()->PrepareForReentry(context.value()); current_thread->Reenter(static_cast(context->lr)); } } @@ -1018,7 +1038,8 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - while (!xe::atomic_cas(0, xe::byte_swap(static_cast(ppc_ctx->r[13])), lock)) { + while (!xe::atomic_cas( + 0, xe::byte_swap(static_cast(ppc_ctx->r[13])), lock)) { #if XE_ARCH_AMD64 == 1 // todo: this is just a nop if they don't have SMT, which is not great // either... @@ -1038,7 +1059,8 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( auto lock = reinterpret_cast(lock_ptr.host_address()); assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - if (!xe::atomic_cas(0, xe::byte_swap(static_cast(ppc_ctx->r[13])), lock)) { + if (!xe::atomic_cas(0, xe::byte_swap(static_cast(ppc_ctx->r[13])), + lock)) { return 0; } return 1; @@ -1281,7 +1303,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr, const ppc_context_t& ppc_context) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count) { @@ -1318,7 +1341,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading, void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr, const ppc_context_t& ppc_context) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count || diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc index 6d548220a..df5991d09 100644 --- a/src/xenia/kernel/xthread.cc +++ b/src/xenia/kernel/xthread.cc @@ -33,8 +33,15 @@ DEFINE_bool(ignore_thread_priorities, true, DEFINE_bool(ignore_thread_affinities, true, "Ignores game-specified thread affinities.", "Kernel"); + +#if 0 +DEFINE_int64(stack_size_multiplier_hack, 1, + "A hack for games with setjmp/longjmp issues.", "Kernel"); +DEFINE_int64(main_xthread_stack_size_multiplier_hack, 1, + "A hack for games with setjmp/longjmp issues.", "Kernel"); +#endif namespace xe { -namespace kernel { + namespace kernel { const uint32_t XAPC::kSize; const uint32_t XAPC::kDummyKernelRoutine; @@ -373,8 +380,23 @@ X_STATUS XThread::Create() { RetainHandle(); xe::threading::Thread::CreationParameters params; - params.stack_size = 16_MiB; // Allocate a big host stack. + + + params.create_suspended = true; + + #if 0 + uint64_t stack_size_mult = cvars::stack_size_multiplier_hack; + + if (main_thread_) { + stack_size_mult = + static_cast(cvars::main_xthread_stack_size_multiplier_hack); + + } + #else + uint64_t stack_size_mult = 1; + #endif + params.stack_size = 16_MiB * stack_size_mult; // Allocate a big host stack. thread_ = xe::threading::Thread::Create(params, [this]() { // Set thread ID override. This is used by logging. xe::threading::set_current_thread_id(handle()); @@ -433,6 +455,9 @@ X_STATUS XThread::Create() { X_STATUS XThread::Exit(int exit_code) { // This may only be called on the thread itself. assert_true(XThread::GetCurrentThread() == this); + //TODO(chrispy): not sure if this order is correct, should it come after apcs? + guest_object()->terminated = 1; + // TODO(benvanik): dispatch events? waiters? etc? RundownAPCs(); diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index 75e790ebd..898aea006 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -121,7 +121,7 @@ struct X_KTHREAD { uint8_t unk_B4[0x8]; // 0xB4 uint8_t suspend_count; // 0xBC uint8_t unk_BD; // 0xBD - uint8_t unk_BE; // 0xBE + uint8_t terminated; // 0xBE uint8_t current_cpu; // 0xBF uint8_t unk_C0[0x10]; // 0xC0 xe::be stack_alloc_base; // 0xD0 diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index b160696f4..6384f4996 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -316,8 +316,8 @@ void Memory::Reset() { heaps_.v90000000.Reset(); heaps_.physical.Reset(); } -//clang does not like non-standard layout offsetof -#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0 +// clang does not like non-standard layout offsetof +#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0 XE_NOALIAS const BaseHeap* Memory::LookupHeap(uint32_t address) const { #define HEAP_INDEX(name) \ @@ -359,7 +359,6 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const { #else XE_NOALIAS const BaseHeap* Memory::LookupHeap(uint32_t address) const { - if (address < 0x40000000) { return &heaps_.v00000000; } else if (address < 0x7F000000) { @@ -964,6 +963,14 @@ bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size, return true; } +template +static inline T QuickMod(T value, uint32_t modv) { + if (xe::is_pow2(modv)) { + return value & (modv - 1); + } else { + return value % modv; + } +} bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, uint32_t size, uint32_t alignment, @@ -976,8 +983,9 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, low_address = std::max(heap_base_, xe::align(low_address, alignment)); high_address = std::min(heap_base_ + (heap_size_ - 1), xe::align(high_address, alignment)); - uint32_t low_page_number = (low_address - heap_base_) / page_size_; - uint32_t high_page_number = (high_address - heap_base_) / page_size_; + + uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_; + uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_; low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number); high_page_number = std::min(uint32_t(page_table_.size()) - 1, high_page_number); @@ -995,8 +1003,10 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, // TODO(benvanik): optimized searching (free list buckets, bitmap, etc). uint32_t start_page_number = UINT_MAX; uint32_t end_page_number = UINT_MAX; - uint32_t page_scan_stride = alignment / page_size_; - high_page_number = high_page_number - (high_page_number % page_scan_stride); + // chrispy:todo, page_scan_stride is probably always a power of two... + uint32_t page_scan_stride = alignment >> page_size_shift_; + high_page_number = + high_page_number - QuickMod(high_page_number, page_scan_stride); if (top_down) { for (int64_t base_page_number = high_page_number - xe::round_up(page_count, page_scan_stride); @@ -1024,7 +1034,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, base_page_number = -1; } else { base_page_number = page_number - page_count; - base_page_number -= base_page_number % page_scan_stride; + base_page_number -= QuickMod(base_page_number, page_scan_stride); base_page_number += page_scan_stride; // cancel out loop logic } break; @@ -1072,7 +1082,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) { // Out of memory. XELOGE("BaseHeap::Alloc failed to find contiguous range"); - //assert_always("Heap exhausted!"); + // assert_always("Heap exhausted!"); return false; } @@ -1084,15 +1094,15 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, ? xe::memory::AllocationType::kCommit : xe::memory::AllocationType::kReserve; void* result = xe::memory::AllocFixed( - TranslateRelative(start_page_number * page_size_), - page_count * page_size_, alloc_type, ToPageAccess(protect)); + TranslateRelative(start_page_number << page_size_shift_), + page_count << page_size_shift_, alloc_type, ToPageAccess(protect)); if (!result) { XELOGE("BaseHeap::Alloc failed to alloc range from host"); return false; } if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) { - std::memset(result, 0xCD, page_count * page_size_); + std::memset(result, 0xCD, page_count << page_size_shift_); } } @@ -1108,7 +1118,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, unreserved_page_count_--; } - *out_address = heap_base_ + (start_page_number * page_size_); + *out_address = heap_base_ + (start_page_number << page_size_shift_); return true; } @@ -1719,8 +1729,7 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner( uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first); uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last); - uint32_t guest_one = - SystemPagenumToGuestPagenum(1); + uint32_t guest_one = SystemPagenumToGuestPagenum(1); uint32_t system_one = GuestPagenumToSystemPagenum(1); for (; i <= system_page_last; ++i) { @@ -1755,7 +1764,6 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner( #endif uint32_t guest_page_number = SystemPagenumToGuestPagenum(i); - //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]); xe::memory::PageAccess current_page_access = ToPageAccess(page_table_ptr[guest_page_number].current_protect); bool protect_system_page = false; diff --git a/src/xenia/ui/windowed_app_main_win.cc b/src/xenia/ui/windowed_app_main_win.cc index 114d36fc0..115eb259f 100644 --- a/src/xenia/ui/windowed_app_main_win.cc +++ b/src/xenia/ui/windowed_app_main_win.cc @@ -19,11 +19,96 @@ DEFINE_bool(enable_console, false, "Open a console window with the main window", "General"); +#if XE_ARCH_AMD64 == 1 +DEFINE_bool(enable_rdrand_ntdll_patch, true, + "Hot-patches ntdll at the start of the process to not use rdrand " + "as part of the RNG for heap randomization. Can reduce CPU usage " + "significantly, but is untested on all Windows versions.", + "Win32"); +// begin ntdll hack +#include +static bool g_didfailtowrite = false; +static void write_process_memory(HANDLE process, uintptr_t offset, + unsigned size, const unsigned char* bvals) { + if (!WriteProcessMemory(process, (void*)offset, bvals, size, nullptr)) { + if (!g_didfailtowrite) { + MessageBoxA(nullptr, "Failed to write to process!", "Failed", MB_OK); + g_didfailtowrite = true; + } + } +} +static const unsigned char pattern_cmp_processorfeature_28_[] = { + 0x80, 0x3C, 0x25, 0x90, + 0x02, 0xFE, 0x7F, 0x00}; // cmp byte ptr ds:7FFE0290h, 0 +static const unsigned char pattern_replacement[] = { + 0x48, 0x39, 0xe4, // cmp rsp, rsp = always Z + 0x0F, 0x1F, 0x44, 0x00, 0x00 // 5byte nop +}; +static void patch_ntdll_instance(HANDLE process, uintptr_t ntdll_base) { + MODULEINFO modinfo; + + GetModuleInformation(process, (HMODULE)ntdll_base, &modinfo, + sizeof(MODULEINFO)); + + std::vector possible_places{}; + + unsigned char* strt = (unsigned char*)modinfo.lpBaseOfDll; + + for (unsigned i = 0; i < modinfo.SizeOfImage; ++i) { + for (unsigned j = 0; j < sizeof(pattern_cmp_processorfeature_28_); ++j) { + if (strt[i + j] != pattern_cmp_processorfeature_28_[j]) { + goto miss; + } + } + possible_places.push_back((uintptr_t)(&strt[i])); + miss:; + } + + for (auto&& place : possible_places) { + write_process_memory(process, place, sizeof(pattern_replacement), + pattern_replacement); + } +} + +static void do_ntdll_hack_this_process() { + patch_ntdll_instance(GetCurrentProcess(), + (uintptr_t)GetModuleHandleA("ntdll.dll")); +} +#endif +// end ntdll hack +LONG _UnhandledExceptionFilter(_EXCEPTION_POINTERS* ExceptionInfo) { + PVOID exception_addr = ExceptionInfo->ExceptionRecord->ExceptionAddress; + + DWORD64 last_stackpointer = ExceptionInfo->ContextRecord->Rsp; + + DWORD64 last_rip = ExceptionInfo->ContextRecord->Rip; + + DWORD except_code = ExceptionInfo->ExceptionRecord->ExceptionCode; + + DWORD last_error = GetLastError(); + + NTSTATUS stat = __readgsdword(0x1250); + + int last_errno_value = errno; + + + + char except_message_buf[1024]; + + sprintf_s(except_message_buf, + "Exception encountered!\nException address: %p\nStackpointer: " + "%p\nInstruction pointer: %p\nExceptionCode: 0x%X\nLast Win32 " + "Error: 0x%X\nLast NTSTATUS: 0x%X\nLast errno value: 0x%X\n", + exception_addr, (void*)last_stackpointer, (void*)last_rip, except_code, + last_error, stat, last_errno_value); + MessageBoxA(nullptr, except_message_buf, "Unhandled Exception", MB_ICONERROR); + return EXCEPTION_CONTINUE_SEARCH; +} int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev, LPWSTR command_line, int show_cmd) { int result; - + SetUnhandledExceptionFilter(_UnhandledExceptionFilter); { xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd); // TODO(Triang3l): Initialize creates a window. Set DPI awareness via the @@ -40,13 +125,6 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev, return EXIT_FAILURE; } - // TODO(Triang3l): Rework this, need to initialize the console properly, - // disable has_console_attached_ by default in windowed apps, and attach - // only if needed. - if (cvars::enable_console) { - xe::AttachConsole(); - } - // Initialize COM on the UI thread with the apartment-threaded concurrency // model, so dialogs can be used. if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) { @@ -55,8 +133,22 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev, xe::InitializeWin32App(app->GetName()); - result = - app->OnInitialize() ? app_context.RunMainMessageLoop() : EXIT_FAILURE; + if (app->OnInitialize()) { +#if XE_ARCH_AMD64 == 1 + if (cvars::enable_rdrand_ntdll_patch) { + do_ntdll_hack_this_process(); + } +#endif + // TODO(Triang3l): Rework this, need to initialize the console properly, + // disable has_console_attached_ by default in windowed apps, and attach + // only if needed. + if (cvars::enable_console) { + xe::AttachConsole(); + } + result = app_context.RunMainMessageLoop(); + } else { + result = EXIT_FAILURE; + } app->InvokeOnDestroy(); } diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index 574501788..349e40886 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -61,6 +61,7 @@ typedef uint32_t X_STATUS; #define X_STATUS_OBJECT_NAME_COLLISION ((X_STATUS)0xC0000035L) #define X_STATUS_INVALID_PAGE_PROTECTION ((X_STATUS)0xC0000045L) #define X_STATUS_MUTANT_NOT_OWNED ((X_STATUS)0xC0000046L) +#define X_STATUS_THREAD_IS_TERMINATING ((X_STATUS)0xC000004BL) #define X_STATUS_PROCEDURE_NOT_FOUND ((X_STATUS)0xC000007AL) #define X_STATUS_INSUFFICIENT_RESOURCES ((X_STATUS)0xC000009AL) #define X_STATUS_MEMORY_NOT_ALLOCATED ((X_STATUS)0xC00000A0L)