Merge pull request #96 from chrisps/host_guest_stack_synchronization

Host/Guest stack sync, exception messagebox, kernel improvements, minor opt
2022-11-27 10:30:16 -08:00 · 2022-11-27 10:30:16 -08:00 · 0674b68143
parent 12005acc98 90c771526d
commit 0674b68143
28 changed files with 950 additions and 244 deletions
--- a/.gitignore
+++ b/.gitignore
@ -103,3 +103,5 @@ node_modules/.bin/
 /tools/shader-playground/*.dll
 /profile_print_times.py
 /profile_times.txt
+/cache1
+/cache0
--- a/src/xenia/base/console_win.cc
+++ b/src/xenia/base/console_win.cc
@ -35,13 +35,15 @@ static bool has_shell_environment_variable() {
 }

 void AttachConsole() {
-  bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
+  
+bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
+#if 0
  if (!has_console || !has_shell_environment_variable()) {
    // We weren't launched from a console, so just return.
    has_console_attached_ = false;
    return;
  }
-
+  #endif
  AllocConsole();

  has_console_attached_ = true;
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -410,34 +410,7 @@ static float ArchReciprocal(float den) {
  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }

-#if 0
-using ArchFloatMask = float;
-
-XE_FORCEINLINE
-static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
-  return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
-}
-XE_FORCEINLINE
-static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-XE_FORCEINLINE
-static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-
-XE_FORCEINLINE
-static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-
-XE_FORCEINLINE
-static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
-  return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
-}
-
-constexpr ArchFloatMask floatmask_zero = .0f;
-#else
+ 
 using ArchFloatMask = __m128;

 XE_FORCEINLINE
@ -464,7 +437,7 @@ static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
 }

 constexpr ArchFloatMask floatmask_zero{.0f};
-#endif
+ 
 #else
 static float ArchMin(float x, float y) { return std::min<float>(x, y); }
 static float ArchMax(float x, float y) { return std::max<float>(x, y); }
@ -610,17 +583,17 @@ union IDivExtraInfo {
  } info;
 };
 // returns magicnum multiplier
-static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
-  IDivExtraInfo extra;
+static constexpr uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
+  IDivExtraInfo extra{};

  uint32_t d = _denom;
-  int p;
-  uint32_t nc, delta, q1, r1, q2, r2;
+  int p=0;
+  uint32_t nc=0, delta=0, q1=0, r1=0, q2=0, r2=0;
  struct {
    unsigned M;
    int a;
    int s;
-  } magu;
+  } magu{};
  magu.a = 0;
  nc = -1 - ((uint32_t) - (int32_t)d) % d;
  p = 31;
@ -660,13 +633,13 @@ static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
  return static_cast<uint64_t>(q2 + 1);
 }

-static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
+static constexpr uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
                                      uint32_t extradata) {
-  IDivExtraInfo extra;
+  IDivExtraInfo extra{};

  extra.value_ = extradata;

-  uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
+  uint32_t result = static_cast<uint32_t>((static_cast<uint64_t>(num) * static_cast<uint64_t>(mul)) >> 32);
  if (extra.info.add_) {
    uint32_t addend = result + num;
    addend = ((addend < result ? 0x80000000 : 0) | addend);
@ -675,7 +648,7 @@ static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
  return result >> extra.info.shift_;
 }

-static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
+static constexpr uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
                                       uint32_t extradata, uint32_t original) {
  uint32_t dived = ApplyUint32Div(num, mul, extradata);
  unsigned result = num - (dived * original);
@ -686,12 +659,12 @@ static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
 struct MagicDiv {
  uint32_t multiplier_;
  uint32_t extradata_;
-  MagicDiv() : multiplier_(0), extradata_(0) {}
-  MagicDiv(uint32_t original) {
+  constexpr MagicDiv() : multiplier_(0), extradata_(0) {}
+  constexpr MagicDiv(uint32_t original) : MagicDiv() {
    multiplier_ = PregenerateUint32Div(original, extradata_);
  }

-  uint32_t Apply(uint32_t numerator) const {
+  constexpr uint32_t Apply(uint32_t numerator) const {
    return ApplyUint32Div(numerator, multiplier_, extradata_);
  }
 };
--- a/src/xenia/base/memory_win.cc
+++ b/src/xenia/base/memory_win.cc
@ -28,6 +28,9 @@ namespace xe {
 namespace memory {

 size_t page_size() {
+#if XE_ARCH_AMD64 == 1
+  return 4096;
+#else
  static size_t value = 0;
  if (!value) {
    SYSTEM_INFO si;
@ -35,9 +38,13 @@ size_t page_size() {
    value = si.dwPageSize;
  }
  return value;
+#endif
 }

 size_t allocation_granularity() {
+#if XE_ARCH_AMD64 == 1 && XE_PLATFORM_WIN32 == 1
+  return 65536;
+#else
  static size_t value = 0;
  if (!value) {
    SYSTEM_INFO si;
@ -45,6 +52,7 @@ size_t allocation_granularity() {
    value = si.dwAllocationGranularity;
  }
  return value;
+#endif
 }

 DWORD ToWin32ProtectFlags(PageAccess access) {
--- a/src/xenia/base/platform_win.h
+++ b/src/xenia/base/platform_win.h
@ -37,7 +37,7 @@
 #define XE_USE_NTDLL_FUNCTIONS 1
 //chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
 // 
-#define XE_USE_KUSER_SHARED 0
+#define XE_USE_KUSER_SHARED 1
 #if XE_USE_NTDLL_FUNCTIONS == 1
 /*
        ntdll versions of functions often skip through a lot of extra garbage in
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@ -67,7 +67,22 @@ class Backend {
  // up until the start of ctx may be used by the backend to store whatever data
  // they want
  virtual void InitializeBackendContext(void* ctx) {}
+
+  /*
+	Free any dynamically allocated data/resources that the backendcontext uses
+  */
+  virtual void DeinitializeBackendContext(void* ctx) {}
  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
+  /*
+        called by KeSetCurrentStackPointers in xboxkrnl_threading.cc just prior
+  to calling XThread::Reenter this is an opportunity for a backend to clear any
+  data related to the guest stack
+
+        in the case of the X64 backend, it means we reset the stackpoint index
+  to 0, since its a new stack and all of our old entries are invalid now
+
+  * */
+  virtual void PrepareForReentry(void* ctx) {}

 protected:
  Processor* processor_ = nullptr;
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -31,7 +31,16 @@ DEFINE_bool(record_mmio_access_exceptions, true,
            "For guest addresses records whether we caught any mmio accesses "
            "for them. This info can then be used on a subsequent run to "
            "instruct the recompiler to emit checks",
-            "CPU");
+            "x64");
+
+DEFINE_int64(max_stackpoints, 65536,
+             "Max number of host->guest stack mappings we can record.", "x64");
+
+DEFINE_bool(enable_host_guest_stack_synchronization, true,
+            "Records entries for guest/host stack mappings at function starts "
+            "and checks for reentry at return sites. Has slight performance "
+            "impact, but fixes crashes in games that use setjmp/longjmp.",
+            "x64");
 #if XE_X64_PROFILER_AVAILABLE == 1
 DECLARE_bool(instrument_call_times);
 #endif
@ -41,15 +50,29 @@ namespace cpu {
 namespace backend {
 namespace x64 {

-class X64ThunkEmitter : public X64Emitter {
+class X64HelperEmitter : public X64Emitter {
 public:
-  X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator);
-  ~X64ThunkEmitter() override;
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  };
+  X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator);
+  ~X64HelperEmitter() override;
  HostToGuestThunk EmitHostToGuestThunk();
  GuestToHostThunk EmitGuestToHostThunk();
  ResolveFunctionThunk EmitResolveFunctionThunk();
+  void* EmitGuestAndHostSynchronizeStackHelper();
+  // 1 for loading byte, 2 for halfword and 4 for word.
+  // these specialized versions save space in the caller
+  void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+      void* sync_func, unsigned stack_element_size);

 private:
+  void* EmitCurrentForOffsets(const _code_offsets& offsets,
+                              size_t stack_size = 0);
  // The following four functions provide save/load functionality for registers.
  // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
  // allocated on the stack.
@ -184,11 +207,26 @@ bool X64Backend::Initialize(Processor* processor) {

  // Generate thunks used to transition between jitted code and host code.
  XbyakAllocator allocator;
-  X64ThunkEmitter thunk_emitter(this, &allocator);
+  X64HelperEmitter thunk_emitter(this, &allocator);
  host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
  guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
  resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();

+  if (cvars::enable_host_guest_stack_synchronization) {
+    synchronize_guest_and_host_stack_helper_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackHelper();
+
+    synchronize_guest_and_host_stack_helper_size8_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 1);
+    synchronize_guest_and_host_stack_helper_size16_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 2);
+    synchronize_guest_and_host_stack_helper_size32_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 4);
+  }
+
  // Set the code cache to use the ResolveFunction thunk for default
  // indirections.
  assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@ -203,9 +241,10 @@ bool X64Backend::Initialize(Processor* processor) {

  // Setup exception callback
  ExceptionHandler::Install(&ExceptionCallbackThunk, this);
-
-  processor->memory()->SetMMIOExceptionRecordingCallback(
-      ForwardMMIOAccessForRecording, (void*)this);
+  if (cvars::record_mmio_access_exceptions) {
+    processor->memory()->SetMMIOExceptionRecordingCallback(
+        ForwardMMIOAccessForRecording, (void*)this);
+  }

 #if XE_X64_PROFILER_AVAILABLE == 1
  if (cvars::instrument_call_times) {
@ -509,23 +548,32 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
  return processor()->OnThreadBreakpointHit(ex);
 }

-X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator)
+X64HelperEmitter::X64HelperEmitter(X64Backend* backend,
+                                   XbyakAllocator* allocator)
    : X64Emitter(backend, allocator) {}

-X64ThunkEmitter::~X64ThunkEmitter() {}
+X64HelperEmitter::~X64HelperEmitter() {}
+void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets,
+                                              size_t stack_size) {
+  EmitFunctionInfo func_info = {};
+  func_info.code_size.total = getSize();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = getSize() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+  func_info.stack_size = stack_size;

-HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
+  void* fn = Emplace(func_info);
+  return fn;
+}
+HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() {
  // rcx = target
  // rdx = arg0 (context)
  // r8 = arg1 (guest return address)

-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};

  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;

@ -576,19 +624,13 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
  return (HostToGuestThunk)fn;
 }

-GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
+GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() {
  // rcx = target function
  // rdx = arg0
  // r8  = arg1
  // r9  = arg2

-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};

  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;

@ -635,17 +677,11 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
 // X64Emitter handles actually resolving functions.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address);

-ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
+ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() {
  // ebx = target PPC address
  // rcx = context

-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};

  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;

@ -688,8 +724,116 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
  void* fn = Emplace(func_info);
  return (ResolveFunctionThunk)fn;
 }
+// r11 = size of callers stack, r8 = return address w/ adjustment
+//i'm not proud of this code, but it shouldn't be executed frequently at all
+void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  mov(eax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));

-void X64ThunkEmitter::EmitSaveVolatileRegs() {
+  lea(ecx, ptr[eax - 1]);
+  mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+
+  Xbyak::Label looper{};
+  Xbyak::Label loopout{};
+  Xbyak::Label signed_underflow{};
+  xor_(r12d, r12d);
+
+  //todo: should use Loop instruction here if hasFastLoop, 
+  //currently xbyak does not support it but its super easy to modify xbyak to have it
+  L(looper);
+  imul(edx, ecx, sizeof(X64BackendStackpoint));
+  mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
+
+  cmp(r10d, r9d);
+
+  jge(loopout, T_NEAR);
+
+  inc(r12d);
+
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    dec(ecx);
+  } else {
+    sub(ecx, 1);
+  }
+  js(signed_underflow, T_NEAR);  // should be impossible!!
+
+
+  jmp(looper, T_NEAR);
+  L(loopout);
+  Xbyak::Label skip_adjust{};
+  cmp(r12d, 1);//should never happen?
+  jle(skip_adjust, T_NEAR);
+  mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]);
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    inc(ecx);
+  } else {
+    add(ecx, 1);
+  }
+
+  // this->DebugBreak();
+  sub(rsp, r11);  // adjust stack
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      ecx);  // set next stackpoint index to be after the one we restored to
+  L(skip_adjust);
+
+  jmp(r8);  // return to caller
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+
+  L(signed_underflow);
+  //find a good, compact way to signal error here
+  // maybe an invalid opcode that we execute, then detect in an exception handler?
+  
+  this->DebugBreak();
+  // stack unwinding, take first entry
+  //actually, no reason to have this
+
+  /*mov(rsp, ptr[rbx + offsetof(X64BackendStackpoint, host_stack_)]);
+  mov(ptr[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r9d);
+  sub(rsp, r11);
+  xor_(eax, eax);
+  inc(eax);
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      eax);
+
+  jmp(r8);*/
+  //  this->DebugBreak();  // err, add an xe::FatalError to call for this
+
+  return EmitCurrentForOffsets(code_offsets);
+}
+
+void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+    void* sync_func, unsigned stack_element_size) {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  pop(r8);  // return address
+
+  switch (stack_element_size) {
+    case 4:
+      mov(r11d, ptr[r8]);
+      break;
+    case 2:
+      movzx(r11d, word[r8]);
+      break;
+    case 1:
+      movzx(r11d, byte[r8]);
+      break;
+  }
+  add(r8, stack_element_size);
+  jmp(sync_func, T_NEAR);
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+void X64HelperEmitter::EmitSaveVolatileRegs() {
  // Save off volatile registers.
  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
@ -711,7 +855,7 @@ void X64ThunkEmitter::EmitSaveVolatileRegs() {
  vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
 }

-void X64ThunkEmitter::EmitLoadVolatileRegs() {
+void X64HelperEmitter::EmitLoadVolatileRegs() {
  // mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
  mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
@ -732,7 +876,7 @@ void X64ThunkEmitter::EmitLoadVolatileRegs() {
  vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
 }

-void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
+void X64HelperEmitter::EmitSaveNonvolatileRegs() {
  mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
 #if XE_PLATFORM_WIN32
@ -760,7 +904,7 @@ void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
 #endif
 }

-void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
+void X64HelperEmitter::EmitLoadNonvolatileRegs() {
  mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
  mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
 #if XE_PLATFORM_WIN32
@ -788,16 +932,41 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
 }
 void X64Backend::InitializeBackendContext(void* ctx) {
  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
-  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
  bctx->mxcsr_fpu =
      DEFAULT_FPU_MXCSR;  // idk if this is right, check on rgh what the
                          // rounding on ppc is at startup
+
+  /*
+          todo: stackpoint arrays should be pooled virtual memory at the very
+     least there may be some fancy virtual address tricks we can do here
+
+  */
+
+  bctx->stackpoints = cvars::enable_host_guest_stack_synchronization
+                          ? new X64BackendStackpoint[cvars::max_stackpoints]
+                          : nullptr;
+  bctx->current_stackpoint_depth = 0;
  bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
  bctx->flags = 0;
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
  bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
 }
+void X64Backend::DeinitializeBackendContext(void* ctx) {
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
+
+  if (bctx->stackpoints) {
+    delete[] bctx->stackpoints;
+    bctx->stackpoints = nullptr;
+  }
+}
+
+void X64Backend::PrepareForReentry(void* ctx) {
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
+
+  bctx->current_stackpoint_depth = 0;
+}
+
 const uint32_t mxcsr_table[8] = {
    0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
 };
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -24,7 +24,8 @@
 #endif

 DECLARE_int64(x64_extension_mask);
-
+DECLARE_int64(max_stackpoints);
+DECLARE_bool(enable_host_guest_stack_synchronization);
 namespace xe {
 class Exception;
 }  // namespace xe
@ -41,14 +42,25 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();

+struct X64BackendStackpoint {
+  uint64_t host_stack_;
+  unsigned guest_stack_;
+  // pad to 16 bytes so we never end up having a 64 bit load/store for
+  // host_stack_ straddling two lines. Consider this field reserved for future
+  // use
+  unsigned unused_;
+};
 // located prior to the ctx register
 // some things it would be nice to have be per-emulator instance instead of per
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
-  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
+  // guest_tick_count is used if inline_loadclock is used
  uint64_t* guest_tick_count;
+  // records mapping of host_stack to guest_stack
+  X64BackendStackpoint* stackpoints;

+  unsigned int current_stackpoint_depth;
  unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
                           // affects both vmx and the fpu
  unsigned int mxcsr_vmx;
@ -81,6 +93,19 @@ class X64Backend : public Backend {
    return resolve_function_thunk_;
  }

+  void* synchronize_guest_and_host_stack_helper() const {
+    return synchronize_guest_and_host_stack_helper_;
+  }
+  void* synchronize_guest_and_host_stack_helper_for_size(size_t sz) const {
+    switch (sz) {
+      case 1:
+        return synchronize_guest_and_host_stack_helper_size8_;
+      case 2:
+        return synchronize_guest_and_host_stack_helper_size16_;
+      default:
+        return synchronize_guest_and_host_stack_helper_size32_;
+    }
+  }
  bool Initialize(Processor* processor) override;

  void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
@ -97,7 +122,8 @@ class X64Backend : public Backend {
  void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
  void UninstallBreakpoint(Breakpoint* breakpoint) override;
  virtual void InitializeBackendContext(void* ctx) override;
-
+  virtual void DeinitializeBackendContext(void* ctx) override;
+  virtual void PrepareForReentry(void* ctx) override;
  X64BackendContext* BackendContextForGuestContext(void* ctx) {
    return reinterpret_cast<X64BackendContext*>(
        reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
@ -120,7 +146,12 @@ class X64Backend : public Backend {
  HostToGuestThunk host_to_guest_thunk_;
  GuestToHostThunk guest_to_host_thunk_;
  ResolveFunctionThunk resolve_function_thunk_;
+  void* synchronize_guest_and_host_stack_helper_ = nullptr;

+  // loads stack sizes 1 byte, 2 bytes or 4 bytes
+  void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
+  void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
+  void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
 #if XE_X64_PROFILER_AVAILABLE == 1
  GuestProfilerData profiler_data_;
 #endif
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -213,6 +213,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  func_info.stack_size = stack_size;
  stack_size_ = stack_size;

+  PushStackpoint();
  sub(rsp, (uint32_t)stack_size);

  code_offsets.prolog_stack_alloc = getSize();
@ -271,6 +272,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  */
  // Body.
  auto block = builder->first_block();
+  synchronize_stack_on_next_instruction_ = false;
  while (block) {
    ForgetMxcsrMode();  // at start of block, mxcsr mode is undefined

@ -287,6 +289,12 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
    // Process instructions.
    const Instr* instr = block->instr_head;
    while (instr) {
+      if (synchronize_stack_on_next_instruction_) {
+        if (instr->GetOpcodeNum() != hir::OPCODE_SOURCE_OFFSET) {
+          synchronize_stack_on_next_instruction_ = false;
+          EnsureSynchronizedGuestAndHostStack();
+        }
+      }
      const Instr* new_tail = instr;
      if (!SelectSequence(this, instr, &new_tail)) {
        // No sequence found!
@ -314,6 +322,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  EmitProfilerEpilogue();

  add(rsp, (uint32_t)stack_size);
+  PopStackpoint();
  ret();
  // todo: do some kind of sorting by alignment?
  for (auto&& tail_item : tail_code_) {
@ -453,12 +462,186 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {

 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state =
-      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
+  auto guest_context = reinterpret_cast<ppc::PPCContext_s*>(raw_context);
+
+  auto thread_state = guest_context->thread_state;

  // TODO(benvanik): required?
  assert_not_zero(target_address);

+  /*
+          todo: refactor this!
+
+         The purpose of this code is to allow guest longjmp to call into
+     the body of an existing host function. There are a lot of conditions we
+     have to check here to ensure that we do not mess up a normal call to a
+     function
+
+         The address must be within an XexModule (may need to make some changes
+     to instructionaddressflags to remove this limitation) The target address
+     must be a known return site. The guest address must be part of a function
+     that was already translated.
+
+  */
+
+  if (cvars::enable_host_guest_stack_synchronization) {
+    auto processor = thread_state->processor();
+    auto module_for_address =
+        processor->LookupModule(static_cast<uint32_t>(target_address));
+
+    if (module_for_address) {
+      XexModule* xexmod = dynamic_cast<XexModule*>(module_for_address);
+      if (xexmod) {
+        InfoCacheFlags* flags = xexmod->GetInstructionAddressFlags(
+            static_cast<uint32_t>(target_address));
+        if (flags) {
+          if (flags->is_return_site) {
+            auto ones_with_address = processor->FindFunctionsWithAddress(
+                static_cast<uint32_t>(target_address));
+            if (ones_with_address.size() != 0) {
+              // this loop to find a host address for the guest address is
+              // necessary because FindFunctionsWithAddress works via a range
+              // check, but if the function consists of multiple blocks
+              // scattered around with "holes" of instructions that cannot be
+              // reached in between those holes the instructions that cannot be
+              // reached will incorrectly be considered members of the function
+
+              X64Function* candidate = nullptr;
+              uintptr_t host_address = 0;
+              for (auto&& entry : ones_with_address) {
+                X64Function* xfunc = static_cast<X64Function*>(entry);
+
+                host_address = xfunc->MapGuestAddressToMachineCode(
+                    static_cast<uint32_t>(target_address));
+                // host address does exist within the function, and that host
+                // function is not the start of the function, it is instead
+                // somewhere within its existing body
+                // i originally did not have this (xfunc->machine_code() !=
+                // reinterpret_cast<const uint8_t*>(host_address))) condition
+                // here when i distributed builds for testing, no issues arose
+                // related to it but i wanted to be more explicit
+                if (host_address &&
+                    xfunc->machine_code() !=
+                        reinterpret_cast<const uint8_t*>(host_address)) {
+                  candidate = xfunc;
+                  break;
+                }
+              }
+              // we found an existing X64Function, and a return site within that
+              // function that has a host address w/ native code
+              if (candidate && host_address) {
+                X64Backend* backend =
+                    static_cast<X64Backend*>(processor->backend());
+                // grab the backend context, next we have to check whether the
+                // guest and host stack are out of sync if they arent, its fine
+                // for the backend to create a new function for the guest
+                // address we're resolving if they are, it means that the reason
+                // we're resolving this address is because context is being
+                // restored (probably by longjmp)
+                X64BackendContext* backend_context =
+                    backend->BackendContextForGuestContext(guest_context);
+
+                uint32_t current_stackpoint_index =
+                    backend_context->current_stackpoint_depth;
+
+                --current_stackpoint_index;
+
+                X64BackendStackpoint* stackpoints =
+                    backend_context->stackpoints;
+
+                uint32_t current_guest_stackpointer =
+                    static_cast<uint32_t>(guest_context->r[1]);
+                uint32_t num_frames_bigger = 0;
+
+                /*
+                        if the current guest stack pointer is bigger than the
+                   recorded pointer for this stack thats fine, plenty of
+                   functions restore the original stack pointer early
+
+                        if more than 1... we're longjmping and sure of it at
+                   this point (jumping to a return site that has already been
+                   emitted)
+                */
+                while (current_stackpoint_index != 0xFFFFFFFF) {
+                  if (current_guest_stackpointer >
+                      stackpoints[current_stackpoint_index].guest_stack_) {
+                    --current_stackpoint_index;
+                    ++num_frames_bigger;
+
+                  } else {
+                    break;
+                  }
+                }
+                /*
+                                        DEFINITELY a longjmp, return original
+                   host address. returning the existing host address is going to
+                   set off some extra machinery we have set up to support this
+
+                                        to break it down, our caller (us being
+                   this ResolveFunction that this comment is in) is
+                   X64Backend::resolve_function_thunk_ which is implemented in
+                   x64_backend.cc X64HelperEmitter::EmitResolveFunctionThunk, or
+                   a call from the resolver table
+
+                                        the x64 fastcall abi dictates that the
+                   stack must always be 16 byte aligned. We select our stack
+                   size for functions to ensure that we keep rsp aligned to 16
+                   bytes
+
+                                        but by calling into the body of an
+                   existing function we've pushed our return address onto the
+                   stack (dont worry about this return address, it gets
+                   discarded in a later step)
+
+                                        this means that the stack is no longer
+                   16 byte aligned, (rsp % 16) now == 8, and this is the only
+                   time outside of the prolog or epilog of a function that this
+                   will be the case
+
+                                        so, after all direct or indirect
+                   function calls we set
+                   X64Emitter::synchronize_stack_on_next_instruction_ to true.
+                                        On the next instruction that is not
+                   OPCODE_SOURCE_OFFSET we will emit a check when we see
+                   synchronize_stack_on_next_instruction_ is true. We have to
+                   skip OPCODE_SOURCE_OFFSET because its not a "real"
+                   instruction and if we emit on it the return address of the
+                   function call will point to AFTER our check, so itll never be
+                   executed.
+
+                                        our check is just going to do test esp,
+                   15 to see if the stack is misaligned. (using esp instead of
+                   rsp saves 1 byte). We tail emit the handling for when the
+                   check succeeds because in 99.99999% of function calls it will
+                   be aligned, in the end the runtime cost of these checks is 5
+                   bytes for the test instruction which ought to be one cycle
+                   and 5 bytes for the jmp with no cycles taken for the jump
+                   which will be predicted not taken.
+
+                  Our handling for the check is implemented in X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper. we don't call it directly though,
+				  instead we go through backend()->synchronize_guest_and_host_stack_helper_for_size(num_bytes_needed_to_represent_stack_size). we place the stack size after the
+				  call instruction so we can load it in the helper and readjust the return address to point after the literal value. 
+
+				  The helper is going to search the array of stackpoints to find the first one that is greater than or equal to the current stack pointer, when it finds
+				  the entry it will set the currently host rsp to the host stack pointer value in the entry, and then subtract the stack size of the caller from that.
+				  the current stackpoint index is adjusted to point to the one after the stackpoint we restored to.
+
+				  The helper then jumps back to the function that was longjmp'ed to, with the host stack in its proper state. it just works!
+
+
+
+                 */
+
+                if (num_frames_bigger > 1) {
+                  return host_address;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
  auto fn = thread_state->processor()->ResolveFunction(
      static_cast<uint32_t>(target_address));
  assert_not_null(fn);
@ -479,7 +662,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
      mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);

      call((void*)fn->machine_code());
-
+      synchronize_stack_on_next_instruction_ = true;
    } else {
      // tail call
      EmitTraceUserCallReturn();
@ -488,8 +671,10 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
      mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);

      add(rsp, static_cast<uint32_t>(stack_size()));
+      PopStackpoint();
      jmp((void*)fn->machine_code(), T_NEAR);
    }
+
    return;
  } else if (code_cache_->has_indirection_table()) {
    // Load the pointer to the indirection table maintained in X64CodeCache.
@ -513,12 +698,14 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
    mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);

    add(rsp, static_cast<uint32_t>(stack_size()));
+    PopStackpoint();
    jmp(rax);
  } else {
    // Return address is from the previous SET_RETURN_ADDRESS.
    mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);

    call(rax);
+    synchronize_stack_on_next_instruction_ = true;
  }
 }

@ -557,12 +744,14 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
    mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);

    add(rsp, static_cast<uint32_t>(stack_size()));
+    PopStackpoint();
    jmp(rax);
  } else {
    // Return address is from the previous SET_RETURN_ADDRESS.
    mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);

    call(rax);
+    synchronize_stack_on_next_instruction_ = true;
  }
 }

@ -1458,6 +1647,126 @@ Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
  pt.setBit(32);
  return pt;
 }
+
+void X64Emitter::HandleStackpointOverflowError(ppc::PPCContext* context) {
+  // context->lr
+  // todo: show lr in message?
+  xe::FatalError(
+      "Overflowed stackpoints! Please report this error for this title to "
+      "Xenia developers.");
+}
+
+void X64Emitter::PushStackpoint() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // push the current host and guest stack pointers
+  // this is done before a stack frame is set up or any guest instructions are
+  // executed this code is probably the most intrusive part of the stackpoint
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  mov(eax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
+
+  mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+
+  imul(r9d, eax, sizeof(X64BackendStackpoint));
+  add(rbx, r9);
+
+  mov(qword[rbx + offsetof(X64BackendStackpoint, host_stack_)], rsp);
+  mov(dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r8d);
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    inc(eax);
+  } else {
+    add(eax, 1);
+  }
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      eax);
+
+  cmp(eax, (uint32_t)cvars::max_stackpoints);
+
+  Xbyak::Label& overflowed_stackpoints =
+      AddToTail([](X64Emitter& e, Xbyak::Label& our_tail_label) {
+        e.L(our_tail_label);
+        // we never subtracted anything from rsp, so our stack is misaligned and
+        // will fault in guesttohostthunk
+        // e.sub(e.rsp, 8);
+        e.push(e.rax);  // easier realign, 1 byte opcode vs 4 bytes for sub
+
+        e.CallNativeSafe((void*)X64Emitter::HandleStackpointOverflowError);
+      });
+  jge(overflowed_stackpoints, T_NEAR);
+}
+void X64Emitter::PopStackpoint() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // todo: maybe verify that rsp and r1 == the stackpoint?
+  Xbyak::Address stackpoint_pos_pointer =
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth));
+  stackpoint_pos_pointer.setBit(32);
+  dec(stackpoint_pos_pointer);
+}
+
+void X64Emitter::EnsureSynchronizedGuestAndHostStack() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // chrispy: keeping this old slower test here in case in the future changes
+  // need to be made
+  // that result in the stack not being 8 byte misaligned on context reentry
+
+#if 0
+	Xbyak::Label skip{};
+	mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  imul(eax,
+       GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+       sizeof(X64BackendStackpoint));
+  sub(eax, sizeof(X64BackendStackpoint));
+  add(rbx, rax);
+
+  cmp(r8d, dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)]);
+   jle(skip, T_NEAR);
+  Xbyak::Label skip{};
+  mov(r11d, stack_size());
+  call(backend_->synchronize_guest_and_host_stack_helper());
+  L(skip);
+#endif
+
+  Xbyak::Label& return_from_sync = this->NewCachedLabel();
+
+  // if we got here somehow from setjmp or the like we ought to have a
+  // misaligned stack right now! this provides us with a very fast pretest for
+  // this condition
+  test(esp, 15);
+
+  Xbyak::Label& sync_label = this->AddToTail(
+      [&return_from_sync](X64Emitter& e, Xbyak::Label& our_tail_label) {
+        e.L(our_tail_label);
+
+        uint32_t stack32 = static_cast<uint32_t>(e.stack_size());
+        auto backend = e.backend();
+
+        if (stack32 < 256) {
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(1));
+          e.db(stack32);
+
+        } else if (stack32 < 65536) {
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(2));
+          e.dw(stack32);
+        } else {
+          // ought to be impossible, a host stack bigger than 65536??
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(4));
+          e.dd(stack32);
+        }
+        e.jmp(return_from_sync, T_NEAR);
+      });
+
+  jnz(sync_label, T_NEAR);
+
+  L(return_from_sync);
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -299,6 +299,11 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
  Xbyak::Label& NewCachedLabel();
+
+  void PushStackpoint();
+  void PopStackpoint();
+
+  void EnsureSynchronizedGuestAndHostStack();
  FunctionDebugInfo* debug_info() const { return debug_info_; }

  size_t stack_size() const { return stack_size_; }
@ -381,13 +386,14 @@ class X64Emitter : public Xbyak::CodeGenerator {
  bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
  void EmitGetCurrentThreadId();
  void EmitTraceUserCallReturn();
-
+  static void HandleStackpointOverflowError(ppc::PPCContext* context);
 protected:
  Processor* processor_ = nullptr;
  X64Backend* backend_ = nullptr;
  X64CodeCache* code_cache_ = nullptr;
  XbyakAllocator* allocator_ = nullptr;
  XexModule* guest_module_ = nullptr;
+  bool synchronize_stack_on_next_instruction_ = false;
  Xbyak::util::Cpu cpu_;
  uint64_t feature_flags_ = 0;
  uint32_t current_guest_function_ = 0;
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@ -56,6 +56,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
  if (entry) {
    // If we aren't ready yet spin and wait.
    if (entry->status == Entry::STATUS_COMPILING) {
+      // chrispy: i think this is dead code, if we are compiling we're holding
+      // the global lock, arent we? so we wouldnt be executing here
      // Still compiling, so spin.
      do {
        global_lock.unlock();
--- a/src/xenia/cpu/function.cc
+++ b/src/xenia/cpu/function.cc
@ -110,8 +110,13 @@ uint32_t GuestFunction::MapGuestAddressToMachineCodeOffset(
 uintptr_t GuestFunction::MapGuestAddressToMachineCode(
    uint32_t guest_address) const {
  auto entry = LookupGuestAddress(guest_address);
-  return reinterpret_cast<uintptr_t>(machine_code()) +
-         (entry ? entry->code_offset : 0);
+
+  if (entry) {
+    return reinterpret_cast<uintptr_t>(machine_code()) + entry->code_offset;
+  } else {
+    return 0;
+  
+  }
 }

 uint32_t GuestFunction::MapMachineCodeToGuestAddress(
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@ -27,18 +27,13 @@
 #include "xenia/cpu/ppc/ppc_frontend.h"
 #include "xenia/cpu/ppc/ppc_opcode_info.h"
 #include "xenia/cpu/processor.h"
-
+#include "xenia/cpu/xex_module.h"
 DEFINE_bool(
    break_on_unimplemented_instructions, true,
    "Break to the host debugger (or crash if no debugger attached) if an "
    "unimplemented PowerPC instruction is encountered.",
    "CPU");

-DEFINE_bool(
-    emit_useless_fpscr_updates, false,
-    "Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
-    "CPU");
-
 namespace xe {
 namespace cpu {
 namespace ppc {
@ -94,8 +89,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {

  function_ = function;
  start_address_ = function_->address();
-  //chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
-  //i've also seen ones with a start and end address that are the same...
+  // chrispy: i've seen this one happen, not sure why but i think from trying to
+  // precompile twice i've also seen ones with a start and end address that are
+  // the same...
  assert_true(function_->address() <= function_->end_address());
  instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;

@ -250,7 +246,8 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
 }

 void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
-	//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
+  // chrispy: label->name is unused, it would be nice to be able to remove the
+  // field and this code
  char name_buffer[13];
  auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
  name_buffer[format_result.size] = '\0';
@ -457,37 +454,38 @@ void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
  // TODO(benvanik): detect overflow and nan cases.
  // fx and vx are the most important.
  /*
-          chrispy: stubbed this out because right now all it does is waste
-     memory and CPU time
+    chrispy: i stubbed this out at one point because all it does is waste
+     memory and CPU time, however, this introduced issues with raiden
+    (substitute w/ titleid later) which probably means they stash stuff in the
+    fpscr?
+
  */
-  if (cvars::emit_useless_fpscr_updates) {
-    Value* fx = LoadConstantInt8(0);
-    Value* fex = LoadConstantInt8(0);
-    Value* vx = LoadConstantInt8(0);
-    Value* ox = LoadConstantInt8(0);

-    if (update_cr1) {
-      // Store into the CR1 field.
-      // We do this instead of just calling CopyFPSCRToCR1 so that we don't
-      // have to read back the bits and do shifting work.
-      StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
-      StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
-      StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
-      StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
-    }
+  Value* fx = LoadConstantInt8(0);
+  Value* fex = LoadConstantInt8(0);
+  Value* vx = LoadConstantInt8(0);
+  Value* ox = LoadConstantInt8(0);

-    // Generate our new bits.
-    Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
-    new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
-    new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
-    new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
-
-    // Mix into fpscr while preserving sticky bits (FX and OX).
-    Value* bits = LoadFPSCR();
-    bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
-    StoreFPSCR(bits);
+  if (update_cr1) {
+    // Store into the CR1 field.
+    // We do this instead of just calling CopyFPSCRToCR1 so that we don't
+    // have to read back the bits and do shifting work.
+    StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
+    StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
+    StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
+    StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
  }

+  // Generate our new bits.
+  Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
+  new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
+  new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
+  new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
+
+  // Mix into fpscr while preserving sticky bits (FX and OX).
+  Value* bits = LoadFPSCR();
+  bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
+  StoreFPSCR(bits);
 }

 void PPCHIRBuilder::CopyFPSCRToCR1() {
@ -587,7 +585,24 @@ void PPCHIRBuilder::StoreReserved(Value* val) {
 Value* PPCHIRBuilder::LoadReserved() {
  return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
 }
+void PPCHIRBuilder::SetReturnAddress(Value* value) {
+  /*
+     Record the address as being a possible target of a return. This is
+     needed for longjmp emulation. See x64_emitter.cc's ResolveFunction
+  */
+  Module* mod = this->function_->module();
+  if (value && value->IsConstant()) {
+    if (mod) {
+      XexModule* xexmod = dynamic_cast<XexModule*>(mod);
+      if (xexmod) {
+        auto flags = xexmod->GetInstructionAddressFlags(value->AsUint32());
+        flags->is_return_site = true;
+      }
+    }
+  }

+  HIRBuilder::SetReturnAddress(value);
+}
 }  // namespace ppc
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/ppc/ppc_hir_builder.h
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.h
@ -80,7 +80,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {

  void StoreReserved(Value* val);
  Value* LoadReserved();
-
+  //calls original impl in hirbuilder, but also records the is_return_site bit into flags in the guestmodule
+  void SetReturnAddress(Value* value);
 private:
  void MaybeBreakOnInstruction(uint32_t address);
  void AnnotateLabel(uint32_t address, Label* label);
--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@ -263,12 +263,11 @@ Function* Processor::ResolveFunction(uint32_t address) {
      return nullptr;
    }

-
    if (!DemandFunction(function)) {
      entry->status = Entry::STATUS_FAILED;
      return nullptr;
    }
-	//only add it to the list of resolved functions if resolving succeeded
+    //only add it to the list of resolved functions if resolving succeeded
    auto module_for = function->module();

    auto xexmod = dynamic_cast<XexModule*>(module_for);
@ -291,23 +290,23 @@ Function* Processor::ResolveFunction(uint32_t address) {
    return nullptr;
  }
 }
-
+Module* Processor::LookupModule(uint32_t address) {
+  auto global_lock = global_critical_region_.Acquire();
+  // TODO(benvanik): sort by code address (if contiguous) so can bsearch.
+  // TODO(benvanik): cache last module low/high, as likely to be in there.
+  for (const auto& module : modules_) {
+    if (module->ContainsAddress(address)) {
+      return module.get();
+    }
+  }
+  return nullptr;
+}
 Function* Processor::LookupFunction(uint32_t address) {
  // TODO(benvanik): fast reject invalid addresses/log errors.

  // Find the module that contains the address.
-  Module* code_module = nullptr;
-  {
-    auto global_lock = global_critical_region_.Acquire();
-    // TODO(benvanik): sort by code address (if contiguous) so can bsearch.
-    // TODO(benvanik): cache last module low/high, as likely to be in there.
-    for (const auto& module : modules_) {
-      if (module->ContainsAddress(address)) {
-        code_module = module.get();
-        break;
-      }
-    }
-  }
+  Module* code_module = LookupModule(address);
+
  if (!code_module) {
    // No module found that could contain the address.
    return nullptr;
--- a/src/xenia/cpu/processor.h
+++ b/src/xenia/cpu/processor.h
@ -115,6 +115,7 @@ class Processor {
  void RemoveFunctionByAddress(uint32_t address);

  Function* LookupFunction(uint32_t address);
+  Module* LookupModule(uint32_t address);
  Function* LookupFunction(Module* module, uint32_t address);
  Function* ResolveFunction(uint32_t address);

--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@ -78,7 +78,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
  // Allocate with 64b alignment.

  context_ = reinterpret_cast<ppc::PPCContext*>(
-      AllocateContext());
+	  AllocateContext());
  processor->backend()->InitializeBackendContext(context_);
  assert_true(((uint64_t)context_ & 0x3F) == 0);
  std::memset(context_, 0, sizeof(ppc::PPCContext));
@ -105,9 +105,9 @@ ThreadState::~ThreadState() {
    thread_state_ = nullptr;
  }
  if (context_) {
+    processor_->backend()->DeinitializeBackendContext(context_);
    FreeContext(reinterpret_cast<void*>(context_));
  }
-  // memory::AlignedFree(context_);
 }

 void ThreadState::Bind(ThreadState* thread_state) {
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@ -38,9 +38,10 @@ DEFINE_bool(disable_instruction_infocache, false,
            "CPU");

 DEFINE_bool(
-    disable_early_precompilation, false,
-    "Disables pre-compiling guest functions that we know we've called/that "
-    "we've recognized as being functions via simple heuristics.",
+    enable_early_precompilation, false,
+    "Enable pre-compiling guest functions that we know we've called/that "
+    "we've recognized as being functions via simple heuristics, good for error "
+    "finding/stress testing with the JIT",
    "CPU");

 static const uint8_t xe_xex2_retail_key[16] = {
@ -1115,6 +1116,7 @@ void XexModule::Precompile() {
  if (!FindSaveRest()) {
    return;
  }
+
  info_cache_.Init(this);
  PrecompileDiscoveredFunctions();
 }
@ -1343,22 +1345,26 @@ void XexInfoCache::Init(XexModule* xexmod) {
  num_codebytes += 3;  // round up to nearest multiple of 4
  num_codebytes &= ~3;

-  bool did_exist = true;
-  if (!std::filesystem::exists(infocache_path)) {
-  recreate:
-    xe::filesystem::CreateEmptyFile(infocache_path);
-    did_exist = false;
-  }
+  auto try_open = [this, &infocache_path, num_codebytes]() {
+    bool did_exist = true;

-  // todo: prepopulate with stuff from pdata, dll exports
+    if (!std::filesystem::exists(infocache_path)) {
+      xe::filesystem::CreateEmptyFile(infocache_path);
+      did_exist = false;
+    }

-  this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
-      infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
-      sizeof(InfoCacheFlagsHeader) +
-          (sizeof(InfoCacheFlags) *
-           (num_codebytes /
-            4))));  // one infocacheflags entry for each PPC instr-sized addr
+    // todo: prepopulate with stuff from pdata, dll exports

+    this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
+        infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
+        sizeof(InfoCacheFlagsHeader) +
+            (sizeof(InfoCacheFlags) *
+             (num_codebytes /
+              4))));  // one infocacheflags entry for each PPC instr-sized addr
+    return did_exist;
+  };
+
+  bool did_exist = try_open();
  if (!did_exist) {
    GetHeader()->version = CURRENT_INFOCACHE_VERSION;

@ -1366,7 +1372,7 @@ void XexInfoCache::Init(XexModule* xexmod) {
    if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) {
      this->executable_addr_flags_->Close();
      std::filesystem::remove(infocache_path);
-      goto recreate;
+      try_open();
    }
  }
 }
@ -1380,7 +1386,7 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
  return info_cache_.LookupFlags(guest_addr);
 }
 void XexModule::PrecompileDiscoveredFunctions() {
-  if (cvars::disable_early_precompilation) {
+  if (!cvars::enable_early_precompilation) {
    return;
  }
  auto others = PreanalyzeCode();
@ -1397,7 +1403,7 @@ void XexModule::PrecompileDiscoveredFunctions() {
  }
 }
 void XexModule::PrecompileKnownFunctions() {
-  if (cvars::disable_early_precompilation) {
+  if (!cvars::enable_early_precompilation) {
    return;
  }
  uint32_t start = 0;
@ -1435,18 +1441,14 @@ static bool IsOpcodeBL(unsigned w) {

 std::vector<uint32_t> XexModule::PreanalyzeCode() {
  uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
-  
-

  uint32_t highest_exec_addr = 0;

  for (auto&& sec : pe_sections_) {
    if ((sec.flags & kXEPESectionContainsCode)) {
-
-
-	  highest_exec_addr =
+      highest_exec_addr =
          std::max<uint32_t>(highest_exec_addr, sec.address + sec.size);
-	}
+    }
  }
  uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1);
  uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
@ -1476,7 +1478,7 @@ std::vector<uint32_t> XexModule::PreanalyzeCode() {
    uint32_t mfspr_r12_lr32 =
        *reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);

-	auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
+    auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
      funcstart_candidate_stack[stack_pos++] = addr;
    };
    /*
@ -1926,7 +1928,7 @@ bool XexModule::FindSaveRest() {
      address += 2 * 4;
    }
  }
-  if (!cvars::disable_early_precompilation) {
+  if (cvars::enable_early_precompilation) {
    for (auto&& to_ensure_precompiled : resolve_on_exit) {
      // we want to make sure an address for these functions is available before
      // any other functions are compiled for code generation purposes but we do
--- a/src/xenia/cpu/xex_module.h
+++ b/src/xenia/cpu/xex_module.h
@ -29,23 +29,27 @@ constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
 constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
 constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');

-
 class Runtime;
 struct InfoCacheFlags {
  uint32_t was_resolved : 1;  // has this address ever been called/requested
                              // via resolvefunction?
  uint32_t accessed_mmio : 1;
  uint32_t is_syscall_func : 1;
-  uint32_t reserved : 29;
+  uint32_t is_return_site : 1;  // address can be reached from another function
+                                // by returning
+  uint32_t reserved : 28;
 };
+static_assert(sizeof(InfoCacheFlags) == 4,
+              "InfoCacheFlags size should be equal to sizeof ppc instruction.");
+
 struct XexInfoCache {
-	//increment this to invalidate all user infocaches
-  static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 1;
+  // increment this to invalidate all user infocaches
+  static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 4;

  struct InfoCacheFlagsHeader {
    uint32_t version;

-    unsigned char reserved[252];  
+    unsigned char reserved[252];

    InfoCacheFlags* LookupFlags(unsigned offset) {
      return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
@ -228,7 +232,8 @@ class XexModule : public xe::cpu::Module {

  InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);

-  virtual void Precompile() override; 
+  virtual void Precompile() override;
+
 protected:
  std::unique_ptr<Function> CreateFunction(uint32_t address) override;

--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -1911,21 +1911,8 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                       uint32_t base,
                                                       uint32_t num_registers) {
-  RingBuffer::ReadRange range =
-      ring->BeginRead(num_registers * sizeof(uint32_t));
-
-  XE_LIKELY_IF(!range.second) {
-    uint32_t num_regs_firstrange =
-        static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
-
-    D3D12CommandProcessor::WriteRegistersFromMem(
-        base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
-        num_regs_firstrange);
-    ring->EndRead(range);
-  }
-  else {
-    return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
-  }
+  WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
+                                                       num_registers);
 }

 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@ -2042,7 +2029,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound(
  RingBuffer::ReadRange range =
      ring->BeginRead(num_registers * sizeof(uint32_t));

-
  XE_LIKELY_IF(!range.second) {
    WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
                                             register_upper_bound>(
@ -2710,9 +2696,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    }

    if (vfetch_current_queued) {
-      // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
-      // pre-acquire the critical region so we're not repeatedly re-acquiring it
-      // in requestrange
+      // so far, i have never seen vfetch_current_queued > 4. 1 is most common,
+      // 2 happens occasionally. did not test many games though pre-acquire the
+      // critical region so we're not repeatedly re-acquiring it in requestrange
      auto shared_memory_request_range_hoisted =
          global_critical_region::Acquire();

@ -4351,7 +4337,8 @@ bool D3D12CommandProcessor::UpdateBindings(
      uint32_t float_constant_index;
      while (xe::bit_scan_forward(float_constant_map_entry,
                                  &float_constant_index)) {
-        float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
+        float_constant_map_entry =
+            xe::clear_lowest_bit(float_constant_map_entry);
        std::memcpy(float_constants,
                    &regs[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
                          (float_constant_index << 2)]
@ -4382,7 +4369,8 @@ bool D3D12CommandProcessor::UpdateBindings(
        uint32_t float_constant_index;
        while (xe::bit_scan_forward(float_constant_map_entry,
                                    &float_constant_index)) {
-          float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
+          float_constant_map_entry =
+              xe::clear_lowest_bit(float_constant_map_entry);
          std::memcpy(float_constants,
                      &regs[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
                            (float_constant_index << 2)]
--- a/src/xenia/kernel/xam/xam_input.cc
+++ b/src/xenia/kernel/xam/xam_input.cc
@ -41,10 +41,23 @@ DECLARE_XAM_EXPORT1(XamEnableInactivityProcessing, kInput, kStub);

 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx
 dword_result_t XamInputGetCapabilities_entry(
-    dword_t user_index, dword_t flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
+    dword_t user_index, dword_t _flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
+  unsigned flags = _flags;
+	//chrispy: actually, it appears that caps is never checked for null, it is memset at the start regardless
  if (!caps) {
    return X_ERROR_BAD_ARGUMENTS;
  }
+  if ((flags & 0x40000000) != 0) {
+	//should trap
+  }
+
+  if ((flags & 4) != 0) {
+  //should trap
+  }
+  if (!flags) {
+    flags = 3;
+  }
+

  if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
    // Ignore any query for other types of devices.
@ -118,7 +131,7 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
 DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);

 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
-dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
+dword_result_t XamInputSetState_entry(dword_t user_index, dword_t flags, /* flags, as far as i can see, is not used*/
                                      pointer_t<X_INPUT_VIBRATION> vibration) {
  if (user_index >= 4) {
    return X_E_DEVICE_NOT_CONNECTED;
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
@ -508,7 +508,16 @@ dword_result_t RtlInitializeCriticalSectionAndSpinCount_entry(
 DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone,
                         kImplemented);

+static void CriticalSectionPrefetchW(const void* vp) {
+#if XE_ARCH_AMD64 == 1
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(vp);
+  }
+#endif
+}
+
 void RtlEnterCriticalSection_entry(pointer_t<X_RTL_CRITICAL_SECTION> cs) {
+  CriticalSectionPrefetchW(&cs->lock_count);
  uint32_t cur_thread = XThread::GetCurrentThread()->guest_object();
  uint32_t spin_count = cs->header.absolute * 256;

@ -544,6 +553,7 @@ DECLARE_XBOXKRNL_EXPORT2(RtlEnterCriticalSection, kNone, kImplemented,

 dword_result_t RtlTryEnterCriticalSection_entry(
    pointer_t<X_RTL_CRITICAL_SECTION> cs) {
+  CriticalSectionPrefetchW(&cs->lock_count);
  uint32_t thread = XThread::GetCurrentThread()->guest_object();

  if (xe::atomic_cas(-1, 0, &cs->lock_count)) {
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -7,6 +7,7 @@
 ******************************************************************************
 */

+#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
 #include <algorithm>
 #include <vector>
 #include "xenia/base/atomic.h"
@ -18,7 +19,6 @@
 #include "xenia/kernel/user_module.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
-#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
 #include "xenia/kernel/xevent.h"
 #include "xenia/kernel/xmutant.h"
 #include "xenia/kernel/xsemaphore.h"
@ -165,8 +165,16 @@ dword_result_t NtResumeThread_entry(dword_t handle,
  uint32_t suspend_count = 0;

  auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
+
  if (thread) {
-    result = thread->Resume(&suspend_count);
+    if (thread->type() == XObject::Type::Thread) {
+      result = thread->Resume(&suspend_count);
+
+    } else {
+      return X_STATUS_OBJECT_TYPE_MISMATCH;
+    }
+  } else {
+    return X_STATUS_INVALID_HANDLE;
  }
  if (suspend_count_ptr) {
    *suspend_count_ptr = suspend_count;
@ -190,15 +198,27 @@ dword_result_t KeResumeThread_entry(lpvoid_t thread_ptr) {
 DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented);

 dword_result_t NtSuspendThread_entry(dword_t handle,
-                                     lpdword_t suspend_count_ptr) {
+                                     lpdword_t suspend_count_ptr,
+                                     const ppc_context_t& context) {
  X_RESULT result = X_STATUS_SUCCESS;
  uint32_t suspend_count = 0;

  auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
  if (thread) {
-    result = thread->Suspend(&suspend_count);
+    if (thread->type() == XObject::Type::Thread) {
+      auto current_pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
+
+      if (current_pcr->current_thread == thread->guest_object() ||
+          !thread->guest_object<X_KTHREAD>()->terminated) {
+        result = thread->Suspend(&suspend_count);
+      } else {
+        return X_STATUS_THREAD_IS_TERMINATING;
+      }
+    } else {
+      return X_STATUS_OBJECT_TYPE_MISMATCH;
+    }
  } else {
-    result = X_STATUS_INVALID_HANDLE;
+    return X_STATUS_INVALID_HANDLE;
  }

  if (suspend_count_ptr) {
@ -213,23 +233,23 @@ void KeSetCurrentStackPointers_entry(lpvoid_t stack_ptr,
                                     pointer_t<X_KTHREAD> thread,
                                     lpvoid_t stack_alloc_base,
                                     lpvoid_t stack_base,
-                                     lpvoid_t stack_limit) {
+                                     lpvoid_t stack_limit, const ppc_context_t& context) {
  auto current_thread = XThread::GetCurrentThread();
-  auto context = current_thread->thread_state()->context();
-  auto pcr = kernel_memory()->TranslateVirtual<X_KPCR*>(
-      static_cast<uint32_t>(context->r[13]));

+  auto pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
+	
  thread->stack_alloc_base = stack_alloc_base.value();
  thread->stack_base = stack_base.value();
  thread->stack_limit = stack_limit.value();
  pcr->stack_base_ptr = stack_base.guest_address();
  pcr->stack_end_ptr = stack_limit.guest_address();
  context->r[1] = stack_ptr.guest_address();
-
+  
  // If a fiber is set, and the thread matches, reenter to avoid issues with
  // host stack overflowing.
  if (thread->fiber_ptr &&
      current_thread->guest_object() == thread.guest_address()) {
+    context->processor->backend()->PrepareForReentry(context.value());
    current_thread->Reenter(static_cast<uint32_t>(context->lr));
  }
 }
@ -1018,7 +1038,8 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
  assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));

  PrefetchForCAS(lock);
-  while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
+  while (!xe::atomic_cas(
+      0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
 #if XE_ARCH_AMD64 == 1
    // todo: this is just a nop if they don't have SMT, which is not great
    // either...
@ -1038,7 +1059,8 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
  auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
  assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
  PrefetchForCAS(lock);
-  if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
+  if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])),
+                      lock)) {
    return 0;
  }
  return 1;
@ -1281,7 +1303,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);

 void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
                                           const ppc_context_t& ppc_context) {
-  auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
+  auto old_irql =
+      xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);

  int32_t lock_count = ++lock_ptr->lock_count;
  if (!lock_count) {
@ -1318,7 +1341,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,

 void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
                                        const ppc_context_t& ppc_context) {
-  auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
+  auto old_irql =
+      xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);

  int32_t lock_count = ++lock_ptr->lock_count;
  if (!lock_count ||
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@ -33,8 +33,15 @@ DEFINE_bool(ignore_thread_priorities, true,
 DEFINE_bool(ignore_thread_affinities, true,
            "Ignores game-specified thread affinities.", "Kernel");

+
+#if 0
+DEFINE_int64(stack_size_multiplier_hack, 1,
+             "A hack for games with setjmp/longjmp issues.", "Kernel");
+DEFINE_int64(main_xthread_stack_size_multiplier_hack, 1,
+             "A hack for games with setjmp/longjmp issues.", "Kernel");
+#endif
 namespace xe {
-namespace kernel {
+  namespace kernel {

 const uint32_t XAPC::kSize;
 const uint32_t XAPC::kDummyKernelRoutine;
@ -373,8 +380,23 @@ X_STATUS XThread::Create() {
  RetainHandle();

  xe::threading::Thread::CreationParameters params;
-  params.stack_size = 16_MiB;  // Allocate a big host stack.
+  
+
+
  params.create_suspended = true;
+
+  #if 0
+  uint64_t stack_size_mult = cvars::stack_size_multiplier_hack;
+  
+  if (main_thread_) {
+    stack_size_mult =
+        static_cast<uint64_t>(cvars::main_xthread_stack_size_multiplier_hack);
+
+  } 
+  #else
+  uint64_t stack_size_mult = 1;
+  #endif
+  params.stack_size = 16_MiB * stack_size_mult;  // Allocate a big host stack.
  thread_ = xe::threading::Thread::Create(params, [this]() {
    // Set thread ID override. This is used by logging.
    xe::threading::set_current_thread_id(handle());
@ -433,6 +455,9 @@ X_STATUS XThread::Create() {
 X_STATUS XThread::Exit(int exit_code) {
  // This may only be called on the thread itself.
  assert_true(XThread::GetCurrentThread() == this);
+  //TODO(chrispy): not sure if this order is correct, should it come after apcs?
+  guest_object<X_KTHREAD>()->terminated = 1;
+ 

  // TODO(benvanik): dispatch events? waiters? etc?
  RundownAPCs();
--- a/src/xenia/kernel/xthread.h
+++ b/src/xenia/kernel/xthread.h
@ -121,7 +121,7 @@ struct X_KTHREAD {
  uint8_t unk_B4[0x8];                // 0xB4
  uint8_t suspend_count;              // 0xBC
  uint8_t unk_BD;                     // 0xBD
-  uint8_t unk_BE;                     // 0xBE
+  uint8_t terminated;                     // 0xBE
  uint8_t current_cpu;                // 0xBF
  uint8_t unk_C0[0x10];               // 0xC0
  xe::be<uint32_t> stack_alloc_base;  // 0xD0
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -316,8 +316,8 @@ void Memory::Reset() {
  heaps_.v90000000.Reset();
  heaps_.physical.Reset();
 }
-//clang does not like non-standard layout offsetof
-#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0
+// clang does not like non-standard layout offsetof
+#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
 XE_NOALIAS
 const BaseHeap* Memory::LookupHeap(uint32_t address) const {
 #define HEAP_INDEX(name) \
@ -359,7 +359,6 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
 #else
 XE_NOALIAS
 const BaseHeap* Memory::LookupHeap(uint32_t address) const {
-
  if (address < 0x40000000) {
    return &heaps_.v00000000;
  } else if (address < 0x7F000000) {
@ -964,6 +963,14 @@ bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size,

  return true;
 }
+template<typename T>
+static inline T QuickMod(T value, uint32_t modv) {
+  if (xe::is_pow2(modv)) {
+    return value & (modv - 1);
+  } else {
+    return value % modv;
+  }
+}

 bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
                          uint32_t size, uint32_t alignment,
@ -976,8 +983,9 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
  low_address = std::max(heap_base_, xe::align(low_address, alignment));
  high_address = std::min(heap_base_ + (heap_size_ - 1),
                          xe::align(high_address, alignment));
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
  low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
  high_page_number =
      std::min(uint32_t(page_table_.size()) - 1, high_page_number);
@ -995,8 +1003,10 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
  // TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
  uint32_t start_page_number = UINT_MAX;
  uint32_t end_page_number = UINT_MAX;
-  uint32_t page_scan_stride = alignment / page_size_;
-  high_page_number = high_page_number - (high_page_number % page_scan_stride);
+  // chrispy:todo, page_scan_stride is probably always a power of two...
+  uint32_t page_scan_stride = alignment >> page_size_shift_;
+  high_page_number =
+      high_page_number - QuickMod(high_page_number, page_scan_stride);
  if (top_down) {
    for (int64_t base_page_number =
             high_page_number - xe::round_up(page_count, page_scan_stride);
@ -1024,7 +1034,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
            base_page_number = -1;
          } else {
            base_page_number = page_number - page_count;
-            base_page_number -= base_page_number % page_scan_stride;
+            base_page_number -= QuickMod(base_page_number, page_scan_stride);
            base_page_number += page_scan_stride;  // cancel out loop logic
          }
          break;
@ -1072,7 +1082,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
  if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
    // Out of memory.
    XELOGE("BaseHeap::Alloc failed to find contiguous range");
-    //assert_always("Heap exhausted!");
+    // assert_always("Heap exhausted!");
    return false;
  }

@ -1084,15 +1094,15 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
                          ? xe::memory::AllocationType::kCommit
                          : xe::memory::AllocationType::kReserve;
    void* result = xe::memory::AllocFixed(
-        TranslateRelative(start_page_number * page_size_),
-        page_count * page_size_, alloc_type, ToPageAccess(protect));
+        TranslateRelative(start_page_number << page_size_shift_),
+        page_count << page_size_shift_, alloc_type, ToPageAccess(protect));
    if (!result) {
      XELOGE("BaseHeap::Alloc failed to alloc range from host");
      return false;
    }

    if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
-      std::memset(result, 0xCD, page_count * page_size_);
+      std::memset(result, 0xCD, page_count << page_size_shift_);
    }
  }

@ -1108,7 +1118,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
    unreserved_page_count_--;
  }

-  *out_address = heap_base_ + (start_page_number * page_size_);
+  *out_address = heap_base_ + (start_page_number << page_size_shift_);
  return true;
 }

@ -1719,8 +1729,7 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
  uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
  uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);

-  uint32_t guest_one =
-      SystemPagenumToGuestPagenum(1);
+  uint32_t guest_one = SystemPagenumToGuestPagenum(1);

  uint32_t system_one = GuestPagenumToSystemPagenum(1);
  for (; i <= system_page_last; ++i) {
@ -1755,7 +1764,6 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
 #endif

    uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
-    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
    xe::memory::PageAccess current_page_access =
        ToPageAccess(page_table_ptr[guest_page_number].current_protect);
    bool protect_system_page = false;
--- a/src/xenia/ui/windowed_app_main_win.cc
+++ b/src/xenia/ui/windowed_app_main_win.cc
@ -19,11 +19,96 @@

 DEFINE_bool(enable_console, false, "Open a console window with the main window",
            "General");
+#if XE_ARCH_AMD64 == 1
+DEFINE_bool(enable_rdrand_ntdll_patch, true,
+            "Hot-patches ntdll at the start of the process to not use rdrand "
+            "as part of the RNG for heap randomization. Can reduce CPU usage "
+            "significantly, but is untested on all Windows versions.",
+            "Win32");
+// begin ntdll hack
+#include <psapi.h>
+static bool g_didfailtowrite = false;
+static void write_process_memory(HANDLE process, uintptr_t offset,
+                                 unsigned size, const unsigned char* bvals) {
+  if (!WriteProcessMemory(process, (void*)offset, bvals, size, nullptr)) {
+    if (!g_didfailtowrite) {
+      MessageBoxA(nullptr, "Failed to write to process!", "Failed", MB_OK);
+      g_didfailtowrite = true;
+    }
+  }
+}

+static const unsigned char pattern_cmp_processorfeature_28_[] = {
+    0x80, 0x3C, 0x25, 0x90,
+    0x02, 0xFE, 0x7F, 0x00};  // cmp     byte ptr ds:7FFE0290h, 0
+static const unsigned char pattern_replacement[] = {
+    0x48, 0x39, 0xe4,             // cmp rsp, rsp = always Z
+    0x0F, 0x1F, 0x44, 0x00, 0x00  // 5byte nop
+};
+static void patch_ntdll_instance(HANDLE process, uintptr_t ntdll_base) {
+  MODULEINFO modinfo;
+
+  GetModuleInformation(process, (HMODULE)ntdll_base, &modinfo,
+                       sizeof(MODULEINFO));
+
+  std::vector<uintptr_t> possible_places{};
+
+  unsigned char* strt = (unsigned char*)modinfo.lpBaseOfDll;
+
+  for (unsigned i = 0; i < modinfo.SizeOfImage; ++i) {
+    for (unsigned j = 0; j < sizeof(pattern_cmp_processorfeature_28_); ++j) {
+      if (strt[i + j] != pattern_cmp_processorfeature_28_[j]) {
+        goto miss;
+      }
+    }
+    possible_places.push_back((uintptr_t)(&strt[i]));
+  miss:;
+  }
+
+  for (auto&& place : possible_places) {
+    write_process_memory(process, place, sizeof(pattern_replacement),
+                         pattern_replacement);
+  }
+}
+
+static void do_ntdll_hack_this_process() {
+  patch_ntdll_instance(GetCurrentProcess(),
+                       (uintptr_t)GetModuleHandleA("ntdll.dll"));
+}
+#endif
+// end ntdll hack
+LONG _UnhandledExceptionFilter(_EXCEPTION_POINTERS* ExceptionInfo) {
+  PVOID exception_addr = ExceptionInfo->ExceptionRecord->ExceptionAddress;
+
+  DWORD64 last_stackpointer = ExceptionInfo->ContextRecord->Rsp;
+
+  DWORD64 last_rip = ExceptionInfo->ContextRecord->Rip;
+
+  DWORD except_code = ExceptionInfo->ExceptionRecord->ExceptionCode;
+
+  DWORD last_error = GetLastError();
+
+  NTSTATUS stat = __readgsdword(0x1250);
+
+  int last_errno_value = errno;
+
+
+
+  char except_message_buf[1024];
+
+  sprintf_s(except_message_buf,
+            "Exception encountered!\nException address: %p\nStackpointer: "
+            "%p\nInstruction pointer: %p\nExceptionCode: 0x%X\nLast Win32 "
+            "Error: 0x%X\nLast NTSTATUS: 0x%X\nLast errno value: 0x%X\n",
+            exception_addr, (void*)last_stackpointer, (void*)last_rip, except_code,
+            last_error, stat, last_errno_value);
+  MessageBoxA(nullptr, except_message_buf, "Unhandled Exception", MB_ICONERROR);
+  return EXCEPTION_CONTINUE_SEARCH;
+}
 int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
                    LPWSTR command_line, int show_cmd) {
  int result;
-
+  SetUnhandledExceptionFilter(_UnhandledExceptionFilter);
  {
    xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd);
    // TODO(Triang3l): Initialize creates a window. Set DPI awareness via the
@ -40,13 +125,6 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
      return EXIT_FAILURE;
    }

-    // TODO(Triang3l): Rework this, need to initialize the console properly,
-    // disable has_console_attached_ by default in windowed apps, and attach
-    // only if needed.
-    if (cvars::enable_console) {
-      xe::AttachConsole();
-    }
-
    // Initialize COM on the UI thread with the apartment-threaded concurrency
    // model, so dialogs can be used.
    if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) {
@ -55,8 +133,22 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,

    xe::InitializeWin32App(app->GetName());

-    result =
-        app->OnInitialize() ? app_context.RunMainMessageLoop() : EXIT_FAILURE;
+    if (app->OnInitialize()) {
+#if XE_ARCH_AMD64 == 1
+      if (cvars::enable_rdrand_ntdll_patch) {
+        do_ntdll_hack_this_process();
+      }
+#endif
+      // TODO(Triang3l): Rework this, need to initialize the console properly,
+      // disable has_console_attached_ by default in windowed apps, and attach
+      // only if needed.
+      if (cvars::enable_console) {
+        xe::AttachConsole();
+      }
+      result = app_context.RunMainMessageLoop();
+    } else {
+      result = EXIT_FAILURE;
+    }

    app->InvokeOnDestroy();
  }
--- a/src/xenia/xbox.h
+++ b/src/xenia/xbox.h
@ -61,6 +61,7 @@ typedef uint32_t X_STATUS;
 #define X_STATUS_OBJECT_NAME_COLLISION                  ((X_STATUS)0xC0000035L)
 #define X_STATUS_INVALID_PAGE_PROTECTION                ((X_STATUS)0xC0000045L)
 #define X_STATUS_MUTANT_NOT_OWNED                       ((X_STATUS)0xC0000046L)
+#define X_STATUS_THREAD_IS_TERMINATING					((X_STATUS)0xC000004BL)
 #define X_STATUS_PROCEDURE_NOT_FOUND                    ((X_STATUS)0xC000007AL)
 #define X_STATUS_INSUFFICIENT_RESOURCES                 ((X_STATUS)0xC000009AL)
 #define X_STATUS_MEMORY_NOT_ALLOCATED                   ((X_STATUS)0xC00000A0L)