diff --git a/.gitignore b/.gitignore
index 34791bda2..27fe839a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,3 +103,5 @@ node_modules/.bin/
 /tools/shader-playground/*.dll
 /profile_print_times.py
 /profile_times.txt
+/cache1
+/cache0
diff --git a/src/xenia/base/console_win.cc b/src/xenia/base/console_win.cc
index 2549a46ed..612e195fc 100644
--- a/src/xenia/base/console_win.cc
+++ b/src/xenia/base/console_win.cc
@@ -35,13 +35,15 @@ static bool has_shell_environment_variable() {
 }
 
 void AttachConsole() {
-  bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
+  
+bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
+#if 0
   if (!has_console || !has_shell_environment_variable()) {
     // We weren't launched from a console, so just return.
     has_console_attached_ = false;
     return;
   }
-
+  #endif
   AllocConsole();
 
   has_console_attached_ = true;
diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h
index 7b9063084..b1ab4d82b 100644
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@@ -410,34 +410,7 @@ static float ArchReciprocal(float den) {
   return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }
 
-#if 0
-using ArchFloatMask = float;
-
-XE_FORCEINLINE
-static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
-  return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
-}
-XE_FORCEINLINE
-static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-XE_FORCEINLINE
-static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-
-XE_FORCEINLINE
-static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
-  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
-}
-
-XE_FORCEINLINE
-static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
-  return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
-}
-
-constexpr ArchFloatMask floatmask_zero = .0f;
-#else
+ 
 using ArchFloatMask = __m128;
 
 XE_FORCEINLINE
@@ -464,7 +437,7 @@ static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
 }
 
 constexpr ArchFloatMask floatmask_zero{.0f};
-#endif
+ 
 #else
 static float ArchMin(float x, float y) { return std::min<float>(x, y); }
 static float ArchMax(float x, float y) { return std::max<float>(x, y); }
@@ -610,17 +583,17 @@ union IDivExtraInfo {
   } info;
 };
 // returns magicnum multiplier
-static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
-  IDivExtraInfo extra;
+static constexpr uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
+  IDivExtraInfo extra{};
 
   uint32_t d = _denom;
-  int p;
-  uint32_t nc, delta, q1, r1, q2, r2;
+  int p=0;
+  uint32_t nc=0, delta=0, q1=0, r1=0, q2=0, r2=0;
   struct {
     unsigned M;
     int a;
     int s;
-  } magu;
+  } magu{};
   magu.a = 0;
   nc = -1 - ((uint32_t) - (int32_t)d) % d;
   p = 31;
@@ -660,13 +633,13 @@ static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
   return static_cast<uint64_t>(q2 + 1);
 }
 
-static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
+static constexpr uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
                                       uint32_t extradata) {
-  IDivExtraInfo extra;
+  IDivExtraInfo extra{};
 
   extra.value_ = extradata;
 
-  uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
+  uint32_t result = static_cast<uint32_t>((static_cast<uint64_t>(num) * static_cast<uint64_t>(mul)) >> 32);
   if (extra.info.add_) {
     uint32_t addend = result + num;
     addend = ((addend < result ? 0x80000000 : 0) | addend);
@@ -675,7 +648,7 @@ static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
   return result >> extra.info.shift_;
 }
 
-static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
+static constexpr uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
                                        uint32_t extradata, uint32_t original) {
   uint32_t dived = ApplyUint32Div(num, mul, extradata);
   unsigned result = num - (dived * original);
@@ -686,12 +659,12 @@ static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
 struct MagicDiv {
   uint32_t multiplier_;
   uint32_t extradata_;
-  MagicDiv() : multiplier_(0), extradata_(0) {}
-  MagicDiv(uint32_t original) {
+  constexpr MagicDiv() : multiplier_(0), extradata_(0) {}
+  constexpr MagicDiv(uint32_t original) : MagicDiv() {
     multiplier_ = PregenerateUint32Div(original, extradata_);
   }
 
-  uint32_t Apply(uint32_t numerator) const {
+  constexpr uint32_t Apply(uint32_t numerator) const {
     return ApplyUint32Div(numerator, multiplier_, extradata_);
   }
 };
diff --git a/src/xenia/base/memory_win.cc b/src/xenia/base/memory_win.cc
index 807e3911c..580e5fd05 100644
--- a/src/xenia/base/memory_win.cc
+++ b/src/xenia/base/memory_win.cc
@@ -28,6 +28,9 @@ namespace xe {
 namespace memory {
 
 size_t page_size() {
+#if XE_ARCH_AMD64 == 1
+  return 4096;
+#else
   static size_t value = 0;
   if (!value) {
     SYSTEM_INFO si;
@@ -35,9 +38,13 @@ size_t page_size() {
     value = si.dwPageSize;
   }
   return value;
+#endif
 }
 
 size_t allocation_granularity() {
+#if XE_ARCH_AMD64 == 1 && XE_PLATFORM_WIN32 == 1
+  return 65536;
+#else
   static size_t value = 0;
   if (!value) {
     SYSTEM_INFO si;
@@ -45,6 +52,7 @@ size_t allocation_granularity() {
     value = si.dwAllocationGranularity;
   }
   return value;
+#endif
 }
 
 DWORD ToWin32ProtectFlags(PageAccess access) {
diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h
index a608f04b4..d342a05b5 100644
--- a/src/xenia/base/platform_win.h
+++ b/src/xenia/base/platform_win.h
@@ -37,7 +37,7 @@
 #define XE_USE_NTDLL_FUNCTIONS 1
 //chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
 // 
-#define XE_USE_KUSER_SHARED 0
+#define XE_USE_KUSER_SHARED 1
 #if XE_USE_NTDLL_FUNCTIONS == 1
 /*
         ntdll versions of functions often skip through a lot of extra garbage in
diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h
index fce3410d7..2e167b0f8 100644
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@@ -67,7 +67,22 @@ class Backend {
   // up until the start of ctx may be used by the backend to store whatever data
   // they want
   virtual void InitializeBackendContext(void* ctx) {}
+
+  /*
+	Free any dynamically allocated data/resources that the backendcontext uses
+  */
+  virtual void DeinitializeBackendContext(void* ctx) {}
   virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
+  /*
+        called by KeSetCurrentStackPointers in xboxkrnl_threading.cc just prior
+  to calling XThread::Reenter this is an opportunity for a backend to clear any
+  data related to the guest stack
+
+        in the case of the X64 backend, it means we reset the stackpoint index
+  to 0, since its a new stack and all of our old entries are invalid now
+
+  * */
+  virtual void PrepareForReentry(void* ctx) {}
 
  protected:
   Processor* processor_ = nullptr;
diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index 7b3e63222..99576ea85 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -31,7 +31,16 @@ DEFINE_bool(record_mmio_access_exceptions, true,
             "For guest addresses records whether we caught any mmio accesses "
             "for them. This info can then be used on a subsequent run to "
             "instruct the recompiler to emit checks",
-            "CPU");
+            "x64");
+
+DEFINE_int64(max_stackpoints, 65536,
+             "Max number of host->guest stack mappings we can record.", "x64");
+
+DEFINE_bool(enable_host_guest_stack_synchronization, true,
+            "Records entries for guest/host stack mappings at function starts "
+            "and checks for reentry at return sites. Has slight performance "
+            "impact, but fixes crashes in games that use setjmp/longjmp.",
+            "x64");
 #if XE_X64_PROFILER_AVAILABLE == 1
 DECLARE_bool(instrument_call_times);
 #endif
@@ -41,15 +50,29 @@ namespace cpu {
 namespace backend {
 namespace x64 {
 
-class X64ThunkEmitter : public X64Emitter {
+class X64HelperEmitter : public X64Emitter {
  public:
-  X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator);
-  ~X64ThunkEmitter() override;
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  };
+  X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator);
+  ~X64HelperEmitter() override;
   HostToGuestThunk EmitHostToGuestThunk();
   GuestToHostThunk EmitGuestToHostThunk();
   ResolveFunctionThunk EmitResolveFunctionThunk();
+  void* EmitGuestAndHostSynchronizeStackHelper();
+  // 1 for loading byte, 2 for halfword and 4 for word.
+  // these specialized versions save space in the caller
+  void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+      void* sync_func, unsigned stack_element_size);
 
  private:
+  void* EmitCurrentForOffsets(const _code_offsets& offsets,
+                              size_t stack_size = 0);
   // The following four functions provide save/load functionality for registers.
   // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
   // allocated on the stack.
@@ -184,11 +207,26 @@ bool X64Backend::Initialize(Processor* processor) {
 
   // Generate thunks used to transition between jitted code and host code.
   XbyakAllocator allocator;
-  X64ThunkEmitter thunk_emitter(this, &allocator);
+  X64HelperEmitter thunk_emitter(this, &allocator);
   host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
   guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
   resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
 
+  if (cvars::enable_host_guest_stack_synchronization) {
+    synchronize_guest_and_host_stack_helper_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackHelper();
+
+    synchronize_guest_and_host_stack_helper_size8_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 1);
+    synchronize_guest_and_host_stack_helper_size16_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 2);
+    synchronize_guest_and_host_stack_helper_size32_ =
+        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+            synchronize_guest_and_host_stack_helper_, 4);
+  }
+
   // Set the code cache to use the ResolveFunction thunk for default
   // indirections.
   assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@@ -203,9 +241,10 @@ bool X64Backend::Initialize(Processor* processor) {
 
   // Setup exception callback
   ExceptionHandler::Install(&ExceptionCallbackThunk, this);
-
-  processor->memory()->SetMMIOExceptionRecordingCallback(
-      ForwardMMIOAccessForRecording, (void*)this);
+  if (cvars::record_mmio_access_exceptions) {
+    processor->memory()->SetMMIOExceptionRecordingCallback(
+        ForwardMMIOAccessForRecording, (void*)this);
+  }
 
 #if XE_X64_PROFILER_AVAILABLE == 1
   if (cvars::instrument_call_times) {
@@ -509,23 +548,32 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
   return processor()->OnThreadBreakpointHit(ex);
 }
 
-X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator)
+X64HelperEmitter::X64HelperEmitter(X64Backend* backend,
+                                   XbyakAllocator* allocator)
     : X64Emitter(backend, allocator) {}
 
-X64ThunkEmitter::~X64ThunkEmitter() {}
+X64HelperEmitter::~X64HelperEmitter() {}
+void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets,
+                                              size_t stack_size) {
+  EmitFunctionInfo func_info = {};
+  func_info.code_size.total = getSize();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = getSize() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+  func_info.stack_size = stack_size;
 
-HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
+  void* fn = Emplace(func_info);
+  return fn;
+}
+HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() {
   // rcx = target
   // rdx = arg0 (context)
   // r8 = arg1 (guest return address)
 
-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};
 
   const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
 
@@ -576,19 +624,13 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
   return (HostToGuestThunk)fn;
 }
 
-GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
+GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() {
   // rcx = target function
   // rdx = arg0
   // r8  = arg1
   // r9  = arg2
 
-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};
 
   const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
 
@@ -635,17 +677,11 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
 // X64Emitter handles actually resolving functions.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
 
-ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
+ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() {
   // ebx = target PPC address
   // rcx = context
 
-  struct _code_offsets {
-    size_t prolog;
-    size_t prolog_stack_alloc;
-    size_t body;
-    size_t epilog;
-    size_t tail;
-  } code_offsets = {};
+  _code_offsets code_offsets = {};
 
   const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
 
@@ -688,8 +724,116 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
   void* fn = Emplace(func_info);
   return (ResolveFunctionThunk)fn;
 }
+// r11 = size of callers stack, r8 = return address w/ adjustment
+//i'm not proud of this code, but it shouldn't be executed frequently at all
+void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  mov(eax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
 
-void X64ThunkEmitter::EmitSaveVolatileRegs() {
+  lea(ecx, ptr[eax - 1]);
+  mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+
+  Xbyak::Label looper{};
+  Xbyak::Label loopout{};
+  Xbyak::Label signed_underflow{};
+  xor_(r12d, r12d);
+
+  //todo: should use Loop instruction here if hasFastLoop, 
+  //currently xbyak does not support it but its super easy to modify xbyak to have it
+  L(looper);
+  imul(edx, ecx, sizeof(X64BackendStackpoint));
+  mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
+
+  cmp(r10d, r9d);
+
+  jge(loopout, T_NEAR);
+
+  inc(r12d);
+
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    dec(ecx);
+  } else {
+    sub(ecx, 1);
+  }
+  js(signed_underflow, T_NEAR);  // should be impossible!!
+
+
+  jmp(looper, T_NEAR);
+  L(loopout);
+  Xbyak::Label skip_adjust{};
+  cmp(r12d, 1);//should never happen?
+  jle(skip_adjust, T_NEAR);
+  mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]);
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    inc(ecx);
+  } else {
+    add(ecx, 1);
+  }
+
+  // this->DebugBreak();
+  sub(rsp, r11);  // adjust stack
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      ecx);  // set next stackpoint index to be after the one we restored to
+  L(skip_adjust);
+
+  jmp(r8);  // return to caller
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+
+  L(signed_underflow);
+  //find a good, compact way to signal error here
+  // maybe an invalid opcode that we execute, then detect in an exception handler?
+  
+  this->DebugBreak();
+  // stack unwinding, take first entry
+  //actually, no reason to have this
+
+  /*mov(rsp, ptr[rbx + offsetof(X64BackendStackpoint, host_stack_)]);
+  mov(ptr[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r9d);
+  sub(rsp, r11);
+  xor_(eax, eax);
+  inc(eax);
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      eax);
+
+  jmp(r8);*/
+  //  this->DebugBreak();  // err, add an xe::FatalError to call for this
+
+  return EmitCurrentForOffsets(code_offsets);
+}
+
+void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
+    void* sync_func, unsigned stack_element_size) {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  pop(r8);  // return address
+
+  switch (stack_element_size) {
+    case 4:
+      mov(r11d, ptr[r8]);
+      break;
+    case 2:
+      movzx(r11d, word[r8]);
+      break;
+    case 1:
+      movzx(r11d, byte[r8]);
+      break;
+  }
+  add(r8, stack_element_size);
+  jmp(sync_func, T_NEAR);
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+void X64HelperEmitter::EmitSaveVolatileRegs() {
   // Save off volatile registers.
   // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
   mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
@@ -711,7 +855,7 @@ void X64ThunkEmitter::EmitSaveVolatileRegs() {
   vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
 }
 
-void X64ThunkEmitter::EmitLoadVolatileRegs() {
+void X64HelperEmitter::EmitLoadVolatileRegs() {
   // mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
   mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
   mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
@@ -732,7 +876,7 @@ void X64ThunkEmitter::EmitLoadVolatileRegs() {
   vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
 }
 
-void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
+void X64HelperEmitter::EmitSaveNonvolatileRegs() {
   mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
   mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
 #if XE_PLATFORM_WIN32
@@ -760,7 +904,7 @@ void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
 #endif
 }
 
-void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
+void X64HelperEmitter::EmitLoadNonvolatileRegs() {
   mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
   mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
 #if XE_PLATFORM_WIN32
@@ -788,16 +932,41 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
 }
 void X64Backend::InitializeBackendContext(void* ctx) {
   X64BackendContext* bctx = BackendContextForGuestContext(ctx);
-  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
   bctx->mxcsr_fpu =
       DEFAULT_FPU_MXCSR;  // idk if this is right, check on rgh what the
                           // rounding on ppc is at startup
+
+  /*
+          todo: stackpoint arrays should be pooled virtual memory at the very
+     least there may be some fancy virtual address tricks we can do here
+
+  */
+
+  bctx->stackpoints = cvars::enable_host_guest_stack_synchronization
+                          ? new X64BackendStackpoint[cvars::max_stackpoints]
+                          : nullptr;
+  bctx->current_stackpoint_depth = 0;
   bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
   bctx->flags = 0;
   // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
   bctx->Ox1000 = 0x1000;
   bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
 }
+void X64Backend::DeinitializeBackendContext(void* ctx) {
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
+
+  if (bctx->stackpoints) {
+    delete[] bctx->stackpoints;
+    bctx->stackpoints = nullptr;
+  }
+}
+
+void X64Backend::PrepareForReentry(void* ctx) {
+  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
+
+  bctx->current_stackpoint_depth = 0;
+}
+
 const uint32_t mxcsr_table[8] = {
     0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
 };
diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h
index cb5a375ec..92ee0f7a4 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@@ -24,7 +24,8 @@
 #endif
 
 DECLARE_int64(x64_extension_mask);
-
+DECLARE_int64(max_stackpoints);
+DECLARE_bool(enable_host_guest_stack_synchronization);
 namespace xe {
 class Exception;
 }  // namespace xe
@@ -41,14 +42,25 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();
 
+struct X64BackendStackpoint {
+  uint64_t host_stack_;
+  unsigned guest_stack_;
+  // pad to 16 bytes so we never end up having a 64 bit load/store for
+  // host_stack_ straddling two lines. Consider this field reserved for future
+  // use
+  unsigned unused_;
+};
 // located prior to the ctx register
 // some things it would be nice to have be per-emulator instance instead of per
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
-  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
+  // guest_tick_count is used if inline_loadclock is used
   uint64_t* guest_tick_count;
+  // records mapping of host_stack to guest_stack
+  X64BackendStackpoint* stackpoints;
 
+  unsigned int current_stackpoint_depth;
   unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
                            // affects both vmx and the fpu
   unsigned int mxcsr_vmx;
@@ -81,6 +93,19 @@ class X64Backend : public Backend {
     return resolve_function_thunk_;
   }
 
+  void* synchronize_guest_and_host_stack_helper() const {
+    return synchronize_guest_and_host_stack_helper_;
+  }
+  void* synchronize_guest_and_host_stack_helper_for_size(size_t sz) const {
+    switch (sz) {
+      case 1:
+        return synchronize_guest_and_host_stack_helper_size8_;
+      case 2:
+        return synchronize_guest_and_host_stack_helper_size16_;
+      default:
+        return synchronize_guest_and_host_stack_helper_size32_;
+    }
+  }
   bool Initialize(Processor* processor) override;
 
   void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
@@ -97,7 +122,8 @@ class X64Backend : public Backend {
   void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
   void UninstallBreakpoint(Breakpoint* breakpoint) override;
   virtual void InitializeBackendContext(void* ctx) override;
-
+  virtual void DeinitializeBackendContext(void* ctx) override;
+  virtual void PrepareForReentry(void* ctx) override;
   X64BackendContext* BackendContextForGuestContext(void* ctx) {
     return reinterpret_cast<X64BackendContext*>(
         reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
@@ -120,7 +146,12 @@ class X64Backend : public Backend {
   HostToGuestThunk host_to_guest_thunk_;
   GuestToHostThunk guest_to_host_thunk_;
   ResolveFunctionThunk resolve_function_thunk_;
+  void* synchronize_guest_and_host_stack_helper_ = nullptr;
 
+  // loads stack sizes 1 byte, 2 bytes or 4 bytes
+  void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
+  void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
+  void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
 #if XE_X64_PROFILER_AVAILABLE == 1
   GuestProfilerData profiler_data_;
 #endif
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index bc9224ab6..463b245d0 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -213,6 +213,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   func_info.stack_size = stack_size;
   stack_size_ = stack_size;
 
+  PushStackpoint();
   sub(rsp, (uint32_t)stack_size);
 
   code_offsets.prolog_stack_alloc = getSize();
@@ -271,6 +272,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   */
   // Body.
   auto block = builder->first_block();
+  synchronize_stack_on_next_instruction_ = false;
   while (block) {
     ForgetMxcsrMode();  // at start of block, mxcsr mode is undefined
 
@@ -287,6 +289,12 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
     // Process instructions.
     const Instr* instr = block->instr_head;
     while (instr) {
+      if (synchronize_stack_on_next_instruction_) {
+        if (instr->GetOpcodeNum() != hir::OPCODE_SOURCE_OFFSET) {
+          synchronize_stack_on_next_instruction_ = false;
+          EnsureSynchronizedGuestAndHostStack();
+        }
+      }
       const Instr* new_tail = instr;
       if (!SelectSequence(this, instr, &new_tail)) {
         // No sequence found!
@@ -314,6 +322,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   EmitProfilerEpilogue();
 
   add(rsp, (uint32_t)stack_size);
+  PopStackpoint();
   ret();
   // todo: do some kind of sorting by alignment?
   for (auto&& tail_item : tail_code_) {
@@ -453,12 +462,186 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 
 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state =
-      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
+  auto guest_context = reinterpret_cast<ppc::PPCContext_s*>(raw_context);
+
+  auto thread_state = guest_context->thread_state;
 
   // TODO(benvanik): required?
   assert_not_zero(target_address);
 
+  /*
+          todo: refactor this!
+
+         The purpose of this code is to allow guest longjmp to call into
+     the body of an existing host function. There are a lot of conditions we
+     have to check here to ensure that we do not mess up a normal call to a
+     function
+
+         The address must be within an XexModule (may need to make some changes
+     to instructionaddressflags to remove this limitation) The target address
+     must be a known return site. The guest address must be part of a function
+     that was already translated.
+
+  */
+
+  if (cvars::enable_host_guest_stack_synchronization) {
+    auto processor = thread_state->processor();
+    auto module_for_address =
+        processor->LookupModule(static_cast<uint32_t>(target_address));
+
+    if (module_for_address) {
+      XexModule* xexmod = dynamic_cast<XexModule*>(module_for_address);
+      if (xexmod) {
+        InfoCacheFlags* flags = xexmod->GetInstructionAddressFlags(
+            static_cast<uint32_t>(target_address));
+        if (flags) {
+          if (flags->is_return_site) {
+            auto ones_with_address = processor->FindFunctionsWithAddress(
+                static_cast<uint32_t>(target_address));
+            if (ones_with_address.size() != 0) {
+              // this loop to find a host address for the guest address is
+              // necessary because FindFunctionsWithAddress works via a range
+              // check, but if the function consists of multiple blocks
+              // scattered around with "holes" of instructions that cannot be
+              // reached in between those holes the instructions that cannot be
+              // reached will incorrectly be considered members of the function
+
+              X64Function* candidate = nullptr;
+              uintptr_t host_address = 0;
+              for (auto&& entry : ones_with_address) {
+                X64Function* xfunc = static_cast<X64Function*>(entry);
+
+                host_address = xfunc->MapGuestAddressToMachineCode(
+                    static_cast<uint32_t>(target_address));
+                // host address does exist within the function, and that host
+                // function is not the start of the function, it is instead
+                // somewhere within its existing body
+                // i originally did not have this (xfunc->machine_code() !=
+                // reinterpret_cast<const uint8_t*>(host_address))) condition
+                // here when i distributed builds for testing, no issues arose
+                // related to it but i wanted to be more explicit
+                if (host_address &&
+                    xfunc->machine_code() !=
+                        reinterpret_cast<const uint8_t*>(host_address)) {
+                  candidate = xfunc;
+                  break;
+                }
+              }
+              // we found an existing X64Function, and a return site within that
+              // function that has a host address w/ native code
+              if (candidate && host_address) {
+                X64Backend* backend =
+                    static_cast<X64Backend*>(processor->backend());
+                // grab the backend context, next we have to check whether the
+                // guest and host stack are out of sync if they arent, its fine
+                // for the backend to create a new function for the guest
+                // address we're resolving if they are, it means that the reason
+                // we're resolving this address is because context is being
+                // restored (probably by longjmp)
+                X64BackendContext* backend_context =
+                    backend->BackendContextForGuestContext(guest_context);
+
+                uint32_t current_stackpoint_index =
+                    backend_context->current_stackpoint_depth;
+
+                --current_stackpoint_index;
+
+                X64BackendStackpoint* stackpoints =
+                    backend_context->stackpoints;
+
+                uint32_t current_guest_stackpointer =
+                    static_cast<uint32_t>(guest_context->r[1]);
+                uint32_t num_frames_bigger = 0;
+
+                /*
+                        if the current guest stack pointer is bigger than the
+                   recorded pointer for this stack thats fine, plenty of
+                   functions restore the original stack pointer early
+
+                        if more than 1... we're longjmping and sure of it at
+                   this point (jumping to a return site that has already been
+                   emitted)
+                */
+                while (current_stackpoint_index != 0xFFFFFFFF) {
+                  if (current_guest_stackpointer >
+                      stackpoints[current_stackpoint_index].guest_stack_) {
+                    --current_stackpoint_index;
+                    ++num_frames_bigger;
+
+                  } else {
+                    break;
+                  }
+                }
+                /*
+                                        DEFINITELY a longjmp, return original
+                   host address. returning the existing host address is going to
+                   set off some extra machinery we have set up to support this
+
+                                        to break it down, our caller (us being
+                   this ResolveFunction that this comment is in) is
+                   X64Backend::resolve_function_thunk_ which is implemented in
+                   x64_backend.cc X64HelperEmitter::EmitResolveFunctionThunk, or
+                   a call from the resolver table
+
+                                        the x64 fastcall abi dictates that the
+                   stack must always be 16 byte aligned. We select our stack
+                   size for functions to ensure that we keep rsp aligned to 16
+                   bytes
+
+                                        but by calling into the body of an
+                   existing function we've pushed our return address onto the
+                   stack (dont worry about this return address, it gets
+                   discarded in a later step)
+
+                                        this means that the stack is no longer
+                   16 byte aligned, (rsp % 16) now == 8, and this is the only
+                   time outside of the prolog or epilog of a function that this
+                   will be the case
+
+                                        so, after all direct or indirect
+                   function calls we set
+                   X64Emitter::synchronize_stack_on_next_instruction_ to true.
+                                        On the next instruction that is not
+                   OPCODE_SOURCE_OFFSET we will emit a check when we see
+                   synchronize_stack_on_next_instruction_ is true. We have to
+                   skip OPCODE_SOURCE_OFFSET because its not a "real"
+                   instruction and if we emit on it the return address of the
+                   function call will point to AFTER our check, so itll never be
+                   executed.
+
+                                        our check is just going to do test esp,
+                   15 to see if the stack is misaligned. (using esp instead of
+                   rsp saves 1 byte). We tail emit the handling for when the
+                   check succeeds because in 99.99999% of function calls it will
+                   be aligned, in the end the runtime cost of these checks is 5
+                   bytes for the test instruction which ought to be one cycle
+                   and 5 bytes for the jmp with no cycles taken for the jump
+                   which will be predicted not taken.
+
+                  Our handling for the check is implemented in X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper. we don't call it directly though,
+				  instead we go through backend()->synchronize_guest_and_host_stack_helper_for_size(num_bytes_needed_to_represent_stack_size). we place the stack size after the
+				  call instruction so we can load it in the helper and readjust the return address to point after the literal value. 
+
+				  The helper is going to search the array of stackpoints to find the first one that is greater than or equal to the current stack pointer, when it finds
+				  the entry it will set the currently host rsp to the host stack pointer value in the entry, and then subtract the stack size of the caller from that.
+				  the current stackpoint index is adjusted to point to the one after the stackpoint we restored to.
+
+				  The helper then jumps back to the function that was longjmp'ed to, with the host stack in its proper state. it just works!
+
+
+
+                 */
+
+                if (num_frames_bigger > 1) {
+                  return host_address;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
   auto fn = thread_state->processor()->ResolveFunction(
       static_cast<uint32_t>(target_address));
   assert_not_null(fn);
@@ -479,7 +662,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
       mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
 
       call((void*)fn->machine_code());
-
+      synchronize_stack_on_next_instruction_ = true;
     } else {
       // tail call
       EmitTraceUserCallReturn();
@@ -488,8 +671,10 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
       mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
 
       add(rsp, static_cast<uint32_t>(stack_size()));
+      PopStackpoint();
       jmp((void*)fn->machine_code(), T_NEAR);
     }
+
     return;
   } else if (code_cache_->has_indirection_table()) {
     // Load the pointer to the indirection table maintained in X64CodeCache.
@@ -513,12 +698,14 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
     mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
 
     add(rsp, static_cast<uint32_t>(stack_size()));
+    PopStackpoint();
     jmp(rax);
   } else {
     // Return address is from the previous SET_RETURN_ADDRESS.
     mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
 
     call(rax);
+    synchronize_stack_on_next_instruction_ = true;
   }
 }
 
@@ -557,12 +744,14 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
     mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
 
     add(rsp, static_cast<uint32_t>(stack_size()));
+    PopStackpoint();
     jmp(rax);
   } else {
     // Return address is from the previous SET_RETURN_ADDRESS.
     mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
 
     call(rax);
+    synchronize_stack_on_next_instruction_ = true;
   }
 }
 
@@ -1458,6 +1647,126 @@ Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
   pt.setBit(32);
   return pt;
 }
+
+void X64Emitter::HandleStackpointOverflowError(ppc::PPCContext* context) {
+  // context->lr
+  // todo: show lr in message?
+  xe::FatalError(
+      "Overflowed stackpoints! Please report this error for this title to "
+      "Xenia developers.");
+}
+
+void X64Emitter::PushStackpoint() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // push the current host and guest stack pointers
+  // this is done before a stack frame is set up or any guest instructions are
+  // executed this code is probably the most intrusive part of the stackpoint
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  mov(eax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
+
+  mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+
+  imul(r9d, eax, sizeof(X64BackendStackpoint));
+  add(rbx, r9);
+
+  mov(qword[rbx + offsetof(X64BackendStackpoint, host_stack_)], rsp);
+  mov(dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r8d);
+  if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
+    inc(eax);
+  } else {
+    add(eax, 1);
+  }
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+      eax);
+
+  cmp(eax, (uint32_t)cvars::max_stackpoints);
+
+  Xbyak::Label& overflowed_stackpoints =
+      AddToTail([](X64Emitter& e, Xbyak::Label& our_tail_label) {
+        e.L(our_tail_label);
+        // we never subtracted anything from rsp, so our stack is misaligned and
+        // will fault in guesttohostthunk
+        // e.sub(e.rsp, 8);
+        e.push(e.rax);  // easier realign, 1 byte opcode vs 4 bytes for sub
+
+        e.CallNativeSafe((void*)X64Emitter::HandleStackpointOverflowError);
+      });
+  jge(overflowed_stackpoints, T_NEAR);
+}
+void X64Emitter::PopStackpoint() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // todo: maybe verify that rsp and r1 == the stackpoint?
+  Xbyak::Address stackpoint_pos_pointer =
+      GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth));
+  stackpoint_pos_pointer.setBit(32);
+  dec(stackpoint_pos_pointer);
+}
+
+void X64Emitter::EnsureSynchronizedGuestAndHostStack() {
+  if (!cvars::enable_host_guest_stack_synchronization) {
+    return;
+  }
+  // chrispy: keeping this old slower test here in case in the future changes
+  // need to be made
+  // that result in the stack not being 8 byte misaligned on context reentry
+
+#if 0
+	Xbyak::Label skip{};
+	mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
+  mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
+  imul(eax,
+       GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
+       sizeof(X64BackendStackpoint));
+  sub(eax, sizeof(X64BackendStackpoint));
+  add(rbx, rax);
+
+  cmp(r8d, dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)]);
+   jle(skip, T_NEAR);
+  Xbyak::Label skip{};
+  mov(r11d, stack_size());
+  call(backend_->synchronize_guest_and_host_stack_helper());
+  L(skip);
+#endif
+
+  Xbyak::Label& return_from_sync = this->NewCachedLabel();
+
+  // if we got here somehow from setjmp or the like we ought to have a
+  // misaligned stack right now! this provides us with a very fast pretest for
+  // this condition
+  test(esp, 15);
+
+  Xbyak::Label& sync_label = this->AddToTail(
+      [&return_from_sync](X64Emitter& e, Xbyak::Label& our_tail_label) {
+        e.L(our_tail_label);
+
+        uint32_t stack32 = static_cast<uint32_t>(e.stack_size());
+        auto backend = e.backend();
+
+        if (stack32 < 256) {
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(1));
+          e.db(stack32);
+
+        } else if (stack32 < 65536) {
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(2));
+          e.dw(stack32);
+        } else {
+          // ought to be impossible, a host stack bigger than 65536??
+          e.call(backend->synchronize_guest_and_host_stack_helper_for_size(4));
+          e.dd(stack32);
+        }
+        e.jmp(return_from_sync, T_NEAR);
+      });
+
+  jnz(sync_label, T_NEAR);
+
+  L(return_from_sync);
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 155994bf9..4fdeab4a4 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -299,6 +299,11 @@ class X64Emitter : public Xbyak::CodeGenerator {
 
   Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
   Xbyak::Label& NewCachedLabel();
+
+  void PushStackpoint();
+  void PopStackpoint();
+
+  void EnsureSynchronizedGuestAndHostStack();
   FunctionDebugInfo* debug_info() const { return debug_info_; }
 
   size_t stack_size() const { return stack_size_; }
@@ -381,13 +386,14 @@ class X64Emitter : public Xbyak::CodeGenerator {
   bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
   void EmitGetCurrentThreadId();
   void EmitTraceUserCallReturn();
-
+  static void HandleStackpointOverflowError(ppc::PPCContext* context);
  protected:
   Processor* processor_ = nullptr;
   X64Backend* backend_ = nullptr;
   X64CodeCache* code_cache_ = nullptr;
   XbyakAllocator* allocator_ = nullptr;
   XexModule* guest_module_ = nullptr;
+  bool synchronize_stack_on_next_instruction_ = false;
   Xbyak::util::Cpu cpu_;
   uint64_t feature_flags_ = 0;
   uint32_t current_guest_function_ = 0;
diff --git a/src/xenia/cpu/entry_table.cc b/src/xenia/cpu/entry_table.cc
index 840706171..4b9181be7 100644
--- a/src/xenia/cpu/entry_table.cc
+++ b/src/xenia/cpu/entry_table.cc
@@ -56,6 +56,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
   if (entry) {
     // If we aren't ready yet spin and wait.
     if (entry->status == Entry::STATUS_COMPILING) {
+      // chrispy: i think this is dead code, if we are compiling we're holding
+      // the global lock, arent we? so we wouldnt be executing here
       // Still compiling, so spin.
       do {
         global_lock.unlock();
diff --git a/src/xenia/cpu/function.cc b/src/xenia/cpu/function.cc
index ebd8c5ba1..828c5f94e 100644
--- a/src/xenia/cpu/function.cc
+++ b/src/xenia/cpu/function.cc
@@ -110,8 +110,13 @@ uint32_t GuestFunction::MapGuestAddressToMachineCodeOffset(
 uintptr_t GuestFunction::MapGuestAddressToMachineCode(
     uint32_t guest_address) const {
   auto entry = LookupGuestAddress(guest_address);
-  return reinterpret_cast<uintptr_t>(machine_code()) +
-         (entry ? entry->code_offset : 0);
+
+  if (entry) {
+    return reinterpret_cast<uintptr_t>(machine_code()) + entry->code_offset;
+  } else {
+    return 0;
+  
+  }
 }
 
 uint32_t GuestFunction::MapMachineCodeToGuestAddress(
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc
index 867651c32..b36c36e68 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@@ -27,18 +27,13 @@
 #include "xenia/cpu/ppc/ppc_frontend.h"
 #include "xenia/cpu/ppc/ppc_opcode_info.h"
 #include "xenia/cpu/processor.h"
-
+#include "xenia/cpu/xex_module.h"
 DEFINE_bool(
     break_on_unimplemented_instructions, true,
     "Break to the host debugger (or crash if no debugger attached) if an "
     "unimplemented PowerPC instruction is encountered.",
     "CPU");
 
-DEFINE_bool(
-    emit_useless_fpscr_updates, false,
-    "Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
-    "CPU");
-
 namespace xe {
 namespace cpu {
 namespace ppc {
@@ -94,8 +89,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
 
   function_ = function;
   start_address_ = function_->address();
-  //chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
-  //i've also seen ones with a start and end address that are the same...
+  // chrispy: i've seen this one happen, not sure why but i think from trying to
+  // precompile twice i've also seen ones with a start and end address that are
+  // the same...
   assert_true(function_->address() <= function_->end_address());
   instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
 
@@ -250,7 +246,8 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
 }
 
 void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
-	//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
+  // chrispy: label->name is unused, it would be nice to be able to remove the
+  // field and this code
   char name_buffer[13];
   auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
   name_buffer[format_result.size] = '\0';
@@ -457,37 +454,38 @@ void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
   // TODO(benvanik): detect overflow and nan cases.
   // fx and vx are the most important.
   /*
-          chrispy: stubbed this out because right now all it does is waste
-     memory and CPU time
+    chrispy: i stubbed this out at one point because all it does is waste
+     memory and CPU time, however, this introduced issues with raiden
+    (substitute w/ titleid later) which probably means they stash stuff in the
+    fpscr?
+
   */
-  if (cvars::emit_useless_fpscr_updates) {
-    Value* fx = LoadConstantInt8(0);
-    Value* fex = LoadConstantInt8(0);
-    Value* vx = LoadConstantInt8(0);
-    Value* ox = LoadConstantInt8(0);
 
-    if (update_cr1) {
-      // Store into the CR1 field.
-      // We do this instead of just calling CopyFPSCRToCR1 so that we don't
-      // have to read back the bits and do shifting work.
-      StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
-      StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
-      StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
-      StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
-    }
+  Value* fx = LoadConstantInt8(0);
+  Value* fex = LoadConstantInt8(0);
+  Value* vx = LoadConstantInt8(0);
+  Value* ox = LoadConstantInt8(0);
 
-    // Generate our new bits.
-    Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
-    new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
-    new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
-    new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
-
-    // Mix into fpscr while preserving sticky bits (FX and OX).
-    Value* bits = LoadFPSCR();
-    bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
-    StoreFPSCR(bits);
+  if (update_cr1) {
+    // Store into the CR1 field.
+    // We do this instead of just calling CopyFPSCRToCR1 so that we don't
+    // have to read back the bits and do shifting work.
+    StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
+    StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
+    StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
+    StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
   }
 
+  // Generate our new bits.
+  Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
+  new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
+  new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
+  new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
+
+  // Mix into fpscr while preserving sticky bits (FX and OX).
+  Value* bits = LoadFPSCR();
+  bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
+  StoreFPSCR(bits);
 }
 
 void PPCHIRBuilder::CopyFPSCRToCR1() {
@@ -587,7 +585,24 @@ void PPCHIRBuilder::StoreReserved(Value* val) {
 Value* PPCHIRBuilder::LoadReserved() {
   return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
 }
+void PPCHIRBuilder::SetReturnAddress(Value* value) {
+  /*
+     Record the address as being a possible target of a return. This is
+     needed for longjmp emulation. See x64_emitter.cc's ResolveFunction
+  */
+  Module* mod = this->function_->module();
+  if (value && value->IsConstant()) {
+    if (mod) {
+      XexModule* xexmod = dynamic_cast<XexModule*>(mod);
+      if (xexmod) {
+        auto flags = xexmod->GetInstructionAddressFlags(value->AsUint32());
+        flags->is_return_site = true;
+      }
+    }
+  }
 
+  HIRBuilder::SetReturnAddress(value);
+}
 }  // namespace ppc
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.h b/src/xenia/cpu/ppc/ppc_hir_builder.h
index a7eb6fc4a..ad99b63d8 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.h
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.h
@@ -80,7 +80,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {
 
   void StoreReserved(Value* val);
   Value* LoadReserved();
-
+  //calls original impl in hirbuilder, but also records the is_return_site bit into flags in the guestmodule
+  void SetReturnAddress(Value* value);
  private:
   void MaybeBreakOnInstruction(uint32_t address);
   void AnnotateLabel(uint32_t address, Label* label);
diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc
index aa9f83013..cb6105464 100644
--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@@ -263,12 +263,11 @@ Function* Processor::ResolveFunction(uint32_t address) {
       return nullptr;
     }
 
-
     if (!DemandFunction(function)) {
       entry->status = Entry::STATUS_FAILED;
       return nullptr;
     }
-	//only add it to the list of resolved functions if resolving succeeded
+    //only add it to the list of resolved functions if resolving succeeded
     auto module_for = function->module();
 
     auto xexmod = dynamic_cast<XexModule*>(module_for);
@@ -291,23 +290,23 @@ Function* Processor::ResolveFunction(uint32_t address) {
     return nullptr;
   }
 }
-
+Module* Processor::LookupModule(uint32_t address) {
+  auto global_lock = global_critical_region_.Acquire();
+  // TODO(benvanik): sort by code address (if contiguous) so can bsearch.
+  // TODO(benvanik): cache last module low/high, as likely to be in there.
+  for (const auto& module : modules_) {
+    if (module->ContainsAddress(address)) {
+      return module.get();
+    }
+  }
+  return nullptr;
+}
 Function* Processor::LookupFunction(uint32_t address) {
   // TODO(benvanik): fast reject invalid addresses/log errors.
 
   // Find the module that contains the address.
-  Module* code_module = nullptr;
-  {
-    auto global_lock = global_critical_region_.Acquire();
-    // TODO(benvanik): sort by code address (if contiguous) so can bsearch.
-    // TODO(benvanik): cache last module low/high, as likely to be in there.
-    for (const auto& module : modules_) {
-      if (module->ContainsAddress(address)) {
-        code_module = module.get();
-        break;
-      }
-    }
-  }
+  Module* code_module = LookupModule(address);
+
   if (!code_module) {
     // No module found that could contain the address.
     return nullptr;
diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h
index eaa958d3d..630cf4633 100644
--- a/src/xenia/cpu/processor.h
+++ b/src/xenia/cpu/processor.h
@@ -115,6 +115,7 @@ class Processor {
   void RemoveFunctionByAddress(uint32_t address);
 
   Function* LookupFunction(uint32_t address);
+  Module* LookupModule(uint32_t address);
   Function* LookupFunction(Module* module, uint32_t address);
   Function* ResolveFunction(uint32_t address);
 
diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc
index fe9467dd8..08338a0a0 100644
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@@ -78,7 +78,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
   // Allocate with 64b alignment.
 
   context_ = reinterpret_cast<ppc::PPCContext*>(
-      AllocateContext());
+	  AllocateContext());
   processor->backend()->InitializeBackendContext(context_);
   assert_true(((uint64_t)context_ & 0x3F) == 0);
   std::memset(context_, 0, sizeof(ppc::PPCContext));
@@ -105,9 +105,9 @@ ThreadState::~ThreadState() {
     thread_state_ = nullptr;
   }
   if (context_) {
+    processor_->backend()->DeinitializeBackendContext(context_);
     FreeContext(reinterpret_cast<void*>(context_));
   }
-  // memory::AlignedFree(context_);
 }
 
 void ThreadState::Bind(ThreadState* thread_state) {
diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc
index 449827cc5..4e997579f 100644
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@@ -38,9 +38,10 @@ DEFINE_bool(disable_instruction_infocache, false,
             "CPU");
 
 DEFINE_bool(
-    disable_early_precompilation, false,
-    "Disables pre-compiling guest functions that we know we've called/that "
-    "we've recognized as being functions via simple heuristics.",
+    enable_early_precompilation, false,
+    "Enable pre-compiling guest functions that we know we've called/that "
+    "we've recognized as being functions via simple heuristics, good for error "
+    "finding/stress testing with the JIT",
     "CPU");
 
 static const uint8_t xe_xex2_retail_key[16] = {
@@ -1115,6 +1116,7 @@ void XexModule::Precompile() {
   if (!FindSaveRest()) {
     return;
   }
+
   info_cache_.Init(this);
   PrecompileDiscoveredFunctions();
 }
@@ -1343,22 +1345,26 @@ void XexInfoCache::Init(XexModule* xexmod) {
   num_codebytes += 3;  // round up to nearest multiple of 4
   num_codebytes &= ~3;
 
-  bool did_exist = true;
-  if (!std::filesystem::exists(infocache_path)) {
-  recreate:
-    xe::filesystem::CreateEmptyFile(infocache_path);
-    did_exist = false;
-  }
+  auto try_open = [this, &infocache_path, num_codebytes]() {
+    bool did_exist = true;
 
-  // todo: prepopulate with stuff from pdata, dll exports
+    if (!std::filesystem::exists(infocache_path)) {
+      xe::filesystem::CreateEmptyFile(infocache_path);
+      did_exist = false;
+    }
 
-  this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
-      infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
-      sizeof(InfoCacheFlagsHeader) +
-          (sizeof(InfoCacheFlags) *
-           (num_codebytes /
-            4))));  // one infocacheflags entry for each PPC instr-sized addr
+    // todo: prepopulate with stuff from pdata, dll exports
 
+    this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
+        infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
+        sizeof(InfoCacheFlagsHeader) +
+            (sizeof(InfoCacheFlags) *
+             (num_codebytes /
+              4))));  // one infocacheflags entry for each PPC instr-sized addr
+    return did_exist;
+  };
+
+  bool did_exist = try_open();
   if (!did_exist) {
     GetHeader()->version = CURRENT_INFOCACHE_VERSION;
 
@@ -1366,7 +1372,7 @@ void XexInfoCache::Init(XexModule* xexmod) {
     if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) {
       this->executable_addr_flags_->Close();
       std::filesystem::remove(infocache_path);
-      goto recreate;
+      try_open();
     }
   }
 }
@@ -1380,7 +1386,7 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
   return info_cache_.LookupFlags(guest_addr);
 }
 void XexModule::PrecompileDiscoveredFunctions() {
-  if (cvars::disable_early_precompilation) {
+  if (!cvars::enable_early_precompilation) {
     return;
   }
   auto others = PreanalyzeCode();
@@ -1397,7 +1403,7 @@ void XexModule::PrecompileDiscoveredFunctions() {
   }
 }
 void XexModule::PrecompileKnownFunctions() {
-  if (cvars::disable_early_precompilation) {
+  if (!cvars::enable_early_precompilation) {
     return;
   }
   uint32_t start = 0;
@@ -1435,18 +1441,14 @@ static bool IsOpcodeBL(unsigned w) {
 
 std::vector<uint32_t> XexModule::PreanalyzeCode() {
   uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
-  
-
 
   uint32_t highest_exec_addr = 0;
 
   for (auto&& sec : pe_sections_) {
     if ((sec.flags & kXEPESectionContainsCode)) {
-
-
-	  highest_exec_addr =
+      highest_exec_addr =
           std::max<uint32_t>(highest_exec_addr, sec.address + sec.size);
-	}
+    }
   }
   uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1);
   uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
@@ -1476,7 +1478,7 @@ std::vector<uint32_t> XexModule::PreanalyzeCode() {
     uint32_t mfspr_r12_lr32 =
         *reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
 
-	auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
+    auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
       funcstart_candidate_stack[stack_pos++] = addr;
     };
     /*
@@ -1926,7 +1928,7 @@ bool XexModule::FindSaveRest() {
       address += 2 * 4;
     }
   }
-  if (!cvars::disable_early_precompilation) {
+  if (cvars::enable_early_precompilation) {
     for (auto&& to_ensure_precompiled : resolve_on_exit) {
       // we want to make sure an address for these functions is available before
       // any other functions are compiled for code generation purposes but we do
diff --git a/src/xenia/cpu/xex_module.h b/src/xenia/cpu/xex_module.h
index bec6b7e0f..ded57c6f8 100644
--- a/src/xenia/cpu/xex_module.h
+++ b/src/xenia/cpu/xex_module.h
@@ -29,23 +29,27 @@ constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
 constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
 constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
 
-
 class Runtime;
 struct InfoCacheFlags {
   uint32_t was_resolved : 1;  // has this address ever been called/requested
                               // via resolvefunction?
   uint32_t accessed_mmio : 1;
   uint32_t is_syscall_func : 1;
-  uint32_t reserved : 29;
+  uint32_t is_return_site : 1;  // address can be reached from another function
+                                // by returning
+  uint32_t reserved : 28;
 };
+static_assert(sizeof(InfoCacheFlags) == 4,
+              "InfoCacheFlags size should be equal to sizeof ppc instruction.");
+
 struct XexInfoCache {
-	//increment this to invalidate all user infocaches
-  static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 1;
+  // increment this to invalidate all user infocaches
+  static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 4;
 
   struct InfoCacheFlagsHeader {
     uint32_t version;
 
-    unsigned char reserved[252];  
+    unsigned char reserved[252];
 
     InfoCacheFlags* LookupFlags(unsigned offset) {
       return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
@@ -228,7 +232,8 @@ class XexModule : public xe::cpu::Module {
 
   InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
 
-  virtual void Precompile() override; 
+  virtual void Precompile() override;
+
  protected:
   std::unique_ptr<Function> CreateFunction(uint32_t address) override;
 
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 62191477e..c99afd595 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1911,21 +1911,8 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                        uint32_t base,
                                                        uint32_t num_registers) {
-  RingBuffer::ReadRange range =
-      ring->BeginRead(num_registers * sizeof(uint32_t));
-
-  XE_LIKELY_IF(!range.second) {
-    uint32_t num_regs_firstrange =
-        static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
-
-    D3D12CommandProcessor::WriteRegistersFromMem(
-        base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
-        num_regs_firstrange);
-    ring->EndRead(range);
-  }
-  else {
-    return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
-  }
+  WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
+                                                       num_registers);
 }
 
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@@ -2042,7 +2029,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound(
   RingBuffer::ReadRange range =
       ring->BeginRead(num_registers * sizeof(uint32_t));
 
-
   XE_LIKELY_IF(!range.second) {
     WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
                                              register_upper_bound>(
@@ -2710,9 +2696,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     }
 
     if (vfetch_current_queued) {
-      // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
-      // pre-acquire the critical region so we're not repeatedly re-acquiring it
-      // in requestrange
+      // so far, i have never seen vfetch_current_queued > 4. 1 is most common,
+      // 2 happens occasionally. did not test many games though pre-acquire the
+      // critical region so we're not repeatedly re-acquiring it in requestrange
       auto shared_memory_request_range_hoisted =
           global_critical_region::Acquire();
 
@@ -4351,7 +4337,8 @@ bool D3D12CommandProcessor::UpdateBindings(
       uint32_t float_constant_index;
       while (xe::bit_scan_forward(float_constant_map_entry,
                                   &float_constant_index)) {
-        float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
+        float_constant_map_entry =
+            xe::clear_lowest_bit(float_constant_map_entry);
         std::memcpy(float_constants,
                     &regs[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
                           (float_constant_index << 2)]
@@ -4382,7 +4369,8 @@ bool D3D12CommandProcessor::UpdateBindings(
         uint32_t float_constant_index;
         while (xe::bit_scan_forward(float_constant_map_entry,
                                     &float_constant_index)) {
-          float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
+          float_constant_map_entry =
+              xe::clear_lowest_bit(float_constant_map_entry);
           std::memcpy(float_constants,
                       &regs[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
                             (float_constant_index << 2)]
diff --git a/src/xenia/kernel/xam/xam_input.cc b/src/xenia/kernel/xam/xam_input.cc
index 242ee1cfa..f9d9fa40f 100644
--- a/src/xenia/kernel/xam/xam_input.cc
+++ b/src/xenia/kernel/xam/xam_input.cc
@@ -41,10 +41,23 @@ DECLARE_XAM_EXPORT1(XamEnableInactivityProcessing, kInput, kStub);
 
 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx
 dword_result_t XamInputGetCapabilities_entry(
-    dword_t user_index, dword_t flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
+    dword_t user_index, dword_t _flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
+  unsigned flags = _flags;
+	//chrispy: actually, it appears that caps is never checked for null, it is memset at the start regardless
   if (!caps) {
     return X_ERROR_BAD_ARGUMENTS;
   }
+  if ((flags & 0x40000000) != 0) {
+	//should trap
+  }
+
+  if ((flags & 4) != 0) {
+  //should trap
+  }
+  if (!flags) {
+    flags = 3;
+  }
+
 
   if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
     // Ignore any query for other types of devices.
@@ -118,7 +131,7 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
 DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
 
 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
-dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
+dword_result_t XamInputSetState_entry(dword_t user_index, dword_t flags, /* flags, as far as i can see, is not used*/
                                       pointer_t<X_INPUT_VIBRATION> vibration) {
   if (user_index >= 4) {
     return X_E_DEVICE_NOT_CONNECTED;
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
index fefe2df4e..928ec780f 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
@@ -508,7 +508,16 @@ dword_result_t RtlInitializeCriticalSectionAndSpinCount_entry(
 DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone,
                          kImplemented);
 
+static void CriticalSectionPrefetchW(const void* vp) {
+#if XE_ARCH_AMD64 == 1
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
+    swcache::PrefetchW(vp);
+  }
+#endif
+}
+
 void RtlEnterCriticalSection_entry(pointer_t<X_RTL_CRITICAL_SECTION> cs) {
+  CriticalSectionPrefetchW(&cs->lock_count);
   uint32_t cur_thread = XThread::GetCurrentThread()->guest_object();
   uint32_t spin_count = cs->header.absolute * 256;
 
@@ -544,6 +553,7 @@ DECLARE_XBOXKRNL_EXPORT2(RtlEnterCriticalSection, kNone, kImplemented,
 
 dword_result_t RtlTryEnterCriticalSection_entry(
     pointer_t<X_RTL_CRITICAL_SECTION> cs) {
+  CriticalSectionPrefetchW(&cs->lock_count);
   uint32_t thread = XThread::GetCurrentThread()->guest_object();
 
   if (xe::atomic_cas(-1, 0, &cs->lock_count)) {
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index 574a91585..14179939e 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -7,6 +7,7 @@
  ******************************************************************************
  */
 
+#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
 #include <algorithm>
 #include <vector>
 #include "xenia/base/atomic.h"
@@ -18,7 +19,6 @@
 #include "xenia/kernel/user_module.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
-#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
 #include "xenia/kernel/xevent.h"
 #include "xenia/kernel/xmutant.h"
 #include "xenia/kernel/xsemaphore.h"
@@ -165,8 +165,16 @@ dword_result_t NtResumeThread_entry(dword_t handle,
   uint32_t suspend_count = 0;
 
   auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
+
   if (thread) {
-    result = thread->Resume(&suspend_count);
+    if (thread->type() == XObject::Type::Thread) {
+      result = thread->Resume(&suspend_count);
+
+    } else {
+      return X_STATUS_OBJECT_TYPE_MISMATCH;
+    }
+  } else {
+    return X_STATUS_INVALID_HANDLE;
   }
   if (suspend_count_ptr) {
     *suspend_count_ptr = suspend_count;
@@ -190,15 +198,27 @@ dword_result_t KeResumeThread_entry(lpvoid_t thread_ptr) {
 DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented);
 
 dword_result_t NtSuspendThread_entry(dword_t handle,
-                                     lpdword_t suspend_count_ptr) {
+                                     lpdword_t suspend_count_ptr,
+                                     const ppc_context_t& context) {
   X_RESULT result = X_STATUS_SUCCESS;
   uint32_t suspend_count = 0;
 
   auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
   if (thread) {
-    result = thread->Suspend(&suspend_count);
+    if (thread->type() == XObject::Type::Thread) {
+      auto current_pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
+
+      if (current_pcr->current_thread == thread->guest_object() ||
+          !thread->guest_object<X_KTHREAD>()->terminated) {
+        result = thread->Suspend(&suspend_count);
+      } else {
+        return X_STATUS_THREAD_IS_TERMINATING;
+      }
+    } else {
+      return X_STATUS_OBJECT_TYPE_MISMATCH;
+    }
   } else {
-    result = X_STATUS_INVALID_HANDLE;
+    return X_STATUS_INVALID_HANDLE;
   }
 
   if (suspend_count_ptr) {
@@ -213,23 +233,23 @@ void KeSetCurrentStackPointers_entry(lpvoid_t stack_ptr,
                                      pointer_t<X_KTHREAD> thread,
                                      lpvoid_t stack_alloc_base,
                                      lpvoid_t stack_base,
-                                     lpvoid_t stack_limit) {
+                                     lpvoid_t stack_limit, const ppc_context_t& context) {
   auto current_thread = XThread::GetCurrentThread();
-  auto context = current_thread->thread_state()->context();
-  auto pcr = kernel_memory()->TranslateVirtual<X_KPCR*>(
-      static_cast<uint32_t>(context->r[13]));
 
+  auto pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
+	
   thread->stack_alloc_base = stack_alloc_base.value();
   thread->stack_base = stack_base.value();
   thread->stack_limit = stack_limit.value();
   pcr->stack_base_ptr = stack_base.guest_address();
   pcr->stack_end_ptr = stack_limit.guest_address();
   context->r[1] = stack_ptr.guest_address();
-
+  
   // If a fiber is set, and the thread matches, reenter to avoid issues with
   // host stack overflowing.
   if (thread->fiber_ptr &&
       current_thread->guest_object() == thread.guest_address()) {
+    context->processor->backend()->PrepareForReentry(context.value());
     current_thread->Reenter(static_cast<uint32_t>(context->lr));
   }
 }
@@ -1018,7 +1038,8 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
   assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
 
   PrefetchForCAS(lock);
-  while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
+  while (!xe::atomic_cas(
+      0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
 #if XE_ARCH_AMD64 == 1
     // todo: this is just a nop if they don't have SMT, which is not great
     // either...
@@ -1038,7 +1059,8 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
   auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
   assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
   PrefetchForCAS(lock);
-  if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
+  if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])),
+                      lock)) {
     return 0;
   }
   return 1;
@@ -1281,7 +1303,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
 
 void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
                                            const ppc_context_t& ppc_context) {
-  auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
+  auto old_irql =
+      xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
 
   int32_t lock_count = ++lock_ptr->lock_count;
   if (!lock_count) {
@@ -1318,7 +1341,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
 
 void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
                                         const ppc_context_t& ppc_context) {
-  auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
+  auto old_irql =
+      xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
 
   int32_t lock_count = ++lock_ptr->lock_count;
   if (!lock_count ||
diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc
index 6d548220a..df5991d09 100644
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@@ -33,8 +33,15 @@ DEFINE_bool(ignore_thread_priorities, true,
 DEFINE_bool(ignore_thread_affinities, true,
             "Ignores game-specified thread affinities.", "Kernel");
 
+
+#if 0
+DEFINE_int64(stack_size_multiplier_hack, 1,
+             "A hack for games with setjmp/longjmp issues.", "Kernel");
+DEFINE_int64(main_xthread_stack_size_multiplier_hack, 1,
+             "A hack for games with setjmp/longjmp issues.", "Kernel");
+#endif
 namespace xe {
-namespace kernel {
+  namespace kernel {
 
 const uint32_t XAPC::kSize;
 const uint32_t XAPC::kDummyKernelRoutine;
@@ -373,8 +380,23 @@ X_STATUS XThread::Create() {
   RetainHandle();
 
   xe::threading::Thread::CreationParameters params;
-  params.stack_size = 16_MiB;  // Allocate a big host stack.
+  
+
+
   params.create_suspended = true;
+
+  #if 0
+  uint64_t stack_size_mult = cvars::stack_size_multiplier_hack;
+  
+  if (main_thread_) {
+    stack_size_mult =
+        static_cast<uint64_t>(cvars::main_xthread_stack_size_multiplier_hack);
+
+  } 
+  #else
+  uint64_t stack_size_mult = 1;
+  #endif
+  params.stack_size = 16_MiB * stack_size_mult;  // Allocate a big host stack.
   thread_ = xe::threading::Thread::Create(params, [this]() {
     // Set thread ID override. This is used by logging.
     xe::threading::set_current_thread_id(handle());
@@ -433,6 +455,9 @@ X_STATUS XThread::Create() {
 X_STATUS XThread::Exit(int exit_code) {
   // This may only be called on the thread itself.
   assert_true(XThread::GetCurrentThread() == this);
+  //TODO(chrispy): not sure if this order is correct, should it come after apcs?
+  guest_object<X_KTHREAD>()->terminated = 1;
+ 
 
   // TODO(benvanik): dispatch events? waiters? etc?
   RundownAPCs();
diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h
index 75e790ebd..898aea006 100644
--- a/src/xenia/kernel/xthread.h
+++ b/src/xenia/kernel/xthread.h
@@ -121,7 +121,7 @@ struct X_KTHREAD {
   uint8_t unk_B4[0x8];                // 0xB4
   uint8_t suspend_count;              // 0xBC
   uint8_t unk_BD;                     // 0xBD
-  uint8_t unk_BE;                     // 0xBE
+  uint8_t terminated;                     // 0xBE
   uint8_t current_cpu;                // 0xBF
   uint8_t unk_C0[0x10];               // 0xC0
   xe::be<uint32_t> stack_alloc_base;  // 0xD0
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index b160696f4..6384f4996 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -316,8 +316,8 @@ void Memory::Reset() {
   heaps_.v90000000.Reset();
   heaps_.physical.Reset();
 }
-//clang does not like non-standard layout offsetof
-#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0
+// clang does not like non-standard layout offsetof
+#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
 XE_NOALIAS
 const BaseHeap* Memory::LookupHeap(uint32_t address) const {
 #define HEAP_INDEX(name) \
@@ -359,7 +359,6 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
 #else
 XE_NOALIAS
 const BaseHeap* Memory::LookupHeap(uint32_t address) const {
-
   if (address < 0x40000000) {
     return &heaps_.v00000000;
   } else if (address < 0x7F000000) {
@@ -964,6 +963,14 @@ bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size,
 
   return true;
 }
+template<typename T>
+static inline T QuickMod(T value, uint32_t modv) {
+  if (xe::is_pow2(modv)) {
+    return value & (modv - 1);
+  } else {
+    return value % modv;
+  }
+}
 
 bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
                           uint32_t size, uint32_t alignment,
@@ -976,8 +983,9 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
   low_address = std::max(heap_base_, xe::align(low_address, alignment));
   high_address = std::min(heap_base_ + (heap_size_ - 1),
                           xe::align(high_address, alignment));
-  uint32_t low_page_number = (low_address - heap_base_) / page_size_;
-  uint32_t high_page_number = (high_address - heap_base_) / page_size_;
+
+  uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
+  uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
   low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
   high_page_number =
       std::min(uint32_t(page_table_.size()) - 1, high_page_number);
@@ -995,8 +1003,10 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
   // TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
   uint32_t start_page_number = UINT_MAX;
   uint32_t end_page_number = UINT_MAX;
-  uint32_t page_scan_stride = alignment / page_size_;
-  high_page_number = high_page_number - (high_page_number % page_scan_stride);
+  // chrispy:todo, page_scan_stride is probably always a power of two...
+  uint32_t page_scan_stride = alignment >> page_size_shift_;
+  high_page_number =
+      high_page_number - QuickMod(high_page_number, page_scan_stride);
   if (top_down) {
     for (int64_t base_page_number =
              high_page_number - xe::round_up(page_count, page_scan_stride);
@@ -1024,7 +1034,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
             base_page_number = -1;
           } else {
             base_page_number = page_number - page_count;
-            base_page_number -= base_page_number % page_scan_stride;
+            base_page_number -= QuickMod(base_page_number, page_scan_stride);
             base_page_number += page_scan_stride;  // cancel out loop logic
           }
           break;
@@ -1072,7 +1082,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
   if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
     // Out of memory.
     XELOGE("BaseHeap::Alloc failed to find contiguous range");
-    //assert_always("Heap exhausted!");
+    // assert_always("Heap exhausted!");
     return false;
   }
 
@@ -1084,15 +1094,15 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
                           ? xe::memory::AllocationType::kCommit
                           : xe::memory::AllocationType::kReserve;
     void* result = xe::memory::AllocFixed(
-        TranslateRelative(start_page_number * page_size_),
-        page_count * page_size_, alloc_type, ToPageAccess(protect));
+        TranslateRelative(start_page_number << page_size_shift_),
+        page_count << page_size_shift_, alloc_type, ToPageAccess(protect));
     if (!result) {
       XELOGE("BaseHeap::Alloc failed to alloc range from host");
       return false;
     }
 
     if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
-      std::memset(result, 0xCD, page_count * page_size_);
+      std::memset(result, 0xCD, page_count << page_size_shift_);
     }
   }
 
@@ -1108,7 +1118,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
     unreserved_page_count_--;
   }
 
-  *out_address = heap_base_ + (start_page_number * page_size_);
+  *out_address = heap_base_ + (start_page_number << page_size_shift_);
   return true;
 }
 
@@ -1719,8 +1729,7 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
   uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
   uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
 
-  uint32_t guest_one =
-      SystemPagenumToGuestPagenum(1);
+  uint32_t guest_one = SystemPagenumToGuestPagenum(1);
 
   uint32_t system_one = GuestPagenumToSystemPagenum(1);
   for (; i <= system_page_last; ++i) {
@@ -1755,7 +1764,6 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
 #endif
 
     uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
-    //swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
     xe::memory::PageAccess current_page_access =
         ToPageAccess(page_table_ptr[guest_page_number].current_protect);
     bool protect_system_page = false;
diff --git a/src/xenia/ui/windowed_app_main_win.cc b/src/xenia/ui/windowed_app_main_win.cc
index 114d36fc0..115eb259f 100644
--- a/src/xenia/ui/windowed_app_main_win.cc
+++ b/src/xenia/ui/windowed_app_main_win.cc
@@ -19,11 +19,96 @@
 
 DEFINE_bool(enable_console, false, "Open a console window with the main window",
             "General");
+#if XE_ARCH_AMD64 == 1
+DEFINE_bool(enable_rdrand_ntdll_patch, true,
+            "Hot-patches ntdll at the start of the process to not use rdrand "
+            "as part of the RNG for heap randomization. Can reduce CPU usage "
+            "significantly, but is untested on all Windows versions.",
+            "Win32");
+// begin ntdll hack
+#include <psapi.h>
+static bool g_didfailtowrite = false;
+static void write_process_memory(HANDLE process, uintptr_t offset,
+                                 unsigned size, const unsigned char* bvals) {
+  if (!WriteProcessMemory(process, (void*)offset, bvals, size, nullptr)) {
+    if (!g_didfailtowrite) {
+      MessageBoxA(nullptr, "Failed to write to process!", "Failed", MB_OK);
+      g_didfailtowrite = true;
+    }
+  }
+}
 
+static const unsigned char pattern_cmp_processorfeature_28_[] = {
+    0x80, 0x3C, 0x25, 0x90,
+    0x02, 0xFE, 0x7F, 0x00};  // cmp     byte ptr ds:7FFE0290h, 0
+static const unsigned char pattern_replacement[] = {
+    0x48, 0x39, 0xe4,             // cmp rsp, rsp = always Z
+    0x0F, 0x1F, 0x44, 0x00, 0x00  // 5byte nop
+};
+static void patch_ntdll_instance(HANDLE process, uintptr_t ntdll_base) {
+  MODULEINFO modinfo;
+
+  GetModuleInformation(process, (HMODULE)ntdll_base, &modinfo,
+                       sizeof(MODULEINFO));
+
+  std::vector<uintptr_t> possible_places{};
+
+  unsigned char* strt = (unsigned char*)modinfo.lpBaseOfDll;
+
+  for (unsigned i = 0; i < modinfo.SizeOfImage; ++i) {
+    for (unsigned j = 0; j < sizeof(pattern_cmp_processorfeature_28_); ++j) {
+      if (strt[i + j] != pattern_cmp_processorfeature_28_[j]) {
+        goto miss;
+      }
+    }
+    possible_places.push_back((uintptr_t)(&strt[i]));
+  miss:;
+  }
+
+  for (auto&& place : possible_places) {
+    write_process_memory(process, place, sizeof(pattern_replacement),
+                         pattern_replacement);
+  }
+}
+
+static void do_ntdll_hack_this_process() {
+  patch_ntdll_instance(GetCurrentProcess(),
+                       (uintptr_t)GetModuleHandleA("ntdll.dll"));
+}
+#endif
+// end ntdll hack
+LONG _UnhandledExceptionFilter(_EXCEPTION_POINTERS* ExceptionInfo) {
+  PVOID exception_addr = ExceptionInfo->ExceptionRecord->ExceptionAddress;
+
+  DWORD64 last_stackpointer = ExceptionInfo->ContextRecord->Rsp;
+
+  DWORD64 last_rip = ExceptionInfo->ContextRecord->Rip;
+
+  DWORD except_code = ExceptionInfo->ExceptionRecord->ExceptionCode;
+
+  DWORD last_error = GetLastError();
+
+  NTSTATUS stat = __readgsdword(0x1250);
+
+  int last_errno_value = errno;
+
+
+
+  char except_message_buf[1024];
+
+  sprintf_s(except_message_buf,
+            "Exception encountered!\nException address: %p\nStackpointer: "
+            "%p\nInstruction pointer: %p\nExceptionCode: 0x%X\nLast Win32 "
+            "Error: 0x%X\nLast NTSTATUS: 0x%X\nLast errno value: 0x%X\n",
+            exception_addr, (void*)last_stackpointer, (void*)last_rip, except_code,
+            last_error, stat, last_errno_value);
+  MessageBoxA(nullptr, except_message_buf, "Unhandled Exception", MB_ICONERROR);
+  return EXCEPTION_CONTINUE_SEARCH;
+}
 int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
                     LPWSTR command_line, int show_cmd) {
   int result;
-
+  SetUnhandledExceptionFilter(_UnhandledExceptionFilter);
   {
     xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd);
     // TODO(Triang3l): Initialize creates a window. Set DPI awareness via the
@@ -40,13 +125,6 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
       return EXIT_FAILURE;
     }
 
-    // TODO(Triang3l): Rework this, need to initialize the console properly,
-    // disable has_console_attached_ by default in windowed apps, and attach
-    // only if needed.
-    if (cvars::enable_console) {
-      xe::AttachConsole();
-    }
-
     // Initialize COM on the UI thread with the apartment-threaded concurrency
     // model, so dialogs can be used.
     if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) {
@@ -55,8 +133,22 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
 
     xe::InitializeWin32App(app->GetName());
 
-    result =
-        app->OnInitialize() ? app_context.RunMainMessageLoop() : EXIT_FAILURE;
+    if (app->OnInitialize()) {
+#if XE_ARCH_AMD64 == 1
+      if (cvars::enable_rdrand_ntdll_patch) {
+        do_ntdll_hack_this_process();
+      }
+#endif
+      // TODO(Triang3l): Rework this, need to initialize the console properly,
+      // disable has_console_attached_ by default in windowed apps, and attach
+      // only if needed.
+      if (cvars::enable_console) {
+        xe::AttachConsole();
+      }
+      result = app_context.RunMainMessageLoop();
+    } else {
+      result = EXIT_FAILURE;
+    }
 
     app->InvokeOnDestroy();
   }
diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h
index 574501788..349e40886 100644
--- a/src/xenia/xbox.h
+++ b/src/xenia/xbox.h
@@ -61,6 +61,7 @@ typedef uint32_t X_STATUS;
 #define X_STATUS_OBJECT_NAME_COLLISION                  ((X_STATUS)0xC0000035L)
 #define X_STATUS_INVALID_PAGE_PROTECTION                ((X_STATUS)0xC0000045L)
 #define X_STATUS_MUTANT_NOT_OWNED                       ((X_STATUS)0xC0000046L)
+#define X_STATUS_THREAD_IS_TERMINATING					((X_STATUS)0xC000004BL)
 #define X_STATUS_PROCEDURE_NOT_FOUND                    ((X_STATUS)0xC000007AL)
 #define X_STATUS_INSUFFICIENT_RESOURCES                 ((X_STATUS)0xC000009AL)
 #define X_STATUS_MEMORY_NOT_ALLOCATED                   ((X_STATUS)0xC00000A0L)