Changing the way the global lock works. Some things are better, I think.

Regressions are likely.
2015-09-01 09:45:14 -07:00 · 2015-09-01 09:45:14 -07:00 · f5e374f9b5
parent 5355183590
commit f5e374f9b5
14 changed files with 199 additions and 128 deletions
--- a/src/xenia/cpu/frontend/ppc_context.h
+++ b/src/xenia/cpu/frontend/ppc_context.h
@ -13,6 +13,7 @@
 #include <cstdint>
 #include <string>

+#include "xenia/base/mutex.h"
 #include "xenia/base/vec128.h"

 namespace xe {
@ -404,8 +405,9 @@ typedef struct alignas(64) PPCContext_s {
  // Thread ID assigned to this context.
  uint32_t thread_id;

-  // Reserve address for load acquire/store release. Shared.
-  uint64_t* reserve_address;
+  // Global interrupt lock, held while interrupts are disabled or interrupts are
+  // executing. This is shared among all threads and comes from the processor.
+  xe::recursive_mutex* global_mutex;

  // Used to shuttle data into externs. Contents volatile.
  uint64_t scratch;
--- a/src/xenia/cpu/frontend/ppc_emit_control.cc
+++ b/src/xenia/cpu/frontend/ppc_emit_control.cc
@ -11,6 +11,7 @@

 #include "xenia/base/assert.h"
 #include "xenia/cpu/frontend/ppc_context.h"
+#include "xenia/cpu/frontend/ppc_frontend.h"
 #include "xenia/cpu/frontend/ppc_hir_builder.h"

 namespace xe {
@ -699,14 +700,28 @@ XEEMITTER(mtspr, 0x7C0003A6, XFX)(PPCHIRBuilder& f, InstrData& i) {
 // without the lock here threads can livelock.

 XEEMITTER(mfmsr, 0x7C0000A6, X)(PPCHIRBuilder& f, InstrData& i) {
-  f.StoreGPR(i.X.RT, f.LoadMSR());
+  // bit 48 = EE; interrupt enabled
+  // bit 62 = RI; recoverable interrupt
+  // return 8000h if unlocked (interrupts enabled), else 0
+  f.CallExtern(f.builtins()->check_global_lock);
+  f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
  return 0;
 }

 XEEMITTER(mtmsr, 0x7C000124, X)(PPCHIRBuilder& f, InstrData& i) {
  if (i.X.RA & 0x01) {
    // L = 1
-    f.StoreMSR(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
+    // iff storing from r13
+    f.StoreContext(
+        offsetof(PPCContext, scratch),
+        f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
+    if (i.X.RT == 13) {
+      // iff storing from r13 we are taking a lock (disable interrupts).
+      f.CallExtern(f.builtins()->enter_global_lock);
+    } else {
+      // Otherwise we are restoring interrupts (probably).
+      f.CallExtern(f.builtins()->leave_global_lock);
+    }
    return 0;
  } else {
    // L = 0
@ -718,7 +733,15 @@ XEEMITTER(mtmsr, 0x7C000124, X)(PPCHIRBuilder& f, InstrData& i) {
 XEEMITTER(mtmsrd, 0x7C000164, X)(PPCHIRBuilder& f, InstrData& i) {
  if (i.X.RA & 0x01) {
    // L = 1
-    f.StoreMSR(f.LoadGPR(i.X.RT));
+    f.StoreContext(offsetof(PPCContext, scratch),
+                   f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
+    if (i.X.RT == 13) {
+      // iff storing from r13 we are taking a lock (disable interrupts).
+      f.CallExtern(f.builtins()->enter_global_lock);
+    } else {
+      // Otherwise we are restoring interrupts (probably).
+      f.CallExtern(f.builtins()->leave_global_lock);
+    }
    return 0;
  } else {
    // L = 0
--- a/src/xenia/cpu/frontend/ppc_emit_memory.cc
+++ b/src/xenia/cpu/frontend/ppc_emit_memory.cc
@ -683,8 +683,16 @@ XEEMITTER(ldarx, 0x7C0000A8, X)(PPCHIRBuilder& f, InstrData& i) {
  // RESERVE_LENGTH <- 8
  // RESERVE_ADDR <- real_addr(EA)
  // RT <- MEM(EA, 8)
+
+  // NOTE: we assume we are within a global lock.
+  // We could assert here that the block (or its parent) has taken a global lock
+  // already, but I haven't see anything but interrupt callbacks (which are
+  // always under a global lock) do that yet.
+  // We issue a memory barrier here to make sure that we get good values.
+  f.MemoryBarrier();
+
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ByteSwap(f.LoadAcquire(ea, INT64_TYPE));
+  Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
  f.StoreGPR(i.X.RT, rt);
  return 0;
 }
@ -699,9 +707,16 @@ XEEMITTER(lwarx, 0x7C000028, X)(PPCHIRBuilder& f, InstrData& i) {
  // RESERVE_LENGTH <- 4
  // RESERVE_ADDR <- real_addr(EA)
  // RT <- i32.0 || MEM(EA, 4)
+
+  // NOTE: we assume we are within a global lock.
+  // We could assert here that the block (or its parent) has taken a global lock
+  // already, but I haven't see anything but interrupt callbacks (which are
+  // always under a global lock) do that yet.
+  // We issue a memory barrier here to make sure that we get good values.
+  f.MemoryBarrier();
+
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt =
-      f.ZeroExtend(f.ByteSwap(f.LoadAcquire(ea, INT32_TYPE)), INT64_TYPE);
+  Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
  f.StoreGPR(i.X.RT, rt);
  return 0;
 }
@ -716,9 +731,22 @@ XEEMITTER(stdcx, 0x7C0001AD, X)(PPCHIRBuilder& f, InstrData& i) {
  // MEM(EA, 8) <- (RS)
  // n <- 1 if store performed
  // CR0[LT GT EQ SO] = 0b00 || n || XER[SO]
+
+  // NOTE: we assume we are within a global lock.
+  // As we have been exclusively executing this entire time, we assume that no
+  // one else could have possibly touched the memory and must always succeed.
+
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
  Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
-  f.StoreRelease(ea, rt);  // also updates cr0
+  f.Store(ea, rt);
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
+
+  // Issue memory barrier for when we go out of lock and want others to see our
+  // updates.
+  f.MemoryBarrier();
+
  return 0;
 }

@ -732,9 +760,22 @@ XEEMITTER(stwcx, 0x7C00012D, X)(PPCHIRBuilder& f, InstrData& i) {
  // MEM(EA, 4) <- (RS)[32:63]
  // n <- 1 if store performed
  // CR0[LT GT EQ SO] = 0b00 || n || XER[SO]
+
+  // NOTE: we assume we are within a global lock.
+  // As we have been exclusively executing this entire time, we assume that no
+  // one else could have possibly touched the memory and must always succeed.
+
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
  Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
-  f.StoreRelease(ea, rt);  // also updates cr0
+  f.Store(ea, rt);
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
+
+  // Issue memory barrier for when we go out of lock and want others to see our
+  // updates.
+  f.MemoryBarrier();
+
  return 0;
 }

--- a/src/xenia/cpu/frontend/ppc_frontend.cc
+++ b/src/xenia/cpu/frontend/ppc_frontend.cc
@ -57,32 +57,41 @@ PPCFrontend::~PPCFrontend() {

 Memory* PPCFrontend::memory() const { return processor_->memory(); }

+// Checks the state of the global lock and sets scratch to the current MSR
+// value.
 void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
-  ppc_context->scratch = 0x8000;
+  auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
+  auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
+  std::lock_guard<xe::recursive_mutex> lock(*global_mutex);
+  ppc_context->scratch = *global_lock_count ? 0 : 0x8000;
 }
-void HandleGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
-  auto global_lock = reinterpret_cast<xe::mutex*>(arg0);
-  volatile bool* global_lock_taken = reinterpret_cast<bool*>(arg1);
-  uint64_t value = ppc_context->scratch;
-  if (value == 0x8000) {
-    global_lock->lock();
-    *global_lock_taken = false;
-    global_lock->unlock();
-  } else if (value == ppc_context->r[13]) {
-    global_lock->lock();
-    *global_lock_taken = true;
-    global_lock->unlock();
-  }
+
+// Enters the global lock. Safe to recursion.
+void EnterGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
+  auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
+  auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
+  global_mutex->lock();
+  *global_lock_count = *global_lock_count + 1;
+}
+
+// Leaves the global lock. Safe to recursion.
+void LeaveGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
+  auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
+  auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
+  *global_lock_count = *global_lock_count - 1;
+  assert_true(*global_lock_count >= 0);
+  global_mutex->unlock();
 }

 bool PPCFrontend::Initialize() {
-  void* arg0 = reinterpret_cast<void*>(&builtins_.global_lock);
-  void* arg1 = reinterpret_cast<void*>(&builtins_.global_lock_taken);
+  void* arg0 = reinterpret_cast<void*>(processor_->global_mutex());
+  void* arg1 = reinterpret_cast<void*>(&builtins_.global_lock_count);
  builtins_.check_global_lock =
      processor_->DefineBuiltin("CheckGlobalLock", CheckGlobalLock, arg0, arg1);
-  builtins_.handle_global_lock = processor_->DefineBuiltin(
-      "HandleGlobalLock", HandleGlobalLock, arg0, arg1);
-
+  builtins_.enter_global_lock =
+      processor_->DefineBuiltin("EnterGlobalLock", EnterGlobalLock, arg0, arg1);
+  builtins_.leave_global_lock =
+      processor_->DefineBuiltin("LeaveGlobalLock", LeaveGlobalLock, arg0, arg1);
  return true;
 }

--- a/src/xenia/cpu/frontend/ppc_frontend.h
+++ b/src/xenia/cpu/frontend/ppc_frontend.h
@ -32,10 +32,10 @@ namespace frontend {
 class PPCTranslator;

 struct PPCBuiltins {
-  xe::mutex global_lock;
-  bool global_lock_taken;
+  int32_t global_lock_count;
  Function* check_global_lock;
-  Function* handle_global_lock;
+  Function* enter_global_lock;
+  Function* leave_global_lock;
 };

 class PPCFrontend {
--- a/src/xenia/cpu/frontend/ppc_hir_builder.cc
+++ b/src/xenia/cpu/frontend/ppc_hir_builder.cc
@ -39,6 +39,8 @@ PPCHIRBuilder::PPCHIRBuilder(PPCFrontend* frontend)

 PPCHIRBuilder::~PPCHIRBuilder() = default;

+PPCBuiltins* PPCHIRBuilder::builtins() const { return frontend_->builtins(); }
+
 void PPCHIRBuilder::Reset() {
  function_ = nullptr;
  start_address_ = 0;
@ -345,20 +347,6 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
  // TOOD(benvanik): trace CR.
 }

-Value* PPCHIRBuilder::LoadMSR() {
-  // bit 48 = EE; interrupt enabled
-  // bit 62 = RI; recoverable interrupt
-  // return 8000h if unlocked, else 0
-  CallExtern(frontend_->builtins()->check_global_lock);
-  return LoadContext(offsetof(PPCContext, scratch), INT64_TYPE);
-}
-
-void PPCHIRBuilder::StoreMSR(Value* value) {
-  // if & 0x8000 == 0, lock, else unlock
-  StoreContext(offsetof(PPCContext, scratch), ZeroExtend(value, INT64_TYPE));
-  CallExtern(frontend_->builtins()->handle_global_lock);
-}
-
 Value* PPCHIRBuilder::LoadFPSCR() {
  return LoadContext(offsetof(PPCContext, fpscr), INT64_TYPE);
 }
@ -444,30 +432,6 @@ void PPCHIRBuilder::StoreVR(uint32_t reg, Value* value) {
  trace_reg.value = value;
 }

-Value* PPCHIRBuilder::LoadAcquire(Value* address, TypeName type,
-                                  uint32_t load_flags) {
-  AtomicExchange(LoadContext(offsetof(PPCContext, reserve_address), INT64_TYPE),
-                 Truncate(address, INT32_TYPE));
-  return Load(address, type, load_flags);
-}
-
-Value* PPCHIRBuilder::StoreRelease(Value* address, Value* value,
-                                   uint32_t store_flags) {
-  Value* old_address = AtomicExchange(
-      LoadContext(offsetof(PPCContext, reserve_address), INT64_TYPE),
-      LoadZeroInt32());
-  // Ensure the reservation addresses match.
-  Value* eq = CompareEQ(Truncate(address, INT32_TYPE), old_address);
-  StoreContext(offsetof(PPCContext, cr0.cr0_eq), eq);
-  StoreContext(offsetof(PPCContext, cr0.cr0_lt), LoadZeroInt8());
-  StoreContext(offsetof(PPCContext, cr0.cr0_gt), LoadZeroInt8());
-  auto skip_label = NewLabel();
-  BranchFalse(eq, skip_label, BRANCH_UNLIKELY);
-  Store(address, value, store_flags);
-  MarkLabel(skip_label);
-  return eq;
-}
-
 }  // namespace frontend
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/frontend/ppc_hir_builder.h
+++ b/src/xenia/cpu/frontend/ppc_hir_builder.h
@ -18,6 +18,7 @@ namespace xe {
 namespace cpu {
 namespace frontend {

+struct PPCBuiltins;
 class PPCFrontend;

 class PPCHIRBuilder : public hir::HIRBuilder {
@ -29,6 +30,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {
  explicit PPCHIRBuilder(PPCFrontend* frontend);
  ~PPCHIRBuilder() override;

+  PPCBuiltins* builtins() const;
+
  void Reset() override;

  enum EmitFlags {
@ -54,8 +57,6 @@ class PPCHIRBuilder : public hir::HIRBuilder {
  void UpdateCR(uint32_t n, Value* lhs, bool is_signed = true);
  void UpdateCR(uint32_t n, Value* lhs, Value* rhs, bool is_signed = true);
  void UpdateCR6(Value* src_value);
-  Value* LoadMSR();
-  void StoreMSR(Value* value);
  Value* LoadFPSCR();
  void StoreFPSCR(Value* value);
  Value* LoadXER();
@ -75,10 +76,6 @@ class PPCHIRBuilder : public hir::HIRBuilder {
  Value* LoadVR(uint32_t reg);
  void StoreVR(uint32_t reg, Value* value);

-  Value* LoadAcquire(Value* address, hir::TypeName type,
-                     uint32_t load_flags = 0);
-  Value* StoreRelease(Value* address, Value* value, uint32_t store_flags = 0);
-
 private:
  void AnnotateLabel(uint32_t address, Label* label);

--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@ -327,6 +327,39 @@ uint64_t Processor::Execute(ThreadState* thread_state, uint32_t address,
  return context->r[3];
 }

+uint64_t Processor::ExecuteInterrupt(ThreadState* thread_state,
+                                     uint32_t address, uint64_t args[],
+                                     size_t arg_count) {
+  SCOPE_profile_cpu_f("cpu");
+
+  // Hold the global lock during interrupt dispatch.
+  // This will block if any code is in a critical region (has interrupts
+  // disabled) or if any other interrupt is executing.
+  std::lock_guard<xe::recursive_mutex> lock(global_mutex_);
+
+  PPCContext* context = thread_state->context();
+  assert_true(arg_count <= 5);
+  for (size_t i = 0; i < arg_count; ++i) {
+    context->r[3 + i] = args[i];
+  }
+
+  // TLS ptr must be zero during interrupts. Some games check this and
+  // early-exit routines when under interrupts.
+  auto pcr_address =
+      memory_->TranslateVirtual(static_cast<uint32_t>(context->r[13]));
+  uint32_t old_tls_ptr = xe::load_and_swap<uint32_t>(pcr_address);
+  xe::store_and_swap<uint32_t>(pcr_address, 0);
+
+  if (!Execute(thread_state, address)) {
+    return 0xDEADBABE;
+  }
+
+  // Restores TLS ptr.
+  xe::store_and_swap<uint32_t>(pcr_address, old_tls_ptr);
+
+  return context->r[3];
+}
+
 Irql Processor::RaiseIrql(Irql new_value) {
  return static_cast<Irql>(
      xe::atomic_exchange(static_cast<uint32_t>(new_value),
--- a/src/xenia/cpu/processor.h
+++ b/src/xenia/cpu/processor.h
@ -11,7 +11,6 @@
 #define XENIA_CPU_PROCESSOR_H_

 #include <memory>
-#include <mutex>
 #include <string>
 #include <vector>

@ -84,7 +83,10 @@ class Processor {
  bool Execute(ThreadState* thread_state, uint32_t address);
  uint64_t Execute(ThreadState* thread_state, uint32_t address, uint64_t args[],
                   size_t arg_count);
+  uint64_t ExecuteInterrupt(ThreadState* thread_state, uint32_t address,
+                            uint64_t args[], size_t arg_count);

+  xe::recursive_mutex* global_mutex() { return &global_mutex_; }
  Irql RaiseIrql(Irql new_value);
  void LowerIrql(Irql old_value);

@ -108,6 +110,7 @@ class Processor {
  uint32_t next_builtin_address_ = 0xFFFF0000u;

  Irql irql_;
+  xe::recursive_mutex global_mutex_;
 };

 }  // namespace cpu
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@ -90,7 +90,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
  std::memset(context_, 0, sizeof(PPCContext));

  // Stash pointers to common structures that callbacks may need.
-  context_->reserve_address = memory_->reserve_address();
+  context_->global_mutex = processor_->global_mutex();
  context_->virtual_membase = memory_->virtual_membase();
  context_->physical_membase = memory_->physical_membase();
  context_->processor = processor_;
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@ -80,8 +80,8 @@ void GraphicsSystem::DispatchInterruptCallback(uint32_t source, uint32_t cpu) {
           interrupt_callback_, source, cpu);

  uint64_t args[] = {source, interrupt_callback_data_};
-  processor_->Execute(thread->thread_state(), interrupt_callback_, args,
-                      xe::countof(args));
+  processor_->ExecuteInterrupt(thread->thread_state(), interrupt_callback_,
+                               args, xe::countof(args));
 }

 }  // namespace gpu
--- a/src/xenia/kernel/xam_ui.cc
+++ b/src/xenia/kernel/xam_ui.cc
@ -403,7 +403,6 @@ SHIM_CALL XamShowDirtyDiscErrorUI_shim(PPCContext* ppc_context,

  // This is death, and should never return.
  // TODO(benvanik): cleaner exit.
-  assert_always();
  exit(1);
 }

--- a/src/xenia/kernel/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl_threading.cc
@ -1279,23 +1279,23 @@ pointer_result_t InterlockedPushEntrySList(
  assert_not_null(plist_ptr);
  assert_not_null(entry);

-  alignas(8) X_SLIST_HEADER old_hdr = {0};
+  // Hold a global lock during this method. Once in the lock we assume we have
+  // exclusive access to the structure.
+  std::lock_guard<xe::recursive_mutex> lock(
+      *kernel_state()->processor()->global_mutex());
+
+  alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
  alignas(8) X_SLIST_HEADER new_hdr = {0};
-  uint32_t old_head = 0;
+  new_hdr.depth = old_hdr.depth + 1;
+  new_hdr.sequence = old_hdr.sequence + 1;

-  do {
-    // Kill the guest reservation
-    xe::atomic_exchange(0, kernel_memory()->reserve_address());
+  uint32_t old_head = old_hdr.next.next;
+  entry->next = old_hdr.next.next;
+  new_hdr.next.next = entry.guest_address();

-    old_hdr = *plist_ptr;
-    new_hdr.depth = old_hdr.depth + 1;
-    new_hdr.sequence = old_hdr.sequence + 1;
-
-    old_head = old_hdr.next.next;
-    entry->next = old_hdr.next.next;
-    new_hdr.next.next = entry.guest_address();
-  } while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
-                           (uint64_t*)plist_ptr.host_address()));
+  xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
+                 *reinterpret_cast<uint64_t*>(&new_hdr),
+                 reinterpret_cast<uint64_t*>(plist_ptr.host_address()));

  return old_head;
 }
@ -1305,27 +1305,29 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPushEntrySList,
 pointer_result_t InterlockedPopEntrySList(pointer_t<X_SLIST_HEADER> plist_ptr) {
  assert_not_null(plist_ptr);

+  // Hold a global lock during this method. Once in the lock we assume we have
+  // exclusive access to the structure.
+  std::lock_guard<xe::recursive_mutex> lock(
+      *kernel_state()->processor()->global_mutex());
+
  uint32_t popped = 0;

-  alignas(8) X_SLIST_HEADER old_hdr = {0};
+  alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
  alignas(8) X_SLIST_HEADER new_hdr = {0};
-  do {
-    // Kill the guest reservation (guest InterlockedPushEntrySList uses this)
-    xe::atomic_exchange(0, kernel_memory()->reserve_address());
+  auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
+      old_hdr.next.next);
+  if (!old_hdr.next.next) {
+    return 0;
+  }
+  popped = old_hdr.next.next;

-    old_hdr = *plist_ptr;
-    auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
-        old_hdr.next.next);
-    if (!old_hdr.next.next) {
-      return 0;
-    }
-    popped = old_hdr.next.next;
+  new_hdr.depth = old_hdr.depth - 1;
+  new_hdr.next.next = next->next;
+  new_hdr.sequence = old_hdr.sequence;

-    new_hdr.depth = old_hdr.depth - 1;
-    new_hdr.next.next = next->next;
-    new_hdr.sequence = old_hdr.sequence;
-  } while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
-                           (uint64_t*)plist_ptr.host_address()));
+  xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
+                 *reinterpret_cast<uint64_t*>(&new_hdr),
+                 reinterpret_cast<uint64_t*>(plist_ptr.host_address()));

  return popped;
 }
@ -1333,22 +1335,23 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPopEntrySList,
                        ExportTag::kImplemented | ExportTag::kHighFrequency);

 pointer_result_t InterlockedFlushSList(pointer_t<X_SLIST_HEADER> plist_ptr) {
-  alignas(8) X_SLIST_HEADER old_hdr = {0};
+  assert_not_null(plist_ptr);
+
+  // Hold a global lock during this method. Once in the lock we assume we have
+  // exclusive access to the structure.
+  std::lock_guard<xe::recursive_mutex> lock(
+      *kernel_state()->processor()->global_mutex());
+
+  alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
  alignas(8) X_SLIST_HEADER new_hdr = {0};
-  uint32_t first = 0;
+  uint32_t first = old_hdr.next.next;
+  new_hdr.next.next = 0;
+  new_hdr.depth = 0;
+  new_hdr.sequence = 0;

-  do {
-    // Kill the guest reservation
-    xe::atomic_exchange(0, kernel_memory()->reserve_address());
-
-    old_hdr = *plist_ptr;
-
-    first = old_hdr.next.next;
-    new_hdr.next.next = 0;
-    new_hdr.depth = 0;
-    new_hdr.sequence = 0;
-  } while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
-                           (uint64_t*)plist_ptr.host_address()));
+  xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
+                 *reinterpret_cast<uint64_t*>(&new_hdr),
+                 reinterpret_cast<uint64_t*>(plist_ptr.host_address()));

  return first;
 }
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -181,8 +181,6 @@ class Memory {
                               (guest_address & 0x1FFFFFFF));
  }

-  inline uint64_t* reserve_address() { return &reserve_address_; }
-
  // TODO(benvanik): make poly memory utils for these.
  void Zero(uint32_t address, uint32_t size);
  void Fill(uint32_t address, uint32_t size, uint8_t value);
@ -219,7 +217,6 @@ class Memory {
  uint32_t system_page_size_ = 0;
  uint8_t* virtual_membase_ = nullptr;
  uint8_t* physical_membase_ = nullptr;
-  uint64_t reserve_address_ = 0;

  xe::memory::FileMappingHandle mapping_ = nullptr;
  uint8_t* mapping_base_ = nullptr;