From 7fb4b4cd419b9f4b00ab910840737bc19dcc2c46 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sat, 15 Apr 2023 16:06:07 -0400
Subject: [PATCH] Attempt to emulate reserved load/store more closely. can't do
 anything for stores of the same value that are done via a non-reserved store
 to a reserved location

uses a bitmap that splits up the memory space into 65k blocks per bit.  Currently is using the guest virtual address but should be using physical addresses instead.

Currently if a guest does a reserve on a location and then a reserved store to a totally different location we trigger a breakpoint. This should never happen
Also removed the NEGATED_MUL_blah operations. They weren't necessary, nothing special is needed for the negated result variants.

Added a log message for when watched physical memory has a race, it just would be nice to know when it happens and in what games.
---
 src/xenia/cpu/backend/x64/x64_backend.cc    | 127 +++++++++++++++++++-
 src/xenia/cpu/backend/x64/x64_backend.h     |  29 ++++-
 src/xenia/cpu/backend/x64/x64_seq_memory.cc |  79 +++++++++++-
 src/xenia/cpu/backend/x64/x64_sequences.cc  | 103 +---------------
 src/xenia/cpu/hir/hir_builder.cc            |  43 +++----
 src/xenia/cpu/hir/hir_builder.h             |   8 +-
 src/xenia/cpu/hir/opcodes.h                 |   7 +-
 src/xenia/cpu/hir/opcodes.inl               |  32 ++---
 src/xenia/cpu/mmio_handler.cc               |   3 +-
 src/xenia/cpu/ppc/ppc_emit_altivec.cc       |   2 +-
 src/xenia/cpu/ppc/ppc_emit_fpu.cc           |  33 +++--
 src/xenia/cpu/ppc/ppc_emit_memory.cc        |  74 +++++++++---
 12 files changed, 353 insertions(+), 187 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index b1c1ff40e..e0918f89b 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
   void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
       void* sync_func, unsigned stack_element_size);
 
+  void* EmitTryAcquireReservationHelper();
+  void* EmitReservedStoreHelper(bool bit64 = false);
+
  private:
   void* EmitCurrentForOffsets(const _code_offsets& offsets,
                               size_t stack_size = 0);
@@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
         thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
             synchronize_guest_and_host_stack_helper_, 4);
   }
+  try_acquire_reservation_helper_ =
+      thunk_emitter.EmitTryAcquireReservationHelper();
+  reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
+  reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
 
   // Set the code cache to use the ResolveFunction thunk for default
   // indirections.
@@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
   inc(ecx);
   jmp(checkbp, T_NEAR);
   L(we_good);
-  //we're popping this return address, so go down by one
+  // we're popping this return address, so go down by one
   sub(edx, sizeof(X64BackendStackpoint));
   dec(ecx);
   L(checkbp);
@@ -857,6 +864,123 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
   code_offsets.tail = getSize();
   return EmitCurrentForOffsets(code_offsets);
 }
+void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+
+  Xbyak::Label already_has_a_reservation;
+  Xbyak::Label acquire_new_reservation;
+
+  btr(GetBackendFlagsPtr(), 1);
+  mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
+  jc(already_has_a_reservation);
+
+  shr(ecx, RESERVE_BLOCK_SHIFT);
+  xor_(r9d, r9d);
+  mov(edx, ecx);
+  shr(edx, 6);  // divide by 64
+  lea(rdx, ptr[r8 + rdx * 8]);
+  and_(ecx, 64 - 1);
+
+  lock();
+  bts(qword[rdx], rcx);
+  // DebugBreak();
+  // set flag on local backend context for thread to indicate our previous
+  // attempt to get the reservation succeeded
+  setnc(r9b);  // success = bitmap did not have a set bit at the idx
+  shl(r9b, 1);
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
+      rdx);
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
+
+  or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
+  ret();
+  L(already_has_a_reservation);
+  DebugBreak();
+
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+// ecx=guest addr
+// r9 = host addr
+// r8 = value
+// if ZF is set and CF is set, we succeeded
+void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  Xbyak::Label done;
+  Xbyak::Label reservation_isnt_for_our_addr;
+  // carry must be set + zero flag must be set
+
+  btr(GetBackendFlagsPtr(), 1);
+
+  jnc(done);
+
+  // mov(edx, i.src1.reg().cvt32());
+  mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
+
+  shr(ecx, RESERVE_BLOCK_SHIFT);
+  mov(edx, ecx);
+  shr(edx, 6);  // divide by 64
+  lea(rdx, ptr[rax + rdx * 8]);
+  // begin acquiring exclusive access to cacheline containing our bit
+  prefetchw(ptr[rdx]);
+
+  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
+      rdx);
+  jnz(reservation_isnt_for_our_addr);
+
+  mov(rax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
+
+  // we need modulo bitsize, it turns out bittests' modulus behavior for the
+  // bitoffset only applies for register operands, for memory ones we bug out
+  // todo: actually, the above note may not be true, double check it
+  and_(ecx, 64 - 1);
+  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
+  jnz(reservation_isnt_for_our_addr);
+
+  // was our memory modified by kernel code or something?
+  lock();
+  if (bit64) {
+    cmpxchg(ptr[r9], r8);
+
+  } else {
+    cmpxchg(ptr[r9], r8d);
+  }
+  // the ZF flag is unaffected by BTR! we exploit this for the retval
+
+  // cancel our lock on the 65k block
+  lock();
+  btr(qword[rdx], rcx);
+
+  // Xbyak::Label check_fucky;
+  jc(done);
+  DebugBreak();
+
+  // L(check_fucky);
+
+  L(done);
+
+  // i don't care that theres a dependency on the prev value of rax atm
+  // sadly theres no CF&ZF condition code
+  setz(al);
+  setc(ah);
+  cmp(ax, 0x0101);
+  ret();
+  L(reservation_isnt_for_our_addr);
+  DebugBreak();
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
 void X64HelperEmitter::EmitSaveVolatileRegs() {
   // Save off volatile registers.
   // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
@@ -975,6 +1099,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
   // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
   bctx->Ox1000 = 0x1000;
   bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
+  bctx->reserve_helper_ = &reserve_helper_;
 }
 void X64Backend::DeinitializeBackendContext(void* ctx) {
   X64BackendContext* bctx = BackendContextForGuestContext(ctx);
diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h
index 79f635722..55ef7da6d 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();
 
+#define RESERVE_BLOCK_SHIFT 16
+
+#define RESERVE_NUM_ENTRIES \
+  ((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
+// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
+struct ReserveHelper {
+  uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
+
+  ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
+};
+
 struct X64BackendStackpoint {
   uint64_t host_stack_;
   unsigned guest_stack_;
@@ -55,16 +66,21 @@ struct X64BackendStackpoint {
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
+  ReserveHelper* reserve_helper_;
+  uint64_t cached_reserve_value_;
   // guest_tick_count is used if inline_loadclock is used
   uint64_t* guest_tick_count;
   // records mapping of host_stack to guest_stack
   X64BackendStackpoint* stackpoints;
-
+  uint64_t cached_reserve_offset;
+  uint32_t cached_reserve_bit;
   unsigned int current_stackpoint_depth;
   unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
                            // affects both vmx and the fpu
   unsigned int mxcsr_vmx;
-  unsigned int flags;   // bit 0 = 0 if mxcsr is fpu, else it is vmx
+  // bit 0 = 0 if mxcsr is fpu, else it is vmx
+  // bit 1 = got reserve
+  unsigned int flags;
   unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                         // add of it by... 2 bytes lol
 };
@@ -152,9 +168,18 @@ class X64Backend : public Backend {
   void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
   void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
   void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
+
+ public:
+  void* try_acquire_reservation_helper_ = nullptr;
+  void* reserved_store_32_helper = nullptr;
+  void* reserved_store_64_helper = nullptr;
+
+ private:
 #if XE_X64_PROFILER_AVAILABLE == 1
   GuestProfilerData profiler_data_;
 #endif
+
+  alignas(64) ReserveHelper reserve_helper_;
 };
 
 }  // namespace x64
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index 2cee66ece..75986b355 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
 
-
 struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     Xbyak::Label endpoint{};
@@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
+
+struct RESERVED_LOAD_INT32
+    : Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // should use phys addrs, not virtual addrs!
+
+    // try_acquire_reservation_helper_ doesnt spoil rax
+    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    // begin acquiring exclusive access to the location
+    // we will do a load first, but we'll need exclusive access once we do our
+    // atomic op in the store
+    e.prefetchw(e.ptr[e.rax]);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.call(e.backend()->try_acquire_reservation_helper_);
+    e.mov(i.dest, e.dword[e.rax]);
+
+    e.mov(
+        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
+        i.dest.reg().cvt64());
+  }
+};
+
+struct RESERVED_LOAD_INT64
+    : Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // try_acquire_reservation_helper_ doesnt spoil rax
+    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    // begin acquiring exclusive access to the location
+    // we will do a load first, but we'll need exclusive access once we do our
+    // atomic op in the store
+    e.prefetchw(e.ptr[e.rax]);
+
+    e.call(e.backend()->try_acquire_reservation_helper_);
+    e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
+
+    e.mov(
+        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
+        i.dest.reg());
+  }
+};
+
+EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
+                     RESERVED_LOAD_INT64);
+
+// address, value
+
+struct RESERVED_STORE_INT32
+    : Sequence<RESERVED_STORE_INT32,
+               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // edx=guest addr
+    // r9 = host addr
+    // r8 = value
+    // if ZF is set and CF is set, we succeeded
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.r8d, i.src2);
+    e.call(e.backend()->reserved_store_32_helper);
+    e.setz(i.dest);
+  }
+};
+
+struct RESERVED_STORE_INT64
+    : Sequence<RESERVED_STORE_INT64,
+               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.r8, i.src2);
+    e.call(e.backend()->reserved_store_64_helper);
+    e.setz(i.dest);
+  }
+};
+
+EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
+                     RESERVED_STORE_INT64);
+
 // ============================================================================
 // OPCODE_ATOMIC_COMPARE_EXCHANGE
 // ============================================================================
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 554906660..5f428ad6c 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
     e.ChangeMxcsrMode(MXCSRMode::Fpu);
     if (!HasPrecedingCmpOfSameValues(i.instr)) {
       EmitCommutativeBinaryXmmOp(
-          e, i,
-          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
             e.vcomiss(src1, src2);
           });
     }
@@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
     e.ChangeMxcsrMode(MXCSRMode::Fpu);
     if (!HasPrecedingCmpOfSameValues(i.instr)) {
       EmitCommutativeBinaryXmmOp(
-          e, i,
-          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
             e.vcomisd(src1, src2);
           });
     }
@@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
 
-struct NEGATED_MUL_ADD_F64
-    : Sequence<NEGATED_MUL_ADD_F64,
-               I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Fpu);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovapd(e.xmm3, src1);
-      e.vfmadd213sd(e.xmm3, src2, src3);
-      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulsd(e.xmm3, src1, src2);
-      e.vaddsd(i.dest, e.xmm3, src3);
-      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
-    }
-  }
-};
-struct NEGATED_MUL_ADD_V128
-    : Sequence<NEGATED_MUL_ADD_V128,
-               I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovaps(e.xmm3, src1);
-      e.vfmadd213ps(e.xmm3, src2, src3);
-      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulps(e.xmm3, src1, src2);
-      e.vaddps(i.dest, e.xmm3, src3);
-      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
-                     NEGATED_MUL_ADD_V128);
-
 // ============================================================================
 // OPCODE_MUL_SUB
 // ============================================================================
@@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
 
-struct NEGATED_MUL_SUB_F64
-    : Sequence<NEGATED_MUL_SUB_F64,
-               I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Fpu);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovapd(e.xmm3, src1);
-      e.vfmsub213sd(e.xmm3, src2, src3);
-      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulsd(e.xmm3, src1, src2);
-      e.vsubsd(i.dest, e.xmm3, src3);
-      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
-    }
-  }
-};
-struct NEGATED_MUL_SUB_V128
-    : Sequence<NEGATED_MUL_SUB_V128,
-               I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovaps(e.xmm3, src1);
-      e.vfmsub213ps(e.xmm3, src2, src3);
-      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulps(e.xmm3, src1, src2);
-      e.vsubps(i.dest, e.xmm3, src3);
-      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
-                     NEGATED_MUL_SUB_V128);
-
 // ============================================================================
 // OPCODE_NEG
 // ============================================================================
@@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
     // src1 constant.
     // `and` instruction only supports up to 32-bit immediate constants
     // 64-bit constants will need a temp register
-	  //only possible with 64 bit inputs, andc is the only instruction that generates this
+    // only possible with 64 bit inputs, andc is the only instruction that
+    // generates this
     auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
     e.mov(temp, i.src1.constant());
 
diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc
index 7a5935001..00634a5e6 100644
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
   return i->dest;
 }
 
+Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
+  ASSERT_ADDRESS_TYPE(address);
+
+  Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+
+  return i->dest;
+}
+
+Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
+                                    TypeName type) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+  return i->dest;
+}
 void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
   ASSERT_ADDRESS_TYPE(address);
   Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
@@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
   return i->dest;
 }
 
-Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
-  ASSERT_TYPES_EQUAL(value1, value2);
-  ASSERT_TYPES_EQUAL(value1, value3);
-
-  Instr* i =
-      AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
-  i->set_src1(value1);
-  i->set_src2(value2);
-  i->set_src3(value3);
-  return i->dest;
-}
-
-Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
-  ASSERT_TYPES_EQUAL(value1, value2);
-  ASSERT_TYPES_EQUAL(value1, value3);
-
-  Instr* i =
-      AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
-  i->set_src1(value1);
-  i->set_src2(value2);
-  i->set_src3(value3);
-  return i->dest;
-}
-
 Value* HIRBuilder::Neg(Value* value) {
   Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
   i->set_src1(value);
diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h
index d83806cd9..b33f18aaf 100644
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@@ -189,6 +189,9 @@ class HIRBuilder {
                    uint32_t store_flags = 0);
 
   Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
+  // create a reserve on an address,
+  Value* LoadWithReserve(Value* address, TypeName type);
+  Value* StoreWithReserve(Value* address, Value* value, TypeName type);
 
   Value* LoadVectorLeft(Value* address);
   Value* LoadVectorRight(Value* address);
@@ -242,10 +245,7 @@ class HIRBuilder {
   Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
   Value* MulAdd(Value* value1, Value* value2, Value* value3);  // (1 * 2) + 3
   Value* MulSub(Value* value1, Value* value2, Value* value3);  // (1 * 2) - 3
-  Value* NegatedMulAdd(Value* value1, Value* value2,
-                       Value* value3);  // -((1 * 2) + 3)
-  Value* NegatedMulSub(Value* value1, Value* value2,
-                       Value* value3);  // -((1 * 2) - 3)
+
   Value* Neg(Value* value);
   Value* Abs(Value* value);
   Value* Sqrt(Value* value);
diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h
index 1bd85cae9..5a1bdc53b 100644
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@@ -248,9 +248,7 @@ enum Opcode {
   OPCODE_MUL_HI,  // TODO(benvanik): remove this and add INT128 type.
   OPCODE_DIV,
   OPCODE_MUL_ADD,
-  OPCODE_NEGATED_MUL_ADD,
   OPCODE_MUL_SUB,
-  OPCODE_NEGATED_MUL_SUB,
   OPCODE_NEG,
   OPCODE_ABS,
   OPCODE_SQRT,
@@ -292,7 +290,10 @@ enum Opcode {
                      // as we already have OPCODE_ROUND. round double to float (
                      // ppc "single" fpu instruction result rounding behavior )
   OPCODE_SET_NJM,
-	OPCODE_DELAY_EXECUTION, //for db16cyc
+  OPCODE_DELAY_EXECUTION,  // for db16cyc
+  OPCODE_RESERVED_LOAD,
+  OPCODE_RESERVED_STORE,
+
   __OPCODE_MAX_VALUE,  // Keep at end.
 };
 
diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl
index e27f30b46..c5c089e85 100644
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@@ -218,7 +218,12 @@ DEFINE_OPCODE(
     "context_barrier",
     OPCODE_SIG_X,
     0)
-DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
+
+DEFINE_OPCODE(
+	OPCODE_DELAY_EXECUTION, 
+	"delay_execution",
+	OPCODE_SIG_X,
+	0)
 DEFINE_OPCODE(
     OPCODE_LOAD_MMIO,
     "load_mmio",
@@ -453,19 +458,6 @@ DEFINE_OPCODE(
     OPCODE_SIG_V_V_V_V,
     OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 
-DEFINE_OPCODE(
-    OPCODE_NEGATED_MUL_ADD,
-    "negated_mul_add",
-    OPCODE_SIG_V_V_V_V,
-    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
-
-DEFINE_OPCODE(
-    OPCODE_NEGATED_MUL_SUB,
-    "negated_mul_sub",
-    OPCODE_SIG_V_V_V_V,
-    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
-
-
 DEFINE_OPCODE(
     OPCODE_NEG,
     "neg",
@@ -719,3 +711,15 @@ DEFINE_OPCODE(
     "storev_right",
     OPCODE_SIG_X_V_V,
     OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(	
+	OPCODE_RESERVED_LOAD,
+	"reserved_load",
+    OPCODE_SIG_V_V,
+    OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(
+    OPCODE_RESERVED_STORE,
+    "reserved_store",
+    OPCODE_SIG_V_V_V,
+    OPCODE_FLAG_MEMORY)
\ No newline at end of file
diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc
index b1e2d2964..d676c4ada 100644
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
   uint8_t rex_b = rex & 0b0001;
   uint8_t rex_x = rex & 0b0010;
   uint8_t rex_r = rex & 0b0100;
-  //uint8_t rex_w = rex & 0b1000;
+  // uint8_t rex_w = rex & 0b1000;
 
   // http://www.sandpile.org/x86/opc_rm.htm
   // http://www.sandpile.org/x86/opc_sib.htm
@@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
     if (cur_access != memory::PageAccess::kNoAccess &&
         (!is_write || cur_access != memory::PageAccess::kReadOnly)) {
       // Another thread has cleared this watch. Abort.
+      XELOGD("Race condition on watch, was already cleared by another thread!");
       return true;
     }
     // The address is not found within any range, so either a write watch or an
diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index 386abf4bd..0ea2fb4ad 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
   Value* b = f.VectorDenormFlush(f.LoadVR(vb));
   Value* c = f.VectorDenormFlush(f.LoadVR(vc));
 
-  Value* v = f.NegatedMulSub(a, c, b);
+  Value* v = f.Neg(f.MulSub(a, c, b));
   f.StoreVR(vd, v);
   return 0;
 }
diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
index c491ad09a..79c4240e1 100644
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
 
 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
   f.StoreFPR(i.A.FRT, v);
   f.UpdateFPSCR(v, i.A.Rc);
   return 0;
@@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
 
 int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
   v = f.ToSingle(v);
   f.StoreFPR(i.A.FRT, v);
   f.UpdateFPSCR(v, i.A.Rc);
@@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
 
 int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
   f.StoreFPR(i.A.FRT, v);
   f.UpdateFPSCR(v, i.A.Rc);
   return 0;
@@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
 
 int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
   v = f.ToSingle(v);
   f.StoreFPR(i.A.FRT, v);
   f.UpdateFPSCR(v, i.A.Rc);
@@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
   f.StoreFPR(i.X.RT, v);
   /*
   The contents of frB with bit 0 cleared are placed into frD.
-Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
-bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
-Other registers altered:
-• Condition Register (CR1 field):
+Note that the fabs instruction treats NaNs just like any other kind of value.
+That is, the sign bit of a NaN may be altered by fabs. This instruction does not
+alter the FPSCR. Other registers altered: • Condition Register (CR1 field):
 Affected: FX, FEX, VX, OX (if Rc = 1)
   */
- // f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
   if (i.X.Rc) {
     // todo
   }
@@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- !abs(frB)
   Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
   f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
   if (i.X.Rc) {
-	//todo
+    // todo
   }
   return 0;
 }
@@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
   // frD <- ¬ frB[0] || frB[1-63]
   Value* v = f.Neg(f.LoadFPR(i.X.RB));
   f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
   if (i.X.Rc) {
-	//todo
+    // todo
   }
   return 0;
 }
diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc
index 69c7fdf9e..b4bdabb49 100644
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@@ -22,6 +22,12 @@ DEFINE_bool(
     "instructions were written with the Xbox 360's cache in mind, and modern "
     "processors do their own automatic prefetching.",
     "CPU");
+
+DEFINE_bool(no_reserved_ops, false,
+            "For testing whether a game may have races with a broken reserved "
+            "load/store impl",
+            "CPU");
+
 namespace xe {
 namespace cpu {
 namespace ppc {
@@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
   // already, but I haven't see anything but interrupt callbacks (which are
   // always under a global lock) do that yet.
   // We issue a memory barrier here to make sure that we get good values.
-  f.MemoryBarrier();
-
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
-  f.StoreReserved(rt);
-  f.StoreGPR(i.X.RT, rt);
+
+  if (cvars::no_reserved_ops) {
+    f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
+
+  } else {
+    f.MemoryBarrier();
+
+    Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
+    f.StoreGPR(i.X.RT, rt);
+  }
   return 0;
 }
 
@@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
   // already, but I haven't see anything but interrupt callbacks (which are
   // always under a global lock) do that yet.
   // We issue a memory barrier here to make sure that we get good values.
-  f.MemoryBarrier();
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
-  f.StoreReserved(rt);
-  f.StoreGPR(i.X.RT, rt);
+  if (cvars::no_reserved_ops) {
+    f.StoreGPR(i.X.RT,
+               f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
+
+  } else {
+    f.MemoryBarrier();
+
+    Value* rt =
+        f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
+    f.StoreGPR(i.X.RT, rt);
+  }
   return 0;
 }
 
@@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
   Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
-  Value* res = f.ByteSwap(f.LoadReserved());
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+
+  if (cvars::no_reserved_ops) {
+    f.Store(ea, rt);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  } else {
+    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+  }
   f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
   f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
 
   // Issue memory barrier for when we go out of lock and want others to see our
   // updates.
-
-  f.MemoryBarrier();
-
+  if (!cvars::no_reserved_ops) {
+    f.MemoryBarrier();
+  }
   return 0;
 }
 
@@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
   // This will always succeed if under the global lock, however.
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+
   Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
-  Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+
+  if (cvars::no_reserved_ops) {
+    f.Store(ea, rt);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  } else {
+    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+  }
+
   f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
   f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
 
   // Issue memory barrier for when we go out of lock and want others to see our
   // updates.
-  f.MemoryBarrier();
+  if (!cvars::no_reserved_ops) {
+    f.MemoryBarrier();
+  }
 
   return 0;
 }
-
 // Floating-point load (A-19)
 
 int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {