Merge pull request #149 from chrisps/canary_experimental

reimplement reserved load/store
2023-04-15 17:23:22 -04:00 · 2023-04-15 17:23:22 -04:00 · 26dc48f695
parent 5e0c67438c e75e0425e0
commit 26dc48f695
12 changed files with 355 additions and 187 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
  void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
      void* sync_func, unsigned stack_element_size);

+  void* EmitTryAcquireReservationHelper();
+  void* EmitReservedStoreHelper(bool bit64 = false);
+
 private:
  void* EmitCurrentForOffsets(const _code_offsets& offsets,
                              size_t stack_size = 0);
@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
            synchronize_guest_and_host_stack_helper_, 4);
  }
+  try_acquire_reservation_helper_ =
+      thunk_emitter.EmitTryAcquireReservationHelper();
+  reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
+  reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);

  // Set the code cache to use the ResolveFunction thunk for default
  // indirections.
@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
  inc(ecx);
  jmp(checkbp, T_NEAR);
  L(we_good);
-  //we're popping this return address, so go down by one
+  // we're popping this return address, so go down by one
  sub(edx, sizeof(X64BackendStackpoint));
  dec(ecx);
  L(checkbp);
@ -857,6 +864,125 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
  code_offsets.tail = getSize();
  return EmitCurrentForOffsets(code_offsets);
 }
+
+void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+
+  Xbyak::Label already_has_a_reservation;
+  Xbyak::Label acquire_new_reservation;
+
+  btr(GetBackendFlagsPtr(), 1);
+  mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
+  jc(already_has_a_reservation);
+
+  shr(ecx, RESERVE_BLOCK_SHIFT);
+  xor_(r9d, r9d);
+  mov(edx, ecx);
+  shr(edx, 6);  // divide by 64
+  lea(rdx, ptr[r8 + rdx * 8]);
+  and_(ecx, 64 - 1);
+
+  lock();
+  bts(qword[rdx], rcx);
+  // set flag on local backend context for thread to indicate our previous
+  // attempt to get the reservation succeeded
+  setnc(r9b);  // success = bitmap did not have a set bit at the idx
+  shl(r9b, 1);
+
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
+      rdx);
+  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
+
+  or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
+  ret();
+  L(already_has_a_reservation);
+  DebugBreak();
+
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+// ecx=guest addr
+// r9 = host addr
+// r8 = value
+// if ZF is set and CF is set, we succeeded
+void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
+  _code_offsets code_offsets = {};
+  code_offsets.prolog = getSize();
+  Xbyak::Label done;
+  Xbyak::Label reservation_isnt_for_our_addr;
+  Xbyak::Label somehow_double_cleared;
+  // carry must be set + zero flag must be set
+
+  btr(GetBackendFlagsPtr(), 1);
+
+  jnc(done);
+
+  mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
+
+  shr(ecx, RESERVE_BLOCK_SHIFT);
+  mov(edx, ecx);
+  shr(edx, 6);  // divide by 64
+  lea(rdx, ptr[rax + rdx * 8]);
+  // begin acquiring exclusive access to cacheline containing our bit
+  prefetchw(ptr[rdx]);
+
+  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
+      rdx);
+  jnz(reservation_isnt_for_our_addr);
+
+  mov(rax,
+      GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
+
+  // we need modulo bitsize, it turns out bittests' modulus behavior for the
+  // bitoffset only applies for register operands, for memory ones we bug out
+  // todo: actually, the above note may not be true, double check it
+  and_(ecx, 64 - 1);
+  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
+  jnz(reservation_isnt_for_our_addr);
+
+  // was our memory modified by kernel code or something?
+  lock();
+  if (bit64) {
+    cmpxchg(ptr[r9], r8);
+
+  } else {
+    cmpxchg(ptr[r9], r8d);
+  }
+  // the ZF flag is unaffected by BTR! we exploit this for the retval
+
+  // cancel our lock on the 65k block
+  lock();
+  btr(qword[rdx], rcx);
+
+  jnc(somehow_double_cleared);
+
+  L(done);
+  // i don't care that theres a dependency on the prev value of rax atm
+  // sadly theres no CF&ZF condition code
+  setz(al);
+  setc(ah);
+  cmp(ax, 0x0101);
+  ret();
+
+  // could be the same label, but otherwise we don't know where we came from
+  // when one gets triggered
+  L(reservation_isnt_for_our_addr);
+  DebugBreak();
+
+  L(somehow_double_cleared);  // somehow, something else cleared our reserve??
+  DebugBreak();
+
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
 void X64HelperEmitter::EmitSaveVolatileRegs() {
  // Save off volatile registers.
  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
@ -975,6 +1101,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
  bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
+  bctx->reserve_helper_ = &reserve_helper_;
 }
 void X64Backend::DeinitializeBackendContext(void* ctx) {
  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();

+#define RESERVE_BLOCK_SHIFT 16
+
+#define RESERVE_NUM_ENTRIES \
+  ((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
+// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
+struct ReserveHelper {
+  uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
+
+  ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
+};
+
 struct X64BackendStackpoint {
  uint64_t host_stack_;
  unsigned guest_stack_;
@ -55,16 +66,21 @@ struct X64BackendStackpoint {
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
+  ReserveHelper* reserve_helper_;
+  uint64_t cached_reserve_value_;
  // guest_tick_count is used if inline_loadclock is used
  uint64_t* guest_tick_count;
  // records mapping of host_stack to guest_stack
  X64BackendStackpoint* stackpoints;
-
+  uint64_t cached_reserve_offset;
+  uint32_t cached_reserve_bit;
  unsigned int current_stackpoint_depth;
  unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
                           // affects both vmx and the fpu
  unsigned int mxcsr_vmx;
-  unsigned int flags;   // bit 0 = 0 if mxcsr is fpu, else it is vmx
+  // bit 0 = 0 if mxcsr is fpu, else it is vmx
+  // bit 1 = got reserve
+  unsigned int flags;
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                        // add of it by... 2 bytes lol
 };
@ -152,9 +168,18 @@ class X64Backend : public Backend {
  void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
  void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
  void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
+
+ public:
+  void* try_acquire_reservation_helper_ = nullptr;
+  void* reserved_store_32_helper = nullptr;
+  void* reserved_store_64_helper = nullptr;
+
+ private:
 #if XE_X64_PROFILER_AVAILABLE == 1
  GuestProfilerData profiler_data_;
 #endif
+
+  alignas(64) ReserveHelper reserve_helper_;
 };

 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);

-
 struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xbyak::Label endpoint{};
@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
+
+struct RESERVED_LOAD_INT32
+    : Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // should use phys addrs, not virtual addrs!
+
+    // try_acquire_reservation_helper_ doesnt spoil rax
+    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    // begin acquiring exclusive access to the location
+    // we will do a load first, but we'll need exclusive access once we do our
+    // atomic op in the store
+    e.prefetchw(e.ptr[e.rax]);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.call(e.backend()->try_acquire_reservation_helper_);
+    e.mov(i.dest, e.dword[e.rax]);
+
+    e.mov(
+        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
+        i.dest.reg().cvt64());
+  }
+};
+
+struct RESERVED_LOAD_INT64
+    : Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // try_acquire_reservation_helper_ doesnt spoil rax
+    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    // begin acquiring exclusive access to the location
+    // we will do a load first, but we'll need exclusive access once we do our
+    // atomic op in the store
+    e.prefetchw(e.ptr[e.rax]);
+
+    e.call(e.backend()->try_acquire_reservation_helper_);
+    e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
+
+    e.mov(
+        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
+        i.dest.reg());
+  }
+};
+
+EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
+                     RESERVED_LOAD_INT64);
+
+// address, value
+
+struct RESERVED_STORE_INT32
+    : Sequence<RESERVED_STORE_INT32,
+               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // edx=guest addr
+    // r9 = host addr
+    // r8 = value
+    // if ZF is set and CF is set, we succeeded
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.r8d, i.src2);
+    e.call(e.backend()->reserved_store_32_helper);
+    e.setz(i.dest);
+  }
+};
+
+struct RESERVED_STORE_INT64
+    : Sequence<RESERVED_STORE_INT64,
+               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
+    e.mov(e.r8, i.src2);
+    e.call(e.backend()->reserved_store_64_helper);
+    e.setz(i.dest);
+  }
+};
+
+EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
+                     RESERVED_STORE_INT64);
+
 // ============================================================================
 // OPCODE_ATOMIC_COMPARE_EXCHANGE
 // ============================================================================
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    if (!HasPrecedingCmpOfSameValues(i.instr)) {
      EmitCommutativeBinaryXmmOp(
-          e, i,
-          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
            e.vcomiss(src1, src2);
          });
    }
@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    if (!HasPrecedingCmpOfSameValues(i.instr)) {
      EmitCommutativeBinaryXmmOp(
-          e, i,
-          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
            e.vcomisd(src1, src2);
          });
    }
@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);

-struct NEGATED_MUL_ADD_F64
-    : Sequence<NEGATED_MUL_ADD_F64,
-               I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Fpu);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovapd(e.xmm3, src1);
-      e.vfmadd213sd(e.xmm3, src2, src3);
-      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulsd(e.xmm3, src1, src2);
-      e.vaddsd(i.dest, e.xmm3, src3);
-      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
-    }
-  }
-};
-struct NEGATED_MUL_ADD_V128
-    : Sequence<NEGATED_MUL_ADD_V128,
-               I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovaps(e.xmm3, src1);
-      e.vfmadd213ps(e.xmm3, src2, src3);
-      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulps(e.xmm3, src1, src2);
-      e.vaddps(i.dest, e.xmm3, src3);
-      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
-                     NEGATED_MUL_ADD_V128);
-
 // ============================================================================
 // OPCODE_MUL_SUB
 // ============================================================================
@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);

-struct NEGATED_MUL_SUB_F64
-    : Sequence<NEGATED_MUL_SUB_F64,
-               I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Fpu);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovapd(e.xmm3, src1);
-      e.vfmsub213sd(e.xmm3, src2, src3);
-      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulsd(e.xmm3, src1, src2);
-      e.vsubsd(i.dest, e.xmm3, src3);
-      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
-    }
-  }
-};
-struct NEGATED_MUL_SUB_V128
-    : Sequence<NEGATED_MUL_SUB_V128,
-               I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-
-    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
-    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
-    if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      // todo: this is garbage
-      e.vmovaps(e.xmm3, src1);
-      e.vfmsub213ps(e.xmm3, src2, src3);
-      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
-    } else {
-      // todo: might need to use x87 in this case...
-      e.vmulps(e.xmm3, src1, src2);
-      e.vsubps(i.dest, e.xmm3, src3);
-      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
-                     NEGATED_MUL_SUB_V128);
-
 // ============================================================================
 // OPCODE_NEG
 // ============================================================================
@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
    // src1 constant.
    // `and` instruction only supports up to 32-bit immediate constants
    // 64-bit constants will need a temp register
-	  //only possible with 64 bit inputs, andc is the only instruction that generates this
+    // only possible with 64 bit inputs, andc is the only instruction that
+    // generates this
    auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
    e.mov(temp, i.src1.constant());

--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
  return i->dest;
 }

+Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
+  ASSERT_ADDRESS_TYPE(address);
+
+  Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+
+  return i->dest;
+}
+
+Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
+                                    TypeName type) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+  return i->dest;
+}
 void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  return i->dest;
 }

-Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
-  ASSERT_TYPES_EQUAL(value1, value2);
-  ASSERT_TYPES_EQUAL(value1, value3);
-
-  Instr* i =
-      AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
-  i->set_src1(value1);
-  i->set_src2(value2);
-  i->set_src3(value3);
-  return i->dest;
-}
-
-Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
-  ASSERT_TYPES_EQUAL(value1, value2);
-  ASSERT_TYPES_EQUAL(value1, value3);
-
-  Instr* i =
-      AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
-  i->set_src1(value1);
-  i->set_src2(value2);
-  i->set_src3(value3);
-  return i->dest;
-}
-
 Value* HIRBuilder::Neg(Value* value) {
  Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
  i->set_src1(value);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -189,6 +189,9 @@ class HIRBuilder {
                   uint32_t store_flags = 0);

  Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
+  // create a reserve on an address,
+  Value* LoadWithReserve(Value* address, TypeName type);
+  Value* StoreWithReserve(Value* address, Value* value, TypeName type);

  Value* LoadVectorLeft(Value* address);
  Value* LoadVectorRight(Value* address);
@ -242,10 +245,7 @@ class HIRBuilder {
  Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* MulAdd(Value* value1, Value* value2, Value* value3);  // (1 * 2) + 3
  Value* MulSub(Value* value1, Value* value2, Value* value3);  // (1 * 2) - 3
-  Value* NegatedMulAdd(Value* value1, Value* value2,
-                       Value* value3);  // -((1 * 2) + 3)
-  Value* NegatedMulSub(Value* value1, Value* value2,
-                       Value* value3);  // -((1 * 2) - 3)
+
  Value* Neg(Value* value);
  Value* Abs(Value* value);
  Value* Sqrt(Value* value);
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -248,9 +248,7 @@ enum Opcode {
  OPCODE_MUL_HI,  // TODO(benvanik): remove this and add INT128 type.
  OPCODE_DIV,
  OPCODE_MUL_ADD,
-  OPCODE_NEGATED_MUL_ADD,
  OPCODE_MUL_SUB,
-  OPCODE_NEGATED_MUL_SUB,
  OPCODE_NEG,
  OPCODE_ABS,
  OPCODE_SQRT,
@ -292,7 +290,10 @@ enum Opcode {
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
  OPCODE_SET_NJM,
-	OPCODE_DELAY_EXECUTION, //for db16cyc
+  OPCODE_DELAY_EXECUTION,  // for db16cyc
+  OPCODE_RESERVED_LOAD,
+  OPCODE_RESERVED_STORE,
+
  __OPCODE_MAX_VALUE,  // Keep at end.
 };

--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -218,7 +218,12 @@ DEFINE_OPCODE(
    "context_barrier",
    OPCODE_SIG_X,
    0)
-DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
+
+DEFINE_OPCODE(
+	OPCODE_DELAY_EXECUTION, 
+	"delay_execution",
+	OPCODE_SIG_X,
+	0)
 DEFINE_OPCODE(
    OPCODE_LOAD_MMIO,
    "load_mmio",
@ -453,19 +458,6 @@ DEFINE_OPCODE(
    OPCODE_SIG_V_V_V_V,
    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

-DEFINE_OPCODE(
-    OPCODE_NEGATED_MUL_ADD,
-    "negated_mul_add",
-    OPCODE_SIG_V_V_V_V,
-    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
-
-DEFINE_OPCODE(
-    OPCODE_NEGATED_MUL_SUB,
-    "negated_mul_sub",
-    OPCODE_SIG_V_V_V_V,
-    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
-
-
 DEFINE_OPCODE(
    OPCODE_NEG,
    "neg",
@ -719,3 +711,15 @@ DEFINE_OPCODE(
    "storev_right",
    OPCODE_SIG_X_V_V,
    OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(	
+	OPCODE_RESERVED_LOAD,
+	"reserved_load",
+    OPCODE_SIG_V_V,
+    OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(
+    OPCODE_RESERVED_STORE,
+    "reserved_store",
+    OPCODE_SIG_V_V_V,
+    OPCODE_FLAG_MEMORY)
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
  uint8_t rex_b = rex & 0b0001;
  uint8_t rex_x = rex & 0b0010;
  uint8_t rex_r = rex & 0b0100;
-  //uint8_t rex_w = rex & 0b1000;
+  // uint8_t rex_w = rex & 0b1000;

  // http://www.sandpile.org/x86/opc_rm.htm
  // http://www.sandpile.org/x86/opc_sib.htm
@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
    if (cur_access != memory::PageAccess::kNoAccess &&
        (!is_write || cur_access != memory::PageAccess::kReadOnly)) {
      // Another thread has cleared this watch. Abort.
+      XELOGD("Race condition on watch, was already cleared by another thread!");
      return true;
    }
    // The address is not found within any range, so either a write watch or an
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
  Value* c = f.VectorDenormFlush(f.LoadVR(vc));

-  Value* v = f.NegatedMulSub(a, c, b);
+  Value* v = f.Neg(f.MulSub(a, c, b));
  f.StoreVR(vd, v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
-                             f.LoadFPR(i.A.FRB));
+  Value* v = f.Neg(
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
  f.StoreFPR(i.X.RT, v);
  /*
  The contents of frB with bit 0 cleared are placed into frD.
-Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
-bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
-Other registers altered:
-• Condition Register (CR1 field):
+Note that the fabs instruction treats NaNs just like any other kind of value.
+That is, the sign bit of a NaN may be altered by fabs. This instruction does not
+alter the FPSCR. Other registers altered: • Condition Register (CR1 field):
 Affected: FX, FEX, VX, OX (if Rc = 1)
  */
- // f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
    // todo
  }
@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- !abs(frB)
  Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
  f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
-	//todo
+    // todo
  }
  return 0;
 }
@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- ¬ frB[0] || frB[1-63]
  Value* v = f.Neg(f.LoadFPR(i.X.RB));
  f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
-	//todo
+    // todo
  }
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@ -22,6 +22,12 @@ DEFINE_bool(
    "instructions were written with the Xbox 360's cache in mind, and modern "
    "processors do their own automatic prefetching.",
    "CPU");
+
+DEFINE_bool(no_reserved_ops, false,
+            "For testing whether a game may have races with a broken reserved "
+            "load/store impl",
+            "CPU");
+
 namespace xe {
 namespace cpu {
 namespace ppc {
@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
  // already, but I haven't see anything but interrupt callbacks (which are
  // always under a global lock) do that yet.
  // We issue a memory barrier here to make sure that we get good values.
-  f.MemoryBarrier();
-
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
-  f.StoreReserved(rt);
-  f.StoreGPR(i.X.RT, rt);
+
+  if (cvars::no_reserved_ops) {
+    f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
+
+  } else {
+    f.MemoryBarrier();
+
+    Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
+    f.StoreGPR(i.X.RT, rt);
+  }
  return 0;
 }

@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
  // already, but I haven't see anything but interrupt callbacks (which are
  // always under a global lock) do that yet.
  // We issue a memory barrier here to make sure that we get good values.
-  f.MemoryBarrier();

  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
-  f.StoreReserved(rt);
-  f.StoreGPR(i.X.RT, rt);
+  if (cvars::no_reserved_ops) {
+    f.StoreGPR(i.X.RT,
+               f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
+
+  } else {
+    f.MemoryBarrier();
+
+    Value* rt =
+        f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
+    f.StoreGPR(i.X.RT, rt);
+  }
  return 0;
 }

@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {

  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
  Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
-  Value* res = f.ByteSwap(f.LoadReserved());
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+
+  if (cvars::no_reserved_ops) {
+    f.Store(ea, rt);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  } else {
+    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+  }
  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());

  // Issue memory barrier for when we go out of lock and want others to see our
  // updates.
-
-  f.MemoryBarrier();
-
+  if (!cvars::no_reserved_ops) {
+    f.MemoryBarrier();
+  }
  return 0;
 }

@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
  // This will always succeed if under the global lock, however.

  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
+
  Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
-  Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+
+  if (cvars::no_reserved_ops) {
+    f.Store(ea, rt);
+
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  } else {
+    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
+    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+  }
+
  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());

  // Issue memory barrier for when we go out of lock and want others to see our
  // updates.
-  f.MemoryBarrier();
+  if (!cvars::no_reserved_ops) {
+    f.MemoryBarrier();
+  }

  return 0;
 }
-
 // Floating-point load (A-19)

 int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {