Add special cases to DOT_PRODUCT_3/4 that detect whether they're calculating lengthsquared

Add alternate path to DOT_PRODUCT_3/4 for use_fast_dot_product that skips all the status register stuff and just remaps inf to qnan Add OPCODE_TO_SINGLE to replace the CONVERT_F32_F64 - CONVERT_F64_F32 sequence we used to emit with the idea that a backend could implement a more correct rounding behavior if possible on its arch Remove some impossible sequences like MUL_HI_I8/I16, MUL_ADD_F32, DIV_V128. These instructions have no equivalent in PPC. Many other instructions are unused/dead code and should be removed to make the x64 backend a better reference for future ones Add backend_flags to Instr. Basically, flags field that a backend can use for whatever it wants when generating code. Add backend instr flag to x64 that tells it to not generate code for an instruction. this allows sequences to consume subsequent instructions Generate actual x64 code for VSL instruction instead of using callnativesafe Detect repeated COMPARE instructions w/ identical operands and reuse the results in FLAGS if so. this eliminates a ton of garbage compare/set instructions. If a COMPARE instructions destination is stored to context with no intervening instruction and no additional uses besides the store, do setx [ctx address] Detect prefetchw and use it in CACHE_CONTROL if prefetch for write is requested instead of doing prefetch to all cache levels Fixed an accident in an earlier commit by me, VECTOR_DENORMFLUSH was not being emitted at all, so denormal inputs to MUL_ADD_V128 were not becoming zero and outputs from DOT_PRODUCT_X were not either. I believe this introduced a bug into RDR where a wagon wouldnt spawn? (https://discord.com/channels/308194948048486401/308207592482668545/1000443975817252874) Compute fresx in double precision using RECIP_F64 and then round to single instead of doing (double)(1.0f / (float)value), matching original behavior better Refactor some of ppc_emit_fpu, much of the InstrEmit function are identical except for whether they round to single or not Added "tail emitters" to X64Emitter. These are callbacks that get invoked with their label and the X64Emitter after the epilog code. This allows us to move cold code out of the critical path and in the future place constant pools near functions guest_to_host_thunk/host_to_guest_thunk now gets directly rel32 called, instead of doing a mov Add X64BackendContext structure, represents data before the start of the PPCContext Instead of doing branchless sequence, do a compare and jump to tail emitted code for address translation. This makes converting addresses a 3 uop affair in most cases. Do qnan move for dot product in a tail emitter Detect whether EFLAGS bits are independent variables for the current cpu (not really detecting it ehe, just checking if zen) and if so generate inc/dec for add/sub 1 Detect whether low 32 bits of membase are 0. If they are then we can use membasereg.cvt32() in place of immediate 0 in many places, particularly in stores Detect LOAD MODIFY STORE pattern for context variables (currently only done for 64 bit ones) and turn them into modify [context ptr]. This is done for add, sub, and, or, xor, not, neg Tail emit error handling for TRAP opcodes Stub out unused trap opcodes like TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_I16 (the call_true/return_true opcodes for these types are also probably unused) Remove BackpropTruncations. It was poorly written and causes crashes on the game Viva pinata (https://discord.com/channels/308194948048486401/701111856600711208/1000249460451983420)
2022-07-23 12:10:07 -07:00 · 2022-07-23 12:10:07 -07:00 · 33a6cfc0a7
parent 1fcac00924
commit 33a6cfc0a7
23 changed files with 1299 additions and 786 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -688,7 +688,12 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
  vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
 #endif
 }
-
+void X64Backend::InitializeBackendContext(void* ctx) {
+  X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
+      reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
+  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
+  bctx->Ox1000 = 0x1000;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -31,6 +31,16 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();

+// located prior to the ctx register
+// some things it would be nice to have be per-emulator instance instead of per
+// context (somehow placing a global X64BackendCtx prior to membase, so we can
+// negatively index the membase reg)
+struct X64BackendContext {
+  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
+  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
+                        // add of it by... 2 bytes lol
+};
+
 class X64Backend : public Backend {
 public:
  static const uint32_t kForceReturnAddress = 0x9FFF0000u;
@ -65,6 +75,7 @@ class X64Backend : public Backend {
  void InstallBreakpoint(Breakpoint* breakpoint) override;
  void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
  void UninstallBreakpoint(Breakpoint* breakpoint) override;
+  virtual void InitializeBackendContext(void* ctx) override;

 private:
  static bool ExceptionCallbackThunk(Exception* ex, void* data);
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -105,6 +105,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
  /*
  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
@ -121,6 +122,10 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
    bool is_zennish = cpu_.displayFamily >= 0x17;

    if (is_zennish) {
+      // ik that i heard somewhere that this is the case for zen, but i need to
+      // verify. cant find my original source for that.
+      // todo: ask agner?
+      feature_flags_ |= kX64FlagsIndependentVars;
      feature_flags_ |= kX64FastJrcx;

      if (cpu_.displayFamily > 0x17) {
@ -132,6 +137,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
         // for my cpu, which is ripper90
    }
  }
+  may_use_membase32_as_zero_reg_ =
+      static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
+          processor()->memory()->virtual_membase())) == 0;
 }

 X64Emitter::~X64Emitter() = default;
@ -210,6 +218,11 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
  top_ = old_address;
  reset();
  call_sites_.clear();
+  tail_code_.clear();
+  for (auto&& cached_label : label_cache_) {
+    delete cached_label;
+  }
+  label_cache_.clear();
  return new_execute_address;
 }

@ -261,13 +274,14 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
-
+  xor_(eax, eax);
  /*
  * chrispy: removed this, it serves no purpose
  mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
  */
  mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
-  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
+
+  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);  // 0

  // Safe now to do some tracing.
  if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
@ -343,6 +357,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  add(rsp, (uint32_t)stack_size);
  ret();
+  // todo: do some kind of sorting by alignment?
+  for (auto&& tail_item : tail_code_) {
+    if (tail_item.alignment) {
+      align(tail_item.alignment);
+    }
+    tail_item.func(*this, tail_item.label);
+  }

  code_offsets.tail = getSize();

@ -605,12 +626,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
      // rdx = arg0
      // r8  = arg1
      // r9  = arg2
-      auto thunk = backend()->guest_to_host_thunk();
-      mov(rax, reinterpret_cast<uint64_t>(thunk));
      mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
      mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
      mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
-      call(rax);
+      call(backend()->guest_to_host_thunk());
      // rax = host return
    }
  } else if (function->behavior() == Function::Behavior::kExtern) {
@ -621,12 +640,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
      // rdx = arg0
      // r8  = arg1
      // r9  = arg2
-      auto thunk = backend()->guest_to_host_thunk();
-      mov(rax, reinterpret_cast<uint64_t>(thunk));
      mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
      mov(rdx,
          qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
-      call(rax);
+      call(backend()->guest_to_host_thunk());
      // rax = host return
    }
  }
@ -656,10 +673,8 @@ void X64Emitter::CallNativeSafe(void* fn) {
  // rdx = arg0
  // r8  = arg1
  // r9  = arg2
-  auto thunk = backend()->guest_to_host_thunk();
-  mov(rax, reinterpret_cast<uint64_t>(thunk));
  mov(rcx, reinterpret_cast<uint64_t>(fn));
-  call(rax);
+  call(backend()->guest_to_host_thunk());
  // rax = host return
 }

@ -715,24 +730,50 @@ bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
  }
  return false;
 }
-
+/*
+    WARNING: do not use any regs here, addr is often produced by
+   ComputeAddressOffset, which may use rax/rdx/rcx in its addr expression
+*/
 void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
-  if ((v & ~0x7FFFFFFF) == 0) {
+  uint32_t lowpart = static_cast<uint32_t>(v);
+  uint32_t highpart = static_cast<uint32_t>(v >> 32);
+  // check whether the constant coincidentally collides with our membase
+  if (v == (uintptr_t)processor()->memory()->virtual_membase()) {
+    mov(qword[addr], GetMembaseReg());
+  } else if ((v & ~0x7FFFFFFF) == 0) {
    // Fits under 31 bits, so just load using normal mov.
+
    mov(qword[addr], v);
  } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
    // Negative number that fits in 32bits.
    mov(qword[addr], v);
-  } else if (!(v >> 32)) {
+  } else if (!highpart) {
    // All high bits are zero. It'd be nice if we had a way to load a 32bit
    // immediate without sign extending!
    // TODO(benvanik): this is super common, find a better way.
-    mov(dword[addr], static_cast<uint32_t>(v));
-    mov(dword[addr + 4], 0);
+    if (lowpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr], static_cast<uint32_t>(v));
+    }
+    if (CanUseMembaseLow32As0()) {
+      mov(dword[addr + 4], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr + 4], 0);
+    }
  } else {
    // 64bit number that needs double movs.
-    mov(dword[addr], static_cast<uint32_t>(v));
-    mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
+
+    if (lowpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr], lowpart);
+    }
+    if (highpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr + 4], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr + 4], highpart);
+    }
  }
 }
 static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
@ -893,7 +934,13 @@ static const vec128_t xmm_consts[] = {
    /* XMMThreeFloatMask */
    vec128i(~0U, ~0U, ~0U, 0U),
    /*XMMXenosF16ExtRangeStart*/
-    vec128f(65504)};
+    vec128f(65504),
+    /*XMMVSRShlByteshuf*/
+    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
+    // XMMVSRMask
+    vec128b(1)
+
+};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1300,6 +1347,27 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {

  return SimdDomain::DONTCARE;
 }
+Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
+  /*
+    index context ptr negatively to get to backend ctx field
+  */
+  ptrdiff_t delta = (-static_cast<ptrdiff_t>(sizeof(X64BackendContext))) +
+                    offset_in_x64backendctx;
+  return ptr[GetContextReg() + static_cast<int>(delta)];
+}
+Xbyak::Label& X64Emitter::AddToTail(TailEmitCallback callback,
+                                    uint32_t alignment) {
+  TailEmitter emitter{};
+  emitter.func = std::move(callback);
+  emitter.alignment = alignment;
+  tail_code_.push_back(std::move(emitter));
+  return tail_code_.back().label;
+}
+Xbyak::Label& X64Emitter::NewCachedLabel() {
+  Xbyak::Label* tmp = new Xbyak::Label;
+  label_cache_.push_back(tmp);
+  return *tmp;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -155,7 +155,15 @@ enum XmmConst {
  XMMLVSRTableBase,
  XMMSingleDenormalMask,
  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
-  XMMXenosF16ExtRangeStart
+  XMMXenosF16ExtRangeStart,
+  XMMVSRShlByteshuf,
+  XMMVSRMask
+};
+// X64Backend specific Instr->runtime_flags
+enum : uint32_t {
+  INSTR_X64_FLAGS_ELIMINATED =
+      1,  // another sequence marked this instruction as not needing codegen,
+          // meaning they likely already handled it
 };

 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@ -185,7 +193,13 @@ enum X64EmitterFeatureFlags {
  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
  kX64FastLoop =
      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
-  kX64EmitAVX512VBMI = 1 << 14
+  kX64EmitAVX512VBMI = 1 << 14,
+  kX64FlagsIndependentVars =
+      1 << 15,  // if true, instructions that only modify some flags (like
+                // inc/dec) do not introduce false dependencies on EFLAGS
+                // because the individual flags are treated as different vars by
+                // the processor. (this applies to zen)
+  kX64EmitPrefetchW = 1 << 16
 };
 class ResolvableGuestCall {
 public:
@ -194,6 +208,13 @@ class ResolvableGuestCall {
  // rgcid
  unsigned offset_;
 };
+class X64Emitter;
+using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
+struct TailEmitter {
+  Xbyak::Label label;
+  uint32_t alignment;
+  TailEmitCallback func;
+};

 class X64Emitter : public Xbyak::CodeGenerator {
 public:
@ -264,7 +285,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Reg64 GetContextReg();
  Xbyak::Reg64 GetMembaseReg();
-
+  bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
  void ReloadMembase();

  void nop(size_t length = 1);
@ -274,6 +295,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void MovMem64(const Xbyak::RegExp& addr, uint64_t v);

  Xbyak::Address GetXmmConstPtr(XmmConst id);
+  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
+
  void LoadConstantXmm(Xbyak::Xmm dest, float v);
  void LoadConstantXmm(Xbyak::Xmm dest, double v);
  void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
@ -289,6 +312,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
    return (feature_flags_ & feature_flag) == feature_flag;
  }

+  Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
+  Xbyak::Label& NewCachedLabel();
  FunctionDebugInfo* debug_info() const { return debug_info_; }

  size_t stack_size() const { return stack_size_; }
@ -324,6 +349,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
  static const uint32_t xmm_reg_map_[XMM_COUNT];
  uint32_t current_rgc_id_ = 0xEEDDF00F;
  std::vector<ResolvableGuestCall> call_sites_;
+  /*
+    set to true if the low 32 bits of membase == 0.
+    only really advantageous if you are storing 32 bit 0 to a displaced address,
+    which would have to represent 0 as 4 bytes
+  */
+  bool may_use_membase32_as_zero_reg_;
+  std::vector<TailEmitter> tail_code_;
+  std::vector<Xbyak::Label*>
+      label_cache_;  // for creating labels that need to be referenced much
+                     // later by tail emitters
 };

 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@ -109,7 +109,6 @@ struct DEBUG_BREAK_TRUE_I32
    : Sequence<DEBUG_BREAK_TRUE_I32,
               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    if (e.IsFeatureEnabled(kX64FastJrcx)) {
      e.mov(e.ecx, i.src1);
      Xbyak::Label skip;
@ -187,77 +186,48 @@ EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
 struct TRAP_TRUE_I8
    : Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    Xbyak::Label& after = e.NewCachedLabel();
+    unsigned flags = i.instr->flags;
+    Xbyak::Label& dotrap =
+        e.AddToTail([flags, &after](X64Emitter& e, Xbyak::Label& me) {
+          e.L(me);
+          e.Trap(flags);
+          // does Trap actually return control to the guest?
+          e.jmp(after, X64Emitter::T_NEAR);
+        });
    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    e.jnz(dotrap, X64Emitter::T_NEAR);
+    e.L(after);
  }
 };
 struct TRAP_TRUE_I16
    : Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_I16);
  }
 };
 struct TRAP_TRUE_I32
    : Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (e.IsFeatureEnabled(kX64FastJrcx)) {
-      e.mov(e.ecx, i.src1);
-      Xbyak::Label skip;
-      e.jrcxz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    } else {
-      e.test(i.src1, i.src1);
-      Xbyak::Label skip;
-      e.jz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    }
+    assert_impossible_sequence(TRAP_TRUE_I32);
  }
 };
 struct TRAP_TRUE_I64
    : Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (e.IsFeatureEnabled(kX64FastJrcx)) {
-      e.mov(e.rcx, i.src1);
-      Xbyak::Label skip;
-      e.jrcxz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    } else {
-      e.test(i.src1, i.src1);
-      Xbyak::Label skip;
-      e.jz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    }
+    assert_impossible_sequence(TRAP_TRUE_I64);
  }
 };
 struct TRAP_TRUE_F32
    : Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_F32);
  }
 };
 struct TRAP_TRUE_F64
    : Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_F64);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
@ -333,6 +303,7 @@ struct CALL_TRUE_F32
    e.L(skip);
  }
 };
+
 struct CALL_TRUE_F64
    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -388,7 +359,6 @@ struct CALL_INDIRECT_TRUE_I32
    : Sequence<CALL_INDIRECT_TRUE_I32,
               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    if (e.IsFeatureEnabled(kX64FastJrcx)) {
      e.mov(e.ecx, i.src1);
      Xbyak::Label skip;
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -14,6 +14,7 @@

 #include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
+#include "xenia/cpu/backend/x64/x64_backend.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
@ -28,8 +29,127 @@ namespace cpu {
 namespace backend {
 namespace x64 {

+struct LoadModStore {
+  const hir::Instr* load;
+  hir::Instr* modify;
+  hir::Instr* store;
+
+  bool is_constant[3];
+  void Consume();
+};
+void LoadModStore::Consume() {
+  modify->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
+  store->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
+}
+static bool GetLoadModStore(const hir::Instr* loadinsn, LoadModStore* out) {
+  if (IsTracingData()) {
+    return false;
+  }
+  // if (!loadinsn->dest->HasSingleUse()) {
+  // allow the value to be used multiple times, as long as it is by the same
+  // instruction
+  if (!loadinsn->dest->AllUsesByOneInsn()) {
+    return false;
+  }
+  hir::Instr* use = loadinsn->dest->use_head->instr;
+
+  if (!use->dest || !use->dest->HasSingleUse() ||
+      use->GetNonFakePrev() != loadinsn) {
+    return false;
+  }
+
+  hir::Instr* shouldbstore = use->dest->use_head->instr;
+
+  if (shouldbstore->dest || shouldbstore->GetNonFakePrev() != use) {
+    return false;  // store insns have no destination
+  }
+  use->VisitValueOperands([out](Value* v, uint32_t idx) {
+    out->is_constant[idx] = v->IsConstant();
+  });
+  out->load = loadinsn;
+  out->modify = use;
+  out->store = shouldbstore;
+  return true;
+}
+struct LoadModStoreContext : public LoadModStore {
+  uint64_t offset;  // ctx offset
+  TypeName type;
+  Opcode op;
+  bool is_commutative;
+  bool is_unary;
+  bool is_binary;
+  bool
+      binary_uses_twice;  // true if binary_other == our value. (for instance,
+                          // add r11, r10, r10, which can be gen'ed for r10 * 2)
+  hir::Value* binary_other;
+
+  hir::Value::ConstantValue* other_const;
+  uint32_t other_index;
+};
+static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
+                                   LoadModStoreContext* out) {
+  if (!GetLoadModStore(loadinsn, out)) {
+    return false;
+  }
+
+  if (out->load->opcode->num != OPCODE_LOAD_CONTEXT ||
+      out->store->opcode->num != OPCODE_STORE_CONTEXT) {
+    return false;
+  }
+
+  if (out->modify->opcode->flags &
+      (OPCODE_FLAG_VOLATILE | OPCODE_FLAG_MEMORY)) {
+    return false;
+  }
+  uint64_t offs = out->load->src1.offset;
+
+  if (offs != out->store->src1.offset) {
+    return false;
+  }
+
+  TypeName typ = out->load->dest->type;
+  // can happen if op is a conversion
+  if (typ != out->store->src2.value->type) {
+    return false;
+  }
+  /*
+        set up a whole bunch of convenience fields for the caller
+  */
+  out->offset = offs;
+  out->type = typ;
+  const OpcodeInfo& opinf = *out->modify->opcode;
+  out->op = opinf.num;
+  out->is_commutative = opinf.flags & OPCODE_FLAG_COMMUNATIVE;
+  out->is_unary = IsOpcodeUnaryValue(opinf.signature);
+  out->is_binary = IsOpcodeBinaryValue(opinf.signature);
+  out->binary_uses_twice = false;
+  out->binary_other = nullptr;
+  out->other_const = nullptr;
+  out->other_index = ~0U;
+  if (out->is_binary) {
+    if (out->modify->src1.value == out->load->dest) {
+      out->binary_other = out->modify->src2.value;
+      out->other_index = 1;
+    } else {
+      out->binary_other = out->modify->src1.value;
+      out->other_index = 0;
+    }
+    if (out->binary_other && out->is_constant[out->other_index]) {
+      out->other_const = &out->binary_other->constant;
+    }
+    if (out->binary_other == out->load->dest) {
+      out->binary_uses_twice = true;
+    }
+  }
+  return true;
+}
 volatile int anchor_memory = 0;

+static void Do0x1000Add(X64Emitter& e, Reg32 reg) {
+  e.add(reg, e.GetBackendCtxPtr(offsetof(X64BackendContext, Ox1000)));
+  // e.add(reg, 0x1000);
+}
+
 // Note: all types are always aligned in the context.
 RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
  return e.GetContextReg() + offset.value;
@ -58,51 +178,6 @@ static bool is_definitely_not_eo(const T& v) {

  return is_eo_def(v.value);
 }
-template <typename T>
-RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
-                                  const T& offset) {
-  assert_true(offset.is_constant);
-  int32_t offset_const = static_cast<int32_t>(offset.constant());
-
-  if (guest.is_constant) {
-    uint32_t address = static_cast<uint32_t>(guest.constant());
-    address += offset_const;
-    if (address < 0x80000000) {
-      return e.GetMembaseReg() + address;
-    } else {
-      if (address >= 0xE0000000 &&
-          xe::memory::allocation_granularity() > 0x1000) {
-        e.mov(e.eax, address + 0x1000);
-      } else {
-        e.mov(e.eax, address);
-      }
-      return e.GetMembaseReg() + e.rax;
-    }
-  } else {
-    if (xe::memory::allocation_granularity() > 0x1000 &&
-        !is_definitely_not_eo(guest)) {
-      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
-      // it via memory mapping.
-
-      // todo: do branching or use an alt membase and cmov
-      e.xor_(e.eax, e.eax);
-      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
-
-      e.cmp(e.edx, e.GetContextReg().cvt32());
-      e.setae(e.al);
-      e.shl(e.eax, 12);
-      e.add(e.eax, e.edx);
-      return e.GetMembaseReg() + e.rax;
-
-    } else {
-      // Clear the top 32 bits, as they are likely garbage.
-      // TODO(benvanik): find a way to avoid doing this.
-
-      e.mov(e.eax, guest.reg().cvt32());
-    }
-    return e.GetMembaseReg() + e.rax + offset_const;
-  }
-}
 // Note: most *should* be aligned, but needs to be checked!
 template <typename T>
 RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
@ -127,11 +202,23 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
        !is_definitely_not_eo(guest)) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.xor_(e.eax, e.eax);
+      Xbyak::Label& jmpback = e.NewCachedLabel();
+
+      e.mov(e.eax, guest.reg().cvt32());
+
      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.al);
-      e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&jmpback](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+            Do0x1000Add(e, e.eax);
+            e.jmp(jmpback, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+
+      e.L(jmpback);
+      return e.GetMembaseReg() + e.rax;
+
    } else {
      // Clear the top 32 bits, as they are likely garbage.
      // TODO(benvanik): find a way to avoid doing this.
@ -140,6 +227,64 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
    return e.GetMembaseReg() + e.rax;
  }
 }
+template <typename T>
+RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
+                                  const T& offset) {
+  assert_true(offset.is_constant);
+  int32_t offset_const = static_cast<int32_t>(offset.constant());
+  if (offset_const == 0) {
+    return ComputeMemoryAddress(e, guest);
+  }
+  if (guest.is_constant) {
+    uint32_t address = static_cast<uint32_t>(guest.constant());
+    address += offset_const;
+    if (address < 0x80000000) {
+      return e.GetMembaseReg() + address;
+    } else {
+      if (address >= 0xE0000000 &&
+          xe::memory::allocation_granularity() > 0x1000) {
+        e.mov(e.eax, address + 0x1000);
+      } else {
+        e.mov(e.eax, address);
+      }
+      return e.GetMembaseReg() + e.rax;
+    }
+  } else {
+    if (xe::memory::allocation_granularity() > 0x1000 &&
+        !is_definitely_not_eo(guest)) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+
+      // todo: do branching or use an alt membase and cmov
+
+      Xbyak::Label& tmplbl = e.NewCachedLabel();
+
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
+
+      e.cmp(e.edx, e.GetContextReg().cvt32());
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.edx);
+
+            e.jmp(tmplbl, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+
+      e.L(tmplbl);
+      return e.GetMembaseReg() + e.rdx;
+
+    } else {
+      // Clear the top 32 bits, as they are likely garbage.
+      // TODO(benvanik): find a way to avoid doing this.
+
+      e.mov(e.eax, guest.reg().cvt32());
+    }
+    return e.GetMembaseReg() + e.rax + offset_const;
+  }
+}

 // ============================================================================
 // OPCODE_ATOMIC_EXCHANGE
@ -214,11 +359,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
+      e.mov(e.ecx, i.src1.reg().cvt32());
      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.cl);
-      e.movzx(e.ecx, e.cl);
-      e.shl(e.ecx, 12);
-      e.add(e.ecx, i.src1.reg().cvt32());
+      Xbyak::Label& backtous = e.NewCachedLabel();
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.ecx);
+
+            e.jmp(backtous, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+      e.L(backtous);
    } else {
      e.mov(e.ecx, i.src1.reg().cvt32());
    }
@ -235,11 +389,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
+      e.mov(e.ecx, i.src1.reg().cvt32());
      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.cl);
-      e.movzx(e.ecx, e.cl);
-      e.shl(e.ecx, 12);
-      e.add(e.ecx, i.src1.reg().cvt32());
+      Xbyak::Label& backtous = e.NewCachedLabel();
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.ecx);
+
+            e.jmp(backtous, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+      e.L(backtous);
    } else {
      e.mov(e.ecx, i.src1.reg().cvt32());
    }
@ -319,25 +482,44 @@ struct STORE_LOCAL_I8
    e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
  }
 };
+
+template <typename T>
+static bool LocalStoreMayUseMembaseLow(X64Emitter& e, const T& i) {
+  return i.src2.is_constant && i.src2.constant() == 0 &&
+         e.CanUseMembaseLow32As0();
+}
 struct STORE_LOCAL_I16
    : Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
+    if (LocalStoreMayUseMembaseLow(e, i)) {
+      e.mov(e.word[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt16());
+    } else {
+      e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_I32
    : Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
+    if (LocalStoreMayUseMembaseLow(e, i)) {
+      e.mov(e.dword[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt32());
+    } else {
+      e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_I64
    : Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
+    if (i.src2.is_constant && i.src2.constant() == 0) {
+      e.xor_(e.eax, e.eax);
+      e.mov(e.qword[e.rsp + i.src1.constant()], e.rax);
+    } else {
+      e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_F32
@ -404,10 +586,133 @@ struct LOAD_CONTEXT_I32
    }
  }
 };
+template <typename EmitArgType>
+static bool HandleLMS64Binary(X64Emitter& e, const EmitArgType& i,
+                              LoadModStoreContext& lms, Xbyak::RegExp& addr) {
+  uint64_t other_const_val = 0;
+  bool const_fits_in_insn = false;
+  if (lms.other_const) {
+    other_const_val = lms.other_const->u64;
+    const_fits_in_insn = e.ConstantFitsIn32Reg(other_const_val);
+  }
+
+  /*
+        this check is here because we currently cannot handle other variables
+     with this
+  */
+  if (!lms.other_const && !lms.binary_uses_twice) {
+    return false;
+  }
+
+  if (lms.op == OPCODE_ADD) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        if (other_const_val == 1 &&
+            e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
+          e.inc(e.qword[addr]);
+        } else {
+          e.add(e.qword[addr], (uint32_t)other_const_val);
+        }
+
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.add(e.qword[addr], e.rax);
+      }
+      return true;
+    } else if (lms.binary_uses_twice) {
+      // we're being added to ourselves, we are a multiply by 2
+
+      e.shl(e.qword[addr], 1);
+      return true;
+    } else if (lms.binary_other) {
+      return false;  // cannot handle other variables right now.
+    }
+  } else if (lms.op == OPCODE_SUB) {
+    if (lms.other_index != 1) {
+      return false;  // if we are the second operand, we cant combine memory
+                     // access and operation
+    }
+
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        if (other_const_val == 1 &&
+            e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
+          e.dec(e.qword[addr]);
+        } else {
+          e.sub(e.qword[addr], (uint32_t)other_const_val);
+        }
+
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.sub(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+
+  } else if (lms.op == OPCODE_AND) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.and_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.and_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  } else if (lms.op == OPCODE_OR) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.or_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.or_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  } else if (lms.op == OPCODE_XOR) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.xor_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.xor_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+template <typename EmitArgType>
+static bool HandleLMS64Unary(X64Emitter& e, const EmitArgType& i,
+                             LoadModStoreContext& lms, Xbyak::RegExp& addr) {
+  Opcode op = lms.op;
+
+  if (op == OPCODE_NOT) {
+    e.not_(e.qword[addr]);
+    return true;
+  } else if (op == OPCODE_NEG) {
+    e.neg(e.qword[addr]);
+    return true;
+  }
+
+  return false;
+}
 struct LOAD_CONTEXT_I64
    : Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
+    LoadModStoreContext lms{};
+    if (GetLoadModStoreContext(i.instr, &lms)) {
+      if (lms.is_binary && HandleLMS64Binary(e, i, lms, addr)) {
+        lms.Consume();
+        return;
+      } else if (lms.is_unary && HandleLMS64Unary(e, i, lms, addr)) {
+        lms.Consume();
+        return;
+      }
+    }
+
    e.mov(i.dest, e.qword[addr]);
    if (IsTracingData()) {
      e.mov(e.GetNativeParam(1), e.qword[addr]);
@ -483,7 +788,11 @@ struct STORE_CONTEXT_I16
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
-      e.mov(e.word[addr], i.src2.constant());
+      if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
+        e.mov(e.word[addr], e.GetMembaseReg().cvt16());
+      } else {
+        e.mov(e.word[addr], i.src2.constant());
+      }
    } else {
      e.mov(e.word[addr], i.src2);
    }
@ -500,7 +809,11 @@ struct STORE_CONTEXT_I32
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
-      e.mov(e.dword[addr], i.src2.constant());
+      if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
+        e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
+      } else {
+        e.mov(e.dword[addr], i.src2.constant());
+      }
    } else {
      e.mov(e.dword[addr], i.src2);
    }
@ -569,9 +882,14 @@ struct STORE_CONTEXT_V128
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.vmovaps(e.ptr[addr], e.xmm0);
+      e.vmovdqa(e.ptr[addr], e.xmm0);
    } else {
-      e.vmovaps(e.ptr[addr], i.src2);
+      SimdDomain domain = e.DeduceSimdDomain(i.src2.value);
+      if (domain == SimdDomain::FLOATING) {
+        e.vmovaps(e.ptr[addr], i.src2);
+      } else {
+        e.vmovdqa(e.ptr[addr], i.src2);
+      }
    }
    if (IsTracingData()) {
      e.lea(e.GetNativeParam(1), e.ptr[addr]);
@ -735,7 +1053,11 @@ struct STORE_OFFSET_I16
      }
    } else {
      if (i.src3.is_constant) {
-        e.mov(e.word[addr], i.src3.constant());
+        if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
+          e.mov(e.word[addr], e.GetMembaseReg().cvt16());
+        } else {
+          e.mov(e.word[addr], i.src3.constant());
+        }
      } else {
        e.mov(e.word[addr], i.src3);
      }
@ -757,7 +1079,11 @@ struct STORE_OFFSET_I32
      }
    } else {
      if (i.src3.is_constant) {
-        e.mov(e.dword[addr], i.src3.constant());
+        if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
+          e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
+        } else {
+          e.mov(e.dword[addr], i.src3.constant());
+        }
      } else {
        e.mov(e.dword[addr], i.src3);
      }
@ -895,7 +1221,7 @@ struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeMemoryAddress(e, i.src1);
    // TODO(benvanik): we should try to stick to movaps if possible.
-    e.vmovups(i.dest, e.ptr[addr]);
+    e.vmovdqa(i.dest, e.ptr[addr]);
    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
      // TODO(benvanik): find a way to do this without the memory load.
      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
@ -1054,13 +1380,15 @@ struct STORE_V128
    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
      assert_false(i.src2.is_constant);
      e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
-      e.vmovaps(e.ptr[addr], e.xmm0);
+      // changed from vmovaps, the penalty on the vpshufb is unavoidable but
+      // we dont need to incur another here too
+      e.vmovdqa(e.ptr[addr], e.xmm0);
    } else {
      if (i.src2.is_constant) {
        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.vmovaps(e.ptr[addr], e.xmm0);
+        e.vmovdqa(e.ptr[addr], e.xmm0);
      } else {
-        e.vmovaps(e.ptr[addr], i.src2);
+        e.vmovdqa(e.ptr[addr], i.src2);
      }
    }
    if (IsTracingData()) {
@ -1081,10 +1409,12 @@ struct CACHE_CONTROL
    : Sequence<CACHE_CONTROL,
               I<OPCODE_CACHE_CONTROL, VoidOp, I64Op, OffsetOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    bool is_clflush = false, is_prefetch = false;
+    bool is_clflush = false, is_prefetch = false, is_prefetchw = false;
    switch (CacheControlType(i.instr->flags)) {
-      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE:
+        is_prefetchw = true;
+        break;
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
        is_prefetch = true;
        break;
      case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE:
@ -1095,6 +1425,11 @@ struct CACHE_CONTROL
        assert_unhandled_case(CacheControlType(i.instr->flags));
        return;
    }
+    if (is_prefetchw && !e.IsFeatureEnabled(kX64EmitPrefetchW)) {
+      is_prefetchw = false;
+      is_prefetch = true;  // cant prefetchw, cpu doesnt have it (unlikely to
+                           // happen). just prefetcht0
+    }
    size_t cache_line_size = i.src2.value;

    RegExp addr;
@ -1117,13 +1452,24 @@ struct CACHE_CONTROL
      }
    } else {
      if (xe::memory::allocation_granularity() > 0x1000) {
-        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
-        // it via memory mapping.
+        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't
+        // do it via memory mapping.
+        e.mov(e.eax, i.src1.reg().cvt32());
+
        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-        e.setae(e.al);
-        e.movzx(e.eax, e.al);
-        e.shl(e.eax, 12);
-        e.add(e.eax, i.src1.reg().cvt32());
+
+        Xbyak::Label& tmplbl = e.NewCachedLabel();
+
+        Xbyak::Label& fixup_label =
+            e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
+              e.L(our_tail_label);
+
+              Do0x1000Add(e, e.eax);
+
+              e.jmp(tmplbl, e.T_NEAR);
+            });
+        e.jae(fixup_label, e.T_NEAR);
+        e.L(tmplbl);
      } else {
        // Clear the top 32 bits, as they are likely garbage.
        // TODO(benvanik): find a way to avoid doing this.
@ -1131,12 +1477,17 @@ struct CACHE_CONTROL
      }
      addr = e.GetMembaseReg() + e.rax;
    }
+    // todo: use clflushopt + sfence on cpus that support it
    if (is_clflush) {
      e.clflush(e.ptr[addr]);
    }
+
    if (is_prefetch) {
      e.prefetcht0(e.ptr[addr]);
    }
+    if (is_prefetchw) {
+      e.prefetchw(e.ptr[addr]);
+    }

    if (cache_line_size >= 128) {
      // Prefetch the other 64 bytes of the 128-byte cache line.
@ -1151,6 +1502,9 @@ struct CACHE_CONTROL
      if (is_prefetch) {
        e.prefetcht0(e.ptr[addr]);
      }
+      if (is_prefetchw) {
+        e.prefetchw(e.ptr[addr]);
+      }
      assert_true(cache_line_size == 128);
    }
  }
@ -1178,20 +1532,24 @@ struct MEMSET_I64_I8_I64
    assert_true(i.src2.constant() == 0);
    e.vpxor(e.xmm0, e.xmm0);
    auto addr = ComputeMemoryAddress(e, i.src1);
+    /*
+        chrispy: changed to vmovdqa, the mismatch between vpxor and vmovaps
+       was causing a 1 cycle stall before the first store
+    */
    switch (i.src3.constant()) {
      case 32:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
+
+        e.vmovdqa(e.ptr[addr], e.ymm0);
        break;
      case 128:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
+        // probably should lea the address beforehand
+        e.vmovdqa(e.ptr[addr + 0 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 2 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 4 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 6 * 16], e.ymm0);
        break;
      default:
        assert_unhandled_case(i.src3.constant());
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@ -13,6 +13,8 @@
 #include "xenia/cpu/hir/instr.h"

 #include <unordered_map>
+#define assert_impossible_sequence(name) \
+  assert_always("impossible sequence hit" #name);

 namespace xe {
 namespace cpu {
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -749,7 +749,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-       
+
        case OPCODE_PERMUTE: {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              i->src3.value->IsConstant() &&
@ -760,17 +760,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }

-          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
+          else if (i->src2.value->IsConstantZero() &&
+                   i->src3.value->IsConstantZero() &&
                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
            /*
-                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
+                chrispy: hoisted this check here from x64_seq_vector where if
+               src1 is not constant, but src2 and src3 are zero, then we know
+               the result will always be zero
            */

            v->set_zero(VEC128_TYPE);
            i->Remove();
            result = true;
          }
-          
+
          break;
        }
        case OPCODE_INSERT:
@ -930,6 +933,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
+        case OPCODE_TO_SINGLE:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->ToSingle();
+            i->Remove();
+            result = true;
+          }
+          break;
        default:
          // Ignored.
          break;
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -10,6 +10,7 @@
 #include "xenia/cpu/compiler/passes/simplification_pass.h"

 #include "xenia/base/byte_order.h"
+#include "xenia/base/logging.h"
 #include "xenia/base/profiling.h"
 namespace xe {
 namespace cpu {
@ -82,7 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
    iter_result |= SimplifyBitArith(builder);
    iter_result |= EliminateConversions(builder);
    iter_result |= SimplifyAssignments(builder);
-    iter_result |= BackpropTruncations(builder);
+
    result |= iter_result;
  } while (iter_result);
  return true;
@ -1207,71 +1208,6 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  return result;
 }

-struct TruncateSimplifier {
-  TypeName type_from, type_to;
-  uint32_t sizeof_from, sizeof_to;
-  uint32_t bit_sizeof_from, bit_sizeof_to;
-  uint64_t typemask_from, typemask_to;
-  hir::HIRBuilder* builder;
-  hir::Instr* truncate_instr;
-  hir::Value* truncated_value;
-  hir::Instr* truncated_value_def;
-};
-bool SimplificationPass::BackpropTruncations(hir::Instr* i,
-                                             hir::HIRBuilder* builder) {
-  if (i->opcode != &OPCODE_TRUNCATE_info) {
-    return false;
-  }
-  TypeName type_from = i->src1.value->type;
-  TypeName type_to = i->dest->type;
-
-  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
-  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
-
-  Instr* input_def = i->src1.value->GetDefSkipAssigns();
-  if (!input_def) {
-    return false;
-  }
-  Opcode input_opc = input_def->opcode->num;
-
-  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
-    uint32_t src2_shift = input_def->src2.value->AsUint32();
-    if (src2_shift < (sizeof_to * CHAR_BIT)) {
-      Value* truncated_preshift =
-          builder->Truncate(input_def->src1.value, type_to);
-
-      truncated_preshift->def->MoveBefore(i);
-      i->Replace(&OPCODE_SHL_info, 0);
-      i->set_src1(truncated_preshift);
-      i->set_src2(input_def->src2.value);
-      return true;
-    }
-  }
-  if (input_opc == OPCODE_LOAD_CONTEXT) {
-    if (sizeof_from == 8 && sizeof_to == 4) {
-      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
-      loadof->def->MoveBefore(input_def);
-      i->Replace(&OPCODE_ASSIGN_info, 0);
-      i->set_src1(loadof);
-      return true;
-    }
-  }
-
-  return false;
-}
-bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
-  bool result = false;
-  auto block = builder->first_block();
-  while (block) {
-    auto i = block->instr_head;
-    while (i) {
-      result |= BackpropTruncations(i, builder);
-      i = i->next;
-    }
-    block = block->next;
-  }
-  return result;
-}
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  auto def = value->def;
  if (def && def->opcode == &OPCODE_ASSIGN_info) {
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -32,8 +32,6 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
-  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
-  bool BackpropTruncations(hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -692,6 +692,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
  instr->block = block;
  instr->opcode = &opcode_info;
  instr->flags = flags;
+  instr->backend_flags = 0;
  instr->dest = dest;
  instr->src1.value = instr->src2.value = instr->src3.value = NULL;
  instr->src1_use = instr->src2_use = instr->src3_use = NULL;
@ -1492,7 +1493,6 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
                         part_type);
 }
 Value* HIRBuilder::VectorDenormFlush(Value* value1) {
-  return value1;
  ASSERT_VECTOR_TYPE(value1);
  Instr* i =
      AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
@ -1501,6 +1501,14 @@ Value* HIRBuilder::VectorDenormFlush(Value* value1) {
  i->src3.value = nullptr;
  return i->dest;
 }
+Value* HIRBuilder::ToSingle(Value* value) {
+  assert_true(value->type == FLOAT64_TYPE);
+  Instr* i = AppendInstr(OPCODE_TO_SINGLE_info, 0, AllocValue(FLOAT64_TYPE));
+  i->set_src1(value);
+  i->src2.value = nullptr;
+  i->src3.value = nullptr;
+  return i->dest;
+}
 Value* HIRBuilder::Add(Value* value1, Value* value2,
                       uint32_t arithmetic_flags) {
  ASSERT_TYPES_EQUAL(value1, value2);
@ -1720,7 +1728,6 @@ Value* HIRBuilder::Log2(Value* value) {
  return i->dest;
 }

-
 Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value1);
  ASSERT_VECTOR_TYPE(value2);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -200,7 +200,7 @@ class HIRBuilder {
  Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
  Value* VectorDenormFlush(Value* value1);
-
+  Value* ToSingle(Value* value);
  Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
                      uint32_t arithmetic_flags = 0);
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -180,6 +180,26 @@ exit_loop:
  *tunnel_flags = traversed_types;
  return current_def;
 }
+bool Instr::IsFake() const {
+  Opcode num = opcode->num;
+  switch (num) {
+    case OPCODE_NOP:
+    case OPCODE_COMMENT:
+    case OPCODE_CONTEXT_BARRIER:
+    case OPCODE_SOURCE_OFFSET:
+      return true;
+  }
+  return false;
+}
+
+const Instr* Instr::GetNonFakePrev() const {
+  const Instr* curr = prev;
+
+  while (curr && curr->IsFake()) {
+    curr = curr->prev;
+  }
+  return curr;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -42,6 +42,7 @@ class Instr {

  const OpcodeInfo* opcode;
  uint16_t flags;
+  uint16_t backend_flags;  // backends may do whatever they wish with this
  uint32_t ordinal;

  typedef union {
@ -158,6 +159,11 @@ if both are constant, return nullptr, nullptr
      call_for_values(src3.value, 2);
    }
  }
+  bool IsFake() const;
+
+  // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
+  // OPCODE_SOURCE_OFFSET
+  const hir::Instr* GetNonFakePrev() const;
 };

 }  // namespace hir
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -281,7 +281,10 @@ enum Opcode {
  OPCODE_ATOMIC_COMPARE_EXCHANGE,
  OPCODE_SET_ROUNDING_MODE,
  OPCODE_VECTOR_DENORMFLUSH,  // converts denormals to signed zeros in a vector
-  __OPCODE_MAX_VALUE,         // Keep at end.
+  OPCODE_TO_SINGLE,  // i could not find a decent name to assign to this opcode,
+                     // as we already have OPCODE_ROUND. round double to float (
+                     // ppc "single" fpu instruction result rounding behavior )
+  __OPCODE_MAX_VALUE,  // Keep at end.
 };

 enum OpcodeFlags {
@ -352,7 +355,9 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
  return (signature & ~(0x7)) ==
         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }
-
+static bool IsOpcodeUnaryValue(uint32_t signature) {
+  return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3));
+}
 static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
                            OpcodeSignatureType& src1,
                            OpcodeSignatureType& src2,
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -679,4 +679,11 @@ DEFINE_OPCODE(
    "vector_denormflush",
    OPCODE_SIG_V_V,
    0
+)
+
+DEFINE_OPCODE(
+	OPCODE_TO_SINGLE,
+	"to_single",
+	OPCODE_SIG_V_V,
+	0
 )
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -1643,6 +1643,11 @@ void Value::DenormalFlush() {
    constant.v128.u32[i] = current_element;
  }
 }
+void Value::ToSingle() {
+  assert_true(type == FLOAT64_TYPE);
+
+  constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
+}
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
@ -1805,6 +1810,25 @@ hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
    return nullptr;
  }
 }
+// does the value only have one instr that uses it?
+bool Value::HasSingleUse() const {
+  return use_head && use_head->next == nullptr;
+}
+bool Value::AllUsesByOneInsn() const {
+  if (!use_head) {
+    return false;
+  }
+  const Use* first_use = use_head;
+  const Instr* should_match = first_use->instr;
+
+  for (const Use* current_use = first_use->next; current_use;
+       current_use = current_use->next) {
+    if (current_use->instr != should_match) {
+      return false;
+    }
+  }
+  return true;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -226,6 +226,15 @@ class Value {
    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
  }
  inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
+
+  inline bool IsEqual(const Value* other) const {
+    if (this == other) {
+      return true;
+    } else if ((this->flags & other->flags) & VALUE_IS_CONSTANT) {
+      return this->IsConstantEQ(other);
+    }
+    return false;
+  }
  bool IsConstantTrue() const {
    if (type == VEC128_TYPE) {
      assert_always();
@ -327,7 +336,7 @@ class Value {
      return false;
    }
  }
-  bool IsConstantEQ(Value* other) const {
+  bool IsConstantEQ(const Value* other) const {
    if (type == VEC128_TYPE) {
      assert_always();
    }
@ -594,13 +603,19 @@ class Value {
                     bool saturate);
  void ByteSwap();
  void DenormalFlush();
-
+  void ToSingle();
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
  // tunnel_flags is updated to the kinds we actually traversed
  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);

+  // does the value only have one instr that uses it?
+  bool HasSingleUse() const;
+  // returns true if every single use is as an operand to a single instruction
+  // (add var2, var1, var1)
+  bool AllUsesByOneInsn() const;
+
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -379,7 +379,7 @@ typedef struct alignas(64) PPCContext_s {
  uint64_t lr;      // 0x10 Link register
  double f[32];     // 0x120 Floating-point registers
  vec128_t v[128];  // 0x220 VMX128 vector registers
-
+  vec128_t vscr_vec;
  // XER register:
  // Split to make it easier to do individual updates.
  uint8_t xer_ca;
@ -422,7 +422,7 @@ typedef struct alignas(64) PPCContext_s {
  // Value of last reserved load
  uint64_t reserved_val;
  ThreadState* thread_state;
-  uint8_t* virtual_membase;  
+  uint8_t* virtual_membase;
  static std::string GetRegisterName(PPCRegister reg);
  std::string GetStringFromValue(PPCRegister reg) const;
  void SetValueFromString(PPCRegister reg, std::string value);
@ -432,6 +432,7 @@ typedef struct alignas(64) PPCContext_s {
                            std::string& result) const;
 } PPCContext;
 #pragma pack(pop)
+constexpr size_t ppcctx_size = sizeof(PPCContext);
 static_assert(sizeof(PPCContext) % 64 == 0, "64b padded");

 }  // namespace ppc
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -355,13 +355,18 @@ int InstrEmit_stvrxl128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // is this the right format?
+
+  f.StoreVR(i.VX128_1.RB,
+            f.LoadContext(offsetof(PPCContext, vscr_vec), VEC128_TYPE));
+  return 0;
 }

 int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // is this the right format?
+  Value* v = f.LoadVR(i.VX128_1.RB);
+  f.StoreContext(offsetof(PPCContext, vscr_vec), v);
+  return 0;
 }

 int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
@ -1105,7 +1110,7 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZ.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
  Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  //chrispy: denormal outputs for Dot product are unconditionally made 0
+  // chrispy: denormal outputs for Dot product are unconditionally made 0
  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -336,6 +336,7 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
    XEINSTRNOTIMPLEMENTED();
    return 1;
  }
+
  Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
                                  f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
                          INT64_TYPE);
@ -353,6 +354,7 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
    XEINSTRNOTIMPLEMENTED();
    return 1;
  }
+
  Value* v = f.ZeroExtend(
      f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
              f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -46,7 +46,7 @@ int InstrEmit_faddx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_faddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) + (frB)
  Value* v = f.Add(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -63,7 +63,7 @@ int InstrEmit_fdivx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fdivsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- frA / frB
  Value* v = f.Div(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -80,7 +80,7 @@ int InstrEmit_fmulx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) x (frC)
  Value* v = f.Mul(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -88,9 +88,9 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- 1.0 / (frB)
-  Value* v = f.Convert(f.Div(f.LoadConstantFloat32(1.0f),
-                             f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)),
-                       FLOAT64_TYPE);
+
+  Value* v = f.Recip(f.LoadFPR(i.A.FRB));
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -116,7 +116,7 @@ int InstrEmit_fsubx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) - (frB)
  Value* v = f.Sub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -132,64 +132,63 @@ int InstrEmit_fselx(PPCHIRBuilder& f, const InstrData& i) {
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
-
-int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
-  // Double precision:
+static int InstrEmit_fsqrt(PPCHIRBuilder& f, const InstrData& i, bool single) {
  // frD <- sqrt(frB)
  Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
+int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
+  return InstrEmit_fsqrt(f, i, false);
+}

 int InstrEmit_fsqrtsx(PPCHIRBuilder& f, const InstrData& i) {
-  // Single precision:
-  // frD <- sqrt(frB)
-  Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fsqrt(f, i, true);
 }

 // Floating-point multiply-add (A-9)

-int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
+static int InstrEmit_fmadd(PPCHIRBuilder& f, const InstrData& i, bool single) {
  // frD <- (frA x frC) + frB
  Value* v =
      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }

+int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
+  return InstrEmit_fmadd(f, i, false);
+}
+
 int InstrEmit_fmaddsx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) + frB
+  return InstrEmit_fmadd(f, i, true);
+}
+
+static int InstrEmit_fmsub(PPCHIRBuilder& f, const InstrData& i, bool single) {
+  // frD <- (frA x frC) - frB
  Value* v =
-      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
-
 int InstrEmit_fmsubx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) - frB
-  Value* v =
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fmsub(f, i, false);
 }

 int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) - frB
-  Value* v =
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fmsub(f, i, true);
 }

 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
@ -205,7 +204,7 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
  Value* v = f.Neg(
      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -224,7 +223,7 @@ int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
  Value* v = f.Neg(
      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;