Merge branch 'canary_experimental' of https://github.com/xenia-canary/xenia-canary into canary_experimental

2022-07-24 17:58:48 +02:00 · 2022-07-24 17:58:48 +02:00 · 6730ffb7d3
parent 6e501fbd61 5c18f3a5dd
commit 6730ffb7d3
23 changed files with 1299 additions and 786 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -688,7 +688,12 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
  vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
 #endif
 }
-
+void X64Backend::InitializeBackendContext(void* ctx) {
+  X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
+      reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
+  bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
+  bctx->Ox1000 = 0x1000;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -31,6 +31,16 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();

+// located prior to the ctx register
+// some things it would be nice to have be per-emulator instance instead of per
+// context (somehow placing a global X64BackendCtx prior to membase, so we can
+// negatively index the membase reg)
+struct X64BackendContext {
+  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
+  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
+                        // add of it by... 2 bytes lol
+};
+
 class X64Backend : public Backend {
 public:
  static const uint32_t kForceReturnAddress = 0x9FFF0000u;
@ -65,6 +75,7 @@ class X64Backend : public Backend {
  void InstallBreakpoint(Breakpoint* breakpoint) override;
  void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
  void UninstallBreakpoint(Breakpoint* breakpoint) override;
+  virtual void InitializeBackendContext(void* ctx) override;

 private:
  static bool ExceptionCallbackThunk(Exception* ex, void* data);
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -105,6 +105,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
 #undef TEST_EMIT_FEATURE
  /*
  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
@ -121,6 +122,10 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
    bool is_zennish = cpu_.displayFamily >= 0x17;

    if (is_zennish) {
+      // ik that i heard somewhere that this is the case for zen, but i need to
+      // verify. cant find my original source for that.
+      // todo: ask agner?
+      feature_flags_ |= kX64FlagsIndependentVars;
      feature_flags_ |= kX64FastJrcx;

      if (cpu_.displayFamily > 0x17) {
@ -132,6 +137,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
         // for my cpu, which is ripper90
    }
  }
+  may_use_membase32_as_zero_reg_ =
+      static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
+          processor()->memory()->virtual_membase())) == 0;
 }

 X64Emitter::~X64Emitter() = default;
@ -210,6 +218,11 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
  top_ = old_address;
  reset();
  call_sites_.clear();
+  tail_code_.clear();
+  for (auto&& cached_label : label_cache_) {
+    delete cached_label;
+  }
+  label_cache_.clear();
  return new_execute_address;
 }

@ -261,13 +274,14 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
-
+  xor_(eax, eax);
  /*
  * chrispy: removed this, it serves no purpose
  mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
  */
  mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
-  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
+
+  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);  // 0

  // Safe now to do some tracing.
  if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
@ -343,6 +357,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  add(rsp, (uint32_t)stack_size);
  ret();
+  // todo: do some kind of sorting by alignment?
+  for (auto&& tail_item : tail_code_) {
+    if (tail_item.alignment) {
+      align(tail_item.alignment);
+    }
+    tail_item.func(*this, tail_item.label);
+  }

  code_offsets.tail = getSize();

@ -605,12 +626,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
      // rdx = arg0
      // r8  = arg1
      // r9  = arg2
-      auto thunk = backend()->guest_to_host_thunk();
-      mov(rax, reinterpret_cast<uint64_t>(thunk));
      mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
      mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
      mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
-      call(rax);
+      call(backend()->guest_to_host_thunk());
      // rax = host return
    }
  } else if (function->behavior() == Function::Behavior::kExtern) {
@ -621,12 +640,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
      // rdx = arg0
      // r8  = arg1
      // r9  = arg2
-      auto thunk = backend()->guest_to_host_thunk();
-      mov(rax, reinterpret_cast<uint64_t>(thunk));
      mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
      mov(rdx,
          qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
-      call(rax);
+      call(backend()->guest_to_host_thunk());
      // rax = host return
    }
  }
@ -656,10 +673,8 @@ void X64Emitter::CallNativeSafe(void* fn) {
  // rdx = arg0
  // r8  = arg1
  // r9  = arg2
-  auto thunk = backend()->guest_to_host_thunk();
-  mov(rax, reinterpret_cast<uint64_t>(thunk));
  mov(rcx, reinterpret_cast<uint64_t>(fn));
-  call(rax);
+  call(backend()->guest_to_host_thunk());
  // rax = host return
 }

@ -715,24 +730,50 @@ bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
  }
  return false;
 }
-
+/*
+    WARNING: do not use any regs here, addr is often produced by
+   ComputeAddressOffset, which may use rax/rdx/rcx in its addr expression
+*/
 void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
-  if ((v & ~0x7FFFFFFF) == 0) {
+  uint32_t lowpart = static_cast<uint32_t>(v);
+  uint32_t highpart = static_cast<uint32_t>(v >> 32);
+  // check whether the constant coincidentally collides with our membase
+  if (v == (uintptr_t)processor()->memory()->virtual_membase()) {
+    mov(qword[addr], GetMembaseReg());
+  } else if ((v & ~0x7FFFFFFF) == 0) {
    // Fits under 31 bits, so just load using normal mov.
+
    mov(qword[addr], v);
  } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
    // Negative number that fits in 32bits.
    mov(qword[addr], v);
-  } else if (!(v >> 32)) {
+  } else if (!highpart) {
    // All high bits are zero. It'd be nice if we had a way to load a 32bit
    // immediate without sign extending!
    // TODO(benvanik): this is super common, find a better way.
-    mov(dword[addr], static_cast<uint32_t>(v));
-    mov(dword[addr + 4], 0);
+    if (lowpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr], static_cast<uint32_t>(v));
+    }
+    if (CanUseMembaseLow32As0()) {
+      mov(dword[addr + 4], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr + 4], 0);
+    }
  } else {
    // 64bit number that needs double movs.
-    mov(dword[addr], static_cast<uint32_t>(v));
-    mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
+
+    if (lowpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr], lowpart);
+    }
+    if (highpart == 0 && CanUseMembaseLow32As0()) {
+      mov(dword[addr + 4], GetMembaseReg().cvt32());
+    } else {
+      mov(dword[addr + 4], highpart);
+    }
  }
 }
 static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
@ -893,7 +934,13 @@ static const vec128_t xmm_consts[] = {
    /* XMMThreeFloatMask */
    vec128i(~0U, ~0U, ~0U, 0U),
    /*XMMXenosF16ExtRangeStart*/
-    vec128f(65504)};
+    vec128f(65504),
+    /*XMMVSRShlByteshuf*/
+    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
+    // XMMVSRMask
+    vec128b(1)
+
+};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1300,6 +1347,27 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {

  return SimdDomain::DONTCARE;
 }
+Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
+  /*
+    index context ptr negatively to get to backend ctx field
+  */
+  ptrdiff_t delta = (-static_cast<ptrdiff_t>(sizeof(X64BackendContext))) +
+                    offset_in_x64backendctx;
+  return ptr[GetContextReg() + static_cast<int>(delta)];
+}
+Xbyak::Label& X64Emitter::AddToTail(TailEmitCallback callback,
+                                    uint32_t alignment) {
+  TailEmitter emitter{};
+  emitter.func = std::move(callback);
+  emitter.alignment = alignment;
+  tail_code_.push_back(std::move(emitter));
+  return tail_code_.back().label;
+}
+Xbyak::Label& X64Emitter::NewCachedLabel() {
+  Xbyak::Label* tmp = new Xbyak::Label;
+  label_cache_.push_back(tmp);
+  return *tmp;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -155,7 +155,15 @@ enum XmmConst {
  XMMLVSRTableBase,
  XMMSingleDenormalMask,
  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
-  XMMXenosF16ExtRangeStart
+  XMMXenosF16ExtRangeStart,
+  XMMVSRShlByteshuf,
+  XMMVSRMask
+};
+// X64Backend specific Instr->runtime_flags
+enum : uint32_t {
+  INSTR_X64_FLAGS_ELIMINATED =
+      1,  // another sequence marked this instruction as not needing codegen,
+          // meaning they likely already handled it
 };

 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@ -185,7 +193,13 @@ enum X64EmitterFeatureFlags {
  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
  kX64FastLoop =
      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
-  kX64EmitAVX512VBMI = 1 << 14
+  kX64EmitAVX512VBMI = 1 << 14,
+  kX64FlagsIndependentVars =
+      1 << 15,  // if true, instructions that only modify some flags (like
+                // inc/dec) do not introduce false dependencies on EFLAGS
+                // because the individual flags are treated as different vars by
+                // the processor. (this applies to zen)
+  kX64EmitPrefetchW = 1 << 16
 };
 class ResolvableGuestCall {
 public:
@ -194,6 +208,13 @@ class ResolvableGuestCall {
  // rgcid
  unsigned offset_;
 };
+class X64Emitter;
+using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
+struct TailEmitter {
+  Xbyak::Label label;
+  uint32_t alignment;
+  TailEmitCallback func;
+};

 class X64Emitter : public Xbyak::CodeGenerator {
 public:
@ -264,7 +285,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Reg64 GetContextReg();
  Xbyak::Reg64 GetMembaseReg();
-
+  bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
  void ReloadMembase();

  void nop(size_t length = 1);
@ -274,6 +295,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
  void MovMem64(const Xbyak::RegExp& addr, uint64_t v);

  Xbyak::Address GetXmmConstPtr(XmmConst id);
+  Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
+
  void LoadConstantXmm(Xbyak::Xmm dest, float v);
  void LoadConstantXmm(Xbyak::Xmm dest, double v);
  void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
@ -289,6 +312,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
    return (feature_flags_ & feature_flag) == feature_flag;
  }

+  Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
+  Xbyak::Label& NewCachedLabel();
  FunctionDebugInfo* debug_info() const { return debug_info_; }

  size_t stack_size() const { return stack_size_; }
@ -324,6 +349,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
  static const uint32_t xmm_reg_map_[XMM_COUNT];
  uint32_t current_rgc_id_ = 0xEEDDF00F;
  std::vector<ResolvableGuestCall> call_sites_;
+  /*
+    set to true if the low 32 bits of membase == 0.
+    only really advantageous if you are storing 32 bit 0 to a displaced address,
+    which would have to represent 0 as 4 bytes
+  */
+  bool may_use_membase32_as_zero_reg_;
+  std::vector<TailEmitter> tail_code_;
+  std::vector<Xbyak::Label*>
+      label_cache_;  // for creating labels that need to be referenced much
+                     // later by tail emitters
 };

 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@ -109,7 +109,6 @@ struct DEBUG_BREAK_TRUE_I32
    : Sequence<DEBUG_BREAK_TRUE_I32,
               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    if (e.IsFeatureEnabled(kX64FastJrcx)) {
      e.mov(e.ecx, i.src1);
      Xbyak::Label skip;
@ -187,77 +186,48 @@ EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
 struct TRAP_TRUE_I8
    : Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    Xbyak::Label& after = e.NewCachedLabel();
+    unsigned flags = i.instr->flags;
+    Xbyak::Label& dotrap =
+        e.AddToTail([flags, &after](X64Emitter& e, Xbyak::Label& me) {
+          e.L(me);
+          e.Trap(flags);
+          // does Trap actually return control to the guest?
+          e.jmp(after, X64Emitter::T_NEAR);
+        });
    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    e.jnz(dotrap, X64Emitter::T_NEAR);
+    e.L(after);
  }
 };
 struct TRAP_TRUE_I16
    : Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_I16);
  }
 };
 struct TRAP_TRUE_I32
    : Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (e.IsFeatureEnabled(kX64FastJrcx)) {
-      e.mov(e.ecx, i.src1);
-      Xbyak::Label skip;
-      e.jrcxz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    } else {
-      e.test(i.src1, i.src1);
-      Xbyak::Label skip;
-      e.jz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    }
+    assert_impossible_sequence(TRAP_TRUE_I32);
  }
 };
 struct TRAP_TRUE_I64
    : Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (e.IsFeatureEnabled(kX64FastJrcx)) {
-      e.mov(e.rcx, i.src1);
-      Xbyak::Label skip;
-      e.jrcxz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    } else {
-      e.test(i.src1, i.src1);
-      Xbyak::Label skip;
-      e.jz(skip);
-      e.Trap(i.instr->flags);
-      e.L(skip);
-    }
+    assert_impossible_sequence(TRAP_TRUE_I64);
  }
 };
 struct TRAP_TRUE_F32
    : Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_F32);
  }
 };
 struct TRAP_TRUE_F64
    : Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
+    assert_impossible_sequence(TRAP_TRUE_F64);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
@ -333,6 +303,7 @@ struct CALL_TRUE_F32
    e.L(skip);
  }
 };
+
 struct CALL_TRUE_F64
    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -388,7 +359,6 @@ struct CALL_INDIRECT_TRUE_I32
    : Sequence<CALL_INDIRECT_TRUE_I32,
               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    if (e.IsFeatureEnabled(kX64FastJrcx)) {
      e.mov(e.ecx, i.src1);
      Xbyak::Label skip;
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -14,6 +14,7 @@

 #include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
+#include "xenia/cpu/backend/x64/x64_backend.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
@ -28,8 +29,127 @@ namespace cpu {
 namespace backend {
 namespace x64 {

+struct LoadModStore {
+  const hir::Instr* load;
+  hir::Instr* modify;
+  hir::Instr* store;
+
+  bool is_constant[3];
+  void Consume();
+};
+void LoadModStore::Consume() {
+  modify->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
+  store->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
+}
+static bool GetLoadModStore(const hir::Instr* loadinsn, LoadModStore* out) {
+  if (IsTracingData()) {
+    return false;
+  }
+  // if (!loadinsn->dest->HasSingleUse()) {
+  // allow the value to be used multiple times, as long as it is by the same
+  // instruction
+  if (!loadinsn->dest->AllUsesByOneInsn()) {
+    return false;
+  }
+  hir::Instr* use = loadinsn->dest->use_head->instr;
+
+  if (!use->dest || !use->dest->HasSingleUse() ||
+      use->GetNonFakePrev() != loadinsn) {
+    return false;
+  }
+
+  hir::Instr* shouldbstore = use->dest->use_head->instr;
+
+  if (shouldbstore->dest || shouldbstore->GetNonFakePrev() != use) {
+    return false;  // store insns have no destination
+  }
+  use->VisitValueOperands([out](Value* v, uint32_t idx) {
+    out->is_constant[idx] = v->IsConstant();
+  });
+  out->load = loadinsn;
+  out->modify = use;
+  out->store = shouldbstore;
+  return true;
+}
+struct LoadModStoreContext : public LoadModStore {
+  uint64_t offset;  // ctx offset
+  TypeName type;
+  Opcode op;
+  bool is_commutative;
+  bool is_unary;
+  bool is_binary;
+  bool
+      binary_uses_twice;  // true if binary_other == our value. (for instance,
+                          // add r11, r10, r10, which can be gen'ed for r10 * 2)
+  hir::Value* binary_other;
+
+  hir::Value::ConstantValue* other_const;
+  uint32_t other_index;
+};
+static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
+                                   LoadModStoreContext* out) {
+  if (!GetLoadModStore(loadinsn, out)) {
+    return false;
+  }
+
+  if (out->load->opcode->num != OPCODE_LOAD_CONTEXT ||
+      out->store->opcode->num != OPCODE_STORE_CONTEXT) {
+    return false;
+  }
+
+  if (out->modify->opcode->flags &
+      (OPCODE_FLAG_VOLATILE | OPCODE_FLAG_MEMORY)) {
+    return false;
+  }
+  uint64_t offs = out->load->src1.offset;
+
+  if (offs != out->store->src1.offset) {
+    return false;
+  }
+
+  TypeName typ = out->load->dest->type;
+  // can happen if op is a conversion
+  if (typ != out->store->src2.value->type) {
+    return false;
+  }
+  /*
+        set up a whole bunch of convenience fields for the caller
+  */
+  out->offset = offs;
+  out->type = typ;
+  const OpcodeInfo& opinf = *out->modify->opcode;
+  out->op = opinf.num;
+  out->is_commutative = opinf.flags & OPCODE_FLAG_COMMUNATIVE;
+  out->is_unary = IsOpcodeUnaryValue(opinf.signature);
+  out->is_binary = IsOpcodeBinaryValue(opinf.signature);
+  out->binary_uses_twice = false;
+  out->binary_other = nullptr;
+  out->other_const = nullptr;
+  out->other_index = ~0U;
+  if (out->is_binary) {
+    if (out->modify->src1.value == out->load->dest) {
+      out->binary_other = out->modify->src2.value;
+      out->other_index = 1;
+    } else {
+      out->binary_other = out->modify->src1.value;
+      out->other_index = 0;
+    }
+    if (out->binary_other && out->is_constant[out->other_index]) {
+      out->other_const = &out->binary_other->constant;
+    }
+    if (out->binary_other == out->load->dest) {
+      out->binary_uses_twice = true;
+    }
+  }
+  return true;
+}
 volatile int anchor_memory = 0;

+static void Do0x1000Add(X64Emitter& e, Reg32 reg) {
+  e.add(reg, e.GetBackendCtxPtr(offsetof(X64BackendContext, Ox1000)));
+  // e.add(reg, 0x1000);
+}
+
 // Note: all types are always aligned in the context.
 RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
  return e.GetContextReg() + offset.value;
@ -58,51 +178,6 @@ static bool is_definitely_not_eo(const T& v) {

  return is_eo_def(v.value);
 }
-template <typename T>
-RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
-                                  const T& offset) {
-  assert_true(offset.is_constant);
-  int32_t offset_const = static_cast<int32_t>(offset.constant());
-
-  if (guest.is_constant) {
-    uint32_t address = static_cast<uint32_t>(guest.constant());
-    address += offset_const;
-    if (address < 0x80000000) {
-      return e.GetMembaseReg() + address;
-    } else {
-      if (address >= 0xE0000000 &&
-          xe::memory::allocation_granularity() > 0x1000) {
-        e.mov(e.eax, address + 0x1000);
-      } else {
-        e.mov(e.eax, address);
-      }
-      return e.GetMembaseReg() + e.rax;
-    }
-  } else {
-    if (xe::memory::allocation_granularity() > 0x1000 &&
-        !is_definitely_not_eo(guest)) {
-      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
-      // it via memory mapping.
-
-      // todo: do branching or use an alt membase and cmov
-      e.xor_(e.eax, e.eax);
-      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
-
-      e.cmp(e.edx, e.GetContextReg().cvt32());
-      e.setae(e.al);
-      e.shl(e.eax, 12);
-      e.add(e.eax, e.edx);
-      return e.GetMembaseReg() + e.rax;
-
-    } else {
-      // Clear the top 32 bits, as they are likely garbage.
-      // TODO(benvanik): find a way to avoid doing this.
-
-      e.mov(e.eax, guest.reg().cvt32());
-    }
-    return e.GetMembaseReg() + e.rax + offset_const;
-  }
-}
 // Note: most *should* be aligned, but needs to be checked!
 template <typename T>
 RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
@ -127,11 +202,23 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
        !is_definitely_not_eo(guest)) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.xor_(e.eax, e.eax);
+      Xbyak::Label& jmpback = e.NewCachedLabel();
+
+      e.mov(e.eax, guest.reg().cvt32());
+
      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.al);
-      e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&jmpback](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+            Do0x1000Add(e, e.eax);
+            e.jmp(jmpback, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+
+      e.L(jmpback);
+      return e.GetMembaseReg() + e.rax;
+
    } else {
      // Clear the top 32 bits, as they are likely garbage.
      // TODO(benvanik): find a way to avoid doing this.
@ -140,6 +227,64 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
    return e.GetMembaseReg() + e.rax;
  }
 }
+template <typename T>
+RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
+                                  const T& offset) {
+  assert_true(offset.is_constant);
+  int32_t offset_const = static_cast<int32_t>(offset.constant());
+  if (offset_const == 0) {
+    return ComputeMemoryAddress(e, guest);
+  }
+  if (guest.is_constant) {
+    uint32_t address = static_cast<uint32_t>(guest.constant());
+    address += offset_const;
+    if (address < 0x80000000) {
+      return e.GetMembaseReg() + address;
+    } else {
+      if (address >= 0xE0000000 &&
+          xe::memory::allocation_granularity() > 0x1000) {
+        e.mov(e.eax, address + 0x1000);
+      } else {
+        e.mov(e.eax, address);
+      }
+      return e.GetMembaseReg() + e.rax;
+    }
+  } else {
+    if (xe::memory::allocation_granularity() > 0x1000 &&
+        !is_definitely_not_eo(guest)) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+
+      // todo: do branching or use an alt membase and cmov
+
+      Xbyak::Label& tmplbl = e.NewCachedLabel();
+
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
+
+      e.cmp(e.edx, e.GetContextReg().cvt32());
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.edx);
+
+            e.jmp(tmplbl, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+
+      e.L(tmplbl);
+      return e.GetMembaseReg() + e.rdx;
+
+    } else {
+      // Clear the top 32 bits, as they are likely garbage.
+      // TODO(benvanik): find a way to avoid doing this.
+
+      e.mov(e.eax, guest.reg().cvt32());
+    }
+    return e.GetMembaseReg() + e.rax + offset_const;
+  }
+}

 // ============================================================================
 // OPCODE_ATOMIC_EXCHANGE
@ -214,11 +359,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
+      e.mov(e.ecx, i.src1.reg().cvt32());
      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.cl);
-      e.movzx(e.ecx, e.cl);
-      e.shl(e.ecx, 12);
-      e.add(e.ecx, i.src1.reg().cvt32());
+      Xbyak::Label& backtous = e.NewCachedLabel();
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.ecx);
+
+            e.jmp(backtous, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+      e.L(backtous);
    } else {
      e.mov(e.ecx, i.src1.reg().cvt32());
    }
@ -235,11 +389,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
+      e.mov(e.ecx, i.src1.reg().cvt32());
      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-      e.setae(e.cl);
-      e.movzx(e.ecx, e.cl);
-      e.shl(e.ecx, 12);
-      e.add(e.ecx, i.src1.reg().cvt32());
+      Xbyak::Label& backtous = e.NewCachedLabel();
+
+      Xbyak::Label& fixup_label =
+          e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
+            e.L(our_tail_label);
+
+            Do0x1000Add(e, e.ecx);
+
+            e.jmp(backtous, e.T_NEAR);
+          });
+      e.jae(fixup_label, e.T_NEAR);
+      e.L(backtous);
    } else {
      e.mov(e.ecx, i.src1.reg().cvt32());
    }
@ -319,25 +482,44 @@ struct STORE_LOCAL_I8
    e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
  }
 };
+
+template <typename T>
+static bool LocalStoreMayUseMembaseLow(X64Emitter& e, const T& i) {
+  return i.src2.is_constant && i.src2.constant() == 0 &&
+         e.CanUseMembaseLow32As0();
+}
 struct STORE_LOCAL_I16
    : Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
+    if (LocalStoreMayUseMembaseLow(e, i)) {
+      e.mov(e.word[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt16());
+    } else {
+      e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_I32
    : Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
+    if (LocalStoreMayUseMembaseLow(e, i)) {
+      e.mov(e.dword[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt32());
+    } else {
+      e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_I64
    : Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
+    if (i.src2.is_constant && i.src2.constant() == 0) {
+      e.xor_(e.eax, e.eax);
+      e.mov(e.qword[e.rsp + i.src1.constant()], e.rax);
+    } else {
+      e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
+    }
  }
 };
 struct STORE_LOCAL_F32
@ -404,10 +586,133 @@ struct LOAD_CONTEXT_I32
    }
  }
 };
+template <typename EmitArgType>
+static bool HandleLMS64Binary(X64Emitter& e, const EmitArgType& i,
+                              LoadModStoreContext& lms, Xbyak::RegExp& addr) {
+  uint64_t other_const_val = 0;
+  bool const_fits_in_insn = false;
+  if (lms.other_const) {
+    other_const_val = lms.other_const->u64;
+    const_fits_in_insn = e.ConstantFitsIn32Reg(other_const_val);
+  }
+
+  /*
+        this check is here because we currently cannot handle other variables
+     with this
+  */
+  if (!lms.other_const && !lms.binary_uses_twice) {
+    return false;
+  }
+
+  if (lms.op == OPCODE_ADD) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        if (other_const_val == 1 &&
+            e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
+          e.inc(e.qword[addr]);
+        } else {
+          e.add(e.qword[addr], (uint32_t)other_const_val);
+        }
+
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.add(e.qword[addr], e.rax);
+      }
+      return true;
+    } else if (lms.binary_uses_twice) {
+      // we're being added to ourselves, we are a multiply by 2
+
+      e.shl(e.qword[addr], 1);
+      return true;
+    } else if (lms.binary_other) {
+      return false;  // cannot handle other variables right now.
+    }
+  } else if (lms.op == OPCODE_SUB) {
+    if (lms.other_index != 1) {
+      return false;  // if we are the second operand, we cant combine memory
+                     // access and operation
+    }
+
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        if (other_const_val == 1 &&
+            e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
+          e.dec(e.qword[addr]);
+        } else {
+          e.sub(e.qword[addr], (uint32_t)other_const_val);
+        }
+
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.sub(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+
+  } else if (lms.op == OPCODE_AND) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.and_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.and_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  } else if (lms.op == OPCODE_OR) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.or_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.or_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  } else if (lms.op == OPCODE_XOR) {
+    if (lms.other_const) {
+      if (const_fits_in_insn) {
+        e.xor_(e.qword[addr], (uint32_t)other_const_val);
+      } else {
+        e.mov(e.rax, other_const_val);
+        e.xor_(e.qword[addr], e.rax);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+template <typename EmitArgType>
+static bool HandleLMS64Unary(X64Emitter& e, const EmitArgType& i,
+                             LoadModStoreContext& lms, Xbyak::RegExp& addr) {
+  Opcode op = lms.op;
+
+  if (op == OPCODE_NOT) {
+    e.not_(e.qword[addr]);
+    return true;
+  } else if (op == OPCODE_NEG) {
+    e.neg(e.qword[addr]);
+    return true;
+  }
+
+  return false;
+}
 struct LOAD_CONTEXT_I64
    : Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
+    LoadModStoreContext lms{};
+    if (GetLoadModStoreContext(i.instr, &lms)) {
+      if (lms.is_binary && HandleLMS64Binary(e, i, lms, addr)) {
+        lms.Consume();
+        return;
+      } else if (lms.is_unary && HandleLMS64Unary(e, i, lms, addr)) {
+        lms.Consume();
+        return;
+      }
+    }
+
    e.mov(i.dest, e.qword[addr]);
    if (IsTracingData()) {
      e.mov(e.GetNativeParam(1), e.qword[addr]);
@ -483,7 +788,11 @@ struct STORE_CONTEXT_I16
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
-      e.mov(e.word[addr], i.src2.constant());
+      if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
+        e.mov(e.word[addr], e.GetMembaseReg().cvt16());
+      } else {
+        e.mov(e.word[addr], i.src2.constant());
+      }
    } else {
      e.mov(e.word[addr], i.src2);
    }
@ -500,7 +809,11 @@ struct STORE_CONTEXT_I32
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
-      e.mov(e.dword[addr], i.src2.constant());
+      if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
+        e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
+      } else {
+        e.mov(e.dword[addr], i.src2.constant());
+      }
    } else {
      e.mov(e.dword[addr], i.src2);
    }
@ -569,9 +882,14 @@ struct STORE_CONTEXT_V128
    auto addr = ComputeContextAddress(e, i.src1);
    if (i.src2.is_constant) {
      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.vmovaps(e.ptr[addr], e.xmm0);
+      e.vmovdqa(e.ptr[addr], e.xmm0);
    } else {
-      e.vmovaps(e.ptr[addr], i.src2);
+      SimdDomain domain = e.DeduceSimdDomain(i.src2.value);
+      if (domain == SimdDomain::FLOATING) {
+        e.vmovaps(e.ptr[addr], i.src2);
+      } else {
+        e.vmovdqa(e.ptr[addr], i.src2);
+      }
    }
    if (IsTracingData()) {
      e.lea(e.GetNativeParam(1), e.ptr[addr]);
@ -735,7 +1053,11 @@ struct STORE_OFFSET_I16
      }
    } else {
      if (i.src3.is_constant) {
-        e.mov(e.word[addr], i.src3.constant());
+        if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
+          e.mov(e.word[addr], e.GetMembaseReg().cvt16());
+        } else {
+          e.mov(e.word[addr], i.src3.constant());
+        }
      } else {
        e.mov(e.word[addr], i.src3);
      }
@ -757,7 +1079,11 @@ struct STORE_OFFSET_I32
      }
    } else {
      if (i.src3.is_constant) {
-        e.mov(e.dword[addr], i.src3.constant());
+        if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
+          e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
+        } else {
+          e.mov(e.dword[addr], i.src3.constant());
+        }
      } else {
        e.mov(e.dword[addr], i.src3);
      }
@ -895,7 +1221,7 @@ struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeMemoryAddress(e, i.src1);
    // TODO(benvanik): we should try to stick to movaps if possible.
-    e.vmovups(i.dest, e.ptr[addr]);
+    e.vmovdqa(i.dest, e.ptr[addr]);
    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
      // TODO(benvanik): find a way to do this without the memory load.
      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
@ -1054,13 +1380,15 @@ struct STORE_V128
    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
      assert_false(i.src2.is_constant);
      e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
-      e.vmovaps(e.ptr[addr], e.xmm0);
+      // changed from vmovaps, the penalty on the vpshufb is unavoidable but
+      // we dont need to incur another here too
+      e.vmovdqa(e.ptr[addr], e.xmm0);
    } else {
      if (i.src2.is_constant) {
        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.vmovaps(e.ptr[addr], e.xmm0);
+        e.vmovdqa(e.ptr[addr], e.xmm0);
      } else {
-        e.vmovaps(e.ptr[addr], i.src2);
+        e.vmovdqa(e.ptr[addr], i.src2);
      }
    }
    if (IsTracingData()) {
@ -1081,10 +1409,12 @@ struct CACHE_CONTROL
    : Sequence<CACHE_CONTROL,
               I<OPCODE_CACHE_CONTROL, VoidOp, I64Op, OffsetOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    bool is_clflush = false, is_prefetch = false;
+    bool is_clflush = false, is_prefetch = false, is_prefetchw = false;
    switch (CacheControlType(i.instr->flags)) {
-      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE:
+        is_prefetchw = true;
+        break;
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
        is_prefetch = true;
        break;
      case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE:
@ -1095,6 +1425,11 @@ struct CACHE_CONTROL
        assert_unhandled_case(CacheControlType(i.instr->flags));
        return;
    }
+    if (is_prefetchw && !e.IsFeatureEnabled(kX64EmitPrefetchW)) {
+      is_prefetchw = false;
+      is_prefetch = true;  // cant prefetchw, cpu doesnt have it (unlikely to
+                           // happen). just prefetcht0
+    }
    size_t cache_line_size = i.src2.value;

    RegExp addr;
@ -1117,13 +1452,24 @@ struct CACHE_CONTROL
      }
    } else {
      if (xe::memory::allocation_granularity() > 0x1000) {
-        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
-        // it via memory mapping.
+        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't
+        // do it via memory mapping.
+        e.mov(e.eax, i.src1.reg().cvt32());
+
        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
-        e.setae(e.al);
-        e.movzx(e.eax, e.al);
-        e.shl(e.eax, 12);
-        e.add(e.eax, i.src1.reg().cvt32());
+
+        Xbyak::Label& tmplbl = e.NewCachedLabel();
+
+        Xbyak::Label& fixup_label =
+            e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
+              e.L(our_tail_label);
+
+              Do0x1000Add(e, e.eax);
+
+              e.jmp(tmplbl, e.T_NEAR);
+            });
+        e.jae(fixup_label, e.T_NEAR);
+        e.L(tmplbl);
      } else {
        // Clear the top 32 bits, as they are likely garbage.
        // TODO(benvanik): find a way to avoid doing this.
@ -1131,12 +1477,17 @@ struct CACHE_CONTROL
      }
      addr = e.GetMembaseReg() + e.rax;
    }
+    // todo: use clflushopt + sfence on cpus that support it
    if (is_clflush) {
      e.clflush(e.ptr[addr]);
    }
+
    if (is_prefetch) {
      e.prefetcht0(e.ptr[addr]);
    }
+    if (is_prefetchw) {
+      e.prefetchw(e.ptr[addr]);
+    }

    if (cache_line_size >= 128) {
      // Prefetch the other 64 bytes of the 128-byte cache line.
@ -1151,6 +1502,9 @@ struct CACHE_CONTROL
      if (is_prefetch) {
        e.prefetcht0(e.ptr[addr]);
      }
+      if (is_prefetchw) {
+        e.prefetchw(e.ptr[addr]);
+      }
      assert_true(cache_line_size == 128);
    }
  }
@ -1178,20 +1532,24 @@ struct MEMSET_I64_I8_I64
    assert_true(i.src2.constant() == 0);
    e.vpxor(e.xmm0, e.xmm0);
    auto addr = ComputeMemoryAddress(e, i.src1);
+    /*
+        chrispy: changed to vmovdqa, the mismatch between vpxor and vmovaps
+       was causing a 1 cycle stall before the first store
+    */
    switch (i.src3.constant()) {
      case 32:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
+
+        e.vmovdqa(e.ptr[addr], e.ymm0);
        break;
      case 128:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
+        // probably should lea the address beforehand
+        e.vmovdqa(e.ptr[addr + 0 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 2 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 4 * 16], e.ymm0);
+
+        e.vmovdqa(e.ptr[addr + 6 * 16], e.ymm0);
        break;
      default:
        assert_unhandled_case(i.src3.constant());
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@ -13,6 +13,8 @@
 #include "xenia/cpu/hir/instr.h"

 #include <unordered_map>
+#define assert_impossible_sequence(name) \
+  assert_always("impossible sequence hit" #name);

 namespace xe {
 namespace cpu {
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -749,7 +749,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-       
+
        case OPCODE_PERMUTE: {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
              i->src3.value->IsConstant() &&
@ -760,17 +760,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }

-          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
+          else if (i->src2.value->IsConstantZero() &&
+                   i->src3.value->IsConstantZero() &&
                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
            /*
-                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
+                chrispy: hoisted this check here from x64_seq_vector where if
+               src1 is not constant, but src2 and src3 are zero, then we know
+               the result will always be zero
            */

            v->set_zero(VEC128_TYPE);
            i->Remove();
            result = true;
          }
-          
+
          break;
        }
        case OPCODE_INSERT:
@ -930,6 +933,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
+        case OPCODE_TO_SINGLE:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->ToSingle();
+            i->Remove();
+            result = true;
+          }
+          break;
        default:
          // Ignored.
          break;
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -10,6 +10,7 @@
 #include "xenia/cpu/compiler/passes/simplification_pass.h"

 #include "xenia/base/byte_order.h"
+#include "xenia/base/logging.h"
 #include "xenia/base/profiling.h"
 namespace xe {
 namespace cpu {
@ -82,7 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
    iter_result |= SimplifyBitArith(builder);
    iter_result |= EliminateConversions(builder);
    iter_result |= SimplifyAssignments(builder);
-    iter_result |= BackpropTruncations(builder);
+
    result |= iter_result;
  } while (iter_result);
  return true;
@ -1207,71 +1208,6 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  return result;
 }

-struct TruncateSimplifier {
-  TypeName type_from, type_to;
-  uint32_t sizeof_from, sizeof_to;
-  uint32_t bit_sizeof_from, bit_sizeof_to;
-  uint64_t typemask_from, typemask_to;
-  hir::HIRBuilder* builder;
-  hir::Instr* truncate_instr;
-  hir::Value* truncated_value;
-  hir::Instr* truncated_value_def;
-};
-bool SimplificationPass::BackpropTruncations(hir::Instr* i,
-                                             hir::HIRBuilder* builder) {
-  if (i->opcode != &OPCODE_TRUNCATE_info) {
-    return false;
-  }
-  TypeName type_from = i->src1.value->type;
-  TypeName type_to = i->dest->type;
-
-  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
-  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
-
-  Instr* input_def = i->src1.value->GetDefSkipAssigns();
-  if (!input_def) {
-    return false;
-  }
-  Opcode input_opc = input_def->opcode->num;
-
-  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
-    uint32_t src2_shift = input_def->src2.value->AsUint32();
-    if (src2_shift < (sizeof_to * CHAR_BIT)) {
-      Value* truncated_preshift =
-          builder->Truncate(input_def->src1.value, type_to);
-
-      truncated_preshift->def->MoveBefore(i);
-      i->Replace(&OPCODE_SHL_info, 0);
-      i->set_src1(truncated_preshift);
-      i->set_src2(input_def->src2.value);
-      return true;
-    }
-  }
-  if (input_opc == OPCODE_LOAD_CONTEXT) {
-    if (sizeof_from == 8 && sizeof_to == 4) {
-      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
-      loadof->def->MoveBefore(input_def);
-      i->Replace(&OPCODE_ASSIGN_info, 0);
-      i->set_src1(loadof);
-      return true;
-    }
-  }
-
-  return false;
-}
-bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
-  bool result = false;
-  auto block = builder->first_block();
-  while (block) {
-    auto i = block->instr_head;
-    while (i) {
-      result |= BackpropTruncations(i, builder);
-      i = i->next;
-    }
-    block = block->next;
-  }
-  return result;
-}
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  auto def = value->def;
  if (def && def->opcode == &OPCODE_ASSIGN_info) {
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -32,8 +32,6 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
-  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
-  bool BackpropTruncations(hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -692,6 +692,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
  instr->block = block;
  instr->opcode = &opcode_info;
  instr->flags = flags;
+  instr->backend_flags = 0;
  instr->dest = dest;
  instr->src1.value = instr->src2.value = instr->src3.value = NULL;
  instr->src1_use = instr->src2_use = instr->src3_use = NULL;
@ -1492,7 +1493,6 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
                         part_type);
 }
 Value* HIRBuilder::VectorDenormFlush(Value* value1) {
-  return value1;
  ASSERT_VECTOR_TYPE(value1);
  Instr* i =
      AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
@ -1501,6 +1501,14 @@ Value* HIRBuilder::VectorDenormFlush(Value* value1) {
  i->src3.value = nullptr;
  return i->dest;
 }
+Value* HIRBuilder::ToSingle(Value* value) {
+  assert_true(value->type == FLOAT64_TYPE);
+  Instr* i = AppendInstr(OPCODE_TO_SINGLE_info, 0, AllocValue(FLOAT64_TYPE));
+  i->set_src1(value);
+  i->src2.value = nullptr;
+  i->src3.value = nullptr;
+  return i->dest;
+}
 Value* HIRBuilder::Add(Value* value1, Value* value2,
                       uint32_t arithmetic_flags) {
  ASSERT_TYPES_EQUAL(value1, value2);
@ -1720,7 +1728,6 @@ Value* HIRBuilder::Log2(Value* value) {
  return i->dest;
 }

-
 Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value1);
  ASSERT_VECTOR_TYPE(value2);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -200,7 +200,7 @@ class HIRBuilder {
  Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
  Value* VectorDenormFlush(Value* value1);
-
+  Value* ToSingle(Value* value);
  Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
                      uint32_t arithmetic_flags = 0);
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -180,6 +180,26 @@ exit_loop:
  *tunnel_flags = traversed_types;
  return current_def;
 }
+bool Instr::IsFake() const {
+  Opcode num = opcode->num;
+  switch (num) {
+    case OPCODE_NOP:
+    case OPCODE_COMMENT:
+    case OPCODE_CONTEXT_BARRIER:
+    case OPCODE_SOURCE_OFFSET:
+      return true;
+  }
+  return false;
+}
+
+const Instr* Instr::GetNonFakePrev() const {
+  const Instr* curr = prev;
+
+  while (curr && curr->IsFake()) {
+    curr = curr->prev;
+  }
+  return curr;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -42,6 +42,7 @@ class Instr {

  const OpcodeInfo* opcode;
  uint16_t flags;
+  uint16_t backend_flags;  // backends may do whatever they wish with this
  uint32_t ordinal;

  typedef union {
@ -158,6 +159,11 @@ if both are constant, return nullptr, nullptr
      call_for_values(src3.value, 2);
    }
  }
+  bool IsFake() const;
+
+  // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
+  // OPCODE_SOURCE_OFFSET
+  const hir::Instr* GetNonFakePrev() const;
 };

 }  // namespace hir
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -281,7 +281,10 @@ enum Opcode {
  OPCODE_ATOMIC_COMPARE_EXCHANGE,
  OPCODE_SET_ROUNDING_MODE,
  OPCODE_VECTOR_DENORMFLUSH,  // converts denormals to signed zeros in a vector
-  __OPCODE_MAX_VALUE,         // Keep at end.
+  OPCODE_TO_SINGLE,  // i could not find a decent name to assign to this opcode,
+                     // as we already have OPCODE_ROUND. round double to float (
+                     // ppc "single" fpu instruction result rounding behavior )
+  __OPCODE_MAX_VALUE,  // Keep at end.
 };

 enum OpcodeFlags {
@ -352,7 +355,9 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
  return (signature & ~(0x7)) ==
         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }
-
+static bool IsOpcodeUnaryValue(uint32_t signature) {
+  return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3));
+}
 static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
                            OpcodeSignatureType& src1,
                            OpcodeSignatureType& src2,
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -679,4 +679,11 @@ DEFINE_OPCODE(
    "vector_denormflush",
    OPCODE_SIG_V_V,
    0
+)
+
+DEFINE_OPCODE(
+	OPCODE_TO_SINGLE,
+	"to_single",
+	OPCODE_SIG_V_V,
+	0
 )
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -1643,6 +1643,11 @@ void Value::DenormalFlush() {
    constant.v128.u32[i] = current_element;
  }
 }
+void Value::ToSingle() {
+  assert_true(type == FLOAT64_TYPE);
+
+  constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
+}
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
@ -1805,6 +1810,25 @@ hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
    return nullptr;
  }
 }
+// does the value only have one instr that uses it?
+bool Value::HasSingleUse() const {
+  return use_head && use_head->next == nullptr;
+}
+bool Value::AllUsesByOneInsn() const {
+  if (!use_head) {
+    return false;
+  }
+  const Use* first_use = use_head;
+  const Instr* should_match = first_use->instr;
+
+  for (const Use* current_use = first_use->next; current_use;
+       current_use = current_use->next) {
+    if (current_use->instr != should_match) {
+      return false;
+    }
+  }
+  return true;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -226,6 +226,15 @@ class Value {
    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
  }
  inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
+
+  inline bool IsEqual(const Value* other) const {
+    if (this == other) {
+      return true;
+    } else if ((this->flags & other->flags) & VALUE_IS_CONSTANT) {
+      return this->IsConstantEQ(other);
+    }
+    return false;
+  }
  bool IsConstantTrue() const {
    if (type == VEC128_TYPE) {
      assert_always();
@ -327,7 +336,7 @@ class Value {
      return false;
    }
  }
-  bool IsConstantEQ(Value* other) const {
+  bool IsConstantEQ(const Value* other) const {
    if (type == VEC128_TYPE) {
      assert_always();
    }
@ -594,13 +603,19 @@ class Value {
                     bool saturate);
  void ByteSwap();
  void DenormalFlush();
-
+  void ToSingle();
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
  // tunnel_flags is updated to the kinds we actually traversed
  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);

+  // does the value only have one instr that uses it?
+  bool HasSingleUse() const;
+  // returns true if every single use is as an operand to a single instruction
+  // (add var2, var1, var1)
+  bool AllUsesByOneInsn() const;
+
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -379,7 +379,7 @@ typedef struct alignas(64) PPCContext_s {
  uint64_t lr;      // 0x10 Link register
  double f[32];     // 0x120 Floating-point registers
  vec128_t v[128];  // 0x220 VMX128 vector registers
-
+  vec128_t vscr_vec;
  // XER register:
  // Split to make it easier to do individual updates.
  uint8_t xer_ca;
@ -422,7 +422,7 @@ typedef struct alignas(64) PPCContext_s {
  // Value of last reserved load
  uint64_t reserved_val;
  ThreadState* thread_state;
-  uint8_t* virtual_membase;  
+  uint8_t* virtual_membase;
  static std::string GetRegisterName(PPCRegister reg);
  std::string GetStringFromValue(PPCRegister reg) const;
  void SetValueFromString(PPCRegister reg, std::string value);
@ -432,6 +432,7 @@ typedef struct alignas(64) PPCContext_s {
                            std::string& result) const;
 } PPCContext;
 #pragma pack(pop)
+constexpr size_t ppcctx_size = sizeof(PPCContext);
 static_assert(sizeof(PPCContext) % 64 == 0, "64b padded");

 }  // namespace ppc
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -355,13 +355,18 @@ int InstrEmit_stvrxl128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // is this the right format?
+
+  f.StoreVR(i.VX128_1.RB,
+            f.LoadContext(offsetof(PPCContext, vscr_vec), VEC128_TYPE));
+  return 0;
 }

 int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // is this the right format?
+  Value* v = f.LoadVR(i.VX128_1.RB);
+  f.StoreContext(offsetof(PPCContext, vscr_vec), v);
+  return 0;
 }

 int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
@ -1105,7 +1110,7 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZ.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
  Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  //chrispy: denormal outputs for Dot product are unconditionally made 0
+  // chrispy: denormal outputs for Dot product are unconditionally made 0
  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -336,6 +336,7 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
    XEINSTRNOTIMPLEMENTED();
    return 1;
  }
+
  Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
                                  f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
                          INT64_TYPE);
@ -353,6 +354,7 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
    XEINSTRNOTIMPLEMENTED();
    return 1;
  }
+
  Value* v = f.ZeroExtend(
      f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
              f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -46,7 +46,7 @@ int InstrEmit_faddx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_faddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) + (frB)
  Value* v = f.Add(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -63,7 +63,7 @@ int InstrEmit_fdivx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fdivsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- frA / frB
  Value* v = f.Div(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -80,7 +80,7 @@ int InstrEmit_fmulx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) x (frC)
  Value* v = f.Mul(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -88,9 +88,9 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- 1.0 / (frB)
-  Value* v = f.Convert(f.Div(f.LoadConstantFloat32(1.0f),
-                             f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)),
-                       FLOAT64_TYPE);
+
+  Value* v = f.Recip(f.LoadFPR(i.A.FRB));
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -116,7 +116,7 @@ int InstrEmit_fsubx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- (frA) - (frB)
  Value* v = f.Sub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -132,64 +132,63 @@ int InstrEmit_fselx(PPCHIRBuilder& f, const InstrData& i) {
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
-
-int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
-  // Double precision:
+static int InstrEmit_fsqrt(PPCHIRBuilder& f, const InstrData& i, bool single) {
  // frD <- sqrt(frB)
  Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
+int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
+  return InstrEmit_fsqrt(f, i, false);
+}

 int InstrEmit_fsqrtsx(PPCHIRBuilder& f, const InstrData& i) {
-  // Single precision:
-  // frD <- sqrt(frB)
-  Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fsqrt(f, i, true);
 }

 // Floating-point multiply-add (A-9)

-int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
+static int InstrEmit_fmadd(PPCHIRBuilder& f, const InstrData& i, bool single) {
  // frD <- (frA x frC) + frB
  Value* v =
      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }

+int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
+  return InstrEmit_fmadd(f, i, false);
+}
+
 int InstrEmit_fmaddsx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) + frB
+  return InstrEmit_fmadd(f, i, true);
+}
+
+static int InstrEmit_fmsub(PPCHIRBuilder& f, const InstrData& i, bool single) {
+  // frD <- (frA x frC) - frB
  Value* v =
-      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
+  if (single) {
+    v = f.ToSingle(v);
+  }
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
 }
-
 int InstrEmit_fmsubx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) - frB
-  Value* v =
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fmsub(f, i, false);
 }

 int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
-  // frD <- (frA x frC) - frB
-  Value* v =
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
-  f.StoreFPR(i.A.FRT, v);
-  f.UpdateFPSCR(v, i.A.Rc);
-  return 0;
+  return InstrEmit_fmsub(f, i, true);
 }

 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
@ -205,7 +204,7 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
  Value* v = f.Neg(
      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -224,7 +223,7 @@ int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
  Value* v = f.Neg(
      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
-  v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
+  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;