diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h
index 054d7e752..aa9097602 100644
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@@ -63,6 +63,10 @@ class Backend {
   virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
   virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
   virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
+  // ctx points to the start of a ppccontext, ctx - page_allocation_granularity
+  // up until the start of ctx may be used by the backend to store whatever data
+  // they want
+  virtual void InitializeBackendContext(void* ctx) {}
 
  protected:
   Processor* processor_ = nullptr;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index c6f2d6180..6d5690c2f 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -32,6 +32,9 @@
 #include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/function.h"
 #include "xenia/cpu/function_debug_info.h"
+#include "xenia/cpu/hir/instr.h"
+#include "xenia/cpu/hir/opcodes.h"
+#include "xenia/cpu/hir/value.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/symbol.h"
 #include "xenia/cpu/thread_state.h"
@@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
 }
 
 uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
   uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
   // uint16_t str_len = uint16_t(thread_state->context()->r[4]);
   auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
@@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
 }
 
 uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
   XELOGE("tw/td forced trap hit! This should be a crash!");
   if (cvars::break_on_debugbreak) {
     xe::debugging::Break();
@@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 
 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
 
   // TODO(benvanik): required?
   assert_not_zero(target_address);
@@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
   MovMem64(addr + 8, v.high);
   return ptr[addr];
 }
+static bool IsVectorCompare(const Instr* i) {
+  hir::Opcode op = i->opcode->num;
+  return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
+         op <= hir::OPCODE_VECTOR_COMPARE_UGE;
+}
 
+static bool IsFlaggedVectorOp(const Instr* i) {
+  if (IsVectorCompare(i)) {
+    return true;
+  }
+  hir::Opcode op = i->opcode->num;
+  using namespace hir;
+  switch (op) {
+    case OPCODE_VECTOR_SUB:
+    case OPCODE_VECTOR_ADD:
+    case OPCODE_SWIZZLE:
+      return true;
+  }
+  return false;
+}
+
+static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
+  switch (df->flags) {  // check what datatype we compared as
+    case hir::INT16_TYPE:
+    case hir::INT32_TYPE:
+    case hir::INT8_TYPE:
+    case hir::INT64_TYPE:
+      return SimdDomain::INTEGER;
+    case hir::FLOAT32_TYPE:
+    case hir::FLOAT64_TYPE:  // pretty sure float64 doesnt occur with vectors.
+                             // here for completeness
+      return SimdDomain::FLOATING;
+    default:
+      return SimdDomain::DONTCARE;
+  }
+  return SimdDomain::DONTCARE;
+}
+// this list is incomplete
+static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_LOAD_VECTOR_SHL:
+    case OPCODE_LOAD_VECTOR_SHR:
+    case OPCODE_VECTOR_CONVERT_F2I:
+    case OPCODE_VECTOR_MIN:  // there apparently is no FLOAT32_TYPE for min/maxs
+                             // flags
+    case OPCODE_VECTOR_MAX:
+    case OPCODE_VECTOR_SHL:
+    case OPCODE_VECTOR_SHR:
+    case OPCODE_VECTOR_SHA:
+    case OPCODE_VECTOR_ROTATE_LEFT:
+    case OPCODE_VECTOR_AVERAGE:  // apparently no float32 type for this
+    case OPCODE_EXTRACT:
+    case OPCODE_INSERT:  // apparently no f32 type for these two
+      return true;
+  }
+  return false;
+}
+static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_VECTOR_CONVERT_I2F:
+    case OPCODE_VECTOR_DENORMFLUSH:
+    case OPCODE_DOT_PRODUCT_3:
+    case OPCODE_DOT_PRODUCT_4:
+    case OPCODE_LOG2:
+    case OPCODE_POW2:
+    case OPCODE_RECIP:
+    case OPCODE_ROUND:
+    case OPCODE_SQRT:
+    case OPCODE_MUL:
+    case OPCODE_MUL_SUB:
+    case OPCODE_MUL_ADD:
+    case OPCODE_ABS:
+      return true;
+  }
+  return false;
+}
+
+SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
+  hir::Instr* df = for_value->def;
+  if (!df) {
+    // todo: visit uses to figure out domain
+    return SimdDomain::DONTCARE;
+
+  } else {
+    SimdDomain result = SimdDomain::DONTCARE;
+
+    if (IsFlaggedVectorOp(df)) {
+      result = GetDomainForFlaggedVectorOp(df);
+    } else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::INTEGER;
+    } else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::FLOATING;
+    }
+
+    // todo: check if still dontcare, if so, visit uses of the value to figure
+    // it out
+    return result;
+  }
+
+  return SimdDomain::DONTCARE;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index d73d86fe1..519bc629a 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -44,7 +44,39 @@ enum RegisterFlags {
   REG_DEST = (1 << 0),
   REG_ABCD = (1 << 1),
 };
+/*
+    SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
+   data and int data for a reason most processors implement two distinct
+   pipelines, one for the integer domain and one for the floating point domain
+    currently, xenia makes no distinction between the two. Crossing domains is
+   expensive. On Zen processors the penalty is one cycle each time you cross,
+   plus the two pipelines need to synchronize Often xenia will emit an integer
+   instruction, then a floating instruction, then integer again. this
+   effectively adds at least two cycles to the time taken These values will in
+   the future be used as tags to operations that tell them which domain to
+   operate in, if its at all possible to avoid crossing
+*/
+enum class SimdDomain : uint32_t {
+  FLOATING,
+  INTEGER,
+  DONTCARE,
+  CONFLICTING  // just used as a special result for PickDomain, different from
+               // dontcare (dontcare means we just dont know the domain,
+               // CONFLICTING means its used in multiple domains)
+};
 
+static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
+  if (dom1 == dom2) {
+    return dom1;
+  }
+  if (dom1 == SimdDomain::DONTCARE) {
+    return dom2;
+  }
+  if (dom2 == SimdDomain::DONTCARE) {
+    return dom1;
+  }
+  return SimdDomain::CONFLICTING;
+}
 enum XmmConst {
   XMMZero = 0,
   XMMOne,
@@ -122,7 +154,7 @@ enum XmmConst {
   XMMLVSLTableBase,
   XMMLVSRTableBase,
   XMMSingleDenormalMask,
-  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
   XMMXenosF16ExtRangeStart
 };
 
@@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {
 
   kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
   kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
-  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
-  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
+  kX64FastLoop =
+      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
   kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
@@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   FunctionDebugInfo* debug_info() const { return debug_info_; }
 
   size_t stack_size() const { return stack_size_; }
+  SimdDomain DeduceSimdDomain(const hir::Value* for_value);
 
  protected:
   void* Emplace(const EmitFunctionInfo& func_info,
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index 0646fdb39..33919d466 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -12,11 +12,11 @@
 #include <algorithm>
 #include <cstring>
 
+#include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
-#include "xenia/base/cvar.h"
 
 DEFINE_bool(
     elide_e0_check, false,
@@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
         !is_definitely_not_eo(guest)) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
+
+      // todo: do branching or use an alt membase and cmov
       e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
+
+      e.cmp(e.edx, e.GetContextReg().cvt32());
       e.setae(e.al);
       e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+      e.add(e.eax, e.edx);
+      return e.GetMembaseReg() + e.rax;
+
     } else {
       // Clear the top 32 bits, as they are likely garbage.
       // TODO(benvanik): find a way to avoid doing this.
@@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
       e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000);
+      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.al);
       e.shl(e.eax, 12);
       e.add(e.eax, guest.reg().cvt32());
@@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
     if (xe::memory::allocation_granularity() > 0x1000) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.cl);
       e.movzx(e.ecx, e.cl);
       e.shl(e.ecx, 12);
@@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
     if (xe::memory::allocation_granularity() > 0x1000) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.cl);
       e.movzx(e.ecx, e.cl);
       e.shl(e.ecx, 12);
@@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
       if (xe::memory::allocation_granularity() > 0x1000) {
         // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
         // it via memory mapping.
-        e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
         e.setae(e.al);
         e.movzx(e.eax, e.al);
         e.shl(e.eax, 12);
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 1cca6469f..7c55300db 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -1826,7 +1826,7 @@ struct PERMUTE_I32
     }
   }
 };
-//todo: use this on const src1
+// todo: use this on const src1
 static vec128_t FixupConstantShuf8(vec128_t input) {
   for (uint32_t i = 0; i < 16; ++i) {
     input.u8[i] ^= 0x03;
@@ -1984,7 +1984,11 @@ struct SWIZZLE
       } else {
         src1 = i.src1;
       }
-      e.vpshufd(i.dest, src1, swizzle_mask);
+      if (element_type == INT32_TYPE) {
+        e.vpshufd(i.dest, src1, swizzle_mask);
+      } else if (element_type == FLOAT32_TYPE) {
+        e.vshufps(i.dest, src1, src1, swizzle_mask);
+      }
     } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
       assert_always();
     } else {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 5af242118..73e2d646b 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -717,6 +717,9 @@ struct SELECT_V128_I8
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): find a shorter sequence.
     // dest = src1 != 0 ? src2 : src3
+    /*
+       chrispy: this is dead code, this sequence is never emitted
+    */
     e.movzx(e.eax, i.src1);
     e.vmovd(e.xmm1, e.eax);
     e.vpbroadcastd(e.xmm1, e.xmm1);
@@ -737,11 +740,46 @@ struct SELECT_V128_I8
     e.vpor(i.dest, e.xmm1);
   }
 };
+
+enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
+static bool IsVectorCompare(const Instr* i) {
+  Opcode op = i->opcode->num;
+  return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
+}
+/*
+    OPCODE_SELECT does a bit by bit selection, however, if the selector is the
+   result of a comparison or if each element may only be 0xff or 0 we may use a
+   blend instruction instead
+*/
+static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
+  const Instr* df = src1v->def;
+  if (!df) {
+    return PermittedBlend::NotPermitted;
+  } else {
+    if (!IsVectorCompare(df)) {
+      return PermittedBlend::NotPermitted;  // todo: check ors, ands of
+                                            // condition
+    } else {
+      switch (df->flags) {  // check what datatype we compared as
+        case INT16_TYPE:
+        case INT32_TYPE:
+        case INT8_TYPE:
+          return PermittedBlend::Int8;  // use vpblendvb
+        case FLOAT32_TYPE:
+          return PermittedBlend::Ps;  // use vblendvps
+        default:                      // unknown type! just ignore
+          return PermittedBlend::NotPermitted;
+      }
+    }
+  }
+}
 struct SELECT_V128_V128
     : Sequence<SELECT_V128_V128,
                I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
+    PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
+    //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
     if (i.src1.is_constant) {
       e.LoadConstantXmm(src1, i.src1.constant());
     }
@@ -756,10 +794,16 @@ struct SELECT_V128_V128
       e.LoadConstantXmm(src3, i.src3.constant());
     }
 
-    // src1 ? src2 : src3;
-    e.vpandn(e.xmm3, src1, src2);
-    e.vpand(i.dest, src1, src3);
-    e.vpor(i.dest, i.dest, e.xmm3);
+    if (mayblend == PermittedBlend::Int8) {
+      e.vpblendvb(i.dest, src2, src3, src1);
+    } else if (mayblend == PermittedBlend::Ps) {
+      e.vblendvps(i.dest, src2, src3, src1);
+    } else {
+      // src1 ? src2 : src3;
+      e.vpandn(e.xmm3, src1, src2);
+      e.vpand(i.dest, src1, src3);
+      e.vpor(i.dest, i.dest, e.xmm3);
+    }
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
@@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
     // TODO(benvanik): the vfmadd sequence produces slightly different results
     // than vmul+vadd and it'd be nice to know why. Until we know, it's
     // disabled so tests pass.
-    if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
+    // chrispy: reenabled, i have added the DAZ behavior that was missing
+    if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
       EmitCommutativeBinaryXmmOp(e, i,
                                  [&i](X64Emitter& e, const Xmm& dest,
                                       const Xmm& src1, const Xmm& src2) {
@@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
                                      e.vfmadd231ps(i.dest, src1, src2);
                                    } else {
                                      // Dest not equal to anything
-                                     e.vmovdqa(i.dest, src1);
+                                     //                                     e.vmovdqa(i.dest,
+                                     //                                     src1);
+                                     // chrispy: vmovdqa was a domain pipeline
+                                     // hazard
+                                     e.vmovaps(i.dest, src1);
                                      e.vfmadd213ps(i.dest, src2, src3);
                                    }
                                  });
@@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
         // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
         src3 = i.src3;
         if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm1, i.src3);
+          // e.vmovdqa(e.xmm1, i.src3);
+          e.vmovaps(e.xmm1, i.src3);
           src3 = e.xmm1;
         }
       }
@@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
 // ============================================================================
 struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
   }
 };
 struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
+    e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
   }
 };
 struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
@@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
     */
     e.vstmxcsr(mxcsr_storage);
 
+    e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
+
     e.mov(e.eax, 8);
 
     auto src1v = e.xmm0;
@@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
     // so that in the future this could be optimized away if the top is known to
     // be zero. Right now im not sure that happens often though and its
     // currently not worth it also, maybe pre-and if constant
-    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm3, src1v, e.xmm2);
+    e.vandps(e.xmm2, src2v, e.xmm2);
 
     e.and_(mxcsr_storage, e.eax);
     e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
@@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
     Xbyak::Label ret_qnan;
     Xbyak::Label done;
     e.jnz(ret_qnan);
-    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);  // broadcast
     e.jmp(done);
     e.L(ret_qnan);
     e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128
 
     e.vcvtps2pd(e.ymm0, src1v);
     e.vcvtps2pd(e.ymm1, src2v);
-    /*
-        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
 
-    e.and_(mxcsr_storage, e.eax);
-    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
-                                // go
-
-    e.vcvtps2pd(e.ymm0, e.xmm3);
-    e.vcvtps2pd(e.ymm1, e.xmm2);
-
-
-    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
-    e.vextractf128(e.xmm4, e.ymm5, 1);
-    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
-    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
-    e.not_(e.eax);
-    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
-    e.vcvtsd2ss(e.xmm1, e.xmm2);
-
-    */
     e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
     e.vextractf128(e.xmm2, e.ymm3, 1);
     e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
@@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
     Xbyak::Label ret_qnan;
     Xbyak::Label done;
     e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
-                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
     e.jmp(done);
     e.L(ret_qnan);
     e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@@ -2846,10 +2876,17 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpand(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandps(dest, src2, src1);
+          } else {
+            e.vpand(dest, src2, src1);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
@@ -2948,10 +2985,17 @@ struct AND_NOT_I64
 struct AND_NOT_V128
     : Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpandn(dest, src2, src1);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandnps(dest, src2, src1);
+          } else {
+            e.vpandn(dest, src2, src1);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32,
@@ -2994,10 +3038,17 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
 };
 struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpor(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vorps(dest, src1, src2);
+          } else {
+            e.vpor(dest, src1, src2);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128);
@@ -3039,10 +3090,17 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
 };
 struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpxor(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vxorps(dest, src1, src2);
+          } else {
+            e.vpxor(dest, src1, src2);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128);
@@ -3078,8 +3136,15 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
 };
 struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // dest = src ^ 0xFFFF...
-    e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+
+    SimdDomain domain =
+        e.DeduceSimdDomain(i.src1.value);
+    if (domain == SimdDomain::FLOATING) {
+      e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    } else {
+      // dest = src ^ 0xFFFF...
+      e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    }
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
@@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
     }
     e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
-    e.vmovaps(i.dest, e.xmm0);
+    e.vmovdqa(i.dest, e.xmm0);
   }
   static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
     // Almost all instances are shamt = 1, but non-constant.
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 025b4114e..6a6a56330 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             i->Remove();
             result = true;
           }
+
+          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
+                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
+            /*
+                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
+            */
+
+            v->set_zero(VEC128_TYPE);
+            i->Remove();
+            result = true;
+          }
+          
           break;
         }
         case OPCODE_INSERT:
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index 10862fd54..8c1cc18c2 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -9,6 +9,7 @@
 
 #include "xenia/cpu/compiler/passes/simplification_pass.h"
 
+#include <__msvc_int128.hpp>
 #include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
 namespace xe {
@@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
 using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::Instr;
 using xe::cpu::hir::Value;
+using vmask_portion_t = uint64_t;
+template <uint32_t Ndwords>
+struct Valuemask_t {
+  vmask_portion_t bits[Ndwords];
+
+  static Valuemask_t create_empty(vmask_portion_t fill = 0) {
+    Valuemask_t result;
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = fill;
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i]);
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i], other.bits[i]);
+    }
+    return result;
+  }
+  Valuemask_t operator&(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
+                   other);
+  }
+  Valuemask_t operator|(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
+                   other);
+  }
+  Valuemask_t operator^(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
+                   other);
+  }
+  Valuemask_t operator~() const {
+    return operate([](vmask_portion_t x) { return ~x; }, other);
+  }
+};
 
 SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
 
@@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
     iter_result |= SimplifyBitArith(builder);
     iter_result |= EliminateConversions(builder);
     iter_result |= SimplifyAssignments(builder);
+    iter_result |= BackpropTruncations(builder);
     result |= iter_result;
   } while (iter_result);
   return true;
@@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
   }
   return false;
 }
+bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
+                                          hir::HIRBuilder* builder,
+                                          hir::Value* xored) {
+  unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
+
+  Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
+  if (!xordef) {
+    return false;
+  }
+
+  Opcode xorop = xordef->opcode->num;
+  bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
+
+  Value* new_value = nullptr;
+  if (xorop == OPCODE_IS_FALSE) {
+    new_value = builder->IsTrue(xordef->src1.value);
+
+  } else if (xorop == OPCODE_IS_TRUE) {
+    new_value = builder->IsFalse(xordef->src1.value);
+  } else if (xorop == OPCODE_COMPARE_EQ) {
+    new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
+
+  } else if (xorop == OPCODE_COMPARE_NE) {
+    new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
+  }  // todo: other conds
+
+  if (!new_value) {
+    return false;
+  }
+
+  new_value->def->MoveBefore(i);
+
+  i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
+  i->set_src1(new_value);
+
+  return true;
+}
+
+bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
+                                            hir::HIRBuilder* builder,
+                                            hir::Value* b1, hir::Value* b2) {
+  // todo: implement
+  return false;
+}
 bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
   if (CheckOrXorZero(i)) {
     return true;
   } else {
-    if (i->src1.value == i->src2.value) {
+    Value* src1 = i->src1.value;
+    Value* src2 = i->src2.value;
+
+    if (SameValueOrEqualConstant(src1, src2)) {
       i->Replace(&OPCODE_ASSIGN_info, 0);
       i->set_src1(builder->LoadZero(i->dest->type));
       return true;
     }
-    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-
     auto [constant_value, variable_value] =
         i->BinaryValueArrangeAsConstAndVar();
+    ScalarNZM nzm1 = GetScalarNZM(src1);
+    ScalarNZM nzm2 = GetScalarNZM(src2);
+
+    if ((nzm1 & nzm2) ==
+        0) {  // no bits of the two sources overlap, this ought to be an OR
+      // cs:optimizing
+      /* i->Replace(&OPCODE_OR_info, 0);
+      i->set_src1(src1);
+      i->set_src2(src2);*/
+
+      i->opcode = &OPCODE_OR_info;
+
+      return true;
+    }
+
+    if (nzm1 == 1ULL && nzm2 == 1ULL) {
+      if (constant_value) {
+        return CheckBooleanXor1(i, builder, variable_value);
+      } else {
+        return CheckXorOfTwoBools(i, builder, src1, src2);
+      }
+    }
+
+    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
 
     if (!constant_value) return false;
 
@@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
 }
 bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
 retry_and_simplification:
+
   auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
   if (!constant_value) {
     // added this for srawi
-    uint64_t nzml = GetScalarNZM(i->src1.value);
-    uint64_t nzmr = GetScalarNZM(i->src2.value);
+    ScalarNZM nzml = GetScalarNZM(i->src1.value);
+    ScalarNZM nzmr = GetScalarNZM(i->src2.value);
 
     if ((nzml & nzmr) == 0) {
       i->Replace(&OPCODE_ASSIGN_info, 0);
@@ -524,9 +642,15 @@ retry_and_simplification:
 
   // todo: check if masking with mask that covers all of zero extension source
   uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-  // if masking with entire width, pointless instruction so become an assign
 
-  if (constant_value->AsUint64() == type_mask) {
+  ScalarNZM nzm = GetScalarNZM(variable_value);
+  // if masking with entire width, pointless instruction so become an assign
+  // chrispy: changed this to use the nzm instead, this optimizes away many and
+  // instructions
+  // chrispy: changed this again. detecting if nzm is a subset of and mask, if
+  // so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
+  // to the mask, but it is a subset so can be elimed
+  if ((constant_value->AsUint64() & nzm) == nzm) {
     i->Replace(&OPCODE_ASSIGN_info, 0);
     i->set_src1(variable_value);
     return true;
@@ -555,7 +679,7 @@ retry_and_simplification:
         Value* or_left = true_variable_def->src1.value;
         Value* or_right = true_variable_def->src2.value;
 
-        uint64_t left_nzm = GetScalarNZM(or_left);
+        ScalarNZM left_nzm = GetScalarNZM(or_left);
 
         // use the other or input instead of the or output
         if ((constant_value->AsUint64() & left_nzm) == 0) {
@@ -565,7 +689,7 @@ retry_and_simplification:
           return true;
         }
 
-        uint64_t right_nzm = GetScalarNZM(or_right);
+        ScalarNZM right_nzm = GetScalarNZM(or_right);
 
         if ((constant_value->AsUint64() & right_nzm) == 0) {
           i->Replace(&OPCODE_AND_info, 0);
@@ -593,6 +717,21 @@ retry_and_simplification:
   return false;
 }
 bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* src1 = i->src1.value;
+  Value* src2 = i->src2.value;
+
+  ScalarNZM nzm1 = GetScalarNZM(src1);
+  ScalarNZM nzm2 = GetScalarNZM(src2);
+  if ((nzm1 & nzm2) == 0) {  // no bits overlap, there will never be a carry
+                             // from any bits to any others, make this an OR
+
+    /* i->Replace(&OPCODE_OR_info, 0);
+    i->set_src1(src1);
+    i->set_src2(src2);*/
+    i->opcode = &OPCODE_OR_info;
+    return true;
+  }
+
   auto [definition, added_constant] =
       i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);
 
@@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     return false;
   }
 
-  uint64_t nzm_for_var = GetScalarNZM(variable);
+  ScalarNZM nzm_for_var = GetScalarNZM(variable);
   Opcode cmpop = i->opcode->num;
   uint64_t constant_unpacked = constant_value->AsUint64();
   uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
@@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     i->set_src1(variable);
     return true;
   }
+
+  if (cmpop == OPCODE_COMPARE_ULE &&
+      constant_unpacked ==
+          0) {  // less than or equal to zero = (== 0) = IS_FALSE
+    i->Replace(&OPCODE_IS_FALSE_info, 0);
+    i->set_src1(variable);
+    return true;
+  }
   // todo: OPCODE_COMPARE_NE too?
   if (cmpop == OPCODE_COMPARE_EQ &&
       def_opcode == OPCODE_NOT) {  // i see this a lot around addic insns
@@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
     return false;
   }
 
-  uint64_t input_nzm = GetScalarNZM(input);
+  ScalarNZM input_nzm = GetScalarNZM(input);
 
   if (istrue &&
       input_nzm == 1) {  // doing istrue on a value thats already a bool bitwise
@@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
    input_def = input_def->GetDestDefSkipAssigns();*/
   return false;
 }
+bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
+                                         hir::HIRBuilder* builder,
+                                         hir::Value* variable,
+                                         unsigned int shift) {
+  if (shift >= 3 && shift <= 6) {
+    // is possible shift of lzcnt res, do some tunneling
+
+    unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
+                          MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
+                          MOVTUNNEL_AND32FF;
+
+    Instr* vardef = variable->def;
+
+    hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
+
+    if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
+      Value* lz_input = var_def->src1.value;
+      TypeName type_of_lz_input = lz_input->type;
+      size_t shift_for_zero =
+          xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
+
+      if (shift == shift_for_zero) {
+        // we ought to be OPCODE_IS_FALSE!
+        /*
+            explanation: if an input to lzcnt is zero, the result will be the
+           bit size of the input type, which is always a power of two any
+           nonzero result will be less than the bit size so you can test for
+           zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
+            this is a very common way of testing for zero without branching on
+           ppc, and the xb360 ppc compiler used it a lot we optimize this away
+           for simplicity and to enable further optimizations, but actually this
+           is also quite fast on modern x86 processors as well, for instance on
+           zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
+           in one cycle
+
+        */
+
+        if (variable->type != INT8_TYPE) {
+          Value* isfalsetest = builder->IsFalse(lz_input);
+
+          isfalsetest->def->MoveBefore(i);
+          i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
+          i->set_src1(isfalsetest);
+
+        } else {
+          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          i->set_src1(lz_input);
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* shr_lhs = i->src1.value;
+  Value* shr_rhs = i->src2.value;
+  if (!shr_lhs || !shr_rhs) return false;
+  if (shr_rhs->IsConstant()) {
+    return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
+  }
+
+  return false;
+}
+
+bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* l = i->src1.value;
+  Value* r = i->src2.value;
+  ScalarNZM l_nzm = GetScalarNZM(l);
+  uint64_t signbit_mask = GetScalarSignbitMask(l->type);
+  size_t typesize = GetTypeSize(l->type);
+
+  /*
+    todo: folding this requires the mask of constant bits
+  if (r->IsConstant()) {
+    uint32_t const_r = r->AsUint32();
+
+    if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
+  fill the result with the signbit of the input.
+
+
+    }
+  }*/
+  if ((l_nzm & signbit_mask) == 0) {  // signbit will never be set, might as
+                                      // well be an SHR. (this does happen)
+    i->opcode = &OPCODE_SHR_info;
+
+    return true;
+  }
+
+  return false;
+}
 bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
   bool result = false;
   auto block = builder->first_block();
@@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
       // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
       // don't handle these in our simplifications, so skip
       if (i->dest && IsScalarIntegralType(i->dest->type)) {
-        if (i->opcode == &OPCODE_OR_info) {
+        Opcode iop = i->opcode->num;
+
+        if (iop == OPCODE_OR) {
           result |= CheckOr(i, builder);
-        } else if (i->opcode == &OPCODE_XOR_info) {
+        } else if (iop == OPCODE_XOR) {
           result |= CheckXor(i, builder);
-        } else if (i->opcode == &OPCODE_AND_info) {
+        } else if (iop == OPCODE_AND) {
           result |= CheckAnd(i, builder);
-        } else if (i->opcode == &OPCODE_ADD_info) {
+        } else if (iop == OPCODE_ADD) {
           result |= CheckAdd(i, builder);
-        } else if (IsScalarBasicCmp(i->opcode->num)) {
+        } else if (IsScalarBasicCmp(iop)) {
           result |= CheckScalarConstCmp(i, builder);
-        } else if (i->opcode == &OPCODE_IS_FALSE_info ||
-                   i->opcode == &OPCODE_IS_TRUE_info) {
+        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
           result |= CheckIsTrueIsFalse(i, builder);
+        } else if (iop == OPCODE_SHR) {
+          result |= CheckSHR(i, builder);
+        } else if (iop == OPCODE_SHA) {
+          result |= CheckSAR(i, builder);
         }
       }
 
@@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
   }
   return false;
 }
-
 bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   // Run over the instructions and rename assigned variables:
   //   v1 = v0
@@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   while (block) {
     auto i = block->instr_head;
     while (i) {
-      uint32_t signature = i->opcode->signature;
-      if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
+      i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
         bool modified = false;
-        i->set_src1(CheckValue(i->src1.value, modified));
+        i->set_srcN(CheckValue(value, modified), idx);
         result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src2(CheckValue(i->src2.value, modified));
-        result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src3(CheckValue(i->src3.value, modified));
-        result |= modified;
-      }
+      });
 
       i = i->next;
     }
@@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   return result;
 }
 
+struct TruncateSimplifier {
+  TypeName type_from, type_to;
+  uint32_t sizeof_from, sizeof_to;
+  uint32_t bit_sizeof_from, bit_sizeof_to;
+  uint64_t typemask_from, typemask_to;
+  hir::HIRBuilder* builder;
+  hir::Instr* truncate_instr;
+  hir::Value* truncated_value;
+  hir::Instr* truncated_value_def;
+};
+bool SimplificationPass::BackpropTruncations(hir::Instr* i,
+                                             hir::HIRBuilder* builder) {
+  if (i->opcode != &OPCODE_TRUNCATE_info) {
+    return false;
+  }
+  TypeName type_from = i->src1.value->type;
+  TypeName type_to = i->dest->type;
+
+  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
+  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
+
+  Instr* input_def = i->src1.value->GetDefSkipAssigns();
+  if (!input_def) {
+    return false;
+  }
+  Opcode input_opc = input_def->opcode->num;
+
+  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
+    uint32_t src2_shift = input_def->src2.value->AsUint32();
+    if (src2_shift < (sizeof_to * CHAR_BIT)) {
+      Value* truncated_preshift =
+          builder->Truncate(input_def->src1.value, type_to);
+
+      truncated_preshift->def->MoveBefore(i);
+      i->Replace(&OPCODE_SHL_info, 0);
+      i->set_src1(truncated_preshift);
+      i->set_src2(input_def->src2.value);
+      return true;
+    }
+  }
+  if (input_opc == OPCODE_LOAD_CONTEXT) {
+    if (sizeof_from == 8 && sizeof_to == 4) {
+      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
+      loadof->def->MoveBefore(input_def);
+      i->Replace(&OPCODE_ASSIGN_info, 0);
+      i->set_src1(loadof);
+      return true;
+    }
+  }
+
+  return false;
+}
+bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
+  bool result = false;
+  auto block = builder->first_block();
+  while (block) {
+    auto i = block->instr_head;
+    while (i) {
+      result |= BackpropTruncations(i, builder);
+      i = i->next;
+    }
+    block = block->next;
+  }
+  return result;
+}
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
   auto def = value->def;
   if (def && def->opcode == &OPCODE_ASSIGN_info) {
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h
index d805ea27c..fe8de8474 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
   bool SimplifyAssignments(hir::HIRBuilder* builder);
   hir::Value* CheckValue(hir::Value* value, bool& result);
   bool SimplifyBitArith(hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::HIRBuilder* builder);
   // handle either or or xor with 0
   bool CheckOrXorZero(hir::Instr* i);
   bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
@@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
   bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
   bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
   bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
+                       hir::Value* variable, unsigned int shift);
+
+  bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
+  // called by CheckXor, handles transforming a 1 bit value xored against 1
+  bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
+                        hir::Value* xored);
+  bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
+                          hir::Value* b1, hir::Value* b2);
+
   // for rlwinm
   bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
   bool TransformANDROLORSHLSeq(
diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc
index 4096d8e4a..118895719 100644
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@@ -14,38 +14,15 @@
 namespace xe {
 namespace cpu {
 namespace hir {
-
-void Instr::set_src1(Value* value) {
-  if (src1.value == value) {
+void Instr::set_srcN(Value* value, uint32_t idx) {
+  if (srcs[idx].value == value) {
     return;
   }
-  if (src1_use) {
-    src1.value->RemoveUse(src1_use);
+  if (srcs_use[idx]) {
+    srcs[idx].value->RemoveUse(srcs_use[idx]);
   }
-  src1.value = value;
-  src1_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src2(Value* value) {
-  if (src2.value == value) {
-    return;
-  }
-  if (src2_use) {
-    src2.value->RemoveUse(src2_use);
-  }
-  src2.value = value;
-  src2_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src3(Value* value) {
-  if (src3.value == value) {
-    return;
-  }
-  if (src3_use) {
-    src3.value->RemoveUse(src3_use);
-  }
-  src3.value = value;
-  src3_use = value ? value->AddUse(block->arena, this) : NULL;
+  srcs[idx].value = value;
+  srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
 }
 
 void Instr::MoveBefore(Instr* other) {
@@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
   }
   return current_def;
 }
+Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
+  unsigned int traversed_types = 0;
+  unsigned int in_flags = *tunnel_flags;
+  Instr* current_def = this;
+
+  while (true) {
+    Opcode op = current_def->opcode->num;
+
+    switch (op) {
+      case OPCODE_ASSIGN: {
+        if ((in_flags & MOVTUNNEL_ASSIGNS)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_ASSIGNS;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_ZERO_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVZX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVZX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_SIGN_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVSX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVSX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_TRUNCATE: {
+        if ((in_flags & MOVTUNNEL_TRUNCATE)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_TRUNCATE;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_AND: {
+        if ((in_flags & MOVTUNNEL_AND32FF)) {
+          auto [constant, nonconst] =
+              current_def->BinaryValueArrangeAsConstAndVar();
+          if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
+            goto exit_loop;
+          }
+          current_def = nonconst->def;
+          traversed_types |= MOVTUNNEL_AND32FF;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      default:
+        goto exit_loop;
+    }
+    if (!current_def) {
+      goto exit_loop;
+    }
+  }
+exit_loop:
+  *tunnel_flags = traversed_types;
+  return current_def;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h
index 1f09ee341..db3c78922 100644
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@@ -25,6 +25,14 @@ namespace hir {
 
 class Block;
 class Label;
+// todo: better name
+enum MovTunnel {
+  MOVTUNNEL_ASSIGNS = 1,
+  MOVTUNNEL_MOVZX = 2,
+  MOVTUNNEL_MOVSX = 4,
+  MOVTUNNEL_TRUNCATE = 8,
+  MOVTUNNEL_AND32FF = 16,  // tunnel through and with 0xFFFFFFFF
+};
 
 class Instr {
  public:
@@ -44,17 +52,28 @@ class Instr {
   } Op;
 
   Value* dest;
-  Op src1;
-  Op src2;
-  Op src3;
+  union {
+    struct {
+      Op src1;
+      Op src2;
+      Op src3;
+    };
+    Op srcs[3];
+  };
+  union {
+    struct {
+      Value::Use* src1_use;
+      Value::Use* src2_use;
+      Value::Use* src3_use;
+    };
+    Value::Use* srcs_use[3];
+  };
+  void set_srcN(Value* value, uint32_t idx);
+  void set_src1(Value* value) { set_srcN(value, 0); }
 
-  Value::Use* src1_use;
-  Value::Use* src2_use;
-  Value::Use* src3_use;
+  void set_src2(Value* value) { set_srcN(value, 1); }
 
-  void set_src1(Value* value);
-  void set_src2(Value* value);
-  void set_src3(Value* value);
+  void set_src3(Value* value) { set_srcN(value, 2); }
 
   void MoveBefore(Instr* other);
   void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
@@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
   }
 
   Instr* GetDestDefSkipAssigns();
+  Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
+
   // returns [def op, constant]
   std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
       const OpcodeInfo* op_ptr) {
@@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
     }
     return result;
   }
+  /*
+  Invokes the provided lambda callback on each operand that is a Value. Callback
+  is invoked with Value*, uint32_t index
+*/
+  template <typename TCallable>
+  void VisitValueOperands(TCallable&& call_for_values) {
+    uint32_t signature = opcode->signature;
+
+    OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
+
+    UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
+
+    if (t_src1 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src1.value, 0);
+    }
+    if (t_src2 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src2.value, 1);
+    }
+    if (t_src3 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src3.value, 2);
+    }
+  }
 };
 
 }  // namespace hir
diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc
index 211cd18f9..c4ebdeb2c 100644
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
     return nullptr;
   }
 }
+hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
+  if (def) {
+    return def->GetDestDefTunnelMovs(tunnel_flags);
+  } else {
+    return nullptr;
+  }
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index 1d8963b64..84d121a26 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -598,6 +598,8 @@ class Value {
   void CountLeadingZeros(const Value* other);
   bool Compare(Opcode opcode, Value* other);
   hir::Instr* GetDefSkipAssigns();
+  // tunnel_flags is updated to the kinds we actually traversed
+  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
 
  private:
   static bool CompareInt8(Opcode opcode, Value* a, Value* b);
diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h
index 4acdaed3c..777ef568a 100644
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@@ -246,30 +246,7 @@ enum class PPCRegister {
 };
 
 #pragma pack(push, 8)
-typedef struct PPCContext_s {
-  // Must be stored at 0x0 for now.
-  // TODO(benvanik): find a nice way to describe this to the JIT.
-  ThreadState* thread_state;  // 0x0
-  // TODO(benvanik): this is getting nasty. Must be here.
-  uint8_t* virtual_membase;  // 0x8
-
-  // Most frequently used registers first.
-  uint64_t lr;      // 0x10 Link register
-  uint64_t ctr;     // 0x18 Count register
-  uint64_t r[32];   // 0x20 General purpose registers
-  double f[32];     // 0x120 Floating-point registers
-  vec128_t v[128];  // 0x220 VMX128 vector registers
-
-  // XER register:
-  // Split to make it easier to do individual updates.
-  uint8_t xer_ca;  // 0xA20
-  uint8_t xer_ov;  // 0xA21
-  uint8_t xer_so;  // 0xA22
-
-  // Condition registers:
-  // These are split to make it easier to do DCE on unused stores.
-  uint64_t cr() const;
-  void set_cr(uint64_t value);
+typedef struct alignas(64) PPCContext_s {
   union {
     uint32_t value;
     struct {
@@ -395,6 +372,25 @@ typedef struct PPCContext_s {
     } bits;
   } fpscr;  // Floating-point status and control register
 
+  // Most frequently used registers first.
+
+  uint64_t r[32];   // 0x20 General purpose registers
+  uint64_t ctr;     // 0x18 Count register
+  uint64_t lr;      // 0x10 Link register
+  double f[32];     // 0x120 Floating-point registers
+  vec128_t v[128];  // 0x220 VMX128 vector registers
+
+  // XER register:
+  // Split to make it easier to do individual updates.
+  uint8_t xer_ca;
+  uint8_t xer_ov;
+  uint8_t xer_so;
+
+  // Condition registers:
+  // These are split to make it easier to do DCE on unused stores.
+  uint64_t cr() const;
+  void set_cr(uint64_t value);
+
   uint8_t vscr_sat;
 
   // uint32_t get_fprf() {
@@ -425,7 +421,8 @@ typedef struct PPCContext_s {
 
   // Value of last reserved load
   uint64_t reserved_val;
-
+  ThreadState* thread_state;
+  uint8_t* virtual_membase;  
   static std::string GetRegisterName(PPCRegister reg);
   std::string GetStringFromValue(PPCRegister reg) const;
   void SetValueFromString(PPCRegister reg, std::string value);
diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc
index 3816446fc..1383646e1 100644
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@@ -18,12 +18,50 @@
 #include "xenia/cpu/processor.h"
 
 #include "xenia/xbox.h"
-
 namespace xe {
 namespace cpu {
 
 thread_local ThreadState* thread_state_ = nullptr;
 
+static void* AllocateContext() {
+  size_t granularity = xe::memory::allocation_granularity();
+  for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
+    /*
+        we want our register which points to the context to have 0xE0000000 in
+       the low 32 bits, for checking for whether we need the 4k offset, but also
+       if we allocate starting from the page before we allow backends to index
+       negatively to get to their own backend specific data, which makes full
+        use of int8 displacement
+
+
+        the downside is we waste most of one granula and probably a fair bit of
+       the one starting at 0xE0 by using a direct virtual memory allocation
+       instead of malloc
+    */
+    uintptr_t context_pre =
+        ((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
+
+    void* p = memory::AllocFixed(
+        (void*)context_pre, granularity + sizeof(ppc::PPCContext),
+        memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
+    if (p) {
+      return reinterpret_cast<char*>(p) +
+             granularity;  // now we have a ctx ptr with the e0 constant in low,
+                           // and one page allocated before it
+    }
+  }
+
+  assert_always("giving up on allocating context, likely leaking contexts");
+  return nullptr;
+}
+
+static void FreeContext(void* ctx) {
+  char* true_start_of_ctx = &reinterpret_cast<char*>(
+      ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
+  memory::DeallocFixed(true_start_of_ctx, 0,
+                       memory::DeallocationType::kRelease);
+}
+
 ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
                          uint32_t stack_base, uint32_t pcr_address)
     : processor_(processor),
@@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
   backend_data_ = processor->backend()->AllocThreadData();
 
   // Allocate with 64b alignment.
-  context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
+
+  context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext());  // memory::AlignedAlloc<ppc::PPCContext>(64);
+  processor->backend()->InitializeBackendContext(context_);
   assert_true(((uint64_t)context_ & 0x3F) == 0);
   std::memset(context_, 0, sizeof(ppc::PPCContext));
 
@@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
   if (thread_state_ == this) {
     thread_state_ = nullptr;
   }
-
-  memory::AlignedFree(context_);
+  if (context_) {
+    FreeContext(reinterpret_cast<void*>(context_));
+  }
+ // memory::AlignedFree(context_);
 }
 
 void ThreadState::Bind(ThreadState* thread_state) {