Merge pull request #52 from chrisps/canary_experimental

Fix previous batch of CPU changes
2022-07-18 09:20:35 +02:00 · 2022-07-18 09:20:35 +02:00 · 3757580f45
parent fd78ab4dfc 11817f0a3b
commit 3757580f45
15 changed files with 856 additions and 170 deletions
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@ -63,6 +63,10 @@ class Backend {
  virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
  virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
  virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
+  // ctx points to the start of a ppccontext, ctx - page_allocation_granularity
+  // up until the start of ctx may be used by the backend to store whatever data
+  // they want
+  virtual void InitializeBackendContext(void* ctx) {}

 protected:
  Processor* processor_ = nullptr;
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -32,6 +32,9 @@
 #include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/function.h"
 #include "xenia/cpu/function_debug_info.h"
+#include "xenia/cpu/hir/instr.h"
+#include "xenia/cpu/hir/opcodes.h"
+#include "xenia/cpu/hir/value.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/symbol.h"
 #include "xenia/cpu/thread_state.h"
@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
 }

 uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
  uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
  // uint16_t str_len = uint16_t(thread_state->context()->r[4]);
  auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
 }

 uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
  XELOGE("tw/td forced trap hit! This should be a crash!");
  if (cvars::break_on_debugbreak) {
    xe::debugging::Break();
@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {

 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;

  // TODO(benvanik): required?
  assert_not_zero(target_address);
@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
  MovMem64(addr + 8, v.high);
  return ptr[addr];
 }
+static bool IsVectorCompare(const Instr* i) {
+  hir::Opcode op = i->opcode->num;
+  return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
+         op <= hir::OPCODE_VECTOR_COMPARE_UGE;
+}

+static bool IsFlaggedVectorOp(const Instr* i) {
+  if (IsVectorCompare(i)) {
+    return true;
+  }
+  hir::Opcode op = i->opcode->num;
+  using namespace hir;
+  switch (op) {
+    case OPCODE_VECTOR_SUB:
+    case OPCODE_VECTOR_ADD:
+    case OPCODE_SWIZZLE:
+      return true;
+  }
+  return false;
+}
+
+static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
+  switch (df->flags) {  // check what datatype we compared as
+    case hir::INT16_TYPE:
+    case hir::INT32_TYPE:
+    case hir::INT8_TYPE:
+    case hir::INT64_TYPE:
+      return SimdDomain::INTEGER;
+    case hir::FLOAT32_TYPE:
+    case hir::FLOAT64_TYPE:  // pretty sure float64 doesnt occur with vectors.
+                             // here for completeness
+      return SimdDomain::FLOATING;
+    default:
+      return SimdDomain::DONTCARE;
+  }
+  return SimdDomain::DONTCARE;
+}
+// this list is incomplete
+static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_LOAD_VECTOR_SHL:
+    case OPCODE_LOAD_VECTOR_SHR:
+    case OPCODE_VECTOR_CONVERT_F2I:
+    case OPCODE_VECTOR_MIN:  // there apparently is no FLOAT32_TYPE for min/maxs
+                             // flags
+    case OPCODE_VECTOR_MAX:
+    case OPCODE_VECTOR_SHL:
+    case OPCODE_VECTOR_SHR:
+    case OPCODE_VECTOR_SHA:
+    case OPCODE_VECTOR_ROTATE_LEFT:
+    case OPCODE_VECTOR_AVERAGE:  // apparently no float32 type for this
+    case OPCODE_EXTRACT:
+    case OPCODE_INSERT:  // apparently no f32 type for these two
+      return true;
+  }
+  return false;
+}
+static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_VECTOR_CONVERT_I2F:
+    case OPCODE_VECTOR_DENORMFLUSH:
+    case OPCODE_DOT_PRODUCT_3:
+    case OPCODE_DOT_PRODUCT_4:
+    case OPCODE_LOG2:
+    case OPCODE_POW2:
+    case OPCODE_RECIP:
+    case OPCODE_ROUND:
+    case OPCODE_SQRT:
+    case OPCODE_MUL:
+    case OPCODE_MUL_SUB:
+    case OPCODE_MUL_ADD:
+    case OPCODE_ABS:
+      return true;
+  }
+  return false;
+}
+
+SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
+  hir::Instr* df = for_value->def;
+  if (!df) {
+    // todo: visit uses to figure out domain
+    return SimdDomain::DONTCARE;
+
+  } else {
+    SimdDomain result = SimdDomain::DONTCARE;
+
+    if (IsFlaggedVectorOp(df)) {
+      result = GetDomainForFlaggedVectorOp(df);
+    } else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::INTEGER;
+    } else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::FLOATING;
+    }
+
+    // todo: check if still dontcare, if so, visit uses of the value to figure
+    // it out
+    return result;
+  }
+
+  return SimdDomain::DONTCARE;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -44,7 +44,39 @@ enum RegisterFlags {
  REG_DEST = (1 << 0),
  REG_ABCD = (1 << 1),
 };
+/*
+    SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
+   data and int data for a reason most processors implement two distinct
+   pipelines, one for the integer domain and one for the floating point domain
+    currently, xenia makes no distinction between the two. Crossing domains is
+   expensive. On Zen processors the penalty is one cycle each time you cross,
+   plus the two pipelines need to synchronize Often xenia will emit an integer
+   instruction, then a floating instruction, then integer again. this
+   effectively adds at least two cycles to the time taken These values will in
+   the future be used as tags to operations that tell them which domain to
+   operate in, if its at all possible to avoid crossing
+*/
+enum class SimdDomain : uint32_t {
+  FLOATING,
+  INTEGER,
+  DONTCARE,
+  CONFLICTING  // just used as a special result for PickDomain, different from
+               // dontcare (dontcare means we just dont know the domain,
+               // CONFLICTING means its used in multiple domains)
+};

+static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
+  if (dom1 == dom2) {
+    return dom1;
+  }
+  if (dom1 == SimdDomain::DONTCARE) {
+    return dom2;
+  }
+  if (dom2 == SimdDomain::DONTCARE) {
+    return dom1;
+  }
+  return SimdDomain::CONFLICTING;
+}
 enum XmmConst {
  XMMZero = 0,
  XMMOne,
@ -122,7 +154,7 @@ enum XmmConst {
  XMMLVSLTableBase,
  XMMLVSRTableBase,
  XMMSingleDenormalMask,
-  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
  XMMXenosF16ExtRangeStart
 };

@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {

  kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
-  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
-  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
+  kX64FastLoop =
+      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  FunctionDebugInfo* debug_info() const { return debug_info_; }

  size_t stack_size() const { return stack_size_; }
+  SimdDomain DeduceSimdDomain(const hir::Value* for_value);

 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -12,11 +12,11 @@
 #include <algorithm>
 #include <cstring>

+#include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
-#include "xenia/base/cvar.h"

 DEFINE_bool(
    elide_e0_check, false,
@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
        !is_definitely_not_eo(guest)) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
+
+      // todo: do branching or use an alt membase and cmov
      e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
+
+      e.cmp(e.edx, e.GetContextReg().cvt32());
      e.setae(e.al);
      e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+      e.add(e.eax, e.edx);
+      return e.GetMembaseReg() + e.rax;
+
    } else {
      // Clear the top 32 bits, as they are likely garbage.
      // TODO(benvanik): find a way to avoid doing this.
@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
      e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000);
+      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.al);
      e.shl(e.eax, 12);
      e.add(e.eax, guest.reg().cvt32());
@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.cl);
      e.movzx(e.ecx, e.cl);
      e.shl(e.ecx, 12);
@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.cl);
      e.movzx(e.ecx, e.cl);
      e.shl(e.ecx, 12);
@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
      if (xe::memory::allocation_granularity() > 0x1000) {
        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
        // it via memory mapping.
-        e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
        e.setae(e.al);
        e.movzx(e.eax, e.al);
        e.shl(e.eax, 12);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -1826,7 +1826,7 @@ struct PERMUTE_I32
    }
  }
 };
-//todo: use this on const src1
+// todo: use this on const src1
 static vec128_t FixupConstantShuf8(vec128_t input) {
  for (uint32_t i = 0; i < 16; ++i) {
    input.u8[i] ^= 0x03;
@ -1984,7 +1984,11 @@ struct SWIZZLE
      } else {
        src1 = i.src1;
      }
+      if (element_type == INT32_TYPE) {
        e.vpshufd(i.dest, src1, swizzle_mask);
+      } else if (element_type == FLOAT32_TYPE) {
+        e.vshufps(i.dest, src1, src1, swizzle_mask);
+      }
    } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
      assert_always();
    } else {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -717,6 +717,9 @@ struct SELECT_V128_I8
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // TODO(benvanik): find a shorter sequence.
    // dest = src1 != 0 ? src2 : src3
+    /*
+       chrispy: this is dead code, this sequence is never emitted
+    */
    e.movzx(e.eax, i.src1);
    e.vmovd(e.xmm1, e.eax);
    e.vpbroadcastd(e.xmm1, e.xmm1);
@ -737,11 +740,46 @@ struct SELECT_V128_I8
    e.vpor(i.dest, e.xmm1);
  }
 };
+
+enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
+static bool IsVectorCompare(const Instr* i) {
+  Opcode op = i->opcode->num;
+  return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
+}
+/*
+    OPCODE_SELECT does a bit by bit selection, however, if the selector is the
+   result of a comparison or if each element may only be 0xff or 0 we may use a
+   blend instruction instead
+*/
+static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
+  const Instr* df = src1v->def;
+  if (!df) {
+    return PermittedBlend::NotPermitted;
+  } else {
+    if (!IsVectorCompare(df)) {
+      return PermittedBlend::NotPermitted;  // todo: check ors, ands of
+                                            // condition
+    } else {
+      switch (df->flags) {  // check what datatype we compared as
+        case INT16_TYPE:
+        case INT32_TYPE:
+        case INT8_TYPE:
+          return PermittedBlend::Int8;  // use vpblendvb
+        case FLOAT32_TYPE:
+          return PermittedBlend::Ps;  // use vblendvps
+        default:                      // unknown type! just ignore
+          return PermittedBlend::NotPermitted;
+      }
+    }
+  }
+}
 struct SELECT_V128_V128
    : Sequence<SELECT_V128_V128,
               I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
+    PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
+    //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
    if (i.src1.is_constant) {
      e.LoadConstantXmm(src1, i.src1.constant());
    }
@ -756,11 +794,17 @@ struct SELECT_V128_V128
      e.LoadConstantXmm(src3, i.src3.constant());
    }

+    if (mayblend == PermittedBlend::Int8) {
+      e.vpblendvb(i.dest, src2, src3, src1);
+    } else if (mayblend == PermittedBlend::Ps) {
+      e.vblendvps(i.dest, src2, src3, src1);
+    } else {
      // src1 ? src2 : src3;
      e.vpandn(e.xmm3, src1, src2);
      e.vpand(i.dest, src1, src3);
      e.vpor(i.dest, i.dest, e.xmm3);
    }
+  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
                     SELECT_I64, SELECT_F32, SELECT_F64, SELECT_V128_I8,
@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
    // TODO(benvanik): the vfmadd sequence produces slightly different results
    // than vmul+vadd and it'd be nice to know why. Until we know, it's
    // disabled so tests pass.
-    if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
+    // chrispy: reenabled, i have added the DAZ behavior that was missing
+    if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
      EmitCommutativeBinaryXmmOp(e, i,
                                 [&i](X64Emitter& e, const Xmm& dest,
                                      const Xmm& src1, const Xmm& src2) {
@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
                                     e.vfmadd231ps(i.dest, src1, src2);
                                   } else {
                                     // Dest not equal to anything
-                                     e.vmovdqa(i.dest, src1);
+                                     //                                     e.vmovdqa(i.dest,
+                                     //                                     src1);
+                                     // chrispy: vmovdqa was a domain pipeline
+                                     // hazard
+                                     e.vmovaps(i.dest, src1);
                                     e.vfmadd213ps(i.dest, src2, src3);
                                   }
                                 });
@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
        src3 = i.src3;
        if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm1, i.src3);
+          // e.vmovdqa(e.xmm1, i.src3);
+          e.vmovaps(e.xmm1, i.src3);
          src3 = e.xmm1;
        }
      }
@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
 // ============================================================================
 struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
  }
 };
 struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
+    e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
  }
 };
 struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
    */
    e.vstmxcsr(mxcsr_storage);

+    e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
+
    e.mov(e.eax, 8);

    auto src1v = e.xmm0;
@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
    // so that in the future this could be optimized away if the top is known to
    // be zero. Right now im not sure that happens often though and its
    // currently not worth it also, maybe pre-and if constant
-    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm3, src1v, e.xmm2);
+    e.vandps(e.xmm2, src2v, e.xmm2);

    e.and_(mxcsr_storage, e.eax);
    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
    Xbyak::Label ret_qnan;
    Xbyak::Label done;
    e.jnz(ret_qnan);
-    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);  // broadcast
    e.jmp(done);
    e.L(ret_qnan);
    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128

    e.vcvtps2pd(e.ymm0, src1v);
    e.vcvtps2pd(e.ymm1, src2v);
-    /*
-        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));

-    e.and_(mxcsr_storage, e.eax);
-    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
-                                // go
-
-    e.vcvtps2pd(e.ymm0, e.xmm3);
-    e.vcvtps2pd(e.ymm1, e.xmm2);
-
-
-    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
-    e.vextractf128(e.xmm4, e.ymm5, 1);
-    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
-    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
-    e.not_(e.eax);
-    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
-    e.vcvtsd2ss(e.xmm1, e.xmm2);
-
-    */
    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
    e.vextractf128(e.xmm2, e.ymm3, 1);
    e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
    Xbyak::Label ret_qnan;
    Xbyak::Label done;
    e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
-                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
    e.jmp(done);
    e.L(ret_qnan);
    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2846,9 +2876,16 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpand(dest, src1, src2);
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandps(dest, src2, src1);
+          } else {
+            e.vpand(dest, src2, src1);
+          }
        });
  }
 };
@ -2948,9 +2985,16 @@ struct AND_NOT_I64
 struct AND_NOT_V128
    : Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandnps(dest, src2, src1);
+          } else {
            e.vpandn(dest, src2, src1);
+          }
        });
  }
 };
@ -2994,9 +3038,16 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
 };
 struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vorps(dest, src1, src2);
+          } else {
            e.vpor(dest, src1, src2);
+          }
        });
  }
 };
@ -3039,9 +3090,16 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
 };
 struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vxorps(dest, src1, src2);
+          } else {
            e.vpxor(dest, src1, src2);
+          }
        });
  }
 };
@ -3078,9 +3136,16 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
 };
 struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+
+    SimdDomain domain =
+        e.DeduceSimdDomain(i.src1.value);
+    if (domain == SimdDomain::FLOATING) {
+      e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    } else {
      // dest = src ^ 0xFFFF...
      e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
    }
+  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);

@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
    }
    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
    e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
-    e.vmovaps(i.dest, e.xmm0);
+    e.vmovdqa(i.dest, e.xmm0);
  }
  static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
    // Almost all instances are shamt = 1, but non-constant.
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            i->Remove();
            result = true;
          }
+
+          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
+                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
+            /*
+                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
+            */
+
+            v->set_zero(VEC128_TYPE);
+            i->Remove();
+            result = true;
+          }
+          
          break;
        }
        case OPCODE_INSERT:
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -9,6 +9,7 @@

 #include "xenia/cpu/compiler/passes/simplification_pass.h"

+#include <__msvc_int128.hpp>
 #include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
 namespace xe {
@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
 using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::Instr;
 using xe::cpu::hir::Value;
+using vmask_portion_t = uint64_t;
+template <uint32_t Ndwords>
+struct Valuemask_t {
+  vmask_portion_t bits[Ndwords];
+
+  static Valuemask_t create_empty(vmask_portion_t fill = 0) {
+    Valuemask_t result;
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = fill;
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i]);
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i], other.bits[i]);
+    }
+    return result;
+  }
+  Valuemask_t operator&(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
+                   other);
+  }
+  Valuemask_t operator|(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
+                   other);
+  }
+  Valuemask_t operator^(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
+                   other);
+  }
+  Valuemask_t operator~() const {
+    return operate([](vmask_portion_t x) { return ~x; }, other);
+  }
+};

 SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}

@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
    iter_result |= SimplifyBitArith(builder);
    iter_result |= EliminateConversions(builder);
    iter_result |= SimplifyAssignments(builder);
+    iter_result |= BackpropTruncations(builder);
    result |= iter_result;
  } while (iter_result);
  return true;
@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
  }
  return false;
 }
+bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
+                                          hir::HIRBuilder* builder,
+                                          hir::Value* xored) {
+  unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
+
+  Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
+  if (!xordef) {
+    return false;
+  }
+
+  Opcode xorop = xordef->opcode->num;
+  bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
+
+  Value* new_value = nullptr;
+  if (xorop == OPCODE_IS_FALSE) {
+    new_value = builder->IsTrue(xordef->src1.value);
+
+  } else if (xorop == OPCODE_IS_TRUE) {
+    new_value = builder->IsFalse(xordef->src1.value);
+  } else if (xorop == OPCODE_COMPARE_EQ) {
+    new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
+
+  } else if (xorop == OPCODE_COMPARE_NE) {
+    new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
+  }  // todo: other conds
+
+  if (!new_value) {
+    return false;
+  }
+
+  new_value->def->MoveBefore(i);
+
+  i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
+  i->set_src1(new_value);
+
+  return true;
+}
+
+bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
+                                            hir::HIRBuilder* builder,
+                                            hir::Value* b1, hir::Value* b2) {
+  // todo: implement
+  return false;
+}
 bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
  if (CheckOrXorZero(i)) {
    return true;
  } else {
-    if (i->src1.value == i->src2.value) {
+    Value* src1 = i->src1.value;
+    Value* src2 = i->src2.value;
+
+    if (SameValueOrEqualConstant(src1, src2)) {
      i->Replace(&OPCODE_ASSIGN_info, 0);
      i->set_src1(builder->LoadZero(i->dest->type));
      return true;
    }
-    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-
    auto [constant_value, variable_value] =
        i->BinaryValueArrangeAsConstAndVar();
+    ScalarNZM nzm1 = GetScalarNZM(src1);
+    ScalarNZM nzm2 = GetScalarNZM(src2);
+
+    if ((nzm1 & nzm2) ==
+        0) {  // no bits of the two sources overlap, this ought to be an OR
+      // cs:optimizing
+      /* i->Replace(&OPCODE_OR_info, 0);
+      i->set_src1(src1);
+      i->set_src2(src2);*/
+
+      i->opcode = &OPCODE_OR_info;
+
+      return true;
+    }
+
+    if (nzm1 == 1ULL && nzm2 == 1ULL) {
+      if (constant_value) {
+        return CheckBooleanXor1(i, builder, variable_value);
+      } else {
+        return CheckXorOfTwoBools(i, builder, src1, src2);
+      }
+    }
+
+    uint64_t type_mask = GetScalarTypeMask(i->dest->type);

    if (!constant_value) return false;

@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
 }
 bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
 retry_and_simplification:
+
  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
  if (!constant_value) {
    // added this for srawi
-    uint64_t nzml = GetScalarNZM(i->src1.value);
-    uint64_t nzmr = GetScalarNZM(i->src2.value);
+    ScalarNZM nzml = GetScalarNZM(i->src1.value);
+    ScalarNZM nzmr = GetScalarNZM(i->src2.value);

    if ((nzml & nzmr) == 0) {
      i->Replace(&OPCODE_ASSIGN_info, 0);
@ -524,9 +642,15 @@ retry_and_simplification:

  // todo: check if masking with mask that covers all of zero extension source
  uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-  // if masking with entire width, pointless instruction so become an assign

-  if (constant_value->AsUint64() == type_mask) {
+  ScalarNZM nzm = GetScalarNZM(variable_value);
+  // if masking with entire width, pointless instruction so become an assign
+  // chrispy: changed this to use the nzm instead, this optimizes away many and
+  // instructions
+  // chrispy: changed this again. detecting if nzm is a subset of and mask, if
+  // so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
+  // to the mask, but it is a subset so can be elimed
+  if ((constant_value->AsUint64() & nzm) == nzm) {
    i->Replace(&OPCODE_ASSIGN_info, 0);
    i->set_src1(variable_value);
    return true;
@ -555,7 +679,7 @@ retry_and_simplification:
        Value* or_left = true_variable_def->src1.value;
        Value* or_right = true_variable_def->src2.value;

-        uint64_t left_nzm = GetScalarNZM(or_left);
+        ScalarNZM left_nzm = GetScalarNZM(or_left);

        // use the other or input instead of the or output
        if ((constant_value->AsUint64() & left_nzm) == 0) {
@ -565,7 +689,7 @@ retry_and_simplification:
          return true;
        }

-        uint64_t right_nzm = GetScalarNZM(or_right);
+        ScalarNZM right_nzm = GetScalarNZM(or_right);

        if ((constant_value->AsUint64() & right_nzm) == 0) {
          i->Replace(&OPCODE_AND_info, 0);
@ -593,6 +717,21 @@ retry_and_simplification:
  return false;
 }
 bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* src1 = i->src1.value;
+  Value* src2 = i->src2.value;
+
+  ScalarNZM nzm1 = GetScalarNZM(src1);
+  ScalarNZM nzm2 = GetScalarNZM(src2);
+  if ((nzm1 & nzm2) == 0) {  // no bits overlap, there will never be a carry
+                             // from any bits to any others, make this an OR
+
+    /* i->Replace(&OPCODE_OR_info, 0);
+    i->set_src1(src1);
+    i->set_src2(src2);*/
+    i->opcode = &OPCODE_OR_info;
+    return true;
+  }
+
  auto [definition, added_constant] =
      i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);

@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    return false;
  }

-  uint64_t nzm_for_var = GetScalarNZM(variable);
+  ScalarNZM nzm_for_var = GetScalarNZM(variable);
  Opcode cmpop = i->opcode->num;
  uint64_t constant_unpacked = constant_value->AsUint64();
  uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    i->set_src1(variable);
    return true;
  }
+
+  if (cmpop == OPCODE_COMPARE_ULE &&
+      constant_unpacked ==
+          0) {  // less than or equal to zero = (== 0) = IS_FALSE
+    i->Replace(&OPCODE_IS_FALSE_info, 0);
+    i->set_src1(variable);
+    return true;
+  }
  // todo: OPCODE_COMPARE_NE too?
  if (cmpop == OPCODE_COMPARE_EQ &&
      def_opcode == OPCODE_NOT) {  // i see this a lot around addic insns
@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
    return false;
  }

-  uint64_t input_nzm = GetScalarNZM(input);
+  ScalarNZM input_nzm = GetScalarNZM(input);

  if (istrue &&
      input_nzm == 1) {  // doing istrue on a value thats already a bool bitwise
@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
   input_def = input_def->GetDestDefSkipAssigns();*/
  return false;
 }
+bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
+                                         hir::HIRBuilder* builder,
+                                         hir::Value* variable,
+                                         unsigned int shift) {
+  if (shift >= 3 && shift <= 6) {
+    // is possible shift of lzcnt res, do some tunneling
+
+    unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
+                          MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
+                          MOVTUNNEL_AND32FF;
+
+    Instr* vardef = variable->def;
+
+    hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
+
+    if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
+      Value* lz_input = var_def->src1.value;
+      TypeName type_of_lz_input = lz_input->type;
+      size_t shift_for_zero =
+          xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
+
+      if (shift == shift_for_zero) {
+        // we ought to be OPCODE_IS_FALSE!
+        /*
+            explanation: if an input to lzcnt is zero, the result will be the
+           bit size of the input type, which is always a power of two any
+           nonzero result will be less than the bit size so you can test for
+           zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
+            this is a very common way of testing for zero without branching on
+           ppc, and the xb360 ppc compiler used it a lot we optimize this away
+           for simplicity and to enable further optimizations, but actually this
+           is also quite fast on modern x86 processors as well, for instance on
+           zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
+           in one cycle
+
+        */
+
+        if (variable->type != INT8_TYPE) {
+          Value* isfalsetest = builder->IsFalse(lz_input);
+
+          isfalsetest->def->MoveBefore(i);
+          i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
+          i->set_src1(isfalsetest);
+
+        } else {
+          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          i->set_src1(lz_input);
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* shr_lhs = i->src1.value;
+  Value* shr_rhs = i->src2.value;
+  if (!shr_lhs || !shr_rhs) return false;
+  if (shr_rhs->IsConstant()) {
+    return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
+  }
+
+  return false;
+}
+
+bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* l = i->src1.value;
+  Value* r = i->src2.value;
+  ScalarNZM l_nzm = GetScalarNZM(l);
+  uint64_t signbit_mask = GetScalarSignbitMask(l->type);
+  size_t typesize = GetTypeSize(l->type);
+
+  /*
+    todo: folding this requires the mask of constant bits
+  if (r->IsConstant()) {
+    uint32_t const_r = r->AsUint32();
+
+    if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
+  fill the result with the signbit of the input.
+
+
+    }
+  }*/
+  if ((l_nzm & signbit_mask) == 0) {  // signbit will never be set, might as
+                                      // well be an SHR. (this does happen)
+    i->opcode = &OPCODE_SHR_info;
+
+    return true;
+  }
+
+  return false;
+}
 bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
  bool result = false;
  auto block = builder->first_block();
@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
      // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
      // don't handle these in our simplifications, so skip
      if (i->dest && IsScalarIntegralType(i->dest->type)) {
-        if (i->opcode == &OPCODE_OR_info) {
+        Opcode iop = i->opcode->num;
+
+        if (iop == OPCODE_OR) {
          result |= CheckOr(i, builder);
-        } else if (i->opcode == &OPCODE_XOR_info) {
+        } else if (iop == OPCODE_XOR) {
          result |= CheckXor(i, builder);
-        } else if (i->opcode == &OPCODE_AND_info) {
+        } else if (iop == OPCODE_AND) {
          result |= CheckAnd(i, builder);
-        } else if (i->opcode == &OPCODE_ADD_info) {
+        } else if (iop == OPCODE_ADD) {
          result |= CheckAdd(i, builder);
-        } else if (IsScalarBasicCmp(i->opcode->num)) {
+        } else if (IsScalarBasicCmp(iop)) {
          result |= CheckScalarConstCmp(i, builder);
-        } else if (i->opcode == &OPCODE_IS_FALSE_info ||
-                   i->opcode == &OPCODE_IS_TRUE_info) {
+        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
          result |= CheckIsTrueIsFalse(i, builder);
+        } else if (iop == OPCODE_SHR) {
+          result |= CheckSHR(i, builder);
+        } else if (iop == OPCODE_SHA) {
+          result |= CheckSAR(i, builder);
        }
      }

@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
  }
  return false;
 }
-
 bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  // Run over the instructions and rename assigned variables:
  //   v1 = v0
@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  while (block) {
    auto i = block->instr_head;
    while (i) {
-      uint32_t signature = i->opcode->signature;
-      if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
+      i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
        bool modified = false;
-        i->set_src1(CheckValue(i->src1.value, modified));
+        i->set_srcN(CheckValue(value, modified), idx);
        result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src2(CheckValue(i->src2.value, modified));
-        result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src3(CheckValue(i->src3.value, modified));
-        result |= modified;
-      }
+      });

      i = i->next;
    }
@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  return result;
 }

+struct TruncateSimplifier {
+  TypeName type_from, type_to;
+  uint32_t sizeof_from, sizeof_to;
+  uint32_t bit_sizeof_from, bit_sizeof_to;
+  uint64_t typemask_from, typemask_to;
+  hir::HIRBuilder* builder;
+  hir::Instr* truncate_instr;
+  hir::Value* truncated_value;
+  hir::Instr* truncated_value_def;
+};
+bool SimplificationPass::BackpropTruncations(hir::Instr* i,
+                                             hir::HIRBuilder* builder) {
+  if (i->opcode != &OPCODE_TRUNCATE_info) {
+    return false;
+  }
+  TypeName type_from = i->src1.value->type;
+  TypeName type_to = i->dest->type;
+
+  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
+  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
+
+  Instr* input_def = i->src1.value->GetDefSkipAssigns();
+  if (!input_def) {
+    return false;
+  }
+  Opcode input_opc = input_def->opcode->num;
+
+  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
+    uint32_t src2_shift = input_def->src2.value->AsUint32();
+    if (src2_shift < (sizeof_to * CHAR_BIT)) {
+      Value* truncated_preshift =
+          builder->Truncate(input_def->src1.value, type_to);
+
+      truncated_preshift->def->MoveBefore(i);
+      i->Replace(&OPCODE_SHL_info, 0);
+      i->set_src1(truncated_preshift);
+      i->set_src2(input_def->src2.value);
+      return true;
+    }
+  }
+  if (input_opc == OPCODE_LOAD_CONTEXT) {
+    if (sizeof_from == 8 && sizeof_to == 4) {
+      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
+      loadof->def->MoveBefore(input_def);
+      i->Replace(&OPCODE_ASSIGN_info, 0);
+      i->set_src1(loadof);
+      return true;
+    }
+  }
+
+  return false;
+}
+bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
+  bool result = false;
+  auto block = builder->first_block();
+  while (block) {
+    auto i = block->instr_head;
+    while (i) {
+      result |= BackpropTruncations(i, builder);
+      i = i->next;
+    }
+    block = block->next;
+  }
+  return result;
+}
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  auto def = value->def;
  if (def && def->opcode == &OPCODE_ASSIGN_info) {
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
+                       hir::Value* variable, unsigned int shift);
+
+  bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
+  // called by CheckXor, handles transforming a 1 bit value xored against 1
+  bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
+                        hir::Value* xored);
+  bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
+                          hir::Value* b1, hir::Value* b2);
+
  // for rlwinm
  bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
  bool TransformANDROLORSHLSeq(
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -14,38 +14,15 @@
 namespace xe {
 namespace cpu {
 namespace hir {
-
-void Instr::set_src1(Value* value) {
-  if (src1.value == value) {
+void Instr::set_srcN(Value* value, uint32_t idx) {
+  if (srcs[idx].value == value) {
    return;
  }
-  if (src1_use) {
-    src1.value->RemoveUse(src1_use);
+  if (srcs_use[idx]) {
+    srcs[idx].value->RemoveUse(srcs_use[idx]);
  }
-  src1.value = value;
-  src1_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src2(Value* value) {
-  if (src2.value == value) {
-    return;
-  }
-  if (src2_use) {
-    src2.value->RemoveUse(src2_use);
-  }
-  src2.value = value;
-  src2_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src3(Value* value) {
-  if (src3.value == value) {
-    return;
-  }
-  if (src3_use) {
-    src3.value->RemoveUse(src3_use);
-  }
-  src3.value = value;
-  src3_use = value ? value->AddUse(block->arena, this) : NULL;
+  srcs[idx].value = value;
+  srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
 }

 void Instr::MoveBefore(Instr* other) {
@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
  }
  return current_def;
 }
+Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
+  unsigned int traversed_types = 0;
+  unsigned int in_flags = *tunnel_flags;
+  Instr* current_def = this;
+
+  while (true) {
+    Opcode op = current_def->opcode->num;
+
+    switch (op) {
+      case OPCODE_ASSIGN: {
+        if ((in_flags & MOVTUNNEL_ASSIGNS)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_ASSIGNS;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_ZERO_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVZX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVZX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_SIGN_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVSX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVSX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_TRUNCATE: {
+        if ((in_flags & MOVTUNNEL_TRUNCATE)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_TRUNCATE;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_AND: {
+        if ((in_flags & MOVTUNNEL_AND32FF)) {
+          auto [constant, nonconst] =
+              current_def->BinaryValueArrangeAsConstAndVar();
+          if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
+            goto exit_loop;
+          }
+          current_def = nonconst->def;
+          traversed_types |= MOVTUNNEL_AND32FF;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      default:
+        goto exit_loop;
+    }
+    if (!current_def) {
+      goto exit_loop;
+    }
+  }
+exit_loop:
+  *tunnel_flags = traversed_types;
+  return current_def;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -25,6 +25,14 @@ namespace hir {

 class Block;
 class Label;
+// todo: better name
+enum MovTunnel {
+  MOVTUNNEL_ASSIGNS = 1,
+  MOVTUNNEL_MOVZX = 2,
+  MOVTUNNEL_MOVSX = 4,
+  MOVTUNNEL_TRUNCATE = 8,
+  MOVTUNNEL_AND32FF = 16,  // tunnel through and with 0xFFFFFFFF
+};

 class Instr {
 public:
@ -44,17 +52,28 @@ class Instr {
  } Op;

  Value* dest;
+  union {
+    struct {
      Op src1;
      Op src2;
      Op src3;
-
+    };
+    Op srcs[3];
+  };
+  union {
+    struct {
      Value::Use* src1_use;
      Value::Use* src2_use;
      Value::Use* src3_use;
+    };
+    Value::Use* srcs_use[3];
+  };
+  void set_srcN(Value* value, uint32_t idx);
+  void set_src1(Value* value) { set_srcN(value, 0); }

-  void set_src1(Value* value);
-  void set_src2(Value* value);
-  void set_src3(Value* value);
+  void set_src2(Value* value) { set_srcN(value, 1); }
+
+  void set_src3(Value* value) { set_srcN(value, 2); }

  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
  }

  Instr* GetDestDefSkipAssigns();
+  Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
+
  // returns [def op, constant]
  std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
      const OpcodeInfo* op_ptr) {
@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
    }
    return result;
  }
+  /*
+  Invokes the provided lambda callback on each operand that is a Value. Callback
+  is invoked with Value*, uint32_t index
+*/
+  template <typename TCallable>
+  void VisitValueOperands(TCallable&& call_for_values) {
+    uint32_t signature = opcode->signature;
+
+    OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
+
+    UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
+
+    if (t_src1 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src1.value, 0);
+    }
+    if (t_src2 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src2.value, 1);
+    }
+    if (t_src3 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src3.value, 2);
+    }
+  }
 };

 }  // namespace hir
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
    return nullptr;
  }
 }
+hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
+  if (def) {
+    return def->GetDestDefTunnelMovs(tunnel_flags);
+  } else {
+    return nullptr;
+  }
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -598,6 +598,8 @@ class Value {
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
+  // tunnel_flags is updated to the kinds we actually traversed
+  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);

 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -246,30 +246,7 @@ enum class PPCRegister {
 };

 #pragma pack(push, 8)
-typedef struct PPCContext_s {
-  // Must be stored at 0x0 for now.
-  // TODO(benvanik): find a nice way to describe this to the JIT.
-  ThreadState* thread_state;  // 0x0
-  // TODO(benvanik): this is getting nasty. Must be here.
-  uint8_t* virtual_membase;  // 0x8
-
-  // Most frequently used registers first.
-  uint64_t lr;      // 0x10 Link register
-  uint64_t ctr;     // 0x18 Count register
-  uint64_t r[32];   // 0x20 General purpose registers
-  double f[32];     // 0x120 Floating-point registers
-  vec128_t v[128];  // 0x220 VMX128 vector registers
-
-  // XER register:
-  // Split to make it easier to do individual updates.
-  uint8_t xer_ca;  // 0xA20
-  uint8_t xer_ov;  // 0xA21
-  uint8_t xer_so;  // 0xA22
-
-  // Condition registers:
-  // These are split to make it easier to do DCE on unused stores.
-  uint64_t cr() const;
-  void set_cr(uint64_t value);
+typedef struct alignas(64) PPCContext_s {
  union {
    uint32_t value;
    struct {
@ -395,6 +372,25 @@ typedef struct PPCContext_s {
    } bits;
  } fpscr;  // Floating-point status and control register

+  // Most frequently used registers first.
+
+  uint64_t r[32];   // 0x20 General purpose registers
+  uint64_t ctr;     // 0x18 Count register
+  uint64_t lr;      // 0x10 Link register
+  double f[32];     // 0x120 Floating-point registers
+  vec128_t v[128];  // 0x220 VMX128 vector registers
+
+  // XER register:
+  // Split to make it easier to do individual updates.
+  uint8_t xer_ca;
+  uint8_t xer_ov;
+  uint8_t xer_so;
+
+  // Condition registers:
+  // These are split to make it easier to do DCE on unused stores.
+  uint64_t cr() const;
+  void set_cr(uint64_t value);
+
  uint8_t vscr_sat;

  // uint32_t get_fprf() {
@ -425,7 +421,8 @@ typedef struct PPCContext_s {

  // Value of last reserved load
  uint64_t reserved_val;
-
+  ThreadState* thread_state;
+  uint8_t* virtual_membase;  
  static std::string GetRegisterName(PPCRegister reg);
  std::string GetStringFromValue(PPCRegister reg) const;
  void SetValueFromString(PPCRegister reg, std::string value);
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@ -18,12 +18,50 @@
 #include "xenia/cpu/processor.h"

 #include "xenia/xbox.h"
-
 namespace xe {
 namespace cpu {

 thread_local ThreadState* thread_state_ = nullptr;

+static void* AllocateContext() {
+  size_t granularity = xe::memory::allocation_granularity();
+  for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
+    /*
+        we want our register which points to the context to have 0xE0000000 in
+       the low 32 bits, for checking for whether we need the 4k offset, but also
+       if we allocate starting from the page before we allow backends to index
+       negatively to get to their own backend specific data, which makes full
+        use of int8 displacement
+
+
+        the downside is we waste most of one granula and probably a fair bit of
+       the one starting at 0xE0 by using a direct virtual memory allocation
+       instead of malloc
+    */
+    uintptr_t context_pre =
+        ((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
+
+    void* p = memory::AllocFixed(
+        (void*)context_pre, granularity + sizeof(ppc::PPCContext),
+        memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
+    if (p) {
+      return reinterpret_cast<char*>(p) +
+             granularity;  // now we have a ctx ptr with the e0 constant in low,
+                           // and one page allocated before it
+    }
+  }
+
+  assert_always("giving up on allocating context, likely leaking contexts");
+  return nullptr;
+}
+
+static void FreeContext(void* ctx) {
+  char* true_start_of_ctx = &reinterpret_cast<char*>(
+      ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
+  memory::DeallocFixed(true_start_of_ctx, 0,
+                       memory::DeallocationType::kRelease);
+}
+
 ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
                         uint32_t stack_base, uint32_t pcr_address)
    : processor_(processor),
@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
  backend_data_ = processor->backend()->AllocThreadData();

  // Allocate with 64b alignment.
-  context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
+
+  context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext());  // memory::AlignedAlloc<ppc::PPCContext>(64);
+  processor->backend()->InitializeBackendContext(context_);
  assert_true(((uint64_t)context_ & 0x3F) == 0);
  std::memset(context_, 0, sizeof(ppc::PPCContext));

@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
  if (thread_state_ == this) {
    thread_state_ = nullptr;
  }
-
-  memory::AlignedFree(context_);
+  if (context_) {
+    FreeContext(reinterpret_cast<void*>(context_));
+  }
+ // memory::AlignedFree(context_);
 }

 void ThreadState::Bind(ThreadState* thread_state) {