Merge pull request #52 from chrisps/canary_experimental

Fix previous batch of CPU changes
2022-07-18 09:20:35 +02:00 · 2022-07-18 09:20:35 +02:00 · 3757580f45
parent fd78ab4dfc 11817f0a3b
commit 3757580f45
15 changed files with 856 additions and 170 deletions
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@ -63,6 +63,10 @@ class Backend {
  virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
  virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
  virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
  // ctx points to the start of a ppccontext, ctx - page_allocation_granularity
  // up until the start of ctx may be used by the backend to store whatever data
  // they want
  virtual void InitializeBackendContext(void* ctx) {}
 protected:
  Processor* processor_ = nullptr;
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -32,6 +32,9 @@
 #include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/function.h"
 #include "xenia/cpu/function_debug_info.h"
 #include "xenia/cpu/hir/instr.h"
 #include "xenia/cpu/hir/opcodes.h"
 #include "xenia/cpu/hir/value.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/symbol.h"
 #include "xenia/cpu/thread_state.h"
@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
 }
 uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
  uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
  // uint16_t str_len = uint16_t(thread_state->context()->r[4]);
  auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
 }
 uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
  XELOGE("tw/td forced trap hit! This should be a crash!");
  if (cvars::break_on_debugbreak) {
    xe::debugging::Break();
@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
  // TODO(benvanik): required?
  assert_not_zero(target_address);
@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
  MovMem64(addr + 8, v.high);
  return ptr[addr];
 }
 static bool IsVectorCompare(const Instr* i) {
  hir::Opcode op = i->opcode->num;
  return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
         op <= hir::OPCODE_VECTOR_COMPARE_UGE;
 }
 static bool IsFlaggedVectorOp(const Instr* i) {
  if (IsVectorCompare(i)) {
    return true;
  }
  hir::Opcode op = i->opcode->num;
  using namespace hir;
  switch (op) {
    case OPCODE_VECTOR_SUB:
    case OPCODE_VECTOR_ADD:
    case OPCODE_SWIZZLE:
      return true;
  }
  return false;
 }
 static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
  switch (df->flags) {  // check what datatype we compared as
    case hir::INT16_TYPE:
    case hir::INT32_TYPE:
    case hir::INT8_TYPE:
    case hir::INT64_TYPE:
      return SimdDomain::INTEGER;
    case hir::FLOAT32_TYPE:
    case hir::FLOAT64_TYPE:  // pretty sure float64 doesnt occur with vectors.
                             // here for completeness
      return SimdDomain::FLOATING;
    default:
      return SimdDomain::DONTCARE;
  }
  return SimdDomain::DONTCARE;
 }
 // this list is incomplete
 static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
  using namespace hir;
  switch (opc) {
    case OPCODE_LOAD_VECTOR_SHL:
    case OPCODE_LOAD_VECTOR_SHR:
    case OPCODE_VECTOR_CONVERT_F2I:
    case OPCODE_VECTOR_MIN:  // there apparently is no FLOAT32_TYPE for min/maxs
                             // flags
    case OPCODE_VECTOR_MAX:
    case OPCODE_VECTOR_SHL:
    case OPCODE_VECTOR_SHR:
    case OPCODE_VECTOR_SHA:
    case OPCODE_VECTOR_ROTATE_LEFT:
    case OPCODE_VECTOR_AVERAGE:  // apparently no float32 type for this
    case OPCODE_EXTRACT:
    case OPCODE_INSERT:  // apparently no f32 type for these two
      return true;
  }
  return false;
 }
 static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
  using namespace hir;
  switch (opc) {
    case OPCODE_VECTOR_CONVERT_I2F:
    case OPCODE_VECTOR_DENORMFLUSH:
    case OPCODE_DOT_PRODUCT_3:
    case OPCODE_DOT_PRODUCT_4:
    case OPCODE_LOG2:
    case OPCODE_POW2:
    case OPCODE_RECIP:
    case OPCODE_ROUND:
    case OPCODE_SQRT:
    case OPCODE_MUL:
    case OPCODE_MUL_SUB:
    case OPCODE_MUL_ADD:
    case OPCODE_ABS:
      return true;
  }
  return false;
 }
 SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
  hir::Instr* df = for_value->def;
  if (!df) {
    // todo: visit uses to figure out domain
    return SimdDomain::DONTCARE;
  } else {
    SimdDomain result = SimdDomain::DONTCARE;
    if (IsFlaggedVectorOp(df)) {
      result = GetDomainForFlaggedVectorOp(df);
    } else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
      result = SimdDomain::INTEGER;
    } else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
      result = SimdDomain::FLOATING;
    }
    // todo: check if still dontcare, if so, visit uses of the value to figure
    // it out
    return result;
  }
  return SimdDomain::DONTCARE;
 }
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -44,7 +44,39 @@ enum RegisterFlags {
  REG_DEST = (1 << 0),
  REG_ABCD = (1 << 1),
 };
 /*
    SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
   data and int data for a reason most processors implement two distinct
   pipelines, one for the integer domain and one for the floating point domain
    currently, xenia makes no distinction between the two. Crossing domains is
   expensive. On Zen processors the penalty is one cycle each time you cross,
   plus the two pipelines need to synchronize Often xenia will emit an integer
   instruction, then a floating instruction, then integer again. this
   effectively adds at least two cycles to the time taken These values will in
   the future be used as tags to operations that tell them which domain to
   operate in, if its at all possible to avoid crossing
 */
 enum class SimdDomain : uint32_t {
  FLOATING,
  INTEGER,
  DONTCARE,
  CONFLICTING  // just used as a special result for PickDomain, different from
               // dontcare (dontcare means we just dont know the domain,
               // CONFLICTING means its used in multiple domains)
 };
 static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
  if (dom1 == dom2) {
    return dom1;
  }
  if (dom1 == SimdDomain::DONTCARE) {
    return dom2;
  }
  if (dom2 == SimdDomain::DONTCARE) {
    return dom1;
  }
  return SimdDomain::CONFLICTING;
 }
 enum XmmConst {
  XMMZero = 0,
  XMMOne,
@ -122,7 +154,7 @@ enum XmmConst {
  XMMLVSLTableBase,
  XMMLVSRTableBase,
  XMMSingleDenormalMask,
-  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
  XMMXenosF16ExtRangeStart
 };
@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {
  kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
-  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
+  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
-  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64FastLoop =
      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  FunctionDebugInfo* debug_info() const { return debug_info_; }
  size_t stack_size() const { return stack_size_; }
  SimdDomain DeduceSimdDomain(const hir::Value* for_value);
 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -12,11 +12,11 @@
 #include <algorithm>
 #include <cstring>
 #include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
 #include "xenia/base/cvar.h"
 DEFINE_bool(
    elide_e0_check, false,
@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
        !is_definitely_not_eo(guest)) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
      // todo: do branching or use an alt membase and cmov
      e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
      e.cmp(e.edx, e.GetContextReg().cvt32());
      e.setae(e.al);
      e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+      e.add(e.eax, e.edx);
      return e.GetMembaseReg() + e.rax;
    } else {
      // Clear the top 32 bits, as they are likely garbage.
      // TODO(benvanik): find a way to avoid doing this.
@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
      e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000);
+      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.al);
      e.shl(e.eax, 12);
      e.add(e.eax, guest.reg().cvt32());
@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.cl);
      e.movzx(e.ecx, e.cl);
      e.shl(e.ecx, 12);
@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
    if (xe::memory::allocation_granularity() > 0x1000) {
      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
      // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
      e.setae(e.cl);
      e.movzx(e.ecx, e.cl);
      e.shl(e.ecx, 12);
@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
      if (xe::memory::allocation_granularity() > 0x1000) {
        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
        // it via memory mapping.
-        e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
        e.setae(e.al);
        e.movzx(e.eax, e.al);
        e.shl(e.eax, 12);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -1826,7 +1826,7 @@ struct PERMUTE_I32
    }
  }
 };
-//todo: use this on const src1
+// todo: use this on const src1
 static vec128_t FixupConstantShuf8(vec128_t input) {
  for (uint32_t i = 0; i < 16; ++i) {
    input.u8[i] ^= 0x03;
@ -1984,7 +1984,11 @@ struct SWIZZLE
      } else {
        src1 = i.src1;
      }
-      e.vpshufd(i.dest, src1, swizzle_mask);
+      if (element_type == INT32_TYPE) {
        e.vpshufd(i.dest, src1, swizzle_mask);
      } else if (element_type == FLOAT32_TYPE) {
        e.vshufps(i.dest, src1, src1, swizzle_mask);
      }
    } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
      assert_always();
    } else {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -717,6 +717,9 @@ struct SELECT_V128_I8
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // TODO(benvanik): find a shorter sequence.
    // dest = src1 != 0 ? src2 : src3
    /*
       chrispy: this is dead code, this sequence is never emitted
    */
    e.movzx(e.eax, i.src1);
    e.vmovd(e.xmm1, e.eax);
    e.vpbroadcastd(e.xmm1, e.xmm1);
@ -737,11 +740,46 @@ struct SELECT_V128_I8
    e.vpor(i.dest, e.xmm1);
  }
 };
 enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
 static bool IsVectorCompare(const Instr* i) {
  Opcode op = i->opcode->num;
  return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
 }
 /*
    OPCODE_SELECT does a bit by bit selection, however, if the selector is the
   result of a comparison or if each element may only be 0xff or 0 we may use a
   blend instruction instead
 */
 static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
  const Instr* df = src1v->def;
  if (!df) {
    return PermittedBlend::NotPermitted;
  } else {
    if (!IsVectorCompare(df)) {
      return PermittedBlend::NotPermitted;  // todo: check ors, ands of
                                            // condition
    } else {
      switch (df->flags) {  // check what datatype we compared as
        case INT16_TYPE:
        case INT32_TYPE:
        case INT8_TYPE:
          return PermittedBlend::Int8;  // use vpblendvb
        case FLOAT32_TYPE:
          return PermittedBlend::Ps;  // use vblendvps
        default:                      // unknown type! just ignore
          return PermittedBlend::NotPermitted;
      }
    }
  }
 }
 struct SELECT_V128_V128
    : Sequence<SELECT_V128_V128,
               I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
    PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
    //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
    if (i.src1.is_constant) {
      e.LoadConstantXmm(src1, i.src1.constant());
    }
@ -756,10 +794,16 @@ struct SELECT_V128_V128
      e.LoadConstantXmm(src3, i.src3.constant());
    }
-    // src1 ? src2 : src3;
+    if (mayblend == PermittedBlend::Int8) {
-    e.vpandn(e.xmm3, src1, src2);
+      e.vpblendvb(i.dest, src2, src3, src1);
-    e.vpand(i.dest, src1, src3);
+    } else if (mayblend == PermittedBlend::Ps) {
-    e.vpor(i.dest, i.dest, e.xmm3);
+      e.vblendvps(i.dest, src2, src3, src1);
    } else {
      // src1 ? src2 : src3;
      e.vpandn(e.xmm3, src1, src2);
      e.vpand(i.dest, src1, src3);
      e.vpor(i.dest, i.dest, e.xmm3);
    }
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
    // TODO(benvanik): the vfmadd sequence produces slightly different results
    // than vmul+vadd and it'd be nice to know why. Until we know, it's
    // disabled so tests pass.
-    if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
+    // chrispy: reenabled, i have added the DAZ behavior that was missing
    if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
      EmitCommutativeBinaryXmmOp(e, i,
                                 [&i](X64Emitter& e, const Xmm& dest,
                                      const Xmm& src1, const Xmm& src2) {
@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
                                     e.vfmadd231ps(i.dest, src1, src2);
                                   } else {
                                     // Dest not equal to anything
-                                     e.vmovdqa(i.dest, src1);
+                                     //                                     e.vmovdqa(i.dest,
                                     //                                     src1);
                                     // chrispy: vmovdqa was a domain pipeline
                                     // hazard
                                     e.vmovaps(i.dest, src1);
                                     e.vfmadd213ps(i.dest, src2, src3);
                                   }
                                 });
@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
        src3 = i.src3;
        if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm1, i.src3);
+          // e.vmovdqa(e.xmm1, i.src3);
          e.vmovaps(e.xmm1, i.src3);
          src3 = e.xmm1;
        }
      }
@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
 // ============================================================================
 struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
  }
 };
 struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
+    e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
  }
 };
 struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
    */
    e.vstmxcsr(mxcsr_storage);
    e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
    e.mov(e.eax, 8);
    auto src1v = e.xmm0;
@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
    // so that in the future this could be optimized away if the top is known to
    // be zero. Right now im not sure that happens often though and its
    // currently not worth it also, maybe pre-and if constant
-    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm3, src1v, e.xmm2);
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.xmm2);
    e.and_(mxcsr_storage, e.eax);
    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
    Xbyak::Label ret_qnan;
    Xbyak::Label done;
    e.jnz(ret_qnan);
-    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);  // broadcast
    e.vbroadcastss(i.dest, e.xmm1);
    e.jmp(done);
    e.L(ret_qnan);
    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128
    e.vcvtps2pd(e.ymm0, src1v);
    e.vcvtps2pd(e.ymm1, src2v);
    /*
        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
    e.and_(mxcsr_storage, e.eax);
    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
                                // go
    e.vcvtps2pd(e.ymm0, e.xmm3);
    e.vcvtps2pd(e.ymm1, e.xmm2);
    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
    e.vextractf128(e.xmm4, e.ymm5, 1);
    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
    e.not_(e.eax);
    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
    e.vcvtsd2ss(e.xmm1, e.xmm2);
    */
    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
    e.vextractf128(e.xmm2, e.ymm3, 1);
    e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
    Xbyak::Label ret_qnan;
    Xbyak::Label done;
    e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
-                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
    e.vbroadcastss(i.dest, e.xmm1);
    e.jmp(done);
    e.L(ret_qnan);
    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2846,10 +2876,17 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+                                 e.DeduceSimdDomain(i.src2.value));
-                                 e.vpand(dest, src1, src2);
+
-                               });
+    EmitCommutativeBinaryXmmOp(
        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
          if (dom == SimdDomain::FLOATING) {
            e.vandps(dest, src2, src1);
          } else {
            e.vpand(dest, src2, src1);
          }
        });
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
@ -2948,10 +2985,17 @@ struct AND_NOT_I64
 struct AND_NOT_V128
    : Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+                                 e.DeduceSimdDomain(i.src2.value));
-                                 e.vpandn(dest, src2, src1);
+
-                               });
+    EmitCommutativeBinaryXmmOp(
        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
          if (dom == SimdDomain::FLOATING) {
            e.vandnps(dest, src2, src1);
          } else {
            e.vpandn(dest, src2, src1);
          }
        });
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32,
@ -2994,10 +3038,17 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
 };
 struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+                                 e.DeduceSimdDomain(i.src2.value));
-                                 e.vpor(dest, src1, src2);
+
-                               });
+    EmitCommutativeBinaryXmmOp(
        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
          if (dom == SimdDomain::FLOATING) {
            e.vorps(dest, src1, src2);
          } else {
            e.vpor(dest, src1, src2);
          }
        });
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128);
@ -3039,10 +3090,17 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
 };
 struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+                                 e.DeduceSimdDomain(i.src2.value));
-                                 e.vpxor(dest, src1, src2);
+
-                               });
+    EmitCommutativeBinaryXmmOp(
        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
          if (dom == SimdDomain::FLOATING) {
            e.vxorps(dest, src1, src2);
          } else {
            e.vpxor(dest, src1, src2);
          }
        });
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128);
@ -3078,8 +3136,15 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
 };
 struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // dest = src ^ 0xFFFF...
+
-    e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    SimdDomain domain =
        e.DeduceSimdDomain(i.src1.value);
    if (domain == SimdDomain::FLOATING) {
      e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
    } else {
      // dest = src ^ 0xFFFF...
      e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
    }
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
    }
    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
    e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
-    e.vmovaps(i.dest, e.xmm0);
+    e.vmovdqa(i.dest, e.xmm0);
  }
  static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
    // Almost all instances are shamt = 1, but non-constant.
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            i->Remove();
            result = true;
          }
          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
            /*
                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
            */
            v->set_zero(VEC128_TYPE);
            i->Remove();
            result = true;
          }
          break;
        }
        case OPCODE_INSERT:
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -9,6 +9,7 @@
 #include "xenia/cpu/compiler/passes/simplification_pass.h"
 #include <__msvc_int128.hpp>
 #include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
 namespace xe {
@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
 using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::Instr;
 using xe::cpu::hir::Value;
 using vmask_portion_t = uint64_t;
 template <uint32_t Ndwords>
 struct Valuemask_t {
  vmask_portion_t bits[Ndwords];
  static Valuemask_t create_empty(vmask_portion_t fill = 0) {
    Valuemask_t result;
    for (uint32_t i = 0; i < Ndwords; ++i) {
      result.bits[i] = fill;
    }
    return result;
  }
  template <typename TCallable>
  Valuemask_t operate(TCallable&& oper) const {
    Valuemask_t result = create_empty();
    for (uint32_t i = 0; i < Ndwords; ++i) {
      result.bits[i] = oper(bits[i]);
    }
    return result;
  }
  template <typename TCallable>
  Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
    Valuemask_t result = create_empty();
    for (uint32_t i = 0; i < Ndwords; ++i) {
      result.bits[i] = oper(bits[i], other.bits[i]);
    }
    return result;
  }
  Valuemask_t operator&(ValueMask other) const {
    return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
                   other);
  }
  Valuemask_t operator|(ValueMask other) const {
    return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
                   other);
  }
  Valuemask_t operator^(ValueMask other) const {
    return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
                   other);
  }
  Valuemask_t operator~() const {
    return operate([](vmask_portion_t x) { return ~x; }, other);
  }
 };
 SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
    iter_result |= SimplifyBitArith(builder);
    iter_result |= EliminateConversions(builder);
    iter_result |= SimplifyAssignments(builder);
    iter_result |= BackpropTruncations(builder);
    result |= iter_result;
  } while (iter_result);
  return true;
@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
  }
  return false;
 }
 bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
                                          hir::HIRBuilder* builder,
                                          hir::Value* xored) {
  unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
  Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
  if (!xordef) {
    return false;
  }
  Opcode xorop = xordef->opcode->num;
  bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
  Value* new_value = nullptr;
  if (xorop == OPCODE_IS_FALSE) {
    new_value = builder->IsTrue(xordef->src1.value);
  } else if (xorop == OPCODE_IS_TRUE) {
    new_value = builder->IsFalse(xordef->src1.value);
  } else if (xorop == OPCODE_COMPARE_EQ) {
    new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
  } else if (xorop == OPCODE_COMPARE_NE) {
    new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
  }  // todo: other conds
  if (!new_value) {
    return false;
  }
  new_value->def->MoveBefore(i);
  i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
  i->set_src1(new_value);
  return true;
 }
 bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
                                            hir::HIRBuilder* builder,
                                            hir::Value* b1, hir::Value* b2) {
  // todo: implement
  return false;
 }
 bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
  if (CheckOrXorZero(i)) {
    return true;
  } else {
-    if (i->src1.value == i->src2.value) {
+    Value* src1 = i->src1.value;
    Value* src2 = i->src2.value;
    if (SameValueOrEqualConstant(src1, src2)) {
      i->Replace(&OPCODE_ASSIGN_info, 0);
      i->set_src1(builder->LoadZero(i->dest->type));
      return true;
    }
    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
    auto [constant_value, variable_value] =
        i->BinaryValueArrangeAsConstAndVar();
    ScalarNZM nzm1 = GetScalarNZM(src1);
    ScalarNZM nzm2 = GetScalarNZM(src2);
    if ((nzm1 & nzm2) ==
        0) {  // no bits of the two sources overlap, this ought to be an OR
      // cs:optimizing
      /* i->Replace(&OPCODE_OR_info, 0);
      i->set_src1(src1);
      i->set_src2(src2);*/
      i->opcode = &OPCODE_OR_info;
      return true;
    }
    if (nzm1 == 1ULL && nzm2 == 1ULL) {
      if (constant_value) {
        return CheckBooleanXor1(i, builder, variable_value);
      } else {
        return CheckXorOfTwoBools(i, builder, src1, src2);
      }
    }
    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
    if (!constant_value) return false;
@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
 }
 bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
 retry_and_simplification:
  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
  if (!constant_value) {
    // added this for srawi
-    uint64_t nzml = GetScalarNZM(i->src1.value);
+    ScalarNZM nzml = GetScalarNZM(i->src1.value);
-    uint64_t nzmr = GetScalarNZM(i->src2.value);
+    ScalarNZM nzmr = GetScalarNZM(i->src2.value);
    if ((nzml & nzmr) == 0) {
      i->Replace(&OPCODE_ASSIGN_info, 0);
@ -524,9 +642,15 @@ retry_and_simplification:
  // todo: check if masking with mask that covers all of zero extension source
  uint64_t type_mask = GetScalarTypeMask(i->dest->type);
  // if masking with entire width, pointless instruction so become an assign
-  if (constant_value->AsUint64() == type_mask) {
+  ScalarNZM nzm = GetScalarNZM(variable_value);
  // if masking with entire width, pointless instruction so become an assign
  // chrispy: changed this to use the nzm instead, this optimizes away many and
  // instructions
  // chrispy: changed this again. detecting if nzm is a subset of and mask, if
  // so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
  // to the mask, but it is a subset so can be elimed
  if ((constant_value->AsUint64() & nzm) == nzm) {
    i->Replace(&OPCODE_ASSIGN_info, 0);
    i->set_src1(variable_value);
    return true;
@ -555,7 +679,7 @@ retry_and_simplification:
        Value* or_left = true_variable_def->src1.value;
        Value* or_right = true_variable_def->src2.value;
-        uint64_t left_nzm = GetScalarNZM(or_left);
+        ScalarNZM left_nzm = GetScalarNZM(or_left);
        // use the other or input instead of the or output
        if ((constant_value->AsUint64() & left_nzm) == 0) {
@ -565,7 +689,7 @@ retry_and_simplification:
          return true;
        }
-        uint64_t right_nzm = GetScalarNZM(or_right);
+        ScalarNZM right_nzm = GetScalarNZM(or_right);
        if ((constant_value->AsUint64() & right_nzm) == 0) {
          i->Replace(&OPCODE_AND_info, 0);
@ -593,6 +717,21 @@ retry_and_simplification:
  return false;
 }
 bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
  Value* src1 = i->src1.value;
  Value* src2 = i->src2.value;
  ScalarNZM nzm1 = GetScalarNZM(src1);
  ScalarNZM nzm2 = GetScalarNZM(src2);
  if ((nzm1 & nzm2) == 0) {  // no bits overlap, there will never be a carry
                             // from any bits to any others, make this an OR
    /* i->Replace(&OPCODE_OR_info, 0);
    i->set_src1(src1);
    i->set_src2(src2);*/
    i->opcode = &OPCODE_OR_info;
    return true;
  }
  auto [definition, added_constant] =
      i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);
@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    return false;
  }
-  uint64_t nzm_for_var = GetScalarNZM(variable);
+  ScalarNZM nzm_for_var = GetScalarNZM(variable);
  Opcode cmpop = i->opcode->num;
  uint64_t constant_unpacked = constant_value->AsUint64();
  uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    i->set_src1(variable);
    return true;
  }
  if (cmpop == OPCODE_COMPARE_ULE &&
      constant_unpacked ==
          0) {  // less than or equal to zero = (== 0) = IS_FALSE
    i->Replace(&OPCODE_IS_FALSE_info, 0);
    i->set_src1(variable);
    return true;
  }
  // todo: OPCODE_COMPARE_NE too?
  if (cmpop == OPCODE_COMPARE_EQ &&
      def_opcode == OPCODE_NOT) {  // i see this a lot around addic insns
@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
    return false;
  }
-  uint64_t input_nzm = GetScalarNZM(input);
+  ScalarNZM input_nzm = GetScalarNZM(input);
  if (istrue &&
      input_nzm == 1) {  // doing istrue on a value thats already a bool bitwise
@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
   input_def = input_def->GetDestDefSkipAssigns();*/
  return false;
 }
 bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
                                         hir::HIRBuilder* builder,
                                         hir::Value* variable,
                                         unsigned int shift) {
  if (shift >= 3 && shift <= 6) {
    // is possible shift of lzcnt res, do some tunneling
    unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
                          MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
                          MOVTUNNEL_AND32FF;
    Instr* vardef = variable->def;
    hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
    if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
      Value* lz_input = var_def->src1.value;
      TypeName type_of_lz_input = lz_input->type;
      size_t shift_for_zero =
          xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
      if (shift == shift_for_zero) {
        // we ought to be OPCODE_IS_FALSE!
        /*
            explanation: if an input to lzcnt is zero, the result will be the
           bit size of the input type, which is always a power of two any
           nonzero result will be less than the bit size so you can test for
           zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
            this is a very common way of testing for zero without branching on
           ppc, and the xb360 ppc compiler used it a lot we optimize this away
           for simplicity and to enable further optimizations, but actually this
           is also quite fast on modern x86 processors as well, for instance on
           zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
           in one cycle
        */
        if (variable->type != INT8_TYPE) {
          Value* isfalsetest = builder->IsFalse(lz_input);
          isfalsetest->def->MoveBefore(i);
          i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
          i->set_src1(isfalsetest);
        } else {
          i->Replace(&OPCODE_IS_FALSE_info, 0);
          i->set_src1(lz_input);
        }
        return true;
      }
    }
  }
  return false;
 }
 bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
  Value* shr_lhs = i->src1.value;
  Value* shr_rhs = i->src2.value;
  if (!shr_lhs || !shr_rhs) return false;
  if (shr_rhs->IsConstant()) {
    return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
  }
  return false;
 }
 bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
  Value* l = i->src1.value;
  Value* r = i->src2.value;
  ScalarNZM l_nzm = GetScalarNZM(l);
  uint64_t signbit_mask = GetScalarSignbitMask(l->type);
  size_t typesize = GetTypeSize(l->type);
  /*
    todo: folding this requires the mask of constant bits
  if (r->IsConstant()) {
    uint32_t const_r = r->AsUint32();
    if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
  fill the result with the signbit of the input.
    }
  }*/
  if ((l_nzm & signbit_mask) == 0) {  // signbit will never be set, might as
                                      // well be an SHR. (this does happen)
    i->opcode = &OPCODE_SHR_info;
    return true;
  }
  return false;
 }
 bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
  bool result = false;
  auto block = builder->first_block();
@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
      // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
      // don't handle these in our simplifications, so skip
      if (i->dest && IsScalarIntegralType(i->dest->type)) {
-        if (i->opcode == &OPCODE_OR_info) {
+        Opcode iop = i->opcode->num;
        if (iop == OPCODE_OR) {
          result |= CheckOr(i, builder);
-        } else if (i->opcode == &OPCODE_XOR_info) {
+        } else if (iop == OPCODE_XOR) {
          result |= CheckXor(i, builder);
-        } else if (i->opcode == &OPCODE_AND_info) {
+        } else if (iop == OPCODE_AND) {
          result |= CheckAnd(i, builder);
-        } else if (i->opcode == &OPCODE_ADD_info) {
+        } else if (iop == OPCODE_ADD) {
          result |= CheckAdd(i, builder);
-        } else if (IsScalarBasicCmp(i->opcode->num)) {
+        } else if (IsScalarBasicCmp(iop)) {
          result |= CheckScalarConstCmp(i, builder);
-        } else if (i->opcode == &OPCODE_IS_FALSE_info ||
+        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
                   i->opcode == &OPCODE_IS_TRUE_info) {
          result |= CheckIsTrueIsFalse(i, builder);
        } else if (iop == OPCODE_SHR) {
          result |= CheckSHR(i, builder);
        } else if (iop == OPCODE_SHA) {
          result |= CheckSAR(i, builder);
        }
      }
@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
  }
  return false;
 }
 bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  // Run over the instructions and rename assigned variables:
  //   v1 = v0
@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  while (block) {
    auto i = block->instr_head;
    while (i) {
-      uint32_t signature = i->opcode->signature;
+      i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
      if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
        bool modified = false;
-        i->set_src1(CheckValue(i->src1.value, modified));
+        i->set_srcN(CheckValue(value, modified), idx);
        result |= modified;
-      }
+      });
      if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
        bool modified = false;
        i->set_src2(CheckValue(i->src2.value, modified));
        result |= modified;
      }
      if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
        bool modified = false;
        i->set_src3(CheckValue(i->src3.value, modified));
        result |= modified;
      }
      i = i->next;
    }
@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
  return result;
 }
 struct TruncateSimplifier {
  TypeName type_from, type_to;
  uint32_t sizeof_from, sizeof_to;
  uint32_t bit_sizeof_from, bit_sizeof_to;
  uint64_t typemask_from, typemask_to;
  hir::HIRBuilder* builder;
  hir::Instr* truncate_instr;
  hir::Value* truncated_value;
  hir::Instr* truncated_value_def;
 };
 bool SimplificationPass::BackpropTruncations(hir::Instr* i,
                                             hir::HIRBuilder* builder) {
  if (i->opcode != &OPCODE_TRUNCATE_info) {
    return false;
  }
  TypeName type_from = i->src1.value->type;
  TypeName type_to = i->dest->type;
  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
  Instr* input_def = i->src1.value->GetDefSkipAssigns();
  if (!input_def) {
    return false;
  }
  Opcode input_opc = input_def->opcode->num;
  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
    uint32_t src2_shift = input_def->src2.value->AsUint32();
    if (src2_shift < (sizeof_to * CHAR_BIT)) {
      Value* truncated_preshift =
          builder->Truncate(input_def->src1.value, type_to);
      truncated_preshift->def->MoveBefore(i);
      i->Replace(&OPCODE_SHL_info, 0);
      i->set_src1(truncated_preshift);
      i->set_src2(input_def->src2.value);
      return true;
    }
  }
  if (input_opc == OPCODE_LOAD_CONTEXT) {
    if (sizeof_from == 8 && sizeof_to == 4) {
      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
      loadof->def->MoveBefore(input_def);
      i->Replace(&OPCODE_ASSIGN_info, 0);
      i->set_src1(loadof);
      return true;
    }
  }
  return false;
 }
 bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
  bool result = false;
  auto block = builder->first_block();
  while (block) {
    auto i = block->instr_head;
    while (i) {
      result |= BackpropTruncations(i, builder);
      i = i->next;
    }
    block = block->next;
  }
  return result;
 }
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  auto def = value->def;
  if (def && def->opcode == &OPCODE_ASSIGN_info) {
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
  bool BackpropTruncations(hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
                       hir::Value* variable, unsigned int shift);
  bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
  bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
  // called by CheckXor, handles transforming a 1 bit value xored against 1
  bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
                        hir::Value* xored);
  bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
                          hir::Value* b1, hir::Value* b2);
  // for rlwinm
  bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
  bool TransformANDROLORSHLSeq(
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -14,38 +14,15 @@
 namespace xe {
 namespace cpu {
 namespace hir {
-
+void Instr::set_srcN(Value* value, uint32_t idx) {
-void Instr::set_src1(Value* value) {
+  if (srcs[idx].value == value) {
  if (src1.value == value) {
    return;
  }
-  if (src1_use) {
+  if (srcs_use[idx]) {
-    src1.value->RemoveUse(src1_use);
+    srcs[idx].value->RemoveUse(srcs_use[idx]);
  }
-  src1.value = value;
+  srcs[idx].value = value;
-  src1_use = value ? value->AddUse(block->arena, this) : NULL;
+  srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
 }
 void Instr::set_src2(Value* value) {
  if (src2.value == value) {
    return;
  }
  if (src2_use) {
    src2.value->RemoveUse(src2_use);
  }
  src2.value = value;
  src2_use = value ? value->AddUse(block->arena, this) : NULL;
 }
 void Instr::set_src3(Value* value) {
  if (src3.value == value) {
    return;
  }
  if (src3_use) {
    src3.value->RemoveUse(src3_use);
  }
  src3.value = value;
  src3_use = value ? value->AddUse(block->arena, this) : NULL;
 }
 void Instr::MoveBefore(Instr* other) {
@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
  }
  return current_def;
 }
 Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
  unsigned int traversed_types = 0;
  unsigned int in_flags = *tunnel_flags;
  Instr* current_def = this;
  while (true) {
    Opcode op = current_def->opcode->num;
    switch (op) {
      case OPCODE_ASSIGN: {
        if ((in_flags & MOVTUNNEL_ASSIGNS)) {
          current_def = current_def->src1.value->def;
          traversed_types |= MOVTUNNEL_ASSIGNS;
        } else {
          goto exit_loop;
        }
        break;
      }
      case OPCODE_ZERO_EXTEND: {
        if ((in_flags & MOVTUNNEL_MOVZX)) {
          current_def = current_def->src1.value->def;
          traversed_types |= MOVTUNNEL_MOVZX;
        } else {
          goto exit_loop;
        }
        break;
      }
      case OPCODE_SIGN_EXTEND: {
        if ((in_flags & MOVTUNNEL_MOVSX)) {
          current_def = current_def->src1.value->def;
          traversed_types |= MOVTUNNEL_MOVSX;
        } else {
          goto exit_loop;
        }
        break;
      }
      case OPCODE_TRUNCATE: {
        if ((in_flags & MOVTUNNEL_TRUNCATE)) {
          current_def = current_def->src1.value->def;
          traversed_types |= MOVTUNNEL_TRUNCATE;
        } else {
          goto exit_loop;
        }
        break;
      }
      case OPCODE_AND: {
        if ((in_flags & MOVTUNNEL_AND32FF)) {
          auto [constant, nonconst] =
              current_def->BinaryValueArrangeAsConstAndVar();
          if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
            goto exit_loop;
          }
          current_def = nonconst->def;
          traversed_types |= MOVTUNNEL_AND32FF;
        } else {
          goto exit_loop;
        }
        break;
      }
      default:
        goto exit_loop;
    }
    if (!current_def) {
      goto exit_loop;
    }
  }
 exit_loop:
  *tunnel_flags = traversed_types;
  return current_def;
 }
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -25,6 +25,14 @@ namespace hir {
 class Block;
 class Label;
 // todo: better name
 enum MovTunnel {
  MOVTUNNEL_ASSIGNS = 1,
  MOVTUNNEL_MOVZX = 2,
  MOVTUNNEL_MOVSX = 4,
  MOVTUNNEL_TRUNCATE = 8,
  MOVTUNNEL_AND32FF = 16,  // tunnel through and with 0xFFFFFFFF
 };
 class Instr {
 public:
@ -44,17 +52,28 @@ class Instr {
  } Op;
  Value* dest;
-  Op src1;
+  union {
-  Op src2;
+    struct {
-  Op src3;
+      Op src1;
      Op src2;
      Op src3;
    };
    Op srcs[3];
  };
  union {
    struct {
      Value::Use* src1_use;
      Value::Use* src2_use;
      Value::Use* src3_use;
    };
    Value::Use* srcs_use[3];
  };
  void set_srcN(Value* value, uint32_t idx);
  void set_src1(Value* value) { set_srcN(value, 0); }
-  Value::Use* src1_use;
+  void set_src2(Value* value) { set_srcN(value, 1); }
  Value::Use* src2_use;
  Value::Use* src3_use;
-  void set_src1(Value* value);
+  void set_src3(Value* value) { set_srcN(value, 2); }
  void set_src2(Value* value);
  void set_src3(Value* value);
  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
  }
  Instr* GetDestDefSkipAssigns();
  Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
  // returns [def op, constant]
  std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
      const OpcodeInfo* op_ptr) {
@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
    }
    return result;
  }
  /*
  Invokes the provided lambda callback on each operand that is a Value. Callback
  is invoked with Value*, uint32_t index
 */
  template <typename TCallable>
  void VisitValueOperands(TCallable&& call_for_values) {
    uint32_t signature = opcode->signature;
    OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
    UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
    if (t_src1 == OPCODE_SIG_TYPE_V) {
      call_for_values(src1.value, 0);
    }
    if (t_src2 == OPCODE_SIG_TYPE_V) {
      call_for_values(src2.value, 1);
    }
    if (t_src3 == OPCODE_SIG_TYPE_V) {
      call_for_values(src3.value, 2);
    }
  }
 };
 }  // namespace hir
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
    return nullptr;
  }
 }
 hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
  if (def) {
    return def->GetDestDefTunnelMovs(tunnel_flags);
  } else {
    return nullptr;
  }
 }
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -598,6 +598,8 @@ class Value {
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
  // tunnel_flags is updated to the kinds we actually traversed
  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -246,30 +246,7 @@ enum class PPCRegister {
 };
 #pragma pack(push, 8)
-typedef struct PPCContext_s {
+typedef struct alignas(64) PPCContext_s {
  // Must be stored at 0x0 for now.
  // TODO(benvanik): find a nice way to describe this to the JIT.
  ThreadState* thread_state;  // 0x0
  // TODO(benvanik): this is getting nasty. Must be here.
  uint8_t* virtual_membase;  // 0x8
  // Most frequently used registers first.
  uint64_t lr;      // 0x10 Link register
  uint64_t ctr;     // 0x18 Count register
  uint64_t r[32];   // 0x20 General purpose registers
  double f[32];     // 0x120 Floating-point registers
  vec128_t v[128];  // 0x220 VMX128 vector registers
  // XER register:
  // Split to make it easier to do individual updates.
  uint8_t xer_ca;  // 0xA20
  uint8_t xer_ov;  // 0xA21
  uint8_t xer_so;  // 0xA22
  // Condition registers:
  // These are split to make it easier to do DCE on unused stores.
  uint64_t cr() const;
  void set_cr(uint64_t value);
  union {
    uint32_t value;
    struct {
@ -395,6 +372,25 @@ typedef struct PPCContext_s {
    } bits;
  } fpscr;  // Floating-point status and control register
  // Most frequently used registers first.
  uint64_t r[32];   // 0x20 General purpose registers
  uint64_t ctr;     // 0x18 Count register
  uint64_t lr;      // 0x10 Link register
  double f[32];     // 0x120 Floating-point registers
  vec128_t v[128];  // 0x220 VMX128 vector registers
  // XER register:
  // Split to make it easier to do individual updates.
  uint8_t xer_ca;
  uint8_t xer_ov;
  uint8_t xer_so;
  // Condition registers:
  // These are split to make it easier to do DCE on unused stores.
  uint64_t cr() const;
  void set_cr(uint64_t value);
  uint8_t vscr_sat;
  // uint32_t get_fprf() {
@ -425,7 +421,8 @@ typedef struct PPCContext_s {
  // Value of last reserved load
  uint64_t reserved_val;
-
+  ThreadState* thread_state;
  uint8_t* virtual_membase;  
  static std::string GetRegisterName(PPCRegister reg);
  std::string GetStringFromValue(PPCRegister reg) const;
  void SetValueFromString(PPCRegister reg, std::string value);
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@ -18,12 +18,50 @@
 #include "xenia/cpu/processor.h"
 #include "xenia/xbox.h"
 namespace xe {
 namespace cpu {
 thread_local ThreadState* thread_state_ = nullptr;
 static void* AllocateContext() {
  size_t granularity = xe::memory::allocation_granularity();
  for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
    /*
        we want our register which points to the context to have 0xE0000000 in
       the low 32 bits, for checking for whether we need the 4k offset, but also
       if we allocate starting from the page before we allow backends to index
       negatively to get to their own backend specific data, which makes full
        use of int8 displacement
        the downside is we waste most of one granula and probably a fair bit of
       the one starting at 0xE0 by using a direct virtual memory allocation
       instead of malloc
    */
    uintptr_t context_pre =
        ((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
    void* p = memory::AllocFixed(
        (void*)context_pre, granularity + sizeof(ppc::PPCContext),
        memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
    if (p) {
      return reinterpret_cast<char*>(p) +
             granularity;  // now we have a ctx ptr with the e0 constant in low,
                           // and one page allocated before it
    }
  }
  assert_always("giving up on allocating context, likely leaking contexts");
  return nullptr;
 }
 static void FreeContext(void* ctx) {
  char* true_start_of_ctx = &reinterpret_cast<char*>(
      ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
  memory::DeallocFixed(true_start_of_ctx, 0,
                       memory::DeallocationType::kRelease);
 }
 ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
                         uint32_t stack_base, uint32_t pcr_address)
    : processor_(processor),
@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
  backend_data_ = processor->backend()->AllocThreadData();
  // Allocate with 64b alignment.
-  context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
+
  context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext());  // memory::AlignedAlloc<ppc::PPCContext>(64);
  processor->backend()->InitializeBackendContext(context_);
  assert_true(((uint64_t)context_ & 0x3F) == 0);
  std::memset(context_, 0, sizeof(ppc::PPCContext));
@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
  if (thread_state_ == this) {
    thread_state_ = nullptr;
  }
-
+  if (context_) {
-  memory::AlignedFree(context_);
+    FreeContext(reinterpret_cast<void*>(context_));
  }
 // memory::AlignedFree(context_);
 }
 void ThreadState::Bind(ThreadState* thread_state) {