From 3717167bbecb9c99fc7f45ea28f6e9bd88b5dd5c Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sun, 17 Jul 2022 09:52:40 -0700
Subject: [PATCH 1/2] Preload ThreeFloatMask in DOT_PRODUCT_3 Use shuffle_ps
 instead of broadcastss, broadcastss is slower on many intel and amd
 processors and encodes to the same number of bytes as shuffle_ps Detect and
 optimize away PERMUTE with a zero src2 and src3 in constant_propagation_pass
 instead of in the x64 sequence For constant PERMUTE, do the Xor/And prior to
 LoadConstantXmm instead of in the generated code Simplified code for PERMUTE
 Added simplification rule that detects (lzcnt(x) >> log2(bitsizeof_x)) == ( x
 == 0) Added set_srcN(value, idx) which can be used to set the nth source of
 an instruction, which makes more sense than having three different functions
 that only differ by the field they touch Added Value::VisitValueOperands for
 iterating all Value operands an instruction has. Add BackpropTruncations code
 to simplification_pass Changed the (void**) dereferences of raw_context that
 are done to grab thread_state to instead reference PPCContext and the
 thread_state field. Moved the thread_state field to the tail of PPCContext.
 Moved membase to the tail of PPCContext, since now it is reloaded very
 infrequently. Rearranged PPCContext so that the condition registers come
 first (most accesses to them cant get SSA'd), moved lr and ctr to after gp
 regs since they are not accessed as much as the main gpregs. This way the
 most frequently accessed registers will be accessible via a rel8 displacement
 instead of rel32 (ideally, we would have only certain CRs at the start, but
 xenia does pointer arithmetic on CR0's offset to get CRn) Use alignas(64) to
 ensure PPCContext's padding Map PPCContext specially so that the low 32 bits
 of the context register is 0xE0000000, for the 4k page offset check. Also
 allocate the page before, so that backends can store their own information
 that is not relevant to the PPCContext on that page and reference that data
 in the generated asm via 8-bit signed displ or 32-bit signed displ. Currently
 this page is not being utilized, but I plan on stashing some data critical to
 the x86 backend there Changed many wrong avx instructions, they worked but
 they were not intended for the data they operated on, meaning they
 transferred domains and caused 1-2 cycle stall each time Added SimdDomain
 checking/deduction to X64Emitter. Used SimdDomain code to fix a lot of
 float/int domain stalls

Use the low 32 bits of the context register instead of constant 0xE0000000 in ComputeAddress
Special path for SELECT_V128 with result of comparison that will use a blend instruction instead of and/or
Many HIR optimizations added in simp pass
A bunch of other stuff running out of time to write this msg
---
 src/xenia/cpu/backend/backend.h               |   4 +
 src/xenia/cpu/backend/x64/x64_emitter.cc      | 114 +++++-
 src/xenia/cpu/backend/x64/x64_emitter.h       |  40 +-
 src/xenia/cpu/backend/x64/x64_seq_memory.cc   |  20 +-
 src/xenia/cpu/backend/x64/x64_seq_vector.cc   |   8 +-
 src/xenia/cpu/backend/x64/x64_sequences.cc    | 175 ++++++---
 .../passes/constant_propagation_pass.cc       |  12 +
 .../compiler/passes/simplification_pass.cc    | 363 ++++++++++++++++--
 .../cpu/compiler/passes/simplification_pass.h |  13 +
 src/xenia/cpu/hir/instr.cc                    | 110 ++++--
 src/xenia/cpu/hir/instr.h                     |  61 ++-
 src/xenia/cpu/hir/value.cc                    |   7 +
 src/xenia/cpu/hir/value.h                     |   2 +
 src/xenia/cpu/ppc/ppc_context.h               |  47 ++-
 src/xenia/cpu/thread_state.cc                 |  50 ++-
 15 files changed, 856 insertions(+), 170 deletions(-)

diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h
index 054d7e752..aa9097602 100644
--- a/src/xenia/cpu/backend/backend.h
+++ b/src/xenia/cpu/backend/backend.h
@@ -63,6 +63,10 @@ class Backend {
   virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
   virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
   virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
+  // ctx points to the start of a ppccontext, ctx - page_allocation_granularity
+  // up until the start of ctx may be used by the backend to store whatever data
+  // they want
+  virtual void InitializeBackendContext(void* ctx) {}
 
  protected:
   Processor* processor_ = nullptr;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index c6f2d6180..6d5690c2f 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -32,6 +32,9 @@
 #include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/function.h"
 #include "xenia/cpu/function_debug_info.h"
+#include "xenia/cpu/hir/instr.h"
+#include "xenia/cpu/hir/opcodes.h"
+#include "xenia/cpu/hir/value.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/symbol.h"
 #include "xenia/cpu/thread_state.h"
@@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
 }
 
 uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
   uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
   // uint16_t str_len = uint16_t(thread_state->context()->r[4]);
   auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
@@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
 }
 
 uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
   XELOGE("tw/td forced trap hit! This should be a crash!");
   if (cvars::break_on_debugbreak) {
     xe::debugging::Break();
@@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 
 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
-  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto thread_state =
+      reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
 
   // TODO(benvanik): required?
   assert_not_zero(target_address);
@@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
   MovMem64(addr + 8, v.high);
   return ptr[addr];
 }
+static bool IsVectorCompare(const Instr* i) {
+  hir::Opcode op = i->opcode->num;
+  return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
+         op <= hir::OPCODE_VECTOR_COMPARE_UGE;
+}
 
+static bool IsFlaggedVectorOp(const Instr* i) {
+  if (IsVectorCompare(i)) {
+    return true;
+  }
+  hir::Opcode op = i->opcode->num;
+  using namespace hir;
+  switch (op) {
+    case OPCODE_VECTOR_SUB:
+    case OPCODE_VECTOR_ADD:
+    case OPCODE_SWIZZLE:
+      return true;
+  }
+  return false;
+}
+
+static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
+  switch (df->flags) {  // check what datatype we compared as
+    case hir::INT16_TYPE:
+    case hir::INT32_TYPE:
+    case hir::INT8_TYPE:
+    case hir::INT64_TYPE:
+      return SimdDomain::INTEGER;
+    case hir::FLOAT32_TYPE:
+    case hir::FLOAT64_TYPE:  // pretty sure float64 doesnt occur with vectors.
+                             // here for completeness
+      return SimdDomain::FLOATING;
+    default:
+      return SimdDomain::DONTCARE;
+  }
+  return SimdDomain::DONTCARE;
+}
+// this list is incomplete
+static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_LOAD_VECTOR_SHL:
+    case OPCODE_LOAD_VECTOR_SHR:
+    case OPCODE_VECTOR_CONVERT_F2I:
+    case OPCODE_VECTOR_MIN:  // there apparently is no FLOAT32_TYPE for min/maxs
+                             // flags
+    case OPCODE_VECTOR_MAX:
+    case OPCODE_VECTOR_SHL:
+    case OPCODE_VECTOR_SHR:
+    case OPCODE_VECTOR_SHA:
+    case OPCODE_VECTOR_ROTATE_LEFT:
+    case OPCODE_VECTOR_AVERAGE:  // apparently no float32 type for this
+    case OPCODE_EXTRACT:
+    case OPCODE_INSERT:  // apparently no f32 type for these two
+      return true;
+  }
+  return false;
+}
+static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
+  using namespace hir;
+  switch (opc) {
+    case OPCODE_VECTOR_CONVERT_I2F:
+    case OPCODE_VECTOR_DENORMFLUSH:
+    case OPCODE_DOT_PRODUCT_3:
+    case OPCODE_DOT_PRODUCT_4:
+    case OPCODE_LOG2:
+    case OPCODE_POW2:
+    case OPCODE_RECIP:
+    case OPCODE_ROUND:
+    case OPCODE_SQRT:
+    case OPCODE_MUL:
+    case OPCODE_MUL_SUB:
+    case OPCODE_MUL_ADD:
+    case OPCODE_ABS:
+      return true;
+  }
+  return false;
+}
+
+SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
+  hir::Instr* df = for_value->def;
+  if (!df) {
+    // todo: visit uses to figure out domain
+    return SimdDomain::DONTCARE;
+
+  } else {
+    SimdDomain result = SimdDomain::DONTCARE;
+
+    if (IsFlaggedVectorOp(df)) {
+      result = GetDomainForFlaggedVectorOp(df);
+    } else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::INTEGER;
+    } else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
+      result = SimdDomain::FLOATING;
+    }
+
+    // todo: check if still dontcare, if so, visit uses of the value to figure
+    // it out
+    return result;
+  }
+
+  return SimdDomain::DONTCARE;
+}
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index d73d86fe1..519bc629a 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -44,7 +44,39 @@ enum RegisterFlags {
   REG_DEST = (1 << 0),
   REG_ABCD = (1 << 1),
 };
+/*
+    SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
+   data and int data for a reason most processors implement two distinct
+   pipelines, one for the integer domain and one for the floating point domain
+    currently, xenia makes no distinction between the two. Crossing domains is
+   expensive. On Zen processors the penalty is one cycle each time you cross,
+   plus the two pipelines need to synchronize Often xenia will emit an integer
+   instruction, then a floating instruction, then integer again. this
+   effectively adds at least two cycles to the time taken These values will in
+   the future be used as tags to operations that tell them which domain to
+   operate in, if its at all possible to avoid crossing
+*/
+enum class SimdDomain : uint32_t {
+  FLOATING,
+  INTEGER,
+  DONTCARE,
+  CONFLICTING  // just used as a special result for PickDomain, different from
+               // dontcare (dontcare means we just dont know the domain,
+               // CONFLICTING means its used in multiple domains)
+};
 
+static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
+  if (dom1 == dom2) {
+    return dom1;
+  }
+  if (dom1 == SimdDomain::DONTCARE) {
+    return dom2;
+  }
+  if (dom2 == SimdDomain::DONTCARE) {
+    return dom1;
+  }
+  return SimdDomain::CONFLICTING;
+}
 enum XmmConst {
   XMMZero = 0,
   XMMOne,
@@ -122,7 +154,7 @@ enum XmmConst {
   XMMLVSLTableBase,
   XMMLVSRTableBase,
   XMMSingleDenormalMask,
-  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
   XMMXenosF16ExtRangeStart
 };
 
@@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {
 
   kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
   kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
-  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
-  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
+  kX64FastLoop =
+      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
   kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
@@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
   FunctionDebugInfo* debug_info() const { return debug_info_; }
 
   size_t stack_size() const { return stack_size_; }
+  SimdDomain DeduceSimdDomain(const hir::Value* for_value);
 
  protected:
   void* Emplace(const EmitFunctionInfo& func_info,
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index 0646fdb39..33919d466 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -12,11 +12,11 @@
 #include <algorithm>
 #include <cstring>
 
+#include "xenia/base/cvar.h"
 #include "xenia/base/memory.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
-#include "xenia/base/cvar.h"
 
 DEFINE_bool(
     elide_e0_check, false,
@@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
         !is_definitely_not_eo(guest)) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
+
+      // todo: do branching or use an alt membase and cmov
       e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
+      e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
+
+      e.cmp(e.edx, e.GetContextReg().cvt32());
       e.setae(e.al);
       e.shl(e.eax, 12);
-      e.add(e.eax, guest.reg().cvt32());
+      e.add(e.eax, e.edx);
+      return e.GetMembaseReg() + e.rax;
+
     } else {
       // Clear the top 32 bits, as they are likely garbage.
       // TODO(benvanik): find a way to avoid doing this.
@@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
       e.xor_(e.eax, e.eax);
-      e.cmp(guest.reg().cvt32(), 0xE0000000);
+      e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.al);
       e.shl(e.eax, 12);
       e.add(e.eax, guest.reg().cvt32());
@@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
     if (xe::memory::allocation_granularity() > 0x1000) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.cl);
       e.movzx(e.ecx, e.cl);
       e.shl(e.ecx, 12);
@@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
     if (xe::memory::allocation_granularity() > 0x1000) {
       // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
       // it via memory mapping.
-      e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+      e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
       e.setae(e.cl);
       e.movzx(e.ecx, e.cl);
       e.shl(e.ecx, 12);
@@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
       if (xe::memory::allocation_granularity() > 0x1000) {
         // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
         // it via memory mapping.
-        e.cmp(i.src1.reg().cvt32(), 0xE0000000);
+        e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
         e.setae(e.al);
         e.movzx(e.eax, e.al);
         e.shl(e.eax, 12);
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 1cca6469f..bde7e5904 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -1826,7 +1826,7 @@ struct PERMUTE_I32
     }
   }
 };
-//todo: use this on const src1
+// todo: use this on const src1
 static vec128_t FixupConstantShuf8(vec128_t input) {
   for (uint32_t i = 0; i < 16; ++i) {
     input.u8[i] ^= 0x03;
@@ -1984,7 +1984,11 @@ struct SWIZZLE
       } else {
         src1 = i.src1;
       }
-      e.vpshufd(i.dest, src1, swizzle_mask);
+      if (element_type == INT32_TYPE && true) {
+        e.vpshufd(i.dest, src1, swizzle_mask);
+      } else if (element_type == FLOAT32_TYPE) {
+        e.vshufps(i.dest, src1, swizzle_mask);
+      }
     } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
       assert_always();
     } else {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 5af7db24d..baf37984d 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -717,6 +717,9 @@ struct SELECT_V128_I8
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): find a shorter sequence.
     // dest = src1 != 0 ? src2 : src3
+    /*
+       chrispy: this is dead code, this sequence is never emitted
+    */
     e.movzx(e.eax, i.src1);
     e.vmovd(e.xmm1, e.eax);
     e.vpbroadcastd(e.xmm1, e.xmm1);
@@ -737,11 +740,46 @@ struct SELECT_V128_I8
     e.vpor(i.dest, e.xmm1);
   }
 };
+
+enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
+static bool IsVectorCompare(const Instr* i) {
+  Opcode op = i->opcode->num;
+  return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
+}
+/*
+    OPCODE_SELECT does a bit by bit selection, however, if the selector is the
+   result of a comparison or if each element may only be 0xff or 0 we may use a
+   blend instruction instead
+*/
+static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
+  const Instr* df = src1v->def;
+  if (!df) {
+    return PermittedBlend::NotPermitted;
+  } else {
+    if (!IsVectorCompare(df)) {
+      return PermittedBlend::NotPermitted;  // todo: check ors, ands of
+                                            // condition
+    } else {
+      switch (df->flags) {  // check what datatype we compared as
+        case INT16_TYPE:
+        case INT32_TYPE:
+        case INT8_TYPE:
+          return PermittedBlend::Int8;  // use vpblendvb
+        case FLOAT32_TYPE:
+          return PermittedBlend::Ps;  // use vblendvps
+        default:                      // unknown type! just ignore
+          return PermittedBlend::NotPermitted;
+      }
+    }
+  }
+}
 struct SELECT_V128_V128
     : Sequence<SELECT_V128_V128,
                I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
+    PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
+    //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
     if (i.src1.is_constant) {
       e.LoadConstantXmm(src1, i.src1.constant());
     }
@@ -756,10 +794,16 @@ struct SELECT_V128_V128
       e.LoadConstantXmm(src3, i.src3.constant());
     }
 
-    // src1 ? src2 : src3;
-    e.vpandn(e.xmm3, src1, src2);
-    e.vpand(i.dest, src1, src3);
-    e.vpor(i.dest, i.dest, e.xmm3);
+    if (mayblend == PermittedBlend::Int8) {
+      e.vpblendvb(i.dest, src2, src3, src1);
+    } else if (mayblend == PermittedBlend::Ps) {
+      e.vblendvps(i.dest, src2, src3, src1);
+    } else {
+      // src1 ? src2 : src3;
+      e.vpandn(e.xmm3, src1, src2);
+      e.vpand(i.dest, src1, src3);
+      e.vpor(i.dest, i.dest, e.xmm3);
+    }
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
@@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
     // TODO(benvanik): the vfmadd sequence produces slightly different results
     // than vmul+vadd and it'd be nice to know why. Until we know, it's
     // disabled so tests pass.
-    if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
+    // chrispy: reenabled, i have added the DAZ behavior that was missing
+    if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
       EmitCommutativeBinaryXmmOp(e, i,
                                  [&i](X64Emitter& e, const Xmm& dest,
                                       const Xmm& src1, const Xmm& src2) {
@@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
                                      e.vfmadd231ps(i.dest, src1, src2);
                                    } else {
                                      // Dest not equal to anything
-                                     e.vmovdqa(i.dest, src1);
+                                     //                                     e.vmovdqa(i.dest,
+                                     //                                     src1);
+                                     // chrispy: vmovdqa was a domain pipeline
+                                     // hazard
+                                     e.vmovaps(i.dest, src1);
                                      e.vfmadd213ps(i.dest, src2, src3);
                                    }
                                  });
@@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
         // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
         src3 = i.src3;
         if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm1, i.src3);
+          // e.vmovdqa(e.xmm1, i.src3);
+          e.vmovaps(e.xmm1, i.src3);
           src3 = e.xmm1;
         }
       }
@@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
 // ============================================================================
 struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
   }
 };
 struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
+    e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
   }
 };
 struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+    e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
@@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
     */
     e.vstmxcsr(mxcsr_storage);
 
+    e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
+
     e.mov(e.eax, 8);
 
     auto src1v = e.xmm0;
@@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
     // so that in the future this could be optimized away if the top is known to
     // be zero. Right now im not sure that happens often though and its
     // currently not worth it also, maybe pre-and if constant
-    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm3, src1v, e.xmm2);
+    e.vandps(e.xmm2, src2v, e.xmm2);
 
     e.and_(mxcsr_storage, e.eax);
     e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
@@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
     Xbyak::Label ret_qnan;
     Xbyak::Label done;
     e.jnz(ret_qnan);
-    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);  // broadcast
     e.jmp(done);
     e.L(ret_qnan);
     e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128
 
     e.vcvtps2pd(e.ymm0, src1v);
     e.vcvtps2pd(e.ymm1, src2v);
-    /*
-        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
-    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
 
-    e.and_(mxcsr_storage, e.eax);
-    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
-                                // go
-
-    e.vcvtps2pd(e.ymm0, e.xmm3);
-    e.vcvtps2pd(e.ymm1, e.xmm2);
-
-
-    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
-    e.vextractf128(e.xmm4, e.ymm5, 1);
-    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
-    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
-    e.not_(e.eax);
-    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
-    e.vcvtsd2ss(e.xmm1, e.xmm2);
-
-    */
     e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
     e.vextractf128(e.xmm2, e.ymm3, 1);
     e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
@@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
     Xbyak::Label ret_qnan;
     Xbyak::Label done;
     e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
-                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
-    e.vbroadcastss(i.dest, e.xmm1);
+    e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
     e.jmp(done);
     e.L(ret_qnan);
     e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@@ -2846,10 +2876,17 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpand(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandps(dest, src2, src1);
+          } else {
+            e.vpand(dest, src2, src1);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
@@ -2948,10 +2985,17 @@ struct AND_NOT_I64
 struct AND_NOT_V128
     : Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpandn(dest, src2, src1);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vandnps(dest, src2, src1);
+          } else {
+            e.vpandn(dest, src2, src1);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32,
@@ -2994,10 +3038,17 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
 };
 struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpor(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vorps(dest, src1, src2);
+          } else {
+            e.vpor(dest, src1, src2);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128);
@@ -3039,10 +3090,17 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
 };
 struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vpxor(dest, src1, src2);
-                               });
+    SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
+                                 e.DeduceSimdDomain(i.src2.value));
+
+    EmitCommutativeBinaryXmmOp(
+        e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          if (dom == SimdDomain::FLOATING) {
+            e.vxorps(dest, src1, src2);
+          } else {
+            e.vpxor(dest, src1, src2);
+          }
+        });
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128);
@@ -3078,8 +3136,15 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
 };
 struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // dest = src ^ 0xFFFF...
-    e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+
+    SimdDomain domain =
+        e.DeduceSimdDomain(i.src1.value);
+    if (domain == SimdDomain::FLOATING) {
+      e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    } else {
+      // dest = src ^ 0xFFFF...
+      e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
+    }
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
@@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
     }
     e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
-    e.vmovaps(i.dest, e.xmm0);
+    e.vmovdqa(i.dest, e.xmm0);
   }
   static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
     // Almost all instances are shamt = 1, but non-constant.
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 025b4114e..6a6a56330 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             i->Remove();
             result = true;
           }
+
+          else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
+                   i->flags == INT8_TYPE /*probably safe for int16 too*/) {
+            /*
+                chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
+            */
+
+            v->set_zero(VEC128_TYPE);
+            i->Remove();
+            result = true;
+          }
+          
           break;
         }
         case OPCODE_INSERT:
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index 10862fd54..8c1cc18c2 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -9,6 +9,7 @@
 
 #include "xenia/cpu/compiler/passes/simplification_pass.h"
 
+#include <__msvc_int128.hpp>
 #include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
 namespace xe {
@@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
 using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::Instr;
 using xe::cpu::hir::Value;
+using vmask_portion_t = uint64_t;
+template <uint32_t Ndwords>
+struct Valuemask_t {
+  vmask_portion_t bits[Ndwords];
+
+  static Valuemask_t create_empty(vmask_portion_t fill = 0) {
+    Valuemask_t result;
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = fill;
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i]);
+    }
+    return result;
+  }
+  template <typename TCallable>
+  Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
+    Valuemask_t result = create_empty();
+
+    for (uint32_t i = 0; i < Ndwords; ++i) {
+      result.bits[i] = oper(bits[i], other.bits[i]);
+    }
+    return result;
+  }
+  Valuemask_t operator&(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
+                   other);
+  }
+  Valuemask_t operator|(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
+                   other);
+  }
+  Valuemask_t operator^(ValueMask other) const {
+    return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
+                   other);
+  }
+  Valuemask_t operator~() const {
+    return operate([](vmask_portion_t x) { return ~x; }, other);
+  }
+};
 
 SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
 
@@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
     iter_result |= SimplifyBitArith(builder);
     iter_result |= EliminateConversions(builder);
     iter_result |= SimplifyAssignments(builder);
+    iter_result |= BackpropTruncations(builder);
     result |= iter_result;
   } while (iter_result);
   return true;
@@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
   }
   return false;
 }
+bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
+                                          hir::HIRBuilder* builder,
+                                          hir::Value* xored) {
+  unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
+
+  Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
+  if (!xordef) {
+    return false;
+  }
+
+  Opcode xorop = xordef->opcode->num;
+  bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
+
+  Value* new_value = nullptr;
+  if (xorop == OPCODE_IS_FALSE) {
+    new_value = builder->IsTrue(xordef->src1.value);
+
+  } else if (xorop == OPCODE_IS_TRUE) {
+    new_value = builder->IsFalse(xordef->src1.value);
+  } else if (xorop == OPCODE_COMPARE_EQ) {
+    new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
+
+  } else if (xorop == OPCODE_COMPARE_NE) {
+    new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
+  }  // todo: other conds
+
+  if (!new_value) {
+    return false;
+  }
+
+  new_value->def->MoveBefore(i);
+
+  i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
+  i->set_src1(new_value);
+
+  return true;
+}
+
+bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
+                                            hir::HIRBuilder* builder,
+                                            hir::Value* b1, hir::Value* b2) {
+  // todo: implement
+  return false;
+}
 bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
   if (CheckOrXorZero(i)) {
     return true;
   } else {
-    if (i->src1.value == i->src2.value) {
+    Value* src1 = i->src1.value;
+    Value* src2 = i->src2.value;
+
+    if (SameValueOrEqualConstant(src1, src2)) {
       i->Replace(&OPCODE_ASSIGN_info, 0);
       i->set_src1(builder->LoadZero(i->dest->type));
       return true;
     }
-    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-
     auto [constant_value, variable_value] =
         i->BinaryValueArrangeAsConstAndVar();
+    ScalarNZM nzm1 = GetScalarNZM(src1);
+    ScalarNZM nzm2 = GetScalarNZM(src2);
+
+    if ((nzm1 & nzm2) ==
+        0) {  // no bits of the two sources overlap, this ought to be an OR
+      // cs:optimizing
+      /* i->Replace(&OPCODE_OR_info, 0);
+      i->set_src1(src1);
+      i->set_src2(src2);*/
+
+      i->opcode = &OPCODE_OR_info;
+
+      return true;
+    }
+
+    if (nzm1 == 1ULL && nzm2 == 1ULL) {
+      if (constant_value) {
+        return CheckBooleanXor1(i, builder, variable_value);
+      } else {
+        return CheckXorOfTwoBools(i, builder, src1, src2);
+      }
+    }
+
+    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
 
     if (!constant_value) return false;
 
@@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
 }
 bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
 retry_and_simplification:
+
   auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
   if (!constant_value) {
     // added this for srawi
-    uint64_t nzml = GetScalarNZM(i->src1.value);
-    uint64_t nzmr = GetScalarNZM(i->src2.value);
+    ScalarNZM nzml = GetScalarNZM(i->src1.value);
+    ScalarNZM nzmr = GetScalarNZM(i->src2.value);
 
     if ((nzml & nzmr) == 0) {
       i->Replace(&OPCODE_ASSIGN_info, 0);
@@ -524,9 +642,15 @@ retry_and_simplification:
 
   // todo: check if masking with mask that covers all of zero extension source
   uint64_t type_mask = GetScalarTypeMask(i->dest->type);
-  // if masking with entire width, pointless instruction so become an assign
 
-  if (constant_value->AsUint64() == type_mask) {
+  ScalarNZM nzm = GetScalarNZM(variable_value);
+  // if masking with entire width, pointless instruction so become an assign
+  // chrispy: changed this to use the nzm instead, this optimizes away many and
+  // instructions
+  // chrispy: changed this again. detecting if nzm is a subset of and mask, if
+  // so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
+  // to the mask, but it is a subset so can be elimed
+  if ((constant_value->AsUint64() & nzm) == nzm) {
     i->Replace(&OPCODE_ASSIGN_info, 0);
     i->set_src1(variable_value);
     return true;
@@ -555,7 +679,7 @@ retry_and_simplification:
         Value* or_left = true_variable_def->src1.value;
         Value* or_right = true_variable_def->src2.value;
 
-        uint64_t left_nzm = GetScalarNZM(or_left);
+        ScalarNZM left_nzm = GetScalarNZM(or_left);
 
         // use the other or input instead of the or output
         if ((constant_value->AsUint64() & left_nzm) == 0) {
@@ -565,7 +689,7 @@ retry_and_simplification:
           return true;
         }
 
-        uint64_t right_nzm = GetScalarNZM(or_right);
+        ScalarNZM right_nzm = GetScalarNZM(or_right);
 
         if ((constant_value->AsUint64() & right_nzm) == 0) {
           i->Replace(&OPCODE_AND_info, 0);
@@ -593,6 +717,21 @@ retry_and_simplification:
   return false;
 }
 bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* src1 = i->src1.value;
+  Value* src2 = i->src2.value;
+
+  ScalarNZM nzm1 = GetScalarNZM(src1);
+  ScalarNZM nzm2 = GetScalarNZM(src2);
+  if ((nzm1 & nzm2) == 0) {  // no bits overlap, there will never be a carry
+                             // from any bits to any others, make this an OR
+
+    /* i->Replace(&OPCODE_OR_info, 0);
+    i->set_src1(src1);
+    i->set_src2(src2);*/
+    i->opcode = &OPCODE_OR_info;
+    return true;
+  }
+
   auto [definition, added_constant] =
       i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);
 
@@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     return false;
   }
 
-  uint64_t nzm_for_var = GetScalarNZM(variable);
+  ScalarNZM nzm_for_var = GetScalarNZM(variable);
   Opcode cmpop = i->opcode->num;
   uint64_t constant_unpacked = constant_value->AsUint64();
   uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
@@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     i->set_src1(variable);
     return true;
   }
+
+  if (cmpop == OPCODE_COMPARE_ULE &&
+      constant_unpacked ==
+          0) {  // less than or equal to zero = (== 0) = IS_FALSE
+    i->Replace(&OPCODE_IS_FALSE_info, 0);
+    i->set_src1(variable);
+    return true;
+  }
   // todo: OPCODE_COMPARE_NE too?
   if (cmpop == OPCODE_COMPARE_EQ &&
       def_opcode == OPCODE_NOT) {  // i see this a lot around addic insns
@@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
     return false;
   }
 
-  uint64_t input_nzm = GetScalarNZM(input);
+  ScalarNZM input_nzm = GetScalarNZM(input);
 
   if (istrue &&
       input_nzm == 1) {  // doing istrue on a value thats already a bool bitwise
@@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
    input_def = input_def->GetDestDefSkipAssigns();*/
   return false;
 }
+bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
+                                         hir::HIRBuilder* builder,
+                                         hir::Value* variable,
+                                         unsigned int shift) {
+  if (shift >= 3 && shift <= 6) {
+    // is possible shift of lzcnt res, do some tunneling
+
+    unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
+                          MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
+                          MOVTUNNEL_AND32FF;
+
+    Instr* vardef = variable->def;
+
+    hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
+
+    if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
+      Value* lz_input = var_def->src1.value;
+      TypeName type_of_lz_input = lz_input->type;
+      size_t shift_for_zero =
+          xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
+
+      if (shift == shift_for_zero) {
+        // we ought to be OPCODE_IS_FALSE!
+        /*
+            explanation: if an input to lzcnt is zero, the result will be the
+           bit size of the input type, which is always a power of two any
+           nonzero result will be less than the bit size so you can test for
+           zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
+            this is a very common way of testing for zero without branching on
+           ppc, and the xb360 ppc compiler used it a lot we optimize this away
+           for simplicity and to enable further optimizations, but actually this
+           is also quite fast on modern x86 processors as well, for instance on
+           zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
+           in one cycle
+
+        */
+
+        if (variable->type != INT8_TYPE) {
+          Value* isfalsetest = builder->IsFalse(lz_input);
+
+          isfalsetest->def->MoveBefore(i);
+          i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
+          i->set_src1(isfalsetest);
+
+        } else {
+          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          i->set_src1(lz_input);
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* shr_lhs = i->src1.value;
+  Value* shr_rhs = i->src2.value;
+  if (!shr_lhs || !shr_rhs) return false;
+  if (shr_rhs->IsConstant()) {
+    return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
+  }
+
+  return false;
+}
+
+bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
+  Value* l = i->src1.value;
+  Value* r = i->src2.value;
+  ScalarNZM l_nzm = GetScalarNZM(l);
+  uint64_t signbit_mask = GetScalarSignbitMask(l->type);
+  size_t typesize = GetTypeSize(l->type);
+
+  /*
+    todo: folding this requires the mask of constant bits
+  if (r->IsConstant()) {
+    uint32_t const_r = r->AsUint32();
+
+    if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
+  fill the result with the signbit of the input.
+
+
+    }
+  }*/
+  if ((l_nzm & signbit_mask) == 0) {  // signbit will never be set, might as
+                                      // well be an SHR. (this does happen)
+    i->opcode = &OPCODE_SHR_info;
+
+    return true;
+  }
+
+  return false;
+}
 bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
   bool result = false;
   auto block = builder->first_block();
@@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
       // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
       // don't handle these in our simplifications, so skip
       if (i->dest && IsScalarIntegralType(i->dest->type)) {
-        if (i->opcode == &OPCODE_OR_info) {
+        Opcode iop = i->opcode->num;
+
+        if (iop == OPCODE_OR) {
           result |= CheckOr(i, builder);
-        } else if (i->opcode == &OPCODE_XOR_info) {
+        } else if (iop == OPCODE_XOR) {
           result |= CheckXor(i, builder);
-        } else if (i->opcode == &OPCODE_AND_info) {
+        } else if (iop == OPCODE_AND) {
           result |= CheckAnd(i, builder);
-        } else if (i->opcode == &OPCODE_ADD_info) {
+        } else if (iop == OPCODE_ADD) {
           result |= CheckAdd(i, builder);
-        } else if (IsScalarBasicCmp(i->opcode->num)) {
+        } else if (IsScalarBasicCmp(iop)) {
           result |= CheckScalarConstCmp(i, builder);
-        } else if (i->opcode == &OPCODE_IS_FALSE_info ||
-                   i->opcode == &OPCODE_IS_TRUE_info) {
+        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
           result |= CheckIsTrueIsFalse(i, builder);
+        } else if (iop == OPCODE_SHR) {
+          result |= CheckSHR(i, builder);
+        } else if (iop == OPCODE_SHA) {
+          result |= CheckSAR(i, builder);
         }
       }
 
@@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
   }
   return false;
 }
-
 bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   // Run over the instructions and rename assigned variables:
   //   v1 = v0
@@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   while (block) {
     auto i = block->instr_head;
     while (i) {
-      uint32_t signature = i->opcode->signature;
-      if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
+      i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
         bool modified = false;
-        i->set_src1(CheckValue(i->src1.value, modified));
+        i->set_srcN(CheckValue(value, modified), idx);
         result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src2(CheckValue(i->src2.value, modified));
-        result |= modified;
-      }
-      if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
-        bool modified = false;
-        i->set_src3(CheckValue(i->src3.value, modified));
-        result |= modified;
-      }
+      });
 
       i = i->next;
     }
@@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   return result;
 }
 
+struct TruncateSimplifier {
+  TypeName type_from, type_to;
+  uint32_t sizeof_from, sizeof_to;
+  uint32_t bit_sizeof_from, bit_sizeof_to;
+  uint64_t typemask_from, typemask_to;
+  hir::HIRBuilder* builder;
+  hir::Instr* truncate_instr;
+  hir::Value* truncated_value;
+  hir::Instr* truncated_value_def;
+};
+bool SimplificationPass::BackpropTruncations(hir::Instr* i,
+                                             hir::HIRBuilder* builder) {
+  if (i->opcode != &OPCODE_TRUNCATE_info) {
+    return false;
+  }
+  TypeName type_from = i->src1.value->type;
+  TypeName type_to = i->dest->type;
+
+  uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
+  uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
+
+  Instr* input_def = i->src1.value->GetDefSkipAssigns();
+  if (!input_def) {
+    return false;
+  }
+  Opcode input_opc = input_def->opcode->num;
+
+  if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
+    uint32_t src2_shift = input_def->src2.value->AsUint32();
+    if (src2_shift < (sizeof_to * CHAR_BIT)) {
+      Value* truncated_preshift =
+          builder->Truncate(input_def->src1.value, type_to);
+
+      truncated_preshift->def->MoveBefore(i);
+      i->Replace(&OPCODE_SHL_info, 0);
+      i->set_src1(truncated_preshift);
+      i->set_src2(input_def->src2.value);
+      return true;
+    }
+  }
+  if (input_opc == OPCODE_LOAD_CONTEXT) {
+    if (sizeof_from == 8 && sizeof_to == 4) {
+      Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
+      loadof->def->MoveBefore(input_def);
+      i->Replace(&OPCODE_ASSIGN_info, 0);
+      i->set_src1(loadof);
+      return true;
+    }
+  }
+
+  return false;
+}
+bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
+  bool result = false;
+  auto block = builder->first_block();
+  while (block) {
+    auto i = block->instr_head;
+    while (i) {
+      result |= BackpropTruncations(i, builder);
+      i = i->next;
+    }
+    block = block->next;
+  }
+  return result;
+}
 Value* SimplificationPass::CheckValue(Value* value, bool& result) {
   auto def = value->def;
   if (def && def->opcode == &OPCODE_ASSIGN_info) {
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h
index d805ea27c..fe8de8474 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
   bool SimplifyAssignments(hir::HIRBuilder* builder);
   hir::Value* CheckValue(hir::Value* value, bool& result);
   bool SimplifyBitArith(hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
+  bool BackpropTruncations(hir::HIRBuilder* builder);
   // handle either or or xor with 0
   bool CheckOrXorZero(hir::Instr* i);
   bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
@@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
   bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
   bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
   bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
+                       hir::Value* variable, unsigned int shift);
+
+  bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
+  bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
+  // called by CheckXor, handles transforming a 1 bit value xored against 1
+  bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
+                        hir::Value* xored);
+  bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
+                          hir::Value* b1, hir::Value* b2);
+
   // for rlwinm
   bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
   bool TransformANDROLORSHLSeq(
diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc
index 4096d8e4a..118895719 100644
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@@ -14,38 +14,15 @@
 namespace xe {
 namespace cpu {
 namespace hir {
-
-void Instr::set_src1(Value* value) {
-  if (src1.value == value) {
+void Instr::set_srcN(Value* value, uint32_t idx) {
+  if (srcs[idx].value == value) {
     return;
   }
-  if (src1_use) {
-    src1.value->RemoveUse(src1_use);
+  if (srcs_use[idx]) {
+    srcs[idx].value->RemoveUse(srcs_use[idx]);
   }
-  src1.value = value;
-  src1_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src2(Value* value) {
-  if (src2.value == value) {
-    return;
-  }
-  if (src2_use) {
-    src2.value->RemoveUse(src2_use);
-  }
-  src2.value = value;
-  src2_use = value ? value->AddUse(block->arena, this) : NULL;
-}
-
-void Instr::set_src3(Value* value) {
-  if (src3.value == value) {
-    return;
-  }
-  if (src3_use) {
-    src3.value->RemoveUse(src3_use);
-  }
-  src3.value = value;
-  src3_use = value ? value->AddUse(block->arena, this) : NULL;
+  srcs[idx].value = value;
+  srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
 }
 
 void Instr::MoveBefore(Instr* other) {
@@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
   }
   return current_def;
 }
+Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
+  unsigned int traversed_types = 0;
+  unsigned int in_flags = *tunnel_flags;
+  Instr* current_def = this;
+
+  while (true) {
+    Opcode op = current_def->opcode->num;
+
+    switch (op) {
+      case OPCODE_ASSIGN: {
+        if ((in_flags & MOVTUNNEL_ASSIGNS)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_ASSIGNS;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_ZERO_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVZX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVZX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_SIGN_EXTEND: {
+        if ((in_flags & MOVTUNNEL_MOVSX)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_MOVSX;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_TRUNCATE: {
+        if ((in_flags & MOVTUNNEL_TRUNCATE)) {
+          current_def = current_def->src1.value->def;
+          traversed_types |= MOVTUNNEL_TRUNCATE;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      case OPCODE_AND: {
+        if ((in_flags & MOVTUNNEL_AND32FF)) {
+          auto [constant, nonconst] =
+              current_def->BinaryValueArrangeAsConstAndVar();
+          if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
+            goto exit_loop;
+          }
+          current_def = nonconst->def;
+          traversed_types |= MOVTUNNEL_AND32FF;
+
+        } else {
+          goto exit_loop;
+        }
+        break;
+      }
+      default:
+        goto exit_loop;
+    }
+    if (!current_def) {
+      goto exit_loop;
+    }
+  }
+exit_loop:
+  *tunnel_flags = traversed_types;
+  return current_def;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h
index 1f09ee341..db3c78922 100644
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@@ -25,6 +25,14 @@ namespace hir {
 
 class Block;
 class Label;
+// todo: better name
+enum MovTunnel {
+  MOVTUNNEL_ASSIGNS = 1,
+  MOVTUNNEL_MOVZX = 2,
+  MOVTUNNEL_MOVSX = 4,
+  MOVTUNNEL_TRUNCATE = 8,
+  MOVTUNNEL_AND32FF = 16,  // tunnel through and with 0xFFFFFFFF
+};
 
 class Instr {
  public:
@@ -44,17 +52,28 @@ class Instr {
   } Op;
 
   Value* dest;
-  Op src1;
-  Op src2;
-  Op src3;
+  union {
+    struct {
+      Op src1;
+      Op src2;
+      Op src3;
+    };
+    Op srcs[3];
+  };
+  union {
+    struct {
+      Value::Use* src1_use;
+      Value::Use* src2_use;
+      Value::Use* src3_use;
+    };
+    Value::Use* srcs_use[3];
+  };
+  void set_srcN(Value* value, uint32_t idx);
+  void set_src1(Value* value) { set_srcN(value, 0); }
 
-  Value::Use* src1_use;
-  Value::Use* src2_use;
-  Value::Use* src3_use;
+  void set_src2(Value* value) { set_srcN(value, 1); }
 
-  void set_src1(Value* value);
-  void set_src2(Value* value);
-  void set_src3(Value* value);
+  void set_src3(Value* value) { set_srcN(value, 2); }
 
   void MoveBefore(Instr* other);
   void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
@@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
   }
 
   Instr* GetDestDefSkipAssigns();
+  Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
+
   // returns [def op, constant]
   std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
       const OpcodeInfo* op_ptr) {
@@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
     }
     return result;
   }
+  /*
+  Invokes the provided lambda callback on each operand that is a Value. Callback
+  is invoked with Value*, uint32_t index
+*/
+  template <typename TCallable>
+  void VisitValueOperands(TCallable&& call_for_values) {
+    uint32_t signature = opcode->signature;
+
+    OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
+
+    UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
+
+    if (t_src1 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src1.value, 0);
+    }
+    if (t_src2 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src2.value, 1);
+    }
+    if (t_src3 == OPCODE_SIG_TYPE_V) {
+      call_for_values(src3.value, 2);
+    }
+  }
 };
 
 }  // namespace hir
diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc
index 211cd18f9..c4ebdeb2c 100644
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
     return nullptr;
   }
 }
+hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
+  if (def) {
+    return def->GetDestDefTunnelMovs(tunnel_flags);
+  } else {
+    return nullptr;
+  }
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index 1d8963b64..84d121a26 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -598,6 +598,8 @@ class Value {
   void CountLeadingZeros(const Value* other);
   bool Compare(Opcode opcode, Value* other);
   hir::Instr* GetDefSkipAssigns();
+  // tunnel_flags is updated to the kinds we actually traversed
+  hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
 
  private:
   static bool CompareInt8(Opcode opcode, Value* a, Value* b);
diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h
index 4acdaed3c..777ef568a 100644
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@@ -246,30 +246,7 @@ enum class PPCRegister {
 };
 
 #pragma pack(push, 8)
-typedef struct PPCContext_s {
-  // Must be stored at 0x0 for now.
-  // TODO(benvanik): find a nice way to describe this to the JIT.
-  ThreadState* thread_state;  // 0x0
-  // TODO(benvanik): this is getting nasty. Must be here.
-  uint8_t* virtual_membase;  // 0x8
-
-  // Most frequently used registers first.
-  uint64_t lr;      // 0x10 Link register
-  uint64_t ctr;     // 0x18 Count register
-  uint64_t r[32];   // 0x20 General purpose registers
-  double f[32];     // 0x120 Floating-point registers
-  vec128_t v[128];  // 0x220 VMX128 vector registers
-
-  // XER register:
-  // Split to make it easier to do individual updates.
-  uint8_t xer_ca;  // 0xA20
-  uint8_t xer_ov;  // 0xA21
-  uint8_t xer_so;  // 0xA22
-
-  // Condition registers:
-  // These are split to make it easier to do DCE on unused stores.
-  uint64_t cr() const;
-  void set_cr(uint64_t value);
+typedef struct alignas(64) PPCContext_s {
   union {
     uint32_t value;
     struct {
@@ -395,6 +372,25 @@ typedef struct PPCContext_s {
     } bits;
   } fpscr;  // Floating-point status and control register
 
+  // Most frequently used registers first.
+
+  uint64_t r[32];   // 0x20 General purpose registers
+  uint64_t ctr;     // 0x18 Count register
+  uint64_t lr;      // 0x10 Link register
+  double f[32];     // 0x120 Floating-point registers
+  vec128_t v[128];  // 0x220 VMX128 vector registers
+
+  // XER register:
+  // Split to make it easier to do individual updates.
+  uint8_t xer_ca;
+  uint8_t xer_ov;
+  uint8_t xer_so;
+
+  // Condition registers:
+  // These are split to make it easier to do DCE on unused stores.
+  uint64_t cr() const;
+  void set_cr(uint64_t value);
+
   uint8_t vscr_sat;
 
   // uint32_t get_fprf() {
@@ -425,7 +421,8 @@ typedef struct PPCContext_s {
 
   // Value of last reserved load
   uint64_t reserved_val;
-
+  ThreadState* thread_state;
+  uint8_t* virtual_membase;  
   static std::string GetRegisterName(PPCRegister reg);
   std::string GetStringFromValue(PPCRegister reg) const;
   void SetValueFromString(PPCRegister reg, std::string value);
diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc
index 3816446fc..1383646e1 100644
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@@ -18,12 +18,50 @@
 #include "xenia/cpu/processor.h"
 
 #include "xenia/xbox.h"
-
 namespace xe {
 namespace cpu {
 
 thread_local ThreadState* thread_state_ = nullptr;
 
+static void* AllocateContext() {
+  size_t granularity = xe::memory::allocation_granularity();
+  for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
+    /*
+        we want our register which points to the context to have 0xE0000000 in
+       the low 32 bits, for checking for whether we need the 4k offset, but also
+       if we allocate starting from the page before we allow backends to index
+       negatively to get to their own backend specific data, which makes full
+        use of int8 displacement
+
+
+        the downside is we waste most of one granula and probably a fair bit of
+       the one starting at 0xE0 by using a direct virtual memory allocation
+       instead of malloc
+    */
+    uintptr_t context_pre =
+        ((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
+
+    void* p = memory::AllocFixed(
+        (void*)context_pre, granularity + sizeof(ppc::PPCContext),
+        memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
+    if (p) {
+      return reinterpret_cast<char*>(p) +
+             granularity;  // now we have a ctx ptr with the e0 constant in low,
+                           // and one page allocated before it
+    }
+  }
+
+  assert_always("giving up on allocating context, likely leaking contexts");
+  return nullptr;
+}
+
+static void FreeContext(void* ctx) {
+  char* true_start_of_ctx = &reinterpret_cast<char*>(
+      ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
+  memory::DeallocFixed(true_start_of_ctx, 0,
+                       memory::DeallocationType::kRelease);
+}
+
 ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
                          uint32_t stack_base, uint32_t pcr_address)
     : processor_(processor),
@@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
   backend_data_ = processor->backend()->AllocThreadData();
 
   // Allocate with 64b alignment.
-  context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
+
+  context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext());  // memory::AlignedAlloc<ppc::PPCContext>(64);
+  processor->backend()->InitializeBackendContext(context_);
   assert_true(((uint64_t)context_ & 0x3F) == 0);
   std::memset(context_, 0, sizeof(ppc::PPCContext));
 
@@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
   if (thread_state_ == this) {
     thread_state_ = nullptr;
   }
-
-  memory::AlignedFree(context_);
+  if (context_) {
+    FreeContext(reinterpret_cast<void*>(context_));
+  }
+ // memory::AlignedFree(context_);
 }
 
 void ThreadState::Bind(ThreadState* thread_state) {

From 11817f0a3b33f4dc74da6f22d3aa81eea8d92ad3 Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sun, 17 Jul 2022 14:44:09 -0700
Subject: [PATCH 2/2] vshufps accident broke things, this fixes

---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index bde7e5904..7c55300db 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -1984,10 +1984,10 @@ struct SWIZZLE
       } else {
         src1 = i.src1;
       }
-      if (element_type == INT32_TYPE && true) {
+      if (element_type == INT32_TYPE) {
         e.vpshufd(i.dest, src1, swizzle_mask);
       } else if (element_type == FLOAT32_TYPE) {
-        e.vshufps(i.dest, src1, swizzle_mask);
+        e.vshufps(i.dest, src1, src1, swizzle_mask);
       }
     } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
       assert_always();