diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index 31e1dc9fd..7cb278e5d 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
   EmitSaveNonvolatileRegs();
 
   mov(rax, rcx);
-  mov(rsi, rdx);  // context
-  mov(rcx, r8);   // return address
+  mov(rsi, rdx);                                                    // context
+  mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]);  // membase
+  mov(rcx, r8);  // return address
   call(rax);
-
+  vzeroupper();
   EmitLoadNonvolatileRegs();
 
   code_offsets.epilog = getSize();
@@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
 
   code_offsets.prolog_stack_alloc = getSize();
   code_offsets.body = getSize();
-
+  // chrispy: added this for proper vmsum impl, avx2 bitshifts
+  vzeroupper();
   // Save off volatile registers.
   EmitSaveVolatileRegs();
 
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 707ab5642..c6f2d6180 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
   TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
   TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
   TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-
-  
-
-
+  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
 #undef TEST_EMIT_FEATURE
   /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+  latest version of xbyak
 */
   unsigned int data[4];
   Xbyak::util::Cpu::getCpuid(0x80000001, data);
@@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
     }
   }
   if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-  
-      bool is_zennish = cpu_.displayFamily >= 0x17;
+    bool is_zennish = cpu_.displayFamily >= 0x17;
 
-      if (is_zennish) {
-        feature_flags_ |= kX64FastJrcx;
+    if (is_zennish) {
+      feature_flags_ |= kX64FastJrcx;
 
-        if (cpu_.displayFamily > 0x17) {
-          feature_flags_ |= kX64FastLoop;
+      if (cpu_.displayFamily > 0x17) {
+        feature_flags_ |= kX64FastLoop;
 
-        } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
-          feature_flags_ |= kX64FastLoop;
-        }  // todo:figure out at model zen+ became zen2, this is just the model
-           // for my cpu, which is ripper90
-      
-      }
+      } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
+        feature_flags_ |= kX64FastLoop;
+      }  // todo:figure out at model zen+ became zen2, this is just the model
+         // for my cpu, which is ripper90
+    }
   }
 }
 
@@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   code_offsets.prolog_stack_alloc = getSize();
   code_offsets.body = getSize();
 
+  /*
+  * chrispy: removed this, it serves no purpose
   mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
+  */
   mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
   mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
 
@@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   }
 
   // Load membase.
-  mov(GetMembaseReg(),
+  /*
+  * chrispy: removed this, as long as we load it in HostToGuestThunk we can
+  count on no other code modifying it. mov(GetMembaseReg(),
       qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
-
+  */
   // Body.
   auto block = builder->first_block();
   while (block) {
@@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
         // NOTE: If you encounter this after adding a new instruction, do a full
         // rebuild!
         assert_always();
-        XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
+        XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
         break;
       }
       instr = new_tail;
@@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   L(epilog_label);
   epilog_label_ = nullptr;
   EmitTraceUserCallReturn();
+  /*
+  * chrispy: removed this, it serves no purpose
   mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-
+  */
   code_offsets.epilog = getSize();
 
   add(rsp, (uint32_t)stack_size);
@@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
 
   if (cvars::emit_source_annotations) {
     nop(5);
-
   }
 
   assert_zero(code_offsets.prolog);
@@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
 Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
 
-void X64Emitter::ReloadContext() {
-  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-}
-
 void X64Emitter::ReloadMembase() {
   mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
 }
-#define __NH_CONCAT(x, y) x##y
-#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
-
-#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
-
-#define DECLNOP(n, ...) \
-  static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
-
-DECLNOP(1, 0x90);
-DECLNOP(2, 0x66, 0x90);
-DECLNOP(3, 0x0F, 0x1F, 0x00);
-DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
-DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
-DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-
-static constexpr const unsigned char* const g_noptable[] = {
-    &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
-    &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
-
-static constexpr unsigned LENGTHOF_NOPTABLE =
-    sizeof(g_noptable) / sizeof(g_noptable[0]);
 
 // Len Assembly                                   Byte Sequence
 // ============================================================================
@@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
 // 8b  NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00H
 // 9b  66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
 void X64Emitter::nop(size_t length) {
-  while (length != 0) {
-    unsigned patchsize = length % LENGTHOF_NOPTABLE;
-
-    // patch_memory(locptr, size, (char*)g_noptable[patchsize]);
-
-    for (unsigned i = 0; i < patchsize; ++i) {
-      db(g_noptable[patchsize][i]);
-    }
-
-    //locptr += patchsize;
-    length -= patchsize;
+  for (size_t i = 0; i < length; ++i) {
+    db(0x90);
   }
 }
 
@@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
                     0x80, 0x80, 0x80, 0x80),
     /*XMMShortsToBytes*/
     v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
-                    0x80, 0x80, 0x80)
-};
+                    0x80, 0x80, 0x80),
+    /*XMMLVSLTableBase*/
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    /*XMMLVSRTableBase*/
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    /* XMMSingleDenormalMask */
+    vec128i(0x7f800000),
+    /* XMMThreeFloatMask */
+    vec128i(~0U, ~0U, ~0U, 0U),
+    /*XMMXenosF16ExtRangeStart*/
+    vec128f(65504)};
 
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
   for (auto& vec : xmm_consts) {
@@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
     // 1111...
     vpcmpeqb(dest, dest);
   } else {
-
     for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
       if (xmm_consts[i] == v) {
         vmovapd(dest, GetXmmConstPtr((XmmConst)i));
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 66a02fcc1..d73d86fe1 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -118,7 +118,12 @@ enum XmmConst {
   XMM2To32,
   XMMFloatInf,
   XMMIntsToBytes,
-  XMMShortsToBytes
+  XMMShortsToBytes,
+  XMMLVSLTableBase,
+  XMMLVSRTableBase,
+  XMMSingleDenormalMask,
+  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMXenosF16ExtRangeStart
 };
 
 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
   kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
   kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
   kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
  public:
@@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
 
   Xbyak::Reg64 GetContextReg();
   Xbyak::Reg64 GetMembaseReg();
-  void ReloadContext();
+
   void ReloadMembase();
 
   void nop(size_t length = 1);
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 299e7674f..1cca6469f 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
 
+struct VECTOR_DENORMFLUSH
+    : Sequence<VECTOR_DENORMFLUSH,
+               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123
+
+    e.vandps(e.xmm0, i.src1,
+             e.GetXmmConstPtr(XMMSingleDenormalMask));  // 0.25 P0123
+    e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1);                // 0.5 P01
+    e.vandps(e.xmm1, i.src1,
+             e.GetXmmConstPtr(XMMSignMaskF32));  // 0.5 P0123 take signs, zeros
+                                                 // must keep their signs
+    e.vandps(e.xmm0, i.src1, e.xmm2);            // P0123
+    e.vorps(i.dest, e.xmm0, e.xmm1);  // P0123 make sure zeros keep signs
+
+    // if it does not equal zero, we stay
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);
+
 // ============================================================================
 // OPCODE_LOAD_VECTOR_SHL
 // ============================================================================
@@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
     if (i.src1.is_constant) {
       auto sh = i.src1.constant();
       assert_true(sh < xe::countof(lvsl_table));
-      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
+      } else {
+        // this is probably extremely rare
+        e.LoadConstantXmm(i.dest, lvsl_table[sh]);
+      }
     } else {
       // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsl_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it.
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+      e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
     }
   }
 };
@@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
     if (i.src1.is_constant) {
       auto sh = i.src1.constant();
       assert_true(sh < xe::countof(lvsr_table));
-      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
+      } else {
+        e.LoadConstantXmm(i.dest, lvsr_table[sh]);
+      }
     } else {
       // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsr_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
+      // lookup as well, compute from LVSR base instead
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+
+      e.vpsubb(i.dest, e.xmm1, e.xmm0);
     }
   }
 };
@@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
     }
   }
 
-static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
 
     if (e.IsFeatureEnabled(kX64EmitAVX2)) {
@@ -1793,6 +1826,14 @@ struct PERMUTE_I32
     }
   }
 };
+//todo: use this on const src1
+static vec128_t FixupConstantShuf8(vec128_t input) {
+  for (uint32_t i = 0; i < 16; ++i) {
+    input.u8[i] ^= 0x03;
+    input.u8[i] &= 0x1F;
+  }
+  return input;
+}
 struct PERMUTE_V128
     : Sequence<PERMUTE_V128,
                I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
@@ -1855,7 +1896,8 @@ struct PERMUTE_V128
       } else {
         e.vpshufb(src3_shuf, i.src3, e.xmm2);
       }
-      // Build a mask with values in src2 having 0 and values in src3 having 1.
+      // Build a mask with values in src2 having 0 and values in src3
+      // having 1.
       e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
       e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
     }
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index b647ff404..5af7db24d 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -35,11 +35,14 @@
 #include "xenia/cpu/backend/x64/x64_emitter.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
+// needed for stmxcsr
+#include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
 
 DEFINE_bool(use_fast_dot_product, false,
-            "Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
+            "Experimental optimization, much shorter sequence on dot products, "
+            "treating inf as overflow instead of using mcxsr"
             "four insn dotprod",
             "CPU");
 namespace xe {
@@ -1996,8 +1999,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
     assert_true(!i.instr->flags);
     EmitAssociativeBinaryXmmOp(e, i,
                                [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                               //  e.vrcpps(e.xmm0, src2);
-                                 //e.vmulps(dest, src1, e.xmm0);
+                                 //  e.vrcpps(e.xmm0, src2);
+                                 // e.vmulps(dest, src1, e.xmm0);
                                  e.vdivps(dest, src1, src2);
                                });
   }
@@ -2607,68 +2610,84 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
 
-struct DOT_PRODUCT_V128 {
-  static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
-    if (cvars::use_fast_dot_product) {
-      e.vdpps(dest, src1, src2, imm);
-      e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
-      e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
-      e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
-
-    } else {
-      // TODO(benvanik): apparently this is very slow
-      // - find alternative?
-      Xbyak::Label end;
-      e.inLocalLabel();
-
-      // Grab space to put MXCSR.
-      // TODO(gibbed): stick this in TLS or
-      // something?
-      e.sub(e.rsp, 8);
-
-      // Grab MXCSR and mask off the overflow flag,
-      // because it's sticky.
-      e.vstmxcsr(e.dword[e.rsp]);
-      e.mov(e.eax, e.dword[e.rsp]);
-      e.and_(e.eax, uint32_t(~8));
-      e.mov(e.dword[e.rsp], e.eax);
-      e.vldmxcsr(e.dword[e.rsp]);
-
-      // Hey we can do the dot product now.
-      e.vdpps(dest, src1, src2, imm);
-
-      // Load MXCSR...
-      e.vstmxcsr(e.dword[e.rsp]);
-
-      // ..free our temporary space and get MXCSR at
-      // the same time
-      e.pop(e.rax);
-
-      // Did we overflow?
-      e.test(e.al, 8);
-      e.jz(end);
-
-      // Infinity? HA! Give NAN.
-      e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
-
-      e.L(end);
-      e.outLocalLabel();
-    }
-  }
-};
-
 // ============================================================================
 // OPCODE_DOT_PRODUCT_3
 // ============================================================================
 struct DOT_PRODUCT_3_V128
     : Sequence<DOT_PRODUCT_3_V128,
-               I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_3, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    // this is going to hurt a bit...
+    /*
+    this implementation is accurate, it matches the results of xb360 vmsum3
+    except that vmsum3 is often off by 1 bit, but its extremely slow. it is a
+    long, unbroken chain of dependencies, and the three uses of mxcsr all cost
+    about 15-20 cycles at the very least on amd zen processors. on older amd the
+    figures agner has are pretty horrible. it looks like its just as bad on
+    modern intel cpus also up until just recently. perhaps a better way of
+    detecting overflow would be to just compare with inf. todo: test whether cmp
+    with inf can replace
+    */
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm0;
+    auto src2v = e.xmm1;
+    if (i.src1.is_constant) {
+      src1v = e.xmm0;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm1;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+    // todo: maybe the top element should be cleared by the InstrEmit_ function
+    // so that in the future this could be optimized away if the top is known to
+    // be zero. Right now im not sure that happens often though and its
+    // currently not worth it also, maybe pre-and if constant
+    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+    /*
+        ymm0 = src1 as doubles, ele 3 cleared
+        ymm1 = src2 as doubles, ele 3 cleared
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);  // get element [1] in xmm3
+    e.vaddsd(e.xmm3, e.xmm3, e.xmm2);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    // this is awful
+    e.vstmxcsr(mxcsr_storage);
+    e.test(mxcsr_storage, e.eax);
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);
+    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
@@ -2678,13 +2697,81 @@ EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
 // ============================================================================
 struct DOT_PRODUCT_4_V128
     : Sequence<DOT_PRODUCT_4_V128,
-               I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_4, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm3;
+    auto src2v = e.xmm2;
+    if (i.src1.is_constant) {
+      src1v = e.xmm3;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm2;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);
+
+    e.vcvtps2pd(e.ymm0, src1v);
+    e.vcvtps2pd(e.ymm1, src2v);
+    /*
+        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+
+
+    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm4, e.ymm5, 1);
+    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
+    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
+
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.test(mxcsr_storage, e.eax);
+
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
+                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
+    //   e.DebugBreak();
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128);
@@ -2759,7 +2846,6 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-
     EmitCommutativeBinaryXmmOp(e, i,
                                [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
                                  e.vpand(dest, src1, src2);
@@ -3419,7 +3505,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
       return true;
     }
   }
-  XELOGE("No sequence match for variant {}", i->opcode->name);
+  XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode));
   return false;
 }
 
diff --git a/src/xenia/cpu/backend/x64/x64_stack_layout.h b/src/xenia/cpu/backend/x64/x64_stack_layout.h
index 1736dc02a..5bd50a803 100644
--- a/src/xenia/cpu/backend/x64/x64_stack_layout.h
+++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h
@@ -122,7 +122,8 @@ class StackLayout {
    *
    */
   static const size_t GUEST_STACK_SIZE = 104;
-  static const size_t GUEST_CTX_HOME = 80;
+  //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
+  static const size_t GUEST_SCRATCH64 = 80;
   static const size_t GUEST_RET_ADDR = 88;
   static const size_t GUEST_CALL_RET_ADDR = 96;
 };
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index df76cc25d..025b4114e 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -312,13 +312,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                 result = true;
               } else if (i->src2.value->IsConstant() &&
                          i->src3.value->IsConstant()) {
-                // TODO: Select
-                // v->set_from(i->src2.value);
-                // v->Select(i->src3.value, i->src1.value);
-                // i->Remove();
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
               }
             } else {
-              // TODO: vec128 select
+              if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
+              }
             }
           }
           break;
@@ -744,8 +749,35 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             result = true;
           }
           break;
-        // TODO(benvanik): INSERT/EXTRACT
-        // TODO(benvanik): PERMUTE/SWIZZLE
+       
+        case OPCODE_PERMUTE: {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant() &&
+              (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
+            v->set_from(i->src1.value);
+            v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        }
+        case OPCODE_INSERT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        case OPCODE_SWIZZLE:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
         case OPCODE_EXTRACT:
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             v->set_zero(v->type);
@@ -867,24 +899,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
           }
           break;
 
-        case OPCODE_DOT_PRODUCT_3:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct3(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
-        case OPCODE_DOT_PRODUCT_4:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct4(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
         case OPCODE_VECTOR_AVERAGE:
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             v->set_from(i->src1.value);
@@ -896,7 +910,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             result = true;
           }
           break;
-
+        case OPCODE_VECTOR_DENORMFLUSH:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->DenormalFlush();
+            i->Remove();
+            result = true;
+          }
+          break;
         default:
           // Ignored.
           break;
diff --git a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
index 56cf1c769..f1613b481 100644
--- a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
+++ b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
@@ -132,10 +132,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
     while (outgoing_ordinal != -1) {
       Value* src_value = value_map[outgoing_ordinal];
       assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
       }
-      builder->StoreLocal(src_value->local_slot, src_value);
+      builder->StoreLocal(src_value->GetLocalSlot(), src_value);
 
       // If we are in the block the value was defined in:
       if (src_value->def->block == block) {
@@ -168,10 +168,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
     while (incoming_ordinal != -1) {
       Value* src_value = value_map[incoming_ordinal];
       assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
       }
-      Value* local_value = builder->LoadLocal(src_value->local_slot);
+      Value* local_value = builder->LoadLocal(src_value->GetLocalSlot());
       builder->last_instr()->MoveBefore(block->instr_head);
 
       // Swap uses of original value with the local value.
diff --git a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
index bd7380184..439b35708 100644
--- a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
@@ -365,7 +365,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
   auto new_head_use = next_use;
 
   // Allocate local.
-  if (spill_value->local_slot) {
+  if (spill_value->HasLocalSlot()) {
     // Value is already assigned a slot. Since we allocate in order and this is
     // all SSA we know the stored value will be exactly what we want. Yay,
     // we can prevent the redundant store!
@@ -373,10 +373,10 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
     // use the spilled value and prevent the need for more locals.
   } else {
     // Allocate a local slot.
-    spill_value->local_slot = builder->AllocLocal(spill_value->type);
+    spill_value->SetLocalSlot(builder->AllocLocal(spill_value->type));
 
     // Add store.
-    builder->StoreLocal(spill_value->local_slot, spill_value);
+    builder->StoreLocal(spill_value->GetLocalSlot(), spill_value);
     auto spill_store = builder->last_instr();
     auto spill_store_use = spill_store->src2_use;
     assert_null(spill_store_use->prev);
@@ -417,7 +417,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
   // use is after the instruction requesting the spill we know we haven't
   // done allocation for that code yet and can let that be handled
   // automatically when we get to it.
-  auto new_value = builder->LoadLocal(spill_value->local_slot);
+  auto new_value = builder->LoadLocal(spill_value->GetLocalSlot());
   auto spill_load = builder->last_instr();
   spill_load->MoveBefore(next_use->instr);
   // Note: implicit first use added.
@@ -429,7 +429,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
 
   // Set the local slot of the new value to our existing one. This way we will
   // reuse that same memory if needed.
-  new_value->local_slot = spill_value->local_slot;
+  new_value->SetLocalSlot( spill_value->GetLocalSlot());
 
   // Rename all future uses of the SSA value to the new value as loaded
   // from the local.
diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc
index 3cc2c9aaa..2665842a5 100644
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@@ -260,9 +260,9 @@ void HIRBuilder::Dump(StringBuffer* str) {
         str->Append(" = ");
       }
       if (i->flags) {
-        str->AppendFormat("{}.{}", info->name, i->flags);
+        str->AppendFormat("{}.{}", GetOpcodeName(info), i->flags);
       } else {
-        str->Append(info->name);
+        str->Append(GetOpcodeName(info));
       }
       if (src1_type) {
         str->Append(' ');
@@ -712,7 +712,6 @@ Value* HIRBuilder::AllocValue(TypeName type) {
   value->use_head = NULL;
   value->last_use = NULL;
   value->local_slot = NULL;
-  value->tag = NULL;
   value->reg.set = NULL;
   value->reg.index = -1;
   return value;
@@ -723,12 +722,11 @@ Value* HIRBuilder::CloneValue(Value* source) {
   value->ordinal = next_value_ordinal_++;
   value->type = source->type;
   value->flags = source->flags;
+  value->local_slot = NULL;
   value->constant.v128 = source->constant.v128;
   value->def = NULL;
   value->use_head = NULL;
   value->last_use = NULL;
-  value->local_slot = NULL;
-  value->tag = NULL;
   value->reg.set = NULL;
   value->reg.index = -1;
   return value;
@@ -1493,7 +1491,16 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
   return VectorCompareXX(OPCODE_VECTOR_COMPARE_UGE_info, value1, value2,
                          part_type);
 }
-
+Value* HIRBuilder::VectorDenormFlush(Value* value1) {
+  return value1;
+  ASSERT_VECTOR_TYPE(value1);
+  Instr* i =
+      AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(value1);
+  i->src2.value = nullptr;
+  i->src3.value = nullptr;
+  return i->dest;
+}
 Value* HIRBuilder::Add(Value* value1, Value* value2,
                        uint32_t arithmetic_flags) {
   ASSERT_TYPES_EQUAL(value1, value2);
@@ -1713,13 +1720,13 @@ Value* HIRBuilder::Log2(Value* value) {
   return i->dest;
 }
 
+
 Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
   ASSERT_VECTOR_TYPE(value1);
   ASSERT_VECTOR_TYPE(value2);
   ASSERT_TYPES_EQUAL(value1, value2);
 
-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(VEC128_TYPE));
   i->set_src1(value1);
   i->set_src2(value2);
   i->src3.value = NULL;
@@ -1731,8 +1738,7 @@ Value* HIRBuilder::DotProduct4(Value* value1, Value* value2) {
   ASSERT_VECTOR_TYPE(value2);
   ASSERT_TYPES_EQUAL(value1, value2);
 
-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(VEC128_TYPE));
   i->set_src1(value1);
   i->set_src2(value2);
   i->src3.value = NULL;
diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h
index b2809d5d8..3b29867e9 100644
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@@ -199,6 +199,7 @@ class HIRBuilder {
   Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type);
   Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
   Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
+  Value* VectorDenormFlush(Value* value1);
 
   Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
   Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
diff --git a/src/xenia/cpu/hir/opcodes.cc b/src/xenia/cpu/hir/opcodes.cc
index b3b14b198..00eb4f2f7 100644
--- a/src/xenia/cpu/hir/opcodes.cc
+++ b/src/xenia/cpu/hir/opcodes.cc
@@ -15,14 +15,23 @@ namespace hir {
 
 #define DEFINE_OPCODE(num, name, sig, flags) \
   const OpcodeInfo num##_info = {            \
+      num,                                   \
       flags,                                 \
       sig,                                   \
-      name,                                  \
-      num,                                   \
   };
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE
 
+const char* GetOpcodeName(Opcode num) {
+  switch (num) {
+#define DEFINE_OPCODE(num, name, sig, flags) \
+  case num:                                  \
+    return name;
+#include "xenia/cpu/hir/opcodes.inl"
+#undef DEFINE_OPCODE
+  }
+  return "invalid opcode";
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h
index 8e681c757..acc61d047 100644
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@@ -280,7 +280,8 @@ enum Opcode {
   OPCODE_ATOMIC_EXCHANGE,
   OPCODE_ATOMIC_COMPARE_EXCHANGE,
   OPCODE_SET_ROUNDING_MODE,
-  __OPCODE_MAX_VALUE,  // Keep at end.
+  OPCODE_VECTOR_DENORMFLUSH,  // converts denormals to signed zeros in a vector
+  __OPCODE_MAX_VALUE,         // Keep at end.
 };
 
 enum OpcodeFlags {
@@ -352,17 +353,42 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
          ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }
 
+static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
+                            OpcodeSignatureType& src1,
+                            OpcodeSignatureType& src2,
+                            OpcodeSignatureType& src3) {
+  dest = GET_OPCODE_SIG_TYPE_DEST(sig);
+  src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+  src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+  src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+}
+
+constexpr uint32_t GetNumOperandsForSig(uint32_t sig) {
+  sig >>= 3;
+
+  uint32_t result = 0;
+  while (sig) {
+    if (sig & 0x7) {
+      ++result;
+    }
+    sig >>= 3;
+  }
+  return result;
+}
 typedef struct {
+  Opcode num;
   uint32_t flags;
   uint32_t signature;
-  const char* name;
-  Opcode num;
 } OpcodeInfo;
 
 #define DEFINE_OPCODE(num, name, sig, flags) extern const OpcodeInfo num##_info;
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE
 
+const char* GetOpcodeName(Opcode num);
+static inline const char* GetOpcodeName(const OpcodeInfo* info) {
+  return GetOpcodeName(info->num);
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl
index 584b0ac55..a1ca73f7d 100644
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@@ -673,3 +673,10 @@ DEFINE_OPCODE(
     "set_rounding_mode",
     OPCODE_SIG_X_V,
     0)
+
+DEFINE_OPCODE(
+    OPCODE_VECTOR_DENORMFLUSH,
+    "vector_denormflush",
+    OPCODE_SIG_V_V,
+    0
+)
\ No newline at end of file
diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc
index 2bf52a05d..211cd18f9 100644
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@@ -864,10 +864,112 @@ void Value::Extract(Value* vec, Value* index) {
       break;
   }
 }
+void Value::Permute(Value* src1, Value* src2, TypeName type) {
+  if (type == INT8_TYPE) {
+    uint8_t table[32];
 
+    for (uint32_t i = 0; i < 16; ++i) {
+      table[i] = src1->constant.v128.u8[i];
+      table[i + 16] = src2->constant.v128.u8[i];
+    }
+
+    for (uint32_t i = 0; i < 16; ++i) {
+      constant.v128.u8[i] = table[(constant.v128.u8[i] ^ 3) & 0x1f];
+    }
+  } else if (type == INT16_TYPE) {
+    vec128_t perm = (constant.v128 & vec128s(0xF)) ^ vec128s(0x1);
+    vec128_t perm_ctrl = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
+
+      auto v = uint8_t(perm.u16[i]);
+      perm.u8[i * 2] = v * 2;
+      perm.u8[i * 2 + 1] = v * 2 + 1;
+    }
+    auto lod = [](const vec128_t& v) {
+      return _mm_loadu_si128((const __m128i*)&v);
+    };
+    auto sto = [](vec128_t& v, __m128i x) {
+      return _mm_storeu_si128((__m128i*)&v, x);
+    };
+
+    __m128i xmm1 = lod(src1->constant.v128);
+    __m128i xmm2 = lod(src2->constant.v128);
+    xmm1 = _mm_shuffle_epi8(xmm1, lod(perm));
+    xmm2 = _mm_shuffle_epi8(xmm2, lod(perm));
+    uint8_t mask = 0;
+    for (int i = 0; i < 8; i++) {
+      if (perm_ctrl.i16[i] == 0) {
+        mask |= 1 << (7 - i);
+      }
+    }
+
+    vec128_t unp_mask = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      if (mask & (1 << i)) {
+        unp_mask.u16[i] = 0xFFFF;
+      }
+    }
+
+    sto(constant.v128, _mm_blendv_epi8(xmm1, xmm2, lod(unp_mask)));
+
+  } else {
+    assert_unhandled_case(type);
+  }
+}
+void Value::Insert(Value* index, Value* part, TypeName type) {
+  vec128_t* me = &constant.v128;
+
+  switch (type) {
+    case INT8_TYPE:
+      me->u8[index->constant.u8 ^ 3] = part->constant.u8;
+      break;
+    case INT16_TYPE:
+      me->u16[index->constant.u8 ^ 1] = part->constant.u16;
+      break;
+    case INT32_TYPE:
+      me->u32[index->constant.u8] = part->constant.u32;
+      break;
+  }
+}
+void Value::Swizzle(uint32_t mask, TypeName type) {
+  if (type == INT32_TYPE || type == FLOAT32_TYPE) {
+    vec128_t result = vec128b(0);
+    for (uint32_t i = 0; i < 4; ++i) {
+      result.u32[i] = constant.v128.u32[(mask >> (i * 2)) & 0b11];
+    }
+    constant.v128 = result;
+  } else {
+    assert_unhandled_case(type);
+  }
+}
 void Value::Select(Value* other, Value* ctrl) {
-  // TODO
-  assert_always();
+  if (ctrl->type == VEC128_TYPE) {
+    constant.v128.low = (constant.v128.low & ~ctrl->constant.v128.low) |
+                        (other->constant.v128.low & ctrl->constant.v128.low);
+    constant.v128.high = (constant.v128.high & ~ctrl->constant.v128.high) |
+                         (other->constant.v128.high & ctrl->constant.v128.high);
+
+  } else {
+    if (ctrl->constant.u8) {
+      switch (other->type) {
+        case INT8_TYPE:
+          constant.u8 = other->constant.u8;
+          break;
+        case INT16_TYPE:
+          constant.u16 = other->constant.u16;
+          break;
+        case INT32_TYPE:
+        case FLOAT32_TYPE:
+          constant.u32 = other->constant.u32;
+          break;
+        case INT64_TYPE:
+        case FLOAT64_TYPE:
+          constant.u64 = other->constant.u64;
+          break;
+      }
+    }
+  }
 }
 
 void Value::Splat(Value* other) {
@@ -1532,7 +1634,15 @@ void Value::ByteSwap() {
       break;
   }
 }
-
+void Value::DenormalFlush() {
+  for (int i = 0; i < 4; ++i) {
+    uint32_t current_element = constant.v128.u32[i];
+    if ((current_element & 0x7f800000) == 0) {
+      current_element = current_element & 0x80000000;
+    }
+    constant.v128.u32[i] = current_element;
+  }
+}
 void Value::CountLeadingZeros(const Value* other) {
   switch (other->type) {
     case INT8_TYPE:
diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index f0312d424..1d8963b64 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -104,6 +104,9 @@ struct ValueMask {
 
 class Value {
  public:
+  /*
+    todo : this should be intrusive and be part of Instr instead.
+  */
   typedef struct Use_s {
     Instr* instr;
     Use_s* prev;
@@ -128,17 +131,16 @@ class Value {
   TypeName type;
 
   uint32_t flags;
-  RegAssignment reg;
-  ConstantValue constant;
 
   Instr* def;
   Use* use_head;
   // NOTE: for performance reasons this is not maintained during construction.
   Instr* last_use;
-  Value* local_slot;
-
-  // TODO(benvanik): remove to shrink size.
-  void* tag;
+  RegAssignment reg;
+  union {
+    Value* local_slot;
+    ConstantValue constant;
+  };
 
   Use* AddUse(Arena* arena, Instr* instr);
   void RemoveUse(Use* use);
@@ -209,7 +211,20 @@ class Value {
     flags = other->flags;
     constant.v128 = other->constant.v128;
   }
+  bool HasLocalSlot() const {
+    return !(flags & VALUE_IS_CONSTANT) && local_slot;
+  }
+  void SetLocalSlot(Value* lslot) {
+    assert(!(flags & VALUE_IS_CONSTANT));
+    local_slot = lslot;
+  }
 
+  Value* GetLocalSlot() {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
+  const Value* GetLocalSlot() const {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
   inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
   bool IsConstantTrue() const {
     if (type == VEC128_TYPE) {
@@ -555,7 +570,10 @@ class Value {
   void Shr(Value* other);
   void Sha(Value* other);
   void RotateLeft(Value* other);
+  void Insert(Value* index, Value* part, TypeName type);
   void Extract(Value* vec, Value* index);
+  void Permute(Value* src1, Value* src2, TypeName type);
+  void Swizzle(uint32_t mask, TypeName type);
   void Select(Value* other, Value* ctrl);
   void Splat(Value* other);
   void VectorCompareEQ(Value* other, TypeName type);
@@ -575,6 +593,8 @@ class Value {
   void VectorAverage(Value* other, TypeName type, bool is_unsigned,
                      bool saturate);
   void ByteSwap();
+  void DenormalFlush();
+
   void CountLeadingZeros(const Value* other);
   bool Compare(Opcode opcode, Value* other);
   hir::Instr* GetDefSkipAssigns();
diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index 3ca5bc40f..37ee10396 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -279,14 +279,21 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
   Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
   // ea &= ~0xF
   ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
+
   // v = (old & ~mask) | ((new >> eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                               f.LoadVR(vd), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
   Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
+  /*
+  these permutes need to be looked at closer. keep in mind Permute is meant to
+  emulate vmx's shuffles and does not generate particularly good code. The logic
+  here looks as if it might make more sense as a comparison (
+*/
   // mask = FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                          f.Not(f.LoadZeroVec128()), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
+
+  Value* v = f.Select(mask, old_value, new_value);
   // ea &= ~0xF (handled above)
   f.Store(ea, f.ByteSwap(v));
   return 0;
@@ -321,14 +328,14 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
   ea = CalculateEA_0(f, ra, rb);
   eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
   ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
   // v = (old & ~mask) | ((new << eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadVR(vd),
-                               f.LoadZeroVec128(), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
   Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
   // mask = ~FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.Not(f.LoadZeroVec128()),
-                          f.LoadZeroVec128(), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
+  Value* v = f.Select(mask, old_value, new_value);
   // ea &= ~0xF (handled above)
   f.Store(ea, f.ByteSwap(v));
   f.MarkLabel(skip_label);
@@ -815,8 +822,16 @@ int InstrEmit_vlogefp128(PPCHIRBuilder& f, const InstrData& i) {
 
 int InstrEmit_vmaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
                        uint32_t vc) {
+  /*
+      chrispy: testing on POWER8 revealed that altivec vmaddfp unconditionally
+     flushes denormal inputs to 0, regardless of NJM setting
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
   // (VD) <- ((VA) * (VC)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb));
+  Value* v = f.MulAdd(a, c, b);
+  // todo: do denormal results also unconditionally become 0?
   f.StoreVR(vd, v);
   return 0;
 }
@@ -832,9 +847,14 @@ int InstrEmit_vmaddfp128(PPCHIRBuilder& f, const InstrData& i) {
 }
 
 int InstrEmit_vmaddcfp128(PPCHIRBuilder& f, const InstrData& i) {
+  /*
+    see vmaddfp about these denormflushes
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(VX128_VA128));
+  Value* b = f.VectorDenormFlush(f.LoadVR(VX128_VB128));
+  Value* d = f.VectorDenormFlush(f.LoadVR(VX128_VD128));
   // (VD) <- ((VA) * (VD)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VD128),
-                      f.LoadVR(VX128_VB128));
+  Value* v = f.MulAdd(a, d, b);
   f.StoreVR(VX128_VD128, v);
   return 0;
 }
@@ -1085,7 +1105,8 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
   // Dot product XYZ.
   // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
   Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  //chrispy: denormal outputs for Dot product are unconditionally made 0
+  v = f.VectorDenormFlush(v);
   f.StoreVR(VX128_VD128, v);
   return 0;
 }
@@ -1094,7 +1115,7 @@ int InstrEmit_vmsum4fp128(PPCHIRBuilder& f, const InstrData& i) {
   // Dot product XYZW.
   // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w)
   Value* v = f.DotProduct4(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  v = f.VectorDenormFlush(v);
   f.StoreVR(VX128_VD128, v);
   return 0;
 }
@@ -1151,7 +1172,19 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
   // (VD) <- -(((VA) * (VC)) - (VB))
   // NOTE: only one rounding should take place, but that's hard...
   // This really needs VFNMSUB132PS/VFNMSUB213PS/VFNMSUB231PS but that's AVX.
-  Value* v = f.Neg(f.MulSub(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb)));
+  // NOTE2: we could make vnmsub a new opcode, and then do it in double
+  // precision, rounding after the neg
+
+  /*
+  chrispy: this is untested, but i believe this has the same DAZ behavior for
+  inputs as vmadd
+  */
+
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
+
+  Value* v = f.Neg(f.MulSub(a, c, b));
   f.StoreVR(vd, v);
   return 0;
 }