Merge pull request #50 from chrisps/canary_experimental

Ton of cpu changes
2022-07-16 19:39:45 +02:00 · 2022-07-16 19:39:45 +02:00 · 23ca3725c4
parent 5f11c5d3b4 6a612b4d34
commit 23ca3725c4
17 changed files with 574 additions and 231 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
  EmitSaveNonvolatileRegs();

  mov(rax, rcx);
-  mov(rsi, rdx);  // context
-  mov(rcx, r8);   // return address
+  mov(rsi, rdx);                                                    // context
+  mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]);  // membase
+  mov(rcx, r8);  // return address
  call(rax);
-
+  vzeroupper();
  EmitLoadNonvolatileRegs();

  code_offsets.epilog = getSize();
@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {

  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
-
+  // chrispy: added this for proper vmsum impl, avx2 bitshifts
+  vzeroupper();
  // Save off volatile registers.
  EmitSaveVolatileRegs();

--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-
-  
-
-
+  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
 #undef TEST_EMIT_FEATURE
  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+  latest version of xbyak
 */
  unsigned int data[4];
  Xbyak::util::Cpu::getCpuid(0x80000001, data);
@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
    }
  }
  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-  
-      bool is_zennish = cpu_.displayFamily >= 0x17;
+    bool is_zennish = cpu_.displayFamily >= 0x17;

-      if (is_zennish) {
-        feature_flags_ |= kX64FastJrcx;
+    if (is_zennish) {
+      feature_flags_ |= kX64FastJrcx;

-        if (cpu_.displayFamily > 0x17) {
-          feature_flags_ |= kX64FastLoop;
+      if (cpu_.displayFamily > 0x17) {
+        feature_flags_ |= kX64FastLoop;

-        } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
-          feature_flags_ |= kX64FastLoop;
-        }  // todo:figure out at model zen+ became zen2, this is just the model
-           // for my cpu, which is ripper90
-      
-      }
+      } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
+        feature_flags_ |= kX64FastLoop;
+      }  // todo:figure out at model zen+ became zen2, this is just the model
+         // for my cpu, which is ripper90
+    }
  }
 }

@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();

+  /*
+  * chrispy: removed this, it serves no purpose
  mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
+  */
  mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);

@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  }

  // Load membase.
-  mov(GetMembaseReg(),
+  /*
+  * chrispy: removed this, as long as we load it in HostToGuestThunk we can
+  count on no other code modifying it. mov(GetMembaseReg(),
      qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
-
+  */
  // Body.
  auto block = builder->first_block();
  while (block) {
@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
        // NOTE: If you encounter this after adding a new instruction, do a full
        // rebuild!
        assert_always();
-        XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
+        XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
        break;
      }
      instr = new_tail;
@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  L(epilog_label);
  epilog_label_ = nullptr;
  EmitTraceUserCallReturn();
+  /*
+  * chrispy: removed this, it serves no purpose
  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-
+  */
  code_offsets.epilog = getSize();

  add(rsp, (uint32_t)stack_size);
@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  if (cvars::emit_source_annotations) {
    nop(5);
-
  }

  assert_zero(code_offsets.prolog);
@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
 Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }

-void X64Emitter::ReloadContext() {
-  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-}
-
 void X64Emitter::ReloadMembase() {
  mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
 }
-#define __NH_CONCAT(x, y) x##y
-#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
-
-#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
-
-#define DECLNOP(n, ...) \
-  static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
-
-DECLNOP(1, 0x90);
-DECLNOP(2, 0x66, 0x90);
-DECLNOP(3, 0x0F, 0x1F, 0x00);
-DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
-DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
-DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-
-static constexpr const unsigned char* const g_noptable[] = {
-    &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
-    &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
-
-static constexpr unsigned LENGTHOF_NOPTABLE =
-    sizeof(g_noptable) / sizeof(g_noptable[0]);

 // Len Assembly                                   Byte Sequence
 // ============================================================================
@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
 // 8b  NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00H
 // 9b  66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
 void X64Emitter::nop(size_t length) {
-  while (length != 0) {
-    unsigned patchsize = length % LENGTHOF_NOPTABLE;
-
-    // patch_memory(locptr, size, (char*)g_noptable[patchsize]);
-
-    for (unsigned i = 0; i < patchsize; ++i) {
-      db(g_noptable[patchsize][i]);
-    }
-
-    //locptr += patchsize;
-    length -= patchsize;
+  for (size_t i = 0; i < length; ++i) {
+    db(0x90);
  }
 }

@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
                    0x80, 0x80, 0x80, 0x80),
    /*XMMShortsToBytes*/
    v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
-                    0x80, 0x80, 0x80)
-};
+                    0x80, 0x80, 0x80),
+    /*XMMLVSLTableBase*/
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    /*XMMLVSRTableBase*/
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    /* XMMSingleDenormalMask */
+    vec128i(0x7f800000),
+    /* XMMThreeFloatMask */
+    vec128i(~0U, ~0U, ~0U, 0U),
+    /*XMMXenosF16ExtRangeStart*/
+    vec128f(65504)};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
    // 1111...
    vpcmpeqb(dest, dest);
  } else {
-
    for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
      if (xmm_consts[i] == v) {
        vmovapd(dest, GetXmmConstPtr((XmmConst)i));
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -118,7 +118,12 @@ enum XmmConst {
  XMM2To32,
  XMMFloatInf,
  XMMIntsToBytes,
-  XMMShortsToBytes
+  XMMShortsToBytes,
+  XMMLVSLTableBase,
+  XMMLVSRTableBase,
+  XMMSingleDenormalMask,
+  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMXenosF16ExtRangeStart
 };

 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
 public:
@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Reg64 GetContextReg();
  Xbyak::Reg64 GetMembaseReg();
-  void ReloadContext();
+
  void ReloadMembase();

  void nop(size_t length = 1);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);

+struct VECTOR_DENORMFLUSH
+    : Sequence<VECTOR_DENORMFLUSH,
+               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123
+
+    e.vandps(e.xmm0, i.src1,
+             e.GetXmmConstPtr(XMMSingleDenormalMask));  // 0.25 P0123
+    e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1);                // 0.5 P01
+    e.vandps(e.xmm1, i.src1,
+             e.GetXmmConstPtr(XMMSignMaskF32));  // 0.5 P0123 take signs, zeros
+                                                 // must keep their signs
+    e.vandps(e.xmm0, i.src1, e.xmm2);            // P0123
+    e.vorps(i.dest, e.xmm0, e.xmm1);  // P0123 make sure zeros keep signs
+
+    // if it does not equal zero, we stay
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);
+
 // ============================================================================
 // OPCODE_LOAD_VECTOR_SHL
 // ============================================================================
@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsl_table));
-      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
+      } else {
+        // this is probably extremely rare
+        e.LoadConstantXmm(i.dest, lvsl_table[sh]);
+      }
    } else {
      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsl_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it.
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+      e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
    }
  }
 };
@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsr_table));
-      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
+      } else {
+        e.LoadConstantXmm(i.dest, lvsr_table[sh]);
+      }
    } else {
      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsr_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
+      // lookup as well, compute from LVSR base instead
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+
+      e.vpsubb(i.dest, e.xmm1, e.xmm0);
    }
  }
 };
@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
    }
  }

-static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
    // TODO(benvanik): native version (with shift magic).

    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
@ -1793,6 +1826,14 @@ struct PERMUTE_I32
    }
  }
 };
+//todo: use this on const src1
+static vec128_t FixupConstantShuf8(vec128_t input) {
+  for (uint32_t i = 0; i < 16; ++i) {
+    input.u8[i] ^= 0x03;
+    input.u8[i] &= 0x1F;
+  }
+  return input;
+}
 struct PERMUTE_V128
    : Sequence<PERMUTE_V128,
               I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
@ -1855,7 +1896,8 @@ struct PERMUTE_V128
      } else {
        e.vpshufb(src3_shuf, i.src3, e.xmm2);
      }
-      // Build a mask with values in src2 having 0 and values in src3 having 1.
+      // Build a mask with values in src2 having 0 and values in src3
+      // having 1.
      e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
      e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
    }
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -35,11 +35,14 @@
 #include "xenia/cpu/backend/x64/x64_emitter.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
+// needed for stmxcsr
+#include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"

 DEFINE_bool(use_fast_dot_product, false,
-            "Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
+            "Experimental optimization, much shorter sequence on dot products, "
+            "treating inf as overflow instead of using mcxsr"
            "four insn dotprod",
            "CPU");
 namespace xe {
@ -1996,8 +1999,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
    assert_true(!i.instr->flags);
    EmitAssociativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                               //  e.vrcpps(e.xmm0, src2);
-                                 //e.vmulps(dest, src1, e.xmm0);
+                                 //  e.vrcpps(e.xmm0, src2);
+                                 // e.vmulps(dest, src1, e.xmm0);
                                 e.vdivps(dest, src1, src2);
                               });
  }
@ -2607,68 +2610,84 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);

-struct DOT_PRODUCT_V128 {
-  static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
-    if (cvars::use_fast_dot_product) {
-      e.vdpps(dest, src1, src2, imm);
-      e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
-      e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
-      e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
-
-    } else {
-      // TODO(benvanik): apparently this is very slow
-      // - find alternative?
-      Xbyak::Label end;
-      e.inLocalLabel();
-
-      // Grab space to put MXCSR.
-      // TODO(gibbed): stick this in TLS or
-      // something?
-      e.sub(e.rsp, 8);
-
-      // Grab MXCSR and mask off the overflow flag,
-      // because it's sticky.
-      e.vstmxcsr(e.dword[e.rsp]);
-      e.mov(e.eax, e.dword[e.rsp]);
-      e.and_(e.eax, uint32_t(~8));
-      e.mov(e.dword[e.rsp], e.eax);
-      e.vldmxcsr(e.dword[e.rsp]);
-
-      // Hey we can do the dot product now.
-      e.vdpps(dest, src1, src2, imm);
-
-      // Load MXCSR...
-      e.vstmxcsr(e.dword[e.rsp]);
-
-      // ..free our temporary space and get MXCSR at
-      // the same time
-      e.pop(e.rax);
-
-      // Did we overflow?
-      e.test(e.al, 8);
-      e.jz(end);
-
-      // Infinity? HA! Give NAN.
-      e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
-
-      e.L(end);
-      e.outLocalLabel();
-    }
-  }
-};
-
 // ============================================================================
 // OPCODE_DOT_PRODUCT_3
 // ============================================================================
 struct DOT_PRODUCT_3_V128
    : Sequence<DOT_PRODUCT_3_V128,
-               I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_3, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    // this is going to hurt a bit...
+    /*
+    this implementation is accurate, it matches the results of xb360 vmsum3
+    except that vmsum3 is often off by 1 bit, but its extremely slow. it is a
+    long, unbroken chain of dependencies, and the three uses of mxcsr all cost
+    about 15-20 cycles at the very least on amd zen processors. on older amd the
+    figures agner has are pretty horrible. it looks like its just as bad on
+    modern intel cpus also up until just recently. perhaps a better way of
+    detecting overflow would be to just compare with inf. todo: test whether cmp
+    with inf can replace
+    */
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm0;
+    auto src2v = e.xmm1;
+    if (i.src1.is_constant) {
+      src1v = e.xmm0;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm1;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+    // todo: maybe the top element should be cleared by the InstrEmit_ function
+    // so that in the future this could be optimized away if the top is known to
+    // be zero. Right now im not sure that happens often though and its
+    // currently not worth it also, maybe pre-and if constant
+    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+    /*
+        ymm0 = src1 as doubles, ele 3 cleared
+        ymm1 = src2 as doubles, ele 3 cleared
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);  // get element [1] in xmm3
+    e.vaddsd(e.xmm3, e.xmm3, e.xmm2);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    // this is awful
+    e.vstmxcsr(mxcsr_storage);
+    e.test(mxcsr_storage, e.eax);
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);
+    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
@ -2678,13 +2697,81 @@ EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
 // ============================================================================
 struct DOT_PRODUCT_4_V128
    : Sequence<DOT_PRODUCT_4_V128,
-               I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_4, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm3;
+    auto src2v = e.xmm2;
+    if (i.src1.is_constant) {
+      src1v = e.xmm3;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm2;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);
+
+    e.vcvtps2pd(e.ymm0, src1v);
+    e.vcvtps2pd(e.ymm1, src2v);
+    /*
+        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+
+
+    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm4, e.ymm5, 1);
+    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
+    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
+
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.test(mxcsr_storage, e.eax);
+
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
+                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
+    //   e.DebugBreak();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128);
@ -2759,7 +2846,6 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    EmitCommutativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
                                 e.vpand(dest, src1, src2);
@ -3419,7 +3505,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
      return true;
    }
  }
-  XELOGE("No sequence match for variant {}", i->opcode->name);
+  XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode));
  return false;
 }

--- a/src/xenia/cpu/backend/x64/x64_stack_layout.h
+++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h
@ -122,7 +122,8 @@ class StackLayout {
   *
   */
  static const size_t GUEST_STACK_SIZE = 104;
-  static const size_t GUEST_CTX_HOME = 80;
+  //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
+  static const size_t GUEST_SCRATCH64 = 80;
  static const size_t GUEST_RET_ADDR = 88;
  static const size_t GUEST_CALL_RET_ADDR = 96;
 };
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -312,13 +312,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                result = true;
              } else if (i->src2.value->IsConstant() &&
                         i->src3.value->IsConstant()) {
-                // TODO: Select
-                // v->set_from(i->src2.value);
-                // v->Select(i->src3.value, i->src1.value);
-                // i->Remove();
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
              }
            } else {
-              // TODO: vec128 select
+              if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
+              }
            }
          }
          break;
@ -744,8 +749,35 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-        // TODO(benvanik): INSERT/EXTRACT
-        // TODO(benvanik): PERMUTE/SWIZZLE
+       
+        case OPCODE_PERMUTE: {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant() &&
+              (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
+            v->set_from(i->src1.value);
+            v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        }
+        case OPCODE_INSERT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        case OPCODE_SWIZZLE:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
        case OPCODE_EXTRACT:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_zero(v->type);
@ -867,24 +899,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;

-        case OPCODE_DOT_PRODUCT_3:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct3(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
-        case OPCODE_DOT_PRODUCT_4:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct4(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
        case OPCODE_VECTOR_AVERAGE:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
@ -896,7 +910,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-
+        case OPCODE_VECTOR_DENORMFLUSH:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->DenormalFlush();
+            i->Remove();
+            result = true;
+          }
+          break;
        default:
          // Ignored.
          break;
--- a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
+++ b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
@ -132,10 +132,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
    while (outgoing_ordinal != -1) {
      Value* src_value = value_map[outgoing_ordinal];
      assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
      }
-      builder->StoreLocal(src_value->local_slot, src_value);
+      builder->StoreLocal(src_value->GetLocalSlot(), src_value);

      // If we are in the block the value was defined in:
      if (src_value->def->block == block) {
@ -168,10 +168,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
    while (incoming_ordinal != -1) {
      Value* src_value = value_map[incoming_ordinal];
      assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
      }
-      Value* local_value = builder->LoadLocal(src_value->local_slot);
+      Value* local_value = builder->LoadLocal(src_value->GetLocalSlot());
      builder->last_instr()->MoveBefore(block->instr_head);

      // Swap uses of original value with the local value.
--- a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
@ -365,7 +365,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
  auto new_head_use = next_use;

  // Allocate local.
-  if (spill_value->local_slot) {
+  if (spill_value->HasLocalSlot()) {
    // Value is already assigned a slot. Since we allocate in order and this is
    // all SSA we know the stored value will be exactly what we want. Yay,
    // we can prevent the redundant store!
@ -373,10 +373,10 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
    // use the spilled value and prevent the need for more locals.
  } else {
    // Allocate a local slot.
-    spill_value->local_slot = builder->AllocLocal(spill_value->type);
+    spill_value->SetLocalSlot(builder->AllocLocal(spill_value->type));

    // Add store.
-    builder->StoreLocal(spill_value->local_slot, spill_value);
+    builder->StoreLocal(spill_value->GetLocalSlot(), spill_value);
    auto spill_store = builder->last_instr();
    auto spill_store_use = spill_store->src2_use;
    assert_null(spill_store_use->prev);
@ -417,7 +417,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
  // use is after the instruction requesting the spill we know we haven't
  // done allocation for that code yet and can let that be handled
  // automatically when we get to it.
-  auto new_value = builder->LoadLocal(spill_value->local_slot);
+  auto new_value = builder->LoadLocal(spill_value->GetLocalSlot());
  auto spill_load = builder->last_instr();
  spill_load->MoveBefore(next_use->instr);
  // Note: implicit first use added.
@ -429,7 +429,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,

  // Set the local slot of the new value to our existing one. This way we will
  // reuse that same memory if needed.
-  new_value->local_slot = spill_value->local_slot;
+  new_value->SetLocalSlot( spill_value->GetLocalSlot());

  // Rename all future uses of the SSA value to the new value as loaded
  // from the local.
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -260,9 +260,9 @@ void HIRBuilder::Dump(StringBuffer* str) {
        str->Append(" = ");
      }
      if (i->flags) {
-        str->AppendFormat("{}.{}", info->name, i->flags);
+        str->AppendFormat("{}.{}", GetOpcodeName(info), i->flags);
      } else {
-        str->Append(info->name);
+        str->Append(GetOpcodeName(info));
      }
      if (src1_type) {
        str->Append(' ');
@ -712,7 +712,6 @@ Value* HIRBuilder::AllocValue(TypeName type) {
  value->use_head = NULL;
  value->last_use = NULL;
  value->local_slot = NULL;
-  value->tag = NULL;
  value->reg.set = NULL;
  value->reg.index = -1;
  return value;
@ -723,12 +722,11 @@ Value* HIRBuilder::CloneValue(Value* source) {
  value->ordinal = next_value_ordinal_++;
  value->type = source->type;
  value->flags = source->flags;
+  value->local_slot = NULL;
  value->constant.v128 = source->constant.v128;
  value->def = NULL;
  value->use_head = NULL;
  value->last_use = NULL;
-  value->local_slot = NULL;
-  value->tag = NULL;
  value->reg.set = NULL;
  value->reg.index = -1;
  return value;
@ -1493,7 +1491,16 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
  return VectorCompareXX(OPCODE_VECTOR_COMPARE_UGE_info, value1, value2,
                         part_type);
 }
-
+Value* HIRBuilder::VectorDenormFlush(Value* value1) {
+  return value1;
+  ASSERT_VECTOR_TYPE(value1);
+  Instr* i =
+      AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(value1);
+  i->src2.value = nullptr;
+  i->src3.value = nullptr;
+  return i->dest;
+}
 Value* HIRBuilder::Add(Value* value1, Value* value2,
                       uint32_t arithmetic_flags) {
  ASSERT_TYPES_EQUAL(value1, value2);
@ -1713,13 +1720,13 @@ Value* HIRBuilder::Log2(Value* value) {
  return i->dest;
 }

+
 Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value1);
  ASSERT_VECTOR_TYPE(value2);
  ASSERT_TYPES_EQUAL(value1, value2);

-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(value1);
  i->set_src2(value2);
  i->src3.value = NULL;
@ -1731,8 +1738,7 @@ Value* HIRBuilder::DotProduct4(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value2);
  ASSERT_TYPES_EQUAL(value1, value2);

-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(value1);
  i->set_src2(value2);
  i->src3.value = NULL;
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -199,6 +199,7 @@ class HIRBuilder {
  Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
+  Value* VectorDenormFlush(Value* value1);

  Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
--- a/src/xenia/cpu/hir/opcodes.cc
+++ b/src/xenia/cpu/hir/opcodes.cc
@ -15,14 +15,23 @@ namespace hir {

 #define DEFINE_OPCODE(num, name, sig, flags) \
  const OpcodeInfo num##_info = {            \
+      num,                                   \
      flags,                                 \
      sig,                                   \
-      name,                                  \
-      num,                                   \
  };
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE

+const char* GetOpcodeName(Opcode num) {
+  switch (num) {
+#define DEFINE_OPCODE(num, name, sig, flags) \
+  case num:                                  \
+    return name;
+#include "xenia/cpu/hir/opcodes.inl"
+#undef DEFINE_OPCODE
+  }
+  return "invalid opcode";
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -280,7 +280,8 @@ enum Opcode {
  OPCODE_ATOMIC_EXCHANGE,
  OPCODE_ATOMIC_COMPARE_EXCHANGE,
  OPCODE_SET_ROUNDING_MODE,
-  __OPCODE_MAX_VALUE,  // Keep at end.
+  OPCODE_VECTOR_DENORMFLUSH,  // converts denormals to signed zeros in a vector
+  __OPCODE_MAX_VALUE,         // Keep at end.
 };

 enum OpcodeFlags {
@ -352,17 +353,42 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }

+static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
+                            OpcodeSignatureType& src1,
+                            OpcodeSignatureType& src2,
+                            OpcodeSignatureType& src3) {
+  dest = GET_OPCODE_SIG_TYPE_DEST(sig);
+  src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+  src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+  src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+}
+
+constexpr uint32_t GetNumOperandsForSig(uint32_t sig) {
+  sig >>= 3;
+
+  uint32_t result = 0;
+  while (sig) {
+    if (sig & 0x7) {
+      ++result;
+    }
+    sig >>= 3;
+  }
+  return result;
+}
 typedef struct {
+  Opcode num;
  uint32_t flags;
  uint32_t signature;
-  const char* name;
-  Opcode num;
 } OpcodeInfo;

 #define DEFINE_OPCODE(num, name, sig, flags) extern const OpcodeInfo num##_info;
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE

+const char* GetOpcodeName(Opcode num);
+static inline const char* GetOpcodeName(const OpcodeInfo* info) {
+  return GetOpcodeName(info->num);
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -673,3 +673,10 @@ DEFINE_OPCODE(
    "set_rounding_mode",
    OPCODE_SIG_X_V,
    0)
+
+DEFINE_OPCODE(
+    OPCODE_VECTOR_DENORMFLUSH,
+    "vector_denormflush",
+    OPCODE_SIG_V_V,
+    0
+)
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -864,10 +864,112 @@ void Value::Extract(Value* vec, Value* index) {
      break;
  }
 }
+void Value::Permute(Value* src1, Value* src2, TypeName type) {
+  if (type == INT8_TYPE) {
+    uint8_t table[32];

+    for (uint32_t i = 0; i < 16; ++i) {
+      table[i] = src1->constant.v128.u8[i];
+      table[i + 16] = src2->constant.v128.u8[i];
+    }
+
+    for (uint32_t i = 0; i < 16; ++i) {
+      constant.v128.u8[i] = table[(constant.v128.u8[i] ^ 3) & 0x1f];
+    }
+  } else if (type == INT16_TYPE) {
+    vec128_t perm = (constant.v128 & vec128s(0xF)) ^ vec128s(0x1);
+    vec128_t perm_ctrl = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
+
+      auto v = uint8_t(perm.u16[i]);
+      perm.u8[i * 2] = v * 2;
+      perm.u8[i * 2 + 1] = v * 2 + 1;
+    }
+    auto lod = [](const vec128_t& v) {
+      return _mm_loadu_si128((const __m128i*)&v);
+    };
+    auto sto = [](vec128_t& v, __m128i x) {
+      return _mm_storeu_si128((__m128i*)&v, x);
+    };
+
+    __m128i xmm1 = lod(src1->constant.v128);
+    __m128i xmm2 = lod(src2->constant.v128);
+    xmm1 = _mm_shuffle_epi8(xmm1, lod(perm));
+    xmm2 = _mm_shuffle_epi8(xmm2, lod(perm));
+    uint8_t mask = 0;
+    for (int i = 0; i < 8; i++) {
+      if (perm_ctrl.i16[i] == 0) {
+        mask |= 1 << (7 - i);
+      }
+    }
+
+    vec128_t unp_mask = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      if (mask & (1 << i)) {
+        unp_mask.u16[i] = 0xFFFF;
+      }
+    }
+
+    sto(constant.v128, _mm_blendv_epi8(xmm1, xmm2, lod(unp_mask)));
+
+  } else {
+    assert_unhandled_case(type);
+  }
+}
+void Value::Insert(Value* index, Value* part, TypeName type) {
+  vec128_t* me = &constant.v128;
+
+  switch (type) {
+    case INT8_TYPE:
+      me->u8[index->constant.u8 ^ 3] = part->constant.u8;
+      break;
+    case INT16_TYPE:
+      me->u16[index->constant.u8 ^ 1] = part->constant.u16;
+      break;
+    case INT32_TYPE:
+      me->u32[index->constant.u8] = part->constant.u32;
+      break;
+  }
+}
+void Value::Swizzle(uint32_t mask, TypeName type) {
+  if (type == INT32_TYPE || type == FLOAT32_TYPE) {
+    vec128_t result = vec128b(0);
+    for (uint32_t i = 0; i < 4; ++i) {
+      result.u32[i] = constant.v128.u32[(mask >> (i * 2)) & 0b11];
+    }
+    constant.v128 = result;
+  } else {
+    assert_unhandled_case(type);
+  }
+}
 void Value::Select(Value* other, Value* ctrl) {
-  // TODO
-  assert_always();
+  if (ctrl->type == VEC128_TYPE) {
+    constant.v128.low = (constant.v128.low & ~ctrl->constant.v128.low) |
+                        (other->constant.v128.low & ctrl->constant.v128.low);
+    constant.v128.high = (constant.v128.high & ~ctrl->constant.v128.high) |
+                         (other->constant.v128.high & ctrl->constant.v128.high);
+
+  } else {
+    if (ctrl->constant.u8) {
+      switch (other->type) {
+        case INT8_TYPE:
+          constant.u8 = other->constant.u8;
+          break;
+        case INT16_TYPE:
+          constant.u16 = other->constant.u16;
+          break;
+        case INT32_TYPE:
+        case FLOAT32_TYPE:
+          constant.u32 = other->constant.u32;
+          break;
+        case INT64_TYPE:
+        case FLOAT64_TYPE:
+          constant.u64 = other->constant.u64;
+          break;
+      }
+    }
+  }
 }

 void Value::Splat(Value* other) {
@ -1532,7 +1634,15 @@ void Value::ByteSwap() {
      break;
  }
 }
-
+void Value::DenormalFlush() {
+  for (int i = 0; i < 4; ++i) {
+    uint32_t current_element = constant.v128.u32[i];
+    if ((current_element & 0x7f800000) == 0) {
+      current_element = current_element & 0x80000000;
+    }
+    constant.v128.u32[i] = current_element;
+  }
+}
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -104,6 +104,9 @@ struct ValueMask {

 class Value {
 public:
+  /*
+    todo : this should be intrusive and be part of Instr instead.
+  */
  typedef struct Use_s {
    Instr* instr;
    Use_s* prev;
@ -128,17 +131,16 @@ class Value {
  TypeName type;

  uint32_t flags;
-  RegAssignment reg;
-  ConstantValue constant;

  Instr* def;
  Use* use_head;
  // NOTE: for performance reasons this is not maintained during construction.
  Instr* last_use;
-  Value* local_slot;
-
-  // TODO(benvanik): remove to shrink size.
-  void* tag;
+  RegAssignment reg;
+  union {
+    Value* local_slot;
+    ConstantValue constant;
+  };

  Use* AddUse(Arena* arena, Instr* instr);
  void RemoveUse(Use* use);
@ -209,7 +211,20 @@ class Value {
    flags = other->flags;
    constant.v128 = other->constant.v128;
  }
+  bool HasLocalSlot() const {
+    return !(flags & VALUE_IS_CONSTANT) && local_slot;
+  }
+  void SetLocalSlot(Value* lslot) {
+    assert(!(flags & VALUE_IS_CONSTANT));
+    local_slot = lslot;
+  }

+  Value* GetLocalSlot() {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
+  const Value* GetLocalSlot() const {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
  inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
  bool IsConstantTrue() const {
    if (type == VEC128_TYPE) {
@ -555,7 +570,10 @@ class Value {
  void Shr(Value* other);
  void Sha(Value* other);
  void RotateLeft(Value* other);
+  void Insert(Value* index, Value* part, TypeName type);
  void Extract(Value* vec, Value* index);
+  void Permute(Value* src1, Value* src2, TypeName type);
+  void Swizzle(uint32_t mask, TypeName type);
  void Select(Value* other, Value* ctrl);
  void Splat(Value* other);
  void VectorCompareEQ(Value* other, TypeName type);
@ -575,6 +593,8 @@ class Value {
  void VectorAverage(Value* other, TypeName type, bool is_unsigned,
                     bool saturate);
  void ByteSwap();
+  void DenormalFlush();
+
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -279,14 +279,21 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
  Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
  // ea &= ~0xF
  ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
+
  // v = (old & ~mask) | ((new >> eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                               f.LoadVR(vd), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
  Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
+  /*
+  these permutes need to be looked at closer. keep in mind Permute is meant to
+  emulate vmx's shuffles and does not generate particularly good code. The logic
+  here looks as if it might make more sense as a comparison (
+*/
  // mask = FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                          f.Not(f.LoadZeroVec128()), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
+
+  Value* v = f.Select(mask, old_value, new_value);
  // ea &= ~0xF (handled above)
  f.Store(ea, f.ByteSwap(v));
  return 0;
@ -321,14 +328,14 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
  ea = CalculateEA_0(f, ra, rb);
  eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
  ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
  // v = (old & ~mask) | ((new << eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadVR(vd),
-                               f.LoadZeroVec128(), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
  Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
  // mask = ~FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.Not(f.LoadZeroVec128()),
-                          f.LoadZeroVec128(), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
+  Value* v = f.Select(mask, old_value, new_value);
  // ea &= ~0xF (handled above)
  f.Store(ea, f.ByteSwap(v));
  f.MarkLabel(skip_label);
@ -815,8 +822,16 @@ int InstrEmit_vlogefp128(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_vmaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
                       uint32_t vc) {
+  /*
+      chrispy: testing on POWER8 revealed that altivec vmaddfp unconditionally
+     flushes denormal inputs to 0, regardless of NJM setting
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
  // (VD) <- ((VA) * (VC)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb));
+  Value* v = f.MulAdd(a, c, b);
+  // todo: do denormal results also unconditionally become 0?
  f.StoreVR(vd, v);
  return 0;
 }
@ -832,9 +847,14 @@ int InstrEmit_vmaddfp128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_vmaddcfp128(PPCHIRBuilder& f, const InstrData& i) {
+  /*
+    see vmaddfp about these denormflushes
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(VX128_VA128));
+  Value* b = f.VectorDenormFlush(f.LoadVR(VX128_VB128));
+  Value* d = f.VectorDenormFlush(f.LoadVR(VX128_VD128));
  // (VD) <- ((VA) * (VD)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VD128),
-                      f.LoadVR(VX128_VB128));
+  Value* v = f.MulAdd(a, d, b);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1085,7 +1105,8 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZ.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
  Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  //chrispy: denormal outputs for Dot product are unconditionally made 0
+  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1094,7 +1115,7 @@ int InstrEmit_vmsum4fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZW.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w)
  Value* v = f.DotProduct4(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1151,7 +1172,19 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
  // (VD) <- -(((VA) * (VC)) - (VB))
  // NOTE: only one rounding should take place, but that's hard...
  // This really needs VFNMSUB132PS/VFNMSUB213PS/VFNMSUB231PS but that's AVX.
-  Value* v = f.Neg(f.MulSub(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb)));
+  // NOTE2: we could make vnmsub a new opcode, and then do it in double
+  // precision, rounding after the neg
+
+  /*
+  chrispy: this is untested, but i believe this has the same DAZ behavior for
+  inputs as vmadd
+  */
+
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
+
+  Value* v = f.Neg(f.MulSub(a, c, b));
  f.StoreVR(vd, v);
  return 0;
 }