remove useless tag field from hir::Value

pack local_slot and constant in hir::Value Instead of loading membase at the start of every function, just load it in HostToGuestThunk vzeroupper in GuestToHostThunk before calling host function, and in HostToGuestThunk after calling function to prevent AVX dirty state slowdowns. In the future, check if CPU implements AVX as 128x2 and skip if so (https://john-h-k.github.io/VexTransitionPenalties.html) Remove useless save/restore of ctx pointer, nothing modifies it and it prevents cpus from doing cross-function memory renaming (https://www.agner.org/forum/viewtopic.php?t=41). Could not remove the space on stack because of alignment issues, instead turned it into GUEST_SCRATCH64 which is a temporary that sequences may use Reorder OpcodeInfo so that name is at offset 0, remove name and add GetOpcodeName function (name is only used for debug code, we are seperating frequently accessed data and rarely accessed data) Add VECTOR_DENORMFLUSH opcode for handling output to DOT_PRODUCT and other opcodes that implicitly force denormal inputs/outputs to zero, will eventually use for implementing NJM Rewrite sequences for LOAD_VECTOR_SHL/SHR. The mask with 0xf in it was pointless as all InstrEmit_ functions that create the load shift instructions do that in HIR. The tables are only used for nonzero constant inputs now, which are probably pretty rare. Instead of doing a shift and lookup, a base value is used for both in the constant table and adding/subtracting of the input is done Reuse result of LoadVectorShl/Shr in InstrEmit_stvlx_, InstrEmit_stvrx_. We were previously calculating it twice which was contributing to the final sequences' fatness. Use OPCODE_SELECT instead of the sequence of or, andnot, and that it was using for merging Add the proper unconditional denormal input flushing behavior to vfmadd, add it also to vfmsub (making the assumption it has the same behavior) Remove constant propagation for DOT_PRODUCT_3/4 DOT_PRODUCT_3/4 now returns a vector with all four elements set to the result. (what we were doing before, truncating to float32 and then splatting didnt make any sense) Add much more correct versions of DOT_PRODUCT_3/4, matching the Xb360's to 1 bit. Still needs work to be a perfect emulation. Add constant folding for OPCODE_SELECT, OPCODE_INSERT, OPCODE_PERMUTE, OPCODE_SWIZZLE Remove constant folding for DOT_PRODUCT Removed the multibyte nop code I committed earlier, it doesnt help us much because nops are only used for debug stuff and its ugly and wouldnt survive in a pr to main Check for AVX512BMI, use vpermb to shuffle if supported
2022-07-16 10:25:04 -07:00 · 2022-07-16 10:25:04 -07:00 · 6a612b4d34
parent 5f11c5d3b4
commit 6a612b4d34
17 changed files with 574 additions and 231 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
  EmitSaveNonvolatileRegs();

  mov(rax, rcx);
-  mov(rsi, rdx);  // context
-  mov(rcx, r8);   // return address
+  mov(rsi, rdx);                                                    // context
+  mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]);  // membase
+  mov(rcx, r8);  // return address
  call(rax);
-
+  vzeroupper();
  EmitLoadNonvolatileRegs();

  code_offsets.epilog = getSize();
@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {

  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
-
+  // chrispy: added this for proper vmsum impl, avx2 bitshifts
+  vzeroupper();
  // Save off volatile registers.
  EmitSaveVolatileRegs();

--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-
-  
-
-
+  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
 #undef TEST_EMIT_FEATURE
  /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+  latest version of xbyak
 */
  unsigned int data[4];
  Xbyak::util::Cpu::getCpuid(0x80000001, data);
@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
    }
  }
  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-  
-      bool is_zennish = cpu_.displayFamily >= 0x17;
+    bool is_zennish = cpu_.displayFamily >= 0x17;

-      if (is_zennish) {
-        feature_flags_ |= kX64FastJrcx;
+    if (is_zennish) {
+      feature_flags_ |= kX64FastJrcx;

-        if (cpu_.displayFamily > 0x17) {
-          feature_flags_ |= kX64FastLoop;
+      if (cpu_.displayFamily > 0x17) {
+        feature_flags_ |= kX64FastLoop;

-        } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
-          feature_flags_ |= kX64FastLoop;
-        }  // todo:figure out at model zen+ became zen2, this is just the model
-           // for my cpu, which is ripper90
-      
-      }
+      } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
+        feature_flags_ |= kX64FastLoop;
+      }  // todo:figure out at model zen+ became zen2, this is just the model
+         // for my cpu, which is ripper90
+    }
  }
 }

@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();

+  /*
+  * chrispy: removed this, it serves no purpose
  mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
+  */
  mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);

@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  }

  // Load membase.
-  mov(GetMembaseReg(),
+  /*
+  * chrispy: removed this, as long as we load it in HostToGuestThunk we can
+  count on no other code modifying it. mov(GetMembaseReg(),
      qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
-
+  */
  // Body.
  auto block = builder->first_block();
  while (block) {
@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
        // NOTE: If you encounter this after adding a new instruction, do a full
        // rebuild!
        assert_always();
-        XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
+        XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
        break;
      }
      instr = new_tail;
@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  L(epilog_label);
  epilog_label_ = nullptr;
  EmitTraceUserCallReturn();
+  /*
+  * chrispy: removed this, it serves no purpose
  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-
+  */
  code_offsets.epilog = getSize();

  add(rsp, (uint32_t)stack_size);
@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  if (cvars::emit_source_annotations) {
    nop(5);
-
  }

  assert_zero(code_offsets.prolog);
@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
 Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }

-void X64Emitter::ReloadContext() {
-  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-}
-
 void X64Emitter::ReloadMembase() {
  mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
 }
-#define __NH_CONCAT(x, y) x##y
-#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
-
-#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
-
-#define DECLNOP(n, ...) \
-  static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
-
-DECLNOP(1, 0x90);
-DECLNOP(2, 0x66, 0x90);
-DECLNOP(3, 0x0F, 0x1F, 0x00);
-DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
-DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
-DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-
-static constexpr const unsigned char* const g_noptable[] = {
-    &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
-    &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
-
-static constexpr unsigned LENGTHOF_NOPTABLE =
-    sizeof(g_noptable) / sizeof(g_noptable[0]);

 // Len Assembly                                   Byte Sequence
 // ============================================================================
@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
 // 8b  NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00H
 // 9b  66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
 void X64Emitter::nop(size_t length) {
-  while (length != 0) {
-    unsigned patchsize = length % LENGTHOF_NOPTABLE;
-
-    // patch_memory(locptr, size, (char*)g_noptable[patchsize]);
-
-    for (unsigned i = 0; i < patchsize; ++i) {
-      db(g_noptable[patchsize][i]);
-    }
-
-    //locptr += patchsize;
-    length -= patchsize;
+  for (size_t i = 0; i < length; ++i) {
+    db(0x90);
  }
 }

@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
                    0x80, 0x80, 0x80, 0x80),
    /*XMMShortsToBytes*/
    v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
-                    0x80, 0x80, 0x80)
-};
+                    0x80, 0x80, 0x80),
+    /*XMMLVSLTableBase*/
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    /*XMMLVSRTableBase*/
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    /* XMMSingleDenormalMask */
+    vec128i(0x7f800000),
+    /* XMMThreeFloatMask */
+    vec128i(~0U, ~0U, ~0U, 0U),
+    /*XMMXenosF16ExtRangeStart*/
+    vec128f(65504)};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
    // 1111...
    vpcmpeqb(dest, dest);
  } else {
-
    for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
      if (xmm_consts[i] == v) {
        vmovapd(dest, GetXmmConstPtr((XmmConst)i));
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -118,7 +118,12 @@ enum XmmConst {
  XMM2To32,
  XMMFloatInf,
  XMMIntsToBytes,
-  XMMShortsToBytes
+  XMMShortsToBytes,
+  XMMLVSLTableBase,
+  XMMLVSRTableBase,
+  XMMSingleDenormalMask,
+  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMXenosF16ExtRangeStart
 };

 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
  kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
  kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
 public:
@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

  Xbyak::Reg64 GetContextReg();
  Xbyak::Reg64 GetMembaseReg();
-  void ReloadContext();
+
  void ReloadMembase();

  void nop(size_t length = 1);
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);

+struct VECTOR_DENORMFLUSH
+    : Sequence<VECTOR_DENORMFLUSH,
+               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123
+
+    e.vandps(e.xmm0, i.src1,
+             e.GetXmmConstPtr(XMMSingleDenormalMask));  // 0.25 P0123
+    e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1);                // 0.5 P01
+    e.vandps(e.xmm1, i.src1,
+             e.GetXmmConstPtr(XMMSignMaskF32));  // 0.5 P0123 take signs, zeros
+                                                 // must keep their signs
+    e.vandps(e.xmm0, i.src1, e.xmm2);            // P0123
+    e.vorps(i.dest, e.xmm0, e.xmm1);  // P0123 make sure zeros keep signs
+
+    // if it does not equal zero, we stay
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);
+
 // ============================================================================
 // OPCODE_LOAD_VECTOR_SHL
 // ============================================================================
@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsl_table));
-      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
+      } else {
+        // this is probably extremely rare
+        e.LoadConstantXmm(i.dest, lvsl_table[sh]);
+      }
    } else {
      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsl_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it.
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+      e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
    }
  }
 };
@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsr_table));
-      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
+      } else {
+        e.LoadConstantXmm(i.dest, lvsr_table[sh]);
+      }
    } else {
      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsr_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
+      // lookup as well, compute from LVSR base instead
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+
+      e.vpsubb(i.dest, e.xmm1, e.xmm0);
    }
  }
 };
@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
    }
  }

-static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
    // TODO(benvanik): native version (with shift magic).

    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
@ -1793,6 +1826,14 @@ struct PERMUTE_I32
    }
  }
 };
+//todo: use this on const src1
+static vec128_t FixupConstantShuf8(vec128_t input) {
+  for (uint32_t i = 0; i < 16; ++i) {
+    input.u8[i] ^= 0x03;
+    input.u8[i] &= 0x1F;
+  }
+  return input;
+}
 struct PERMUTE_V128
    : Sequence<PERMUTE_V128,
               I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
@ -1855,7 +1896,8 @@ struct PERMUTE_V128
      } else {
        e.vpshufb(src3_shuf, i.src3, e.xmm2);
      }
-      // Build a mask with values in src2 having 0 and values in src3 having 1.
+      // Build a mask with values in src2 having 0 and values in src3
+      // having 1.
      e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
      e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
    }
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -35,11 +35,14 @@
 #include "xenia/cpu/backend/x64/x64_emitter.h"
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
+// needed for stmxcsr
+#include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"

 DEFINE_bool(use_fast_dot_product, false,
-            "Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
+            "Experimental optimization, much shorter sequence on dot products, "
+            "treating inf as overflow instead of using mcxsr"
            "four insn dotprod",
            "CPU");
 namespace xe {
@ -1996,8 +1999,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
    assert_true(!i.instr->flags);
    EmitAssociativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                               //  e.vrcpps(e.xmm0, src2);
-                                 //e.vmulps(dest, src1, e.xmm0);
+                                 //  e.vrcpps(e.xmm0, src2);
+                                 // e.vmulps(dest, src1, e.xmm0);
                                 e.vdivps(dest, src1, src2);
                               });
  }
@ -2607,68 +2610,84 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);

-struct DOT_PRODUCT_V128 {
-  static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
-    if (cvars::use_fast_dot_product) {
-      e.vdpps(dest, src1, src2, imm);
-      e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
-      e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
-      e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
-
-    } else {
-      // TODO(benvanik): apparently this is very slow
-      // - find alternative?
-      Xbyak::Label end;
-      e.inLocalLabel();
-
-      // Grab space to put MXCSR.
-      // TODO(gibbed): stick this in TLS or
-      // something?
-      e.sub(e.rsp, 8);
-
-      // Grab MXCSR and mask off the overflow flag,
-      // because it's sticky.
-      e.vstmxcsr(e.dword[e.rsp]);
-      e.mov(e.eax, e.dword[e.rsp]);
-      e.and_(e.eax, uint32_t(~8));
-      e.mov(e.dword[e.rsp], e.eax);
-      e.vldmxcsr(e.dword[e.rsp]);
-
-      // Hey we can do the dot product now.
-      e.vdpps(dest, src1, src2, imm);
-
-      // Load MXCSR...
-      e.vstmxcsr(e.dword[e.rsp]);
-
-      // ..free our temporary space and get MXCSR at
-      // the same time
-      e.pop(e.rax);
-
-      // Did we overflow?
-      e.test(e.al, 8);
-      e.jz(end);
-
-      // Infinity? HA! Give NAN.
-      e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
-
-      e.L(end);
-      e.outLocalLabel();
-    }
-  }
-};
-
 // ============================================================================
 // OPCODE_DOT_PRODUCT_3
 // ============================================================================
 struct DOT_PRODUCT_3_V128
    : Sequence<DOT_PRODUCT_3_V128,
-               I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_3, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    // this is going to hurt a bit...
+    /*
+    this implementation is accurate, it matches the results of xb360 vmsum3
+    except that vmsum3 is often off by 1 bit, but its extremely slow. it is a
+    long, unbroken chain of dependencies, and the three uses of mxcsr all cost
+    about 15-20 cycles at the very least on amd zen processors. on older amd the
+    figures agner has are pretty horrible. it looks like its just as bad on
+    modern intel cpus also up until just recently. perhaps a better way of
+    detecting overflow would be to just compare with inf. todo: test whether cmp
+    with inf can replace
+    */
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm0;
+    auto src2v = e.xmm1;
+    if (i.src1.is_constant) {
+      src1v = e.xmm0;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm1;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+    // todo: maybe the top element should be cleared by the InstrEmit_ function
+    // so that in the future this could be optimized away if the top is known to
+    // be zero. Right now im not sure that happens often though and its
+    // currently not worth it also, maybe pre-and if constant
+    e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+    /*
+        ymm0 = src1 as doubles, ele 3 cleared
+        ymm1 = src2 as doubles, ele 3 cleared
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);  // get element [1] in xmm3
+    e.vaddsd(e.xmm3, e.xmm3, e.xmm2);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    // this is awful
+    e.vstmxcsr(mxcsr_storage);
+    e.test(mxcsr_storage, e.eax);
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);
+    // e.vshufps(i.dest, e.xmm1,e.xmm1, 0);  // broadcast
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
@ -2678,13 +2697,81 @@ EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
 // ============================================================================
 struct DOT_PRODUCT_4_V128
    : Sequence<DOT_PRODUCT_4_V128,
-               I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
+               I<OPCODE_DOT_PRODUCT_4, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
-    EmitCommutativeBinaryXmmOp(
-        e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
-        });
+    // todo: add fast_dot_product path that just checks for infinity instead of
+    // using mxcsr
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.mov(e.eax, 8);
+
+    auto src1v = e.xmm3;
+    auto src2v = e.xmm2;
+    if (i.src1.is_constant) {
+      src1v = e.xmm3;
+      e.LoadConstantXmm(src1v, i.src1.constant());
+    } else {
+      src1v = i.src1.reg();
+    }
+    if (i.src2.is_constant) {
+      src2v = e.xmm2;
+      e.LoadConstantXmm(src2v, i.src2.constant());
+    } else {
+      src2v = i.src2.reg();
+    }
+    e.not_(e.eax);
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);
+
+    e.vcvtps2pd(e.ymm0, src1v);
+    e.vcvtps2pd(e.ymm1, src2v);
+    /*
+        e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
+    e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
+
+    e.and_(mxcsr_storage, e.eax);
+    e.vldmxcsr(mxcsr_storage);  // overflow flag is cleared, now we're good to
+                                // go
+
+    e.vcvtps2pd(e.ymm0, e.xmm3);
+    e.vcvtps2pd(e.ymm1, e.xmm2);
+
+
+    e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm4, e.ymm5, 1);
+    e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5);  // get element [1] in xmm3
+    e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    */
+    e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
+    e.vextractf128(e.xmm2, e.ymm3, 1);
+    e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
+
+    e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);
+    e.not_(e.eax);
+    e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
+    e.vcvtsd2ss(e.xmm1, e.xmm2);
+
+    e.vstmxcsr(mxcsr_storage);
+
+    e.test(mxcsr_storage, e.eax);
+
+    Xbyak::Label ret_qnan;
+    Xbyak::Label done;
+    e.jnz(ret_qnan);  // reorder these jmps later, just want to get this fix in
+                      //  e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
+    e.vbroadcastss(i.dest, e.xmm1);
+    e.jmp(done);
+    e.L(ret_qnan);
+    e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
+    e.L(done);
+    //   e.DebugBreak();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128);
@ -2759,7 +2846,6 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-
    EmitCommutativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
                                 e.vpand(dest, src1, src2);
@ -3419,7 +3505,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
      return true;
    }
  }
-  XELOGE("No sequence match for variant {}", i->opcode->name);
+  XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode));
  return false;
 }

--- a/src/xenia/cpu/backend/x64/x64_stack_layout.h
+++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h
@ -122,7 +122,8 @@ class StackLayout {
   *
   */
  static const size_t GUEST_STACK_SIZE = 104;
-  static const size_t GUEST_CTX_HOME = 80;
+  //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
+  static const size_t GUEST_SCRATCH64 = 80;
  static const size_t GUEST_RET_ADDR = 88;
  static const size_t GUEST_CALL_RET_ADDR = 96;
 };
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -312,13 +312,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                result = true;
              } else if (i->src2.value->IsConstant() &&
                         i->src3.value->IsConstant()) {
-                // TODO: Select
-                // v->set_from(i->src2.value);
-                // v->Select(i->src3.value, i->src1.value);
-                // i->Remove();
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
              }
            } else {
-              // TODO: vec128 select
+              if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
+                v->set_from(i->src2.value);
+                v->Select(i->src3.value, i->src1.value);
+                i->Remove();
+                result = true;
+              }
            }
          }
          break;
@ -744,8 +749,35 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-        // TODO(benvanik): INSERT/EXTRACT
-        // TODO(benvanik): PERMUTE/SWIZZLE
+       
+        case OPCODE_PERMUTE: {
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant() &&
+              (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
+            v->set_from(i->src1.value);
+            v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        }
+        case OPCODE_INSERT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
+              i->src3.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
+        case OPCODE_SWIZZLE:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
+            i->Remove();
+            result = true;
+          }
+          break;
        case OPCODE_EXTRACT:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_zero(v->type);
@ -867,24 +899,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          }
          break;

-        case OPCODE_DOT_PRODUCT_3:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct3(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
-        case OPCODE_DOT_PRODUCT_4:
-          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->DotProduct4(i->src2.value);
-            i->Remove();
-            result = true;
-          }
-          break;
-
        case OPCODE_VECTOR_AVERAGE:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
@ -896,7 +910,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            result = true;
          }
          break;
-
+        case OPCODE_VECTOR_DENORMFLUSH:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->DenormalFlush();
+            i->Remove();
+            result = true;
+          }
+          break;
        default:
          // Ignored.
          break;
--- a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
+++ b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
@ -132,10 +132,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
    while (outgoing_ordinal != -1) {
      Value* src_value = value_map[outgoing_ordinal];
      assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
      }
-      builder->StoreLocal(src_value->local_slot, src_value);
+      builder->StoreLocal(src_value->GetLocalSlot(), src_value);

      // If we are in the block the value was defined in:
      if (src_value->def->block == block) {
@ -168,10 +168,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
    while (incoming_ordinal != -1) {
      Value* src_value = value_map[incoming_ordinal];
      assert_not_null(src_value);
-      if (!src_value->local_slot) {
-        src_value->local_slot = builder->AllocLocal(src_value->type);
+      if (!src_value->HasLocalSlot()) {
+        src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
      }
-      Value* local_value = builder->LoadLocal(src_value->local_slot);
+      Value* local_value = builder->LoadLocal(src_value->GetLocalSlot());
      builder->last_instr()->MoveBefore(block->instr_head);

      // Swap uses of original value with the local value.
--- a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc
@ -365,7 +365,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
  auto new_head_use = next_use;

  // Allocate local.
-  if (spill_value->local_slot) {
+  if (spill_value->HasLocalSlot()) {
    // Value is already assigned a slot. Since we allocate in order and this is
    // all SSA we know the stored value will be exactly what we want. Yay,
    // we can prevent the redundant store!
@ -373,10 +373,10 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
    // use the spilled value and prevent the need for more locals.
  } else {
    // Allocate a local slot.
-    spill_value->local_slot = builder->AllocLocal(spill_value->type);
+    spill_value->SetLocalSlot(builder->AllocLocal(spill_value->type));

    // Add store.
-    builder->StoreLocal(spill_value->local_slot, spill_value);
+    builder->StoreLocal(spill_value->GetLocalSlot(), spill_value);
    auto spill_store = builder->last_instr();
    auto spill_store_use = spill_store->src2_use;
    assert_null(spill_store_use->prev);
@ -417,7 +417,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
  // use is after the instruction requesting the spill we know we haven't
  // done allocation for that code yet and can let that be handled
  // automatically when we get to it.
-  auto new_value = builder->LoadLocal(spill_value->local_slot);
+  auto new_value = builder->LoadLocal(spill_value->GetLocalSlot());
  auto spill_load = builder->last_instr();
  spill_load->MoveBefore(next_use->instr);
  // Note: implicit first use added.
@ -429,7 +429,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,

  // Set the local slot of the new value to our existing one. This way we will
  // reuse that same memory if needed.
-  new_value->local_slot = spill_value->local_slot;
+  new_value->SetLocalSlot( spill_value->GetLocalSlot());

  // Rename all future uses of the SSA value to the new value as loaded
  // from the local.
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -260,9 +260,9 @@ void HIRBuilder::Dump(StringBuffer* str) {
        str->Append(" = ");
      }
      if (i->flags) {
-        str->AppendFormat("{}.{}", info->name, i->flags);
+        str->AppendFormat("{}.{}", GetOpcodeName(info), i->flags);
      } else {
-        str->Append(info->name);
+        str->Append(GetOpcodeName(info));
      }
      if (src1_type) {
        str->Append(' ');
@ -712,7 +712,6 @@ Value* HIRBuilder::AllocValue(TypeName type) {
  value->use_head = NULL;
  value->last_use = NULL;
  value->local_slot = NULL;
-  value->tag = NULL;
  value->reg.set = NULL;
  value->reg.index = -1;
  return value;
@ -723,12 +722,11 @@ Value* HIRBuilder::CloneValue(Value* source) {
  value->ordinal = next_value_ordinal_++;
  value->type = source->type;
  value->flags = source->flags;
+  value->local_slot = NULL;
  value->constant.v128 = source->constant.v128;
  value->def = NULL;
  value->use_head = NULL;
  value->last_use = NULL;
-  value->local_slot = NULL;
-  value->tag = NULL;
  value->reg.set = NULL;
  value->reg.index = -1;
  return value;
@ -1493,7 +1491,16 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
  return VectorCompareXX(OPCODE_VECTOR_COMPARE_UGE_info, value1, value2,
                         part_type);
 }
-
+Value* HIRBuilder::VectorDenormFlush(Value* value1) {
+  return value1;
+  ASSERT_VECTOR_TYPE(value1);
+  Instr* i =
+      AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(value1);
+  i->src2.value = nullptr;
+  i->src3.value = nullptr;
+  return i->dest;
+}
 Value* HIRBuilder::Add(Value* value1, Value* value2,
                       uint32_t arithmetic_flags) {
  ASSERT_TYPES_EQUAL(value1, value2);
@ -1713,13 +1720,13 @@ Value* HIRBuilder::Log2(Value* value) {
  return i->dest;
 }

+
 Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value1);
  ASSERT_VECTOR_TYPE(value2);
  ASSERT_TYPES_EQUAL(value1, value2);

-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(value1);
  i->set_src2(value2);
  i->src3.value = NULL;
@ -1731,8 +1738,7 @@ Value* HIRBuilder::DotProduct4(Value* value1, Value* value2) {
  ASSERT_VECTOR_TYPE(value2);
  ASSERT_TYPES_EQUAL(value1, value2);

-  Instr* i =
-      AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(FLOAT32_TYPE));
+  Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(value1);
  i->set_src2(value2);
  i->src3.value = NULL;
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -199,6 +199,7 @@ class HIRBuilder {
  Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
  Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
+  Value* VectorDenormFlush(Value* value1);

  Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
--- a/src/xenia/cpu/hir/opcodes.cc
+++ b/src/xenia/cpu/hir/opcodes.cc
@ -15,14 +15,23 @@ namespace hir {

 #define DEFINE_OPCODE(num, name, sig, flags) \
  const OpcodeInfo num##_info = {            \
+      num,                                   \
      flags,                                 \
      sig,                                   \
-      name,                                  \
-      num,                                   \
  };
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE

+const char* GetOpcodeName(Opcode num) {
+  switch (num) {
+#define DEFINE_OPCODE(num, name, sig, flags) \
+  case num:                                  \
+    return name;
+#include "xenia/cpu/hir/opcodes.inl"
+#undef DEFINE_OPCODE
+  }
+  return "invalid opcode";
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -280,7 +280,8 @@ enum Opcode {
  OPCODE_ATOMIC_EXCHANGE,
  OPCODE_ATOMIC_COMPARE_EXCHANGE,
  OPCODE_SET_ROUNDING_MODE,
-  __OPCODE_MAX_VALUE,  // Keep at end.
+  OPCODE_VECTOR_DENORMFLUSH,  // converts denormals to signed zeros in a vector
+  __OPCODE_MAX_VALUE,         // Keep at end.
 };

 enum OpcodeFlags {
@ -352,17 +353,42 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }

+static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
+                            OpcodeSignatureType& src1,
+                            OpcodeSignatureType& src2,
+                            OpcodeSignatureType& src3) {
+  dest = GET_OPCODE_SIG_TYPE_DEST(sig);
+  src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+  src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+  src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+}
+
+constexpr uint32_t GetNumOperandsForSig(uint32_t sig) {
+  sig >>= 3;
+
+  uint32_t result = 0;
+  while (sig) {
+    if (sig & 0x7) {
+      ++result;
+    }
+    sig >>= 3;
+  }
+  return result;
+}
 typedef struct {
+  Opcode num;
  uint32_t flags;
  uint32_t signature;
-  const char* name;
-  Opcode num;
 } OpcodeInfo;

 #define DEFINE_OPCODE(num, name, sig, flags) extern const OpcodeInfo num##_info;
 #include "xenia/cpu/hir/opcodes.inl"
 #undef DEFINE_OPCODE

+const char* GetOpcodeName(Opcode num);
+static inline const char* GetOpcodeName(const OpcodeInfo* info) {
+  return GetOpcodeName(info->num);
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -673,3 +673,10 @@ DEFINE_OPCODE(
    "set_rounding_mode",
    OPCODE_SIG_X_V,
    0)
+
+DEFINE_OPCODE(
+    OPCODE_VECTOR_DENORMFLUSH,
+    "vector_denormflush",
+    OPCODE_SIG_V_V,
+    0
+)
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -864,10 +864,112 @@ void Value::Extract(Value* vec, Value* index) {
      break;
  }
 }
+void Value::Permute(Value* src1, Value* src2, TypeName type) {
+  if (type == INT8_TYPE) {
+    uint8_t table[32];

+    for (uint32_t i = 0; i < 16; ++i) {
+      table[i] = src1->constant.v128.u8[i];
+      table[i + 16] = src2->constant.v128.u8[i];
+    }
+
+    for (uint32_t i = 0; i < 16; ++i) {
+      constant.v128.u8[i] = table[(constant.v128.u8[i] ^ 3) & 0x1f];
+    }
+  } else if (type == INT16_TYPE) {
+    vec128_t perm = (constant.v128 & vec128s(0xF)) ^ vec128s(0x1);
+    vec128_t perm_ctrl = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
+
+      auto v = uint8_t(perm.u16[i]);
+      perm.u8[i * 2] = v * 2;
+      perm.u8[i * 2 + 1] = v * 2 + 1;
+    }
+    auto lod = [](const vec128_t& v) {
+      return _mm_loadu_si128((const __m128i*)&v);
+    };
+    auto sto = [](vec128_t& v, __m128i x) {
+      return _mm_storeu_si128((__m128i*)&v, x);
+    };
+
+    __m128i xmm1 = lod(src1->constant.v128);
+    __m128i xmm2 = lod(src2->constant.v128);
+    xmm1 = _mm_shuffle_epi8(xmm1, lod(perm));
+    xmm2 = _mm_shuffle_epi8(xmm2, lod(perm));
+    uint8_t mask = 0;
+    for (int i = 0; i < 8; i++) {
+      if (perm_ctrl.i16[i] == 0) {
+        mask |= 1 << (7 - i);
+      }
+    }
+
+    vec128_t unp_mask = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      if (mask & (1 << i)) {
+        unp_mask.u16[i] = 0xFFFF;
+      }
+    }
+
+    sto(constant.v128, _mm_blendv_epi8(xmm1, xmm2, lod(unp_mask)));
+
+  } else {
+    assert_unhandled_case(type);
+  }
+}
+void Value::Insert(Value* index, Value* part, TypeName type) {
+  vec128_t* me = &constant.v128;
+
+  switch (type) {
+    case INT8_TYPE:
+      me->u8[index->constant.u8 ^ 3] = part->constant.u8;
+      break;
+    case INT16_TYPE:
+      me->u16[index->constant.u8 ^ 1] = part->constant.u16;
+      break;
+    case INT32_TYPE:
+      me->u32[index->constant.u8] = part->constant.u32;
+      break;
+  }
+}
+void Value::Swizzle(uint32_t mask, TypeName type) {
+  if (type == INT32_TYPE || type == FLOAT32_TYPE) {
+    vec128_t result = vec128b(0);
+    for (uint32_t i = 0; i < 4; ++i) {
+      result.u32[i] = constant.v128.u32[(mask >> (i * 2)) & 0b11];
+    }
+    constant.v128 = result;
+  } else {
+    assert_unhandled_case(type);
+  }
+}
 void Value::Select(Value* other, Value* ctrl) {
-  // TODO
-  assert_always();
+  if (ctrl->type == VEC128_TYPE) {
+    constant.v128.low = (constant.v128.low & ~ctrl->constant.v128.low) |
+                        (other->constant.v128.low & ctrl->constant.v128.low);
+    constant.v128.high = (constant.v128.high & ~ctrl->constant.v128.high) |
+                         (other->constant.v128.high & ctrl->constant.v128.high);
+
+  } else {
+    if (ctrl->constant.u8) {
+      switch (other->type) {
+        case INT8_TYPE:
+          constant.u8 = other->constant.u8;
+          break;
+        case INT16_TYPE:
+          constant.u16 = other->constant.u16;
+          break;
+        case INT32_TYPE:
+        case FLOAT32_TYPE:
+          constant.u32 = other->constant.u32;
+          break;
+        case INT64_TYPE:
+        case FLOAT64_TYPE:
+          constant.u64 = other->constant.u64;
+          break;
+      }
+    }
+  }
 }

 void Value::Splat(Value* other) {
@ -1532,7 +1634,15 @@ void Value::ByteSwap() {
      break;
  }
 }
-
+void Value::DenormalFlush() {
+  for (int i = 0; i < 4; ++i) {
+    uint32_t current_element = constant.v128.u32[i];
+    if ((current_element & 0x7f800000) == 0) {
+      current_element = current_element & 0x80000000;
+    }
+    constant.v128.u32[i] = current_element;
+  }
+}
 void Value::CountLeadingZeros(const Value* other) {
  switch (other->type) {
    case INT8_TYPE:
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -104,6 +104,9 @@ struct ValueMask {

 class Value {
 public:
+  /*
+    todo : this should be intrusive and be part of Instr instead.
+  */
  typedef struct Use_s {
    Instr* instr;
    Use_s* prev;
@ -128,17 +131,16 @@ class Value {
  TypeName type;

  uint32_t flags;
-  RegAssignment reg;
-  ConstantValue constant;

  Instr* def;
  Use* use_head;
  // NOTE: for performance reasons this is not maintained during construction.
  Instr* last_use;
-  Value* local_slot;
-
-  // TODO(benvanik): remove to shrink size.
-  void* tag;
+  RegAssignment reg;
+  union {
+    Value* local_slot;
+    ConstantValue constant;
+  };

  Use* AddUse(Arena* arena, Instr* instr);
  void RemoveUse(Use* use);
@ -209,7 +211,20 @@ class Value {
    flags = other->flags;
    constant.v128 = other->constant.v128;
  }
+  bool HasLocalSlot() const {
+    return !(flags & VALUE_IS_CONSTANT) && local_slot;
+  }
+  void SetLocalSlot(Value* lslot) {
+    assert(!(flags & VALUE_IS_CONSTANT));
+    local_slot = lslot;
+  }

+  Value* GetLocalSlot() {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
+  const Value* GetLocalSlot() const {
+    return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
+  }
  inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
  bool IsConstantTrue() const {
    if (type == VEC128_TYPE) {
@ -555,7 +570,10 @@ class Value {
  void Shr(Value* other);
  void Sha(Value* other);
  void RotateLeft(Value* other);
+  void Insert(Value* index, Value* part, TypeName type);
  void Extract(Value* vec, Value* index);
+  void Permute(Value* src1, Value* src2, TypeName type);
+  void Swizzle(uint32_t mask, TypeName type);
  void Select(Value* other, Value* ctrl);
  void Splat(Value* other);
  void VectorCompareEQ(Value* other, TypeName type);
@ -575,6 +593,8 @@ class Value {
  void VectorAverage(Value* other, TypeName type, bool is_unsigned,
                     bool saturate);
  void ByteSwap();
+  void DenormalFlush();
+
  void CountLeadingZeros(const Value* other);
  bool Compare(Opcode opcode, Value* other);
  hir::Instr* GetDefSkipAssigns();
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -279,14 +279,21 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
  Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
  // ea &= ~0xF
  ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
+
  // v = (old & ~mask) | ((new >> eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                               f.LoadVR(vd), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
  Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
+  /*
+  these permutes need to be looked at closer. keep in mind Permute is meant to
+  emulate vmx's shuffles and does not generate particularly good code. The logic
+  here looks as if it might make more sense as a comparison (
+*/
  // mask = FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
-                          f.Not(f.LoadZeroVec128()), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
+
+  Value* v = f.Select(mask, old_value, new_value);
  // ea &= ~0xF (handled above)
  f.Store(ea, f.ByteSwap(v));
  return 0;
@ -321,14 +328,14 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
  ea = CalculateEA_0(f, ra, rb);
  eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
  ea = f.And(ea, f.LoadConstantUint64(~0xFull));
+  Value* shrs = f.LoadVectorShr(eb);
+  Value* zerovec = f.LoadZeroVec128();
  // v = (old & ~mask) | ((new << eb) & mask)
-  Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadVR(vd),
-                               f.LoadZeroVec128(), INT8_TYPE);
+  Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
  Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
  // mask = ~FFFF... >> eb
-  Value* mask = f.Permute(f.LoadVectorShr(eb), f.Not(f.LoadZeroVec128()),
-                          f.LoadZeroVec128(), INT8_TYPE);
-  Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
+  Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
+  Value* v = f.Select(mask, old_value, new_value);
  // ea &= ~0xF (handled above)
  f.Store(ea, f.ByteSwap(v));
  f.MarkLabel(skip_label);
@ -815,8 +822,16 @@ int InstrEmit_vlogefp128(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_vmaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
                       uint32_t vc) {
+  /*
+      chrispy: testing on POWER8 revealed that altivec vmaddfp unconditionally
+     flushes denormal inputs to 0, regardless of NJM setting
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
  // (VD) <- ((VA) * (VC)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb));
+  Value* v = f.MulAdd(a, c, b);
+  // todo: do denormal results also unconditionally become 0?
  f.StoreVR(vd, v);
  return 0;
 }
@ -832,9 +847,14 @@ int InstrEmit_vmaddfp128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_vmaddcfp128(PPCHIRBuilder& f, const InstrData& i) {
+  /*
+    see vmaddfp about these denormflushes
+  */
+  Value* a = f.VectorDenormFlush(f.LoadVR(VX128_VA128));
+  Value* b = f.VectorDenormFlush(f.LoadVR(VX128_VB128));
+  Value* d = f.VectorDenormFlush(f.LoadVR(VX128_VD128));
  // (VD) <- ((VA) * (VD)) + (VB)
-  Value* v = f.MulAdd(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VD128),
-                      f.LoadVR(VX128_VB128));
+  Value* v = f.MulAdd(a, d, b);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1085,7 +1105,8 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZ.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
  Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  //chrispy: denormal outputs for Dot product are unconditionally made 0
+  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1094,7 +1115,7 @@ int InstrEmit_vmsum4fp128(PPCHIRBuilder& f, const InstrData& i) {
  // Dot product XYZW.
  // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w)
  Value* v = f.DotProduct4(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
-  v = f.Splat(v, VEC128_TYPE);
+  v = f.VectorDenormFlush(v);
  f.StoreVR(VX128_VD128, v);
  return 0;
 }
@ -1151,7 +1172,19 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
  // (VD) <- -(((VA) * (VC)) - (VB))
  // NOTE: only one rounding should take place, but that's hard...
  // This really needs VFNMSUB132PS/VFNMSUB213PS/VFNMSUB231PS but that's AVX.
-  Value* v = f.Neg(f.MulSub(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb)));
+  // NOTE2: we could make vnmsub a new opcode, and then do it in double
+  // precision, rounding after the neg
+
+  /*
+  chrispy: this is untested, but i believe this has the same DAZ behavior for
+  inputs as vmadd
+  */
+
+  Value* a = f.VectorDenormFlush(f.LoadVR(va));
+  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
+  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
+
+  Value* v = f.Neg(f.MulSub(a, c, b));
  f.StoreVR(vd, v);
  return 0;
 }