diff --git a/src/xenia/base/debug_visualizers.natvis b/src/xenia/base/debug_visualizers.natvis
index 17e481c08..b5077dfc6 100644
--- a/src/xenia/base/debug_visualizers.natvis
+++ b/src/xenia/base/debug_visualizers.natvis
@@ -2,6 +2,31 @@
 <AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
 
   <!-- Automatically convert endianness for xe::be -->
+  <Type Name="xe::be&lt;unsigned __int64&gt;">
+    <DisplayString>
+      {(((value &amp; 0xFF00000000000000) &gt;&gt; 56) |
+        ((value &amp; 0x00FF000000000000) &gt;&gt; 40) |
+        ((value &amp; 0x0000FF0000000000) &gt;&gt; 24) |
+        ((value &amp; 0x000000FF00000000) &gt;&gt; 8 ) |
+        ((value &amp; 0x00000000FF000000) &lt;&lt; 8 ) |
+        ((value &amp; 0x0000000000FF0000) &lt;&lt; 24) |
+        ((value &amp; 0x000000000000FF00) &lt;&lt; 40) |
+        ((value &amp; 0x00000000000000FF) &lt;&lt; 56))}
+    </DisplayString>
+  </Type>
+  <Type Name="xe::be&lt;__int64&gt;">
+    <DisplayString>
+      {(((value &amp; 0xFF00000000000000) &gt;&gt; 56) |
+        ((value &amp; 0x00FF000000000000) &gt;&gt; 40) |
+        ((value &amp; 0x0000FF0000000000) &gt;&gt; 24) |
+        ((value &amp; 0x000000FF00000000) &gt;&gt; 8 ) |
+        ((value &amp; 0x00000000FF000000) &lt;&lt; 8 ) |
+        ((value &amp; 0x0000000000FF0000) &lt;&lt; 24) |
+        ((value &amp; 0x000000000000FF00) &lt;&lt; 40) |
+        ((value &amp; 0x00000000000000FF) &lt;&lt; 56))}
+    </DisplayString>
+  </Type>
+
   <Type Name="xe::be&lt;unsigned int&gt;">
     <DisplayString>
       {(((value &amp; 0xFF000000) &gt;&gt; 24) |
diff --git a/src/xenia/base/vec128.h b/src/xenia/base/vec128.h
index 0d5e985eb..139227cc5 100644
--- a/src/xenia/base/vec128.h
+++ b/src/xenia/base/vec128.h
@@ -105,12 +105,54 @@ typedef struct alignas(16) vec128_s {
     };
   };
 
+  vec128_s() = default;
+  vec128_s(const vec128_s& other) {
+    high = other.high;
+    low = other.low;
+  }
+
+  vec128_s& operator=(const vec128_s& b) {
+    high = b.high;
+    low = b.low;
+    return *this;
+  }
+
   bool operator==(const vec128_s& b) const {
     return low == b.low && high == b.high;
   }
   bool operator!=(const vec128_s& b) const {
     return low != b.low || high != b.high;
   }
+  vec128_s operator^(const vec128_s& b) const {
+    vec128_s a = *this;
+    a.high ^= b.high;
+    a.low ^= b.low;
+    return a;
+  };
+  vec128_s& operator^=(const vec128_s& b) {
+    *this = *this ^ b;
+    return *this;
+  };
+  vec128_s operator&(const vec128_s& b) const {
+    vec128_s a = *this;
+    a.high &= b.high;
+    a.low &= b.low;
+    return a;
+  };
+  vec128_s& operator&=(const vec128_s& b) {
+    *this = *this & b;
+    return *this;
+  };
+  vec128_s operator|(const vec128_s& b) const {
+    vec128_s a = *this;
+    a.high |= b.high;
+    a.low |= b.low;
+    return a;
+  };
+  vec128_s& operator|=(const vec128_s& b) {
+    *this = *this | b;
+    return *this;
+  };
 } vec128_t;
 
 static inline vec128_t vec128i(uint32_t src) {
diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.h b/src/xenia/cpu/backend/x64/x64_code_cache.h
index 5795f85d7..8fef0273e 100644
--- a/src/xenia/cpu/backend/x64/x64_code_cache.h
+++ b/src/xenia/cpu/backend/x64/x64_code_cache.h
@@ -70,7 +70,7 @@ class X64CodeCache : public CodeCache {
   // This is picked to be high enough to cover whatever we can reasonably
   // expect. If we hit issues with this it probably means some corner case
   // in analysis triggering.
-  static const size_t kMaximumFunctionCount = 30000;
+  static const size_t kMaximumFunctionCount = 50000;
 
   struct UnwindReservation {
     size_t data_size = 0;
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 2aa290952..60afde294 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -572,6 +572,7 @@ struct Sequence {
       e.LoadConstantXmm(e.xmm0, i.src1.constant());
       fn(e, i.dest, e.xmm0, i.src2);
     } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
       fn(e, i.dest, i.src1, e.xmm0);
     } else {
@@ -2715,26 +2716,46 @@ struct SELECT_F32
     : Sequence<SELECT_F32, I<OPCODE_SELECT, F32Op, I8Op, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): find a shorter sequence.
-    // xmm0 = src1 != 0 ? 1111... : 0000....
+    // dest = src1 != 0 ? src2 : src3
     e.movzx(e.eax, i.src1);
     e.vmovd(e.xmm1, e.eax);
     e.vxorps(e.xmm0, e.xmm0);
-    e.vcmpneqss(e.xmm0, e.xmm1);
-    e.vpand(e.xmm1, e.xmm0, i.src2);
-    e.vpandn(i.dest, e.xmm0, i.src3);
+    e.vpcmpeqd(e.xmm0, e.xmm1);
+
+    Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src2.constant());
+    }
+    e.vpandn(e.xmm1, e.xmm0, src2);
+
+    Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src3.constant());
+    }
+    e.vpand(i.dest, e.xmm0, src3);
     e.vpor(i.dest, e.xmm1);
   }
 };
 struct SELECT_F64
     : Sequence<SELECT_F64, I<OPCODE_SELECT, F64Op, I8Op, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // xmm0 = src1 != 0 ? 1111... : 0000....
+    // dest = src1 != 0 ? src2 : src3
     e.movzx(e.eax, i.src1);
     e.vmovd(e.xmm1, e.eax);
-    e.vxorpd(e.xmm0, e.xmm0);
-    e.vcmpneqsd(e.xmm0, e.xmm1);
-    e.vpand(e.xmm1, e.xmm0, i.src2);
-    e.vpandn(i.dest, e.xmm0, i.src3);
+    e.vpxor(e.xmm0, e.xmm0);
+    e.vpcmpeqq(e.xmm0, e.xmm1);
+
+    Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src2.constant());
+    }
+    e.vpandn(e.xmm1, e.xmm0, src2);
+
+    Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src3.constant());
+    }
+    e.vpand(i.dest, e.xmm0, src3);
     e.vpor(i.dest, e.xmm1);
   }
 };
@@ -2742,14 +2763,24 @@ struct SELECT_V128_I8
     : Sequence<SELECT_V128_I8, I<OPCODE_SELECT, V128Op, I8Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): find a shorter sequence.
-    // xmm0 = src1 != 0 ? 1111... : 0000....
+    // dest = src1 != 0 ? src2 : src3
     e.movzx(e.eax, i.src1);
     e.vmovd(e.xmm1, e.eax);
     e.vpbroadcastd(e.xmm1, e.xmm1);
     e.vxorps(e.xmm0, e.xmm0);
-    e.vcmpneqps(e.xmm0, e.xmm1);
-    e.vpand(e.xmm1, e.xmm0, i.src2);
-    e.vpandn(i.dest, e.xmm0, i.src3);
+    e.vpcmpeqd(e.xmm0, e.xmm1);
+
+    Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src2.constant());
+    }
+    e.vpandn(e.xmm1, e.xmm0, src2);
+
+    Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src3.constant());
+    }
+    e.vpand(i.dest, e.xmm0, src3);
     e.vpor(i.dest, e.xmm1);
   }
 };
@@ -2757,26 +2788,24 @@ struct SELECT_V128_V128
     : Sequence<SELECT_V128_V128,
                I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): could be made shorter when consts involved.
+    Xmm src1 = i.src1.is_constant ? e.xmm1 : i.src1;
+    if (i.src1.is_constant) {
+      e.LoadConstantXmm(e.xmm1, i.src1.constant());
+    }
+
+    Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2;
     if (i.src2.is_constant) {
-      if (i.src2.value->IsConstantZero()) {
-        e.vpxor(e.xmm1, e.xmm1);
-      } else {
-        assert_always();
-      }
-    } else {
-      e.vpandn(e.xmm1, i.src1, i.src2);
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
     }
+    e.vpandn(e.xmm0, src1, src2);
+
+    Xmm src3 = i.src3.is_constant ? i.dest : i.src3;
     if (i.src3.is_constant) {
-      if (i.src3.value->IsConstantZero()) {
-        e.vpxor(i.dest, i.dest);
-      } else {
-        assert_always();
-      }
-    } else {
-      e.vpand(i.dest, i.src1, i.src3);
+      e.LoadConstantXmm(i.dest, i.src3.constant());
     }
-    e.vpor(i.dest, e.xmm1);
+    e.vpand(i.dest, src1, src3);
+
+    e.vpor(i.dest, i.dest, e.xmm0);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
@@ -2926,14 +2955,20 @@ struct COMPARE_EQ_I64
 struct COMPARE_EQ_F32
     : Sequence<COMPARE_EQ_F32, I<OPCODE_COMPARE_EQ, I8Op, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vcomiss(i.src1, i.src2);
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e.vcomiss(src1, src2);
+        });
     e.sete(i.dest);
   }
 };
 struct COMPARE_EQ_F64
     : Sequence<COMPARE_EQ_F64, I<OPCODE_COMPARE_EQ, I8Op, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vcomisd(i.src1, i.src2);
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
+          e.vcomisd(src1, src2);
+        });
     e.sete(i.dest);
   }
 };
@@ -3210,6 +3245,9 @@ struct VECTOR_COMPARE_UGT_V128
       case FLOAT32_TYPE:
         sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
         break;
+      default:
+        assert_always();
+        break;
     }
     if (i.src1.is_constant) {
       // TODO(benvanik): make this constant.
@@ -3418,43 +3456,9 @@ EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16,
 // ============================================================================
 struct VECTOR_ADD
     : Sequence<VECTOR_ADD, I<OPCODE_VECTOR_ADD, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorAddUnsignedSatI32(void*, __m128i src1,
-                                                __m128i src2) {
-    alignas(16) uint32_t a[4];
-    alignas(16) uint32_t b[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      uint64_t v = (uint64_t)a[i] + (uint64_t)b[i];
-      if (v > 0xFFFFFFFF) {
-        a[i] = 0xFFFFFFFF;
-      } else {
-        a[i] = (uint32_t)v;
-      }
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(a));
-  }
-  static __m128i EmulateVectorAddSignedSatI32(void*, __m128i src1,
-                                              __m128i src2) {
-    alignas(16) int32_t a[4];
-    alignas(16) int32_t b[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      int64_t v = (int64_t)a[i] + (int64_t)b[i];
-      if (v > 0x7FFFFFFF) {
-        a[i] = 0x7FFFFFFF;
-      } else if (v < -0x80000000ll) {
-        a[i] = 0x80000000;
-      } else {
-        a[i] = (uint32_t)v;
-      }
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(a));
-  }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
-                                          const Xmm& src1, const Xmm& src2) {
+                                          Xmm src1, Xmm src2) {
       const TypeName part_type = static_cast<TypeName>(i.instr->flags & 0xFF);
       const uint32_t arithmetic_flags = i.instr->flags >> 8;
       bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
@@ -3487,71 +3491,56 @@ struct VECTOR_ADD
         case INT32_TYPE:
           if (saturate) {
             if (is_unsigned) {
-              // TODO(benvanik): broken with UINT32MAX+1
-              //// We reuse all these temps...
-              // assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 !=
-              // e.xmm2);
-              // assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 !=
-              // e.xmm2);
-              //// Clamp to 0xFFFFFFFF.
-              //// Wish there was a vpaddusd...
-              //// | A | B | C | D |
-              //// |     B |     D |
-              // e.vpsllq(e.xmm0, src1, 32);
-              // e.vpsllq(e.xmm1, src2, 32);
-              // e.vpsrlq(e.xmm0, 32);
-              // e.vpsrlq(e.xmm1, 32);
-              // e.vpaddq(e.xmm0, e.xmm1);
-              // e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax));
-              // e.vpsllq(e.xmm0, 32);
-              // e.vpsrlq(e.xmm0, 32);
-              //// |     A |     C |
-              // e.vpsrlq(e.xmm1, src1, 32);
-              // e.vpsrlq(e.xmm2, src2, 32);
-              // e.vpaddq(e.xmm1, e.xmm2);
-              // e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax));
-              // e.vpsllq(e.xmm1, 32);
-              //// xmm0 = mask for with saturated dwords == 111...
-              // e.vpor(e.xmm0, e.xmm1);
-              // e.vpaddd(dest, src1, src2);
-              //// dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n];
-              // e.vblendvps(dest, dest, e.xmm1, e.xmm1);
-              if (i.src2.is_constant) {
-                e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                e.lea(e.r9, e.StashXmm(1, e.xmm0));
-              } else {
-                e.lea(e.r9, e.StashXmm(1, i.src2));
-              }
-              e.lea(e.r8, e.StashXmm(0, i.src1));
-              e.CallNativeSafe(
-                  reinterpret_cast<void*>(EmulateVectorAddUnsignedSatI32));
-              e.vmovaps(i.dest, e.xmm0);
+              // xmm0 is the only temp register that can be used by src1/src2.
+              e.vpaddd(e.xmm1, src1, src2);
+
+              // If result is smaller than either of the inputs, we've
+              // overflowed (only need to check one input)
+              // if (src1 > res) then overflowed
+              // http://locklessinc.com/articles/sat_arithmetic/
+              e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
+              e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
+              e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);
+              e.vpor(dest, e.xmm1, e.xmm0);
             } else {
-              // https://software.intel.com/en-us/forums/topic/285219
-              // TODO(benvanik): this is broken with INTMAX+1.
-              // We reuse all these temps...
-              // assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 !=
-              // e.xmm2);
-              // assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 !=
-              // e.xmm2);
-              // e.vpaddd(e.xmm0, src1, src2);  // res
-              // e.vpand(e.xmm1, src1, src2);  // sign_and
-              // e.vpandn(e.xmm2, e.xmm0, e.xmm1);  // min_sat_mask
-              // e.vblendvps(dest, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS),
-              // e.xmm2);
-              // e.vpor(e.xmm1, src1, src2);  // sign_or
-              // e.vpandn(e.xmm1, e.xmm0);  // max_sat_mask
-              // e.vblendvps(dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm1);
-              if (i.src2.is_constant) {
-                e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                e.lea(e.r9, e.StashXmm(1, e.xmm0));
-              } else {
-                e.lea(e.r9, e.StashXmm(1, i.src2));
+              // Preserve the sources.
+              if (dest == src1) {
+                e.vmovdqa(e.xmm2, src1);
+                src1 = e.xmm2;
               }
-              e.lea(e.r8, e.StashXmm(0, i.src1));
-              e.CallNativeSafe(
-                  reinterpret_cast<void*>(EmulateVectorAddSignedSatI32));
-              e.vmovaps(i.dest, e.xmm0);
+              if (dest == src2) {
+                e.vmovdqa(e.xmm1, src2);
+                src2 = e.xmm1;
+              }
+
+              // xmm0 is the only temp register that can be used by src1/src2.
+              e.vpaddd(dest, src1, src2);
+
+              // Overflow results if two inputs are the same sign and the result
+              // isn't the same sign.
+              // if ((s32b)(~(src1 ^ src2) & (src1 ^ res)) < 0) then overflowed
+              // http://locklessinc.com/articles/sat_arithmetic/
+              e.vpxor(e.xmm1, src1, src2);
+
+              // Move src1 to xmm0 in-case it was the same register as the dest.
+              // This kills src2 if it's a constant.
+              if (src1 != e.xmm0) {
+                e.vmovdqa(e.xmm0, src1);
+                src1 = e.xmm0;
+              }
+
+              e.vpxor(e.xmm2, src1, dest);
+              e.vpandn(e.xmm1, e.xmm1, e.xmm2);
+
+              // High bit of xmm1 is now set if overflowed.
+
+              // Set any negative overflowed elements of src1 to INT_MIN
+              e.vpand(e.xmm2, src1, e.xmm1);
+              e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMSignMaskI32), e.xmm2);
+
+              // Set any positive overflowed elements of src1 to INT_MAX
+              e.vpandn(e.xmm2, src1, e.xmm1);
+              e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm2);
             }
           } else {
             e.vpaddd(dest, src1, src2);
@@ -3630,22 +3619,9 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32,
 // ============================================================================
 struct VECTOR_SUB
     : Sequence<VECTOR_SUB, I<OPCODE_VECTOR_SUB, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorSubSignedSatI32(void*, __m128i src1,
-                                              __m128i src2) {
-    alignas(16) int32_t src1v[4];
-    alignas(16) int32_t src2v[4];
-    alignas(16) int32_t value[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      auto t = int64_t(src1v[i]) - int64_t(src2v[i]);
-      value[i] = t < INT_MIN ? INT_MIN : (t > INT_MAX ? INT_MAX : int32_t(t));
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
-                                          const Xmm& src1, const Xmm& src2) {
+                                          Xmm src1, Xmm src2) {
       const TypeName part_type = static_cast<TypeName>(i.instr->flags & 0xFF);
       const uint32_t arithmetic_flags = i.instr->flags >> 8;
       bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
@@ -3678,13 +3654,57 @@ struct VECTOR_SUB
         case INT32_TYPE:
           if (saturate) {
             if (is_unsigned) {
-              assert_always();
+              // xmm0 is the only temp register that can be used by src1/src2.
+              e.vpsubd(e.xmm1, src1, src2);
+
+              // If result is greater than either of the inputs, we've
+              // underflowed (only need to check one input)
+              // if (res > src1) then underflowed
+              // http://locklessinc.com/articles/sat_arithmetic/
+              e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
+              e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
+              e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2);
+              e.vpandn(dest, e.xmm0, e.xmm1);
             } else {
-              e.lea(e.r8, e.StashXmm(0, i.src1));
-              e.lea(e.r9, e.StashXmm(1, i.src2));
-              e.CallNativeSafe(
-                  reinterpret_cast<void*>(EmulateVectorSubSignedSatI32));
-              e.vmovaps(i.dest, e.xmm0);
+              // Preserve the sources.
+              if (dest == src1) {
+                e.vmovdqa(e.xmm2, src1);
+                src1 = e.xmm2;
+              }
+              if (dest == src2) {
+                e.vmovdqa(e.xmm1, src2);
+                src2 = e.xmm1;
+              }
+
+              // xmm0 is the only temp register that can be used by src1/src2.
+              e.vpsubd(dest, src1, src2);
+
+              // We can only overflow if the signs of the operands are opposite.
+              // If signs are opposite and result sign isn't the same as src1's
+              // sign, we've overflowed.
+              // if ((s32b)((src1 ^ src2) & (src1 ^ res)) < 0) then overflowed
+              // http://locklessinc.com/articles/sat_arithmetic/
+              e.vpxor(e.xmm1, src1, src2);
+
+              // Move src1 to xmm0 in-case it's the same register as the dest.
+              // This kills src2 if it's a constant.
+              if (src1 != e.xmm0) {
+                e.vmovdqa(e.xmm0, src1);
+                src1 = e.xmm0;
+              }
+
+              e.vpxor(e.xmm2, src1, dest);
+              e.vpand(e.xmm1, e.xmm1, e.xmm2);
+
+              // High bit of xmm1 is now set if overflowed.
+
+              // Set any negative overflowed elements of src1 to INT_MIN
+              e.vpand(e.xmm2, src1, e.xmm1);
+              e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMSignMaskI32), e.xmm2);
+
+              // Set any positive overflowed elements of src1 to INT_MAX
+              e.vpandn(e.xmm2, src1, e.xmm1);
+              e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm2);
             }
           } else {
             e.vpsubd(dest, src1, src2);
@@ -4361,68 +4381,113 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32,
 // ============================================================================
 // d = 1 * 2 + 3
 // $0 = $1x$0 + $2
-// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling.
-// dest could be src2 or src3 - need to ensure it's not before overwriting dest
-// perhaps use other 132/213/etc
-// Forms:
+// Forms of vfmadd/vfmsub:
 // - 132 -> $1 = $1 * $3 + $2
 // - 213 -> $1 = $2 * $1 + $3
 // - 231 -> $1 = $2 * $3 + $1
 struct MUL_ADD_F32
     : Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      float mul = i.src1.constant() * i.src2.constant();
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vaddss(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // FMA extension
     if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        e.vfmadd213ss(i.dest, i.src2, i.src3);
-      } else if (i.dest == i.src2) {
-        e.vfmadd213ss(i.dest, i.src1, i.src3);
-      } else if (i.dest == i.src3) {
-        e.vfmadd231ss(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything
-        e.vmovss(i.dest, i.src1);
-        e.vfmadd213ss(i.dest, i.src2, i.src3);
-      }
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmadd213ss(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmadd213ss(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmadd231ss(i.dest, src1, src2);
+        } else {
+          // Dest not equal to anything
+          e.vmovss(i.dest, src1);
+          e.vfmadd213ss(i.dest, src2, i.src3);
+        }
+      });
     } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
-      Xmm src3 = i.src3;
-      if (i.dest == i.src3) {
-        e.vmovss(e.xmm0, i.src3);
-        src3 = e.xmm0;
+      Xmm src3;
+      if (i.src3.is_constant) {
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
+      } else {
+        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+        src3 = i.src3;
+        if (i.dest == i.src3) {
+          e.vmovss(e.xmm1, i.src3);
+          src3 = e.xmm1;
+        }
       }
 
-      e.vmulss(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vaddss(i.dest, i.dest, src3);    // $0 = $1 + $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulss(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vaddss(i.dest, i.dest, src3);  // $0 = $1 + $2
     }
   }
 };
 struct MUL_ADD_F64
     : Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      double mul = i.src1.constant() * i.src2.constant();
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vaddsd(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // FMA extension
     if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        e.vfmadd213sd(i.dest, i.src2, i.src3);
-      } else if (i.dest == i.src2) {
-        e.vfmadd213sd(i.dest, i.src1, i.src3);
-      } else if (i.dest == i.src3) {
-        e.vfmadd231sd(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything
-        e.vmovsd(i.dest, i.src1);
-        e.vfmadd213sd(i.dest, i.src2, i.src3);
-      }
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmadd213sd(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmadd213sd(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmadd231sd(i.dest, src1, src2);
+        } else {
+          // Dest not equal to anything
+          e.vmovsd(i.dest, src1);
+          e.vfmadd213sd(i.dest, src2, i.src3);
+        }
+      });
     } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
-      Xmm src3 = i.src3;
-      if (i.dest == i.src3) {
-        e.vmovsd(e.xmm0, i.src3);
-        src3 = e.xmm0;
+      Xmm src3;
+      if (i.src3.is_constant) {
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
+      } else {
+        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+        src3 = i.src3;
+        if (i.dest == i.src3) {
+          e.vmovsd(e.xmm1, i.src3);
+          src3 = e.xmm1;
+        }
       }
 
-      e.vmulsd(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vaddsd(i.dest, i.dest, src3);    // $0 = $1 + $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulsd(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vaddsd(i.dest, i.dest, src3);  // $0 = $1 + $2
     }
   }
 };
@@ -4430,37 +4495,58 @@ struct MUL_ADD_V128
     : Sequence<MUL_ADD_V128,
                I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      vec128_t mul;
+      for (int n = 0; n < 4; n++) {
+        mul.f32[n] = i.src1.constant().f32[n] * i.src2.constant().f32[n];
+      }
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vaddps(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // TODO(benvanik): the vfmadd sequence produces slightly different results
     // than vmul+vadd and it'd be nice to know why. Until we know, it's
     // disabled so tests pass.
     if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        e.vfmadd213ps(i.dest, i.src2, i.src3);
-      } else if (i.dest == i.src2) {
-        e.vfmadd213ps(i.dest, i.src1, i.src3);
-      } else if (i.dest == i.src3) {
-        e.vfmadd231ps(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything
-        e.vmovdqa(i.dest, i.src1);
-        e.vfmadd213ps(i.dest, i.src2, i.src3);
-      }
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmadd213ps(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmadd213ps(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmadd231ps(i.dest, src1, src2);
+        } else {
+          // Dest not equal to anything
+          e.vmovdqa(i.dest, src1);
+          e.vfmadd213ps(i.dest, src2, i.src3);
+        }
+      });
     } else {
       Xmm src3;
       if (i.src3.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src3.constant());
-        src3 = e.xmm0;
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
       } else {
         // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
         src3 = i.src3;
         if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm0, i.src3);
-          src3 = e.xmm0;
+          e.vmovdqa(e.xmm1, i.src3);
+          src3 = e.xmm1;
         }
       }
 
-      e.vmulps(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vaddps(i.dest, i.dest, src3);    // $0 = $1 + $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulps(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vaddps(i.dest, i.dest, src3);  // $0 = $1 + $2
     }
   }
 };
@@ -4481,58 +4567,106 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
 struct MUL_SUB_F32
     : Sequence<MUL_SUB_F32, I<OPCODE_MUL_SUB, F32Op, F32Op, F32Op, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      float mul = i.src1.constant() * i.src2.constant();
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vsubss(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // FMA extension
     if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        e.vfmsub213ss(i.dest, i.src2, i.src3);
-      } else if (i.dest == i.src2) {
-        e.vfmsub213ss(i.dest, i.src1, i.src3);
-      } else if (i.dest == i.src3) {
-        e.vfmsub231ss(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything
-        e.vmovss(i.dest, i.src1);
-        e.vfmsub213ss(i.dest, i.src2, i.src3);
-      }
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmsub213ss(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmsub213ss(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmsub231ss(i.dest, src1, src2);
+        } else {
+          // Dest not equal to anything
+          e.vmovss(i.dest, src1);
+          e.vfmsub213ss(i.dest, src2, i.src3);
+        }
+      });
     } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
-      Xmm src3 = i.src3;
-      if (i.dest == i.src3) {
-        e.vmovss(e.xmm0, i.src3);
-        src3 = e.xmm0;
+      Xmm src3;
+      if (i.src3.is_constant) {
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
+      } else {
+        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+        src3 = i.src3;
+        if (i.dest == i.src3) {
+          e.vmovss(e.xmm1, i.src3);
+          src3 = e.xmm1;
+        }
       }
 
-      e.vmulss(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vsubss(i.dest, i.dest, src3);    // $0 = $1 - $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulss(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vsubss(i.dest, i.dest, src3);  // $0 = $1 - $2
     }
   }
 };
 struct MUL_SUB_F64
     : Sequence<MUL_SUB_F64, I<OPCODE_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      double mul = i.src1.constant() * i.src2.constant();
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vsubsd(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // FMA extension
     if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        e.vfmsub213sd(i.dest, i.src2, i.src3);
-      } else if (i.dest == i.src2) {
-        e.vfmsub213sd(i.dest, i.src1, i.src3);
-      } else if (i.dest == i.src3) {
-        e.vfmsub231sd(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything
-        e.vmovsd(i.dest, i.src1);
-        e.vfmsub213sd(i.dest, i.src2, i.src3);
-      }
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmsub213sd(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmsub213sd(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmsub231sd(i.dest, src1, src2);
+        } else {
+          // Dest not equal to anything
+          e.vmovsd(i.dest, src1);
+          e.vfmsub213sd(i.dest, src2, i.src3);
+        }
+      });
     } else {
-      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
-      Xmm src3 = i.src3;
-      if (i.dest == i.src3) {
-        e.vmovsd(e.xmm0, i.src3);
-        src3 = e.xmm0;
+      Xmm src3;
+      if (i.src3.is_constant) {
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
+      } else {
+        // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+        src3 = i.src3;
+        if (i.dest == i.src3) {
+          e.vmovsd(e.xmm1, i.src3);
+          src3 = e.xmm1;
+        }
       }
 
-      e.vmulsd(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vsubsd(i.dest, i.dest, src3);    // $0 = $1 - $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulsd(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vsubsd(i.dest, i.dest, src3);  // $0 = $1 - $2
     }
   }
 };
@@ -4540,50 +4674,56 @@ struct MUL_SUB_V128
     : Sequence<MUL_SUB_V128,
                I<OPCODE_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // Calculate the multiply part if it's constant.
+    // TODO: Do this in the constant propagation pass.
+    if (i.src1.is_constant && i.src2.is_constant) {
+      vec128_t mul;
+      for (int n = 0; n < 4; n++) {
+        mul.f32[n] = i.src1.constant().f32[n] * i.src2.constant().f32[n];
+      }
+
+      e.LoadConstantXmm(e.xmm0, mul);
+      e.vsubps(i.dest, e.xmm0, i.src3);
+      return;
+    }
+
     // FMA extension
     if (e.IsFeatureEnabled(kX64EmitFMA)) {
-      if (i.dest == i.src1) {
-        if (i.src3.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src3.constant());
-          e.vfmsub213ps(i.dest, i.src2, e.xmm0);
+      EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest,
+                                            const Xmm& src1, const Xmm& src2) {
+        if (i.dest == src1) {
+          e.vfmsub213ps(i.dest, src2, i.src3);
+        } else if (i.dest == src2) {
+          e.vfmsub213ps(i.dest, src1, i.src3);
+        } else if (i.dest == i.src3) {
+          e.vfmsub231ps(i.dest, src1, src2);
         } else {
-          e.vfmsub213ps(i.dest, i.src2, i.src3);
+          // Dest not equal to anything
+          e.vmovdqa(i.dest, src1);
+          e.vfmsub213ps(i.dest, src2, i.src3);
         }
-      } else if (i.dest == i.src2) {
-        if (i.src3.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src3.constant());
-          e.vfmsub213ps(i.dest, i.src1, e.xmm0);
-        } else {
-          e.vfmsub213ps(i.dest, i.src1, i.src3);
-        }
-      } else if (i.dest == i.src3) {
-        e.vfmsub231ps(i.dest, i.src1, i.src2);
-      } else {
-        // Dest not equal to anything.
-        e.vmovdqa(i.dest, i.src1);
-        if (i.src3.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src3.constant());
-          e.vfmsub213ps(i.dest, i.src2, e.xmm0);
-        } else {
-          e.vfmsub213ps(i.dest, i.src2, i.src3);
-        }
-      }
+      });
     } else {
       Xmm src3;
       if (i.src3.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src3.constant());
-        src3 = e.xmm0;
+        e.LoadConstantXmm(e.xmm1, i.src3.constant());
+        src3 = e.xmm1;
       } else {
         // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
         src3 = i.src3;
         if (i.dest == i.src3) {
-          e.vmovdqa(e.xmm0, i.src3);
-          src3 = e.xmm0;
+          e.vmovdqa(e.xmm1, i.src3);
+          src3 = e.xmm1;
         }
       }
 
-      e.vmulps(i.dest, i.src1, i.src2);  // $0 = $1 * $2
-      e.vsubps(i.dest, i.dest, src3);    // $0 = $1 - $2
+      // Multiply operation is commutative.
+      EmitCommutativeBinaryXmmOp(
+          e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+            e.vmulps(dest, src1, src2);  // $0 = $1 * $2
+          });
+
+      e.vsubps(i.dest, i.dest, src3);  // $0 = $1 - $2
     }
   }
 };
@@ -5274,7 +5414,28 @@ struct VECTOR_SHL_V128
         return;
       }
     }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      // See if the shift is equal first for a shortcut.
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vptest(e.xmm0, i.src2);
+      e.jnc(emu);
+
+      // Equal. Shift using vpsllw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsllw(i.dest, i.src1, e.xmm0);
+      e.jmp(end);
+    }
+
     // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
       e.lea(e.r9, e.StashXmm(1, e.xmm0));
@@ -5284,6 +5445,8 @@ struct VECTOR_SHL_V128
     e.lea(e.r8, e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
     e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
   }
   static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) {
     alignas(16) uint32_t value[4];
@@ -5296,28 +5459,32 @@ struct VECTOR_SHL_V128
     return _mm_load_si128(reinterpret_cast<__m128i*>(value));
   }
   static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpslld.
+        e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F);
+        return;
+      }
+    }
+
     if (e.IsFeatureEnabled(kX64EmitAVX2)) {
       if (i.src2.is_constant) {
         const auto& shamt = i.src2.constant();
-        bool all_same = true;
-        for (size_t n = 0; n < 4 - n; ++n) {
-          if (shamt.u32[n] != shamt.u32[n + 1]) {
-            all_same = false;
-            break;
-          }
-        }
-        if (all_same) {
-          // Every count is the same, so we can use vpslld.
-          e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F);
-        } else {
-          // Counts differ, so pre-mask and load constant.
-          vec128_t masked = i.src2.constant();
-          for (size_t n = 0; n < 4; ++n) {
-            masked.u32[n] &= 0x1F;
-          }
-          e.LoadConstantXmm(e.xmm0, masked);
-          e.vpsllvd(i.dest, i.src1, e.xmm0);
+        // Counts differ, so pre-mask and load constant.
+        vec128_t masked = i.src2.constant();
+        for (size_t n = 0; n < 4; ++n) {
+          masked.u32[n] &= 0x1F;
         }
+        e.LoadConstantXmm(e.xmm0, masked);
+        e.vpsllvd(i.dest, i.src1, e.xmm0);
       } else {
         // Fully variable shift.
         // src shift mask may have values >31, and x86 sets to zero when
@@ -5326,7 +5493,26 @@ struct VECTOR_SHL_V128
         e.vpsllvd(i.dest, i.src1, e.xmm0);
       }
     } else {
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vptest(e.xmm0, i.src2);
+        e.jnc(emu);
+
+        // Equal. Shift using vpsrad.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpslld(i.dest, i.src1, e.xmm0);
+        e.jmp(end);
+      }
+
       // TODO(benvanik): native version (with shift magic).
+      e.L(emu);
       if (i.src2.is_constant) {
         e.LoadConstantXmm(e.xmm0, i.src2.constant());
         e.lea(e.r9, e.StashXmm(1, e.xmm0));
@@ -5336,6 +5522,8 @@ struct VECTOR_SHL_V128
       e.lea(e.r8, e.StashXmm(0, i.src1));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI32));
       e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
     }
   }
 };
@@ -5410,7 +5598,28 @@ struct VECTOR_SHR_V128
         return;
       }
     }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // See if the shift is equal first for a shortcut.
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vptest(e.xmm0, i.src2);
+      e.jnc(emu);
+
+      // Equal. Shift using vpsrlw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsrlw(i.dest, i.src1, e.xmm0);
+      e.jmp(end);
+    }
+
     // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
       e.lea(e.r9, e.StashXmm(1, e.xmm0));
@@ -5420,6 +5629,8 @@ struct VECTOR_SHR_V128
     e.lea(e.r8, e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16));
     e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
   }
   static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) {
     alignas(16) uint32_t value[4];
@@ -5442,7 +5653,7 @@ struct VECTOR_SHR_V128
         }
       }
       if (all_same) {
-        // Every count is the same, so we can use vpslld.
+        // Every count is the same, so we can use vpsrld.
         e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F);
         return;
       } else {
@@ -5457,28 +5668,47 @@ struct VECTOR_SHR_V128
           return;
         }
       }
-    } else {
-      if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-        // Fully variable shift.
-        // src shift mask may have values >31, and x86 sets to zero when
-        // that happens so we mask.
-        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-        e.vpsrlvd(i.dest, i.src1, e.xmm0);
-        return;
-      }
     }
 
-    // We've reached here if we don't have AVX2 and it's a variable shift.
-    // TODO(benvanik): native version.
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Fully variable shift.
+      // src shift mask may have values >31, and x86 sets to zero when
+      // that happens so we mask.
+      e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+      e.vpsrlvd(i.dest, i.src1, e.xmm0);
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vptest(e.xmm0, i.src2);
+        e.jnc(emu);
+
+        // Equal. Shift using vpsrld.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpsrld(i.dest, i.src1, e.xmm0);
+        e.jmp(end);
+      }
+
+      // TODO(benvanik): native version.
+      e.L(emu);
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      } else {
+        e.lea(e.r9, e.StashXmm(1, i.src2));
+      }
+      e.lea(e.r8, e.StashXmm(0, i.src1));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
+      e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
-    e.vmovaps(i.dest, e.xmm0);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
@@ -5498,6 +5728,20 @@ struct VECTOR_SHA_V128
     }
     return _mm_load_si128(reinterpret_cast<__m128i*>(value));
   }
+
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI8));
+    e.vmovaps(i.dest, e.xmm0);
+  }
+
   static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) {
     alignas(16) int16_t value[8];
     alignas(16) int16_t shamt[8];
@@ -5508,6 +5752,58 @@ struct VECTOR_SHA_V128
     }
     return _mm_load_si128(reinterpret_cast<__m128i*>(value));
   }
+
+  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsraw.
+        e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF);
+        return;
+      }
+    }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // See if the shift is equal first for a shortcut.
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vptest(e.xmm0, i.src2);
+      e.jnc(emu);
+
+      // Equal. Shift using vpsraw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsraw(i.dest, i.src1, e.xmm0);
+      e.jmp(end);
+    }
+
+    // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI16));
+    e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
+  }
+
   static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) {
     alignas(16) int32_t value[4];
     alignas(16) int32_t shamt[4];
@@ -5518,55 +5814,79 @@ struct VECTOR_SHA_V128
     }
     return _mm_load_si128(reinterpret_cast<__m128i*>(value));
   }
+
+  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsrad.
+        e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F);
+        return;
+      }
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // src shift mask may have values >31, and x86 sets to zero when
+      // that happens so we mask.
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
+      } else {
+        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+      }
+      e.vpsravd(i.dest, i.src1, e.xmm0);
+    } else {
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vptest(e.xmm0, i.src2);
+        e.jnc(emu);
+
+        // Equal. Shift using vpsrad.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpsrad(i.dest, i.src1, e.xmm0);
+        e.jmp(end);
+      }
+
+      // TODO(benvanik): native version.
+      e.L(emu);
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      } else {
+        e.lea(e.r9, e.StashXmm(1, i.src2));
+      }
+      e.lea(e.r8, e.StashXmm(0, i.src1));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
+      e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
+    }
+  }
+
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     switch (i.instr->flags) {
       case INT8_TYPE:
-        // TODO(benvanik): native version (with shift magic).
-        if (i.src2.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
-        } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-        }
-        e.lea(e.r8, e.StashXmm(0, i.src1));
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI8));
-        e.vmovaps(i.dest, e.xmm0);
+        EmitInt8(e, i);
         break;
       case INT16_TYPE:
-        // TODO(benvanik): native version (with shift magic).
-        if (i.src2.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
-        } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-        }
-        e.lea(e.r8, e.StashXmm(0, i.src1));
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI16));
-        e.vmovaps(i.dest, e.xmm0);
+        EmitInt16(e, i);
         break;
       case INT32_TYPE:
-        if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-          // src shift mask may have values >31, and x86 sets to zero when
-          // that happens so we mask.
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
-          } else {
-            e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-          }
-          e.vpsravd(i.dest, i.src1, e.xmm0);
-        } else {
-          // TODO(benvanik): native version.
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
-          } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
-          }
-          e.lea(e.r8, e.StashXmm(0, i.src1));
-          e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
-          e.vmovaps(i.dest, e.xmm0);
-        }
+        EmitInt32(e, i);
         break;
       default:
         assert_always();
@@ -5677,14 +5997,24 @@ struct VECTOR_ROTATE_LEFT_V128
       case INT8_TYPE:
         // TODO(benvanik): native version (with shift magic).
         e.lea(e.r8, e.StashXmm(0, i.src1));
-        e.lea(e.r9, e.StashXmm(1, i.src2));
+        if (i.src2.is_constant) {
+          e.LoadConstantXmm(e.xmm0, i.src2.constant());
+          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        } else {
+          e.lea(e.r9, e.StashXmm(1, i.src2));
+        }
         e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI8));
         e.vmovaps(i.dest, e.xmm0);
         break;
       case INT16_TYPE:
         // TODO(benvanik): native version (with shift magic).
         e.lea(e.r8, e.StashXmm(0, i.src1));
-        e.lea(e.r9, e.StashXmm(1, i.src2));
+        if (i.src2.is_constant) {
+          e.LoadConstantXmm(e.xmm0, i.src2.constant());
+          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        } else {
+          e.lea(e.r9, e.StashXmm(1, i.src2));
+        }
         e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI16));
         e.vmovaps(i.dest, e.xmm0);
         break;
@@ -5706,7 +6036,12 @@ struct VECTOR_ROTATE_LEFT_V128
         } else {
           // TODO(benvanik): non-AVX2 native version.
           e.lea(e.r8, e.StashXmm(0, i.src1));
-          e.lea(e.r9, e.StashXmm(1, i.src2));
+          if (i.src2.is_constant) {
+            e.LoadConstantXmm(e.xmm0, i.src2.constant());
+            e.lea(e.r9, e.StashXmm(1, e.xmm0));
+          } else {
+            e.lea(e.r9, e.StashXmm(1, i.src2));
+          }
           e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI32));
           e.vmovaps(i.dest, e.xmm0);
         }
@@ -6264,6 +6599,8 @@ struct PERMUTE_V128
   static void EmitByInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): find out how to do this with only one temp register!
     // Permute bytes between src2 and src3.
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
     if (i.src3.value->IsConstantZero()) {
       // Permuting with src2/zero, so just shuffle/mask.
       if (i.src2.value->IsConstantZero()) {
@@ -6324,43 +6661,42 @@ struct PERMUTE_V128
     }
   }
 
-  static __m128i EmulateByInt16(void*, __m128i control, __m128i src1,
-                                __m128i src2) {
-    alignas(16) uint16_t c[8];
-    alignas(16) uint16_t a[8];
-    alignas(16) uint16_t b[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(c), control);
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      uint16_t si = (c[i] & 0xF) ^ 0x1;
-      c[i] = si >= 8 ? b[si - 8] : a[si];
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
-  }
   static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): replace with proper version.
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
     assert_true(i.src1.is_constant);
-    if (i.src1.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src1.constant());
-      e.lea(e.r8, e.StashXmm(0, e.xmm0));
-    } else {
-      e.lea(e.r8, e.StashXmm(0, i.src1));
+    vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
+    vec128_t perm_ctrl = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
+
+      auto v = uint8_t(perm.u16[i]);
+      perm.u8[i * 2] = v * 2;
+      perm.u8[i * 2 + 1] = v * 2 + 1;
     }
+    e.LoadConstantXmm(e.xmm0, perm);
+
     if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.LoadConstantXmm(e.xmm1, i.src2.constant());
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.vmovdqa(e.xmm1, i.src2);
     }
     if (i.src3.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src3.constant());
-      e.lea(e.r10, e.StashXmm(2, e.xmm0));
+      e.LoadConstantXmm(e.xmm2, i.src3.constant());
     } else {
-      e.lea(e.r10, e.StashXmm(2, i.src3));
+      e.vmovdqa(e.xmm2, i.src3);
     }
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateByInt16));
-    e.vmovaps(i.dest, e.xmm0);
+
+    e.vpshufb(e.xmm1, e.xmm1, e.xmm0);
+    e.vpshufb(e.xmm2, e.xmm2, e.xmm0);
+
+    uint8_t mask = 0;
+    for (int i = 0; i < 8; i++) {
+      if (perm_ctrl.i16[i] == 0) {
+        mask |= 1 << (7 - i);
+      }
+    }
+    e.vpblendw(i.dest, e.xmm1, e.xmm2, mask);
   }
 
   static void EmitByInt32(X64Emitter& e, const EmitArgType& i) {
@@ -6646,7 +6982,12 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
         if (IsPackOutSaturate(flags)) {
           // signed -> unsigned + saturate
           // PACKUSWB / SaturateSignedWordToUnsignedByte
-          e.vpackuswb(i.dest, i.src1, i.src2);
+          Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantXmm(e.xmm0, i.src2.constant());
+          }
+
+          e.vpackuswb(i.dest, i.src1, src2);
           e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
         } else {
           // signed -> unsigned
@@ -6665,19 +7006,6 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
       }
     }
   }
-  static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1,
-                                               __m128i src2) {
-    alignas(16) uint32_t a[4];
-    alignas(16) uint32_t b[4];
-    alignas(16) uint16_t c[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (int i = 0; i < 4; ++i) {
-      c[i] = uint16_t(std::min(65535u, a[i]));
-      c[i + 4] = uint16_t(std::min(65535u, b[i]));
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
-  }
   static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
                            uint32_t flags) {
     // TODO(benvanik): handle src2 (or src1) being constant zero
@@ -6685,18 +7013,28 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
       if (IsPackOutUnsigned(flags)) {
         if (IsPackOutSaturate(flags)) {
           // unsigned -> unsigned + saturate
-          Xmm src2;
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
-          } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
-          }
-          e.lea(e.r8, e.StashXmm(0, i.src1));
-          e.CallNativeSafe(
-              reinterpret_cast<void*>(EmulatePack16_IN_32_UN_UN_SAT));
-          e.vmovaps(i.dest, e.xmm0);
-          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
+          // Construct a saturation mask
+          e.mov(e.eax, ~0xFFFFu);
+          e.vmovd(e.xmm0, e.eax);
+          e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+
+          e.vandps(e.xmm1, e.xmm0, i.src1);  // src1 & 0xFFFF0000
+          e.vpcmpeqd(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMZero));
+          e.vpxor(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMFFFF));
+          e.vpor(e.xmm1, e.xmm1, i.src1);  // Saturate src1
+          e.vpshuflw(e.xmm1, e.xmm1, 0b00100010);
+          e.vpshufhw(e.xmm1, e.xmm1, 0b00100010);
+          e.vpshufd(e.xmm1, e.xmm1, 0b00001000);
+
+          e.vandps(e.xmm0, e.xmm0, i.src2);  // src2 & 0xFFFF0000
+          e.vpcmpeqd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+          e.vpxor(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFFFF));
+          e.vpor(i.dest, e.xmm0, i.src2);  // Saturate src2
+          e.vpshuflw(i.dest, i.dest, 0b00100010);
+          e.vpshufhw(i.dest, i.dest, 0b00100010);
+          e.vpshufd(i.dest, i.dest, 0b10000000);
+
+          e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111);
         } else {
           // unsigned -> unsigned
           e.vmovaps(e.xmm0, i.src1);
@@ -6771,6 +7109,9 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
       case PACK_TYPE_FLOAT16_2:
         EmitFLOAT16_2(e, i);
         break;
+      case PACK_TYPE_FLOAT16_3:
+        EmitFLOAT16_3(e, i);
+        break;
       case PACK_TYPE_FLOAT16_4:
         EmitFLOAT16_4(e, i);
         break;
@@ -6814,7 +7155,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
     _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
 
     for (int i = 0; i < 2; i++) {
-      b[i] = half_float::detail::half2float(a[7 - i]);
+      b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
     }
 
     // Constants, or something
@@ -6862,23 +7203,34 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
       e.vmovaps(i.dest, e.xmm0);
     }
   }
+  // FIXME: This has not been verified on a real 360, but from context the
+  // return values are used in floating point math.
+  static __m128 EmulateFLOAT16_3(void*, __m128i src1) {
+    alignas(16) uint16_t a[8];
+    alignas(16) float b[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
+
+    for (int i = 0; i < 3; i++) {
+      b[i] = half_float::detail::half2float(a[VEC128_W(5 + i)]);
+    }
+
+    // FIXME: Correct?
+    b[3] = 1.0f;
+
+    return _mm_load_ps(b);
+  }
+  static void EmitFLOAT16_3(X64Emitter& e, const EmitArgType& i) {
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_3));
+    e.vmovaps(i.dest, e.xmm0);
+  }
   static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
     alignas(16) uint16_t a[8];
     alignas(16) float b[4];
     _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
 
-    // The floats come in swapped for some reason. Swap them back.
-    for (int i = 0; i < 2; i++) {
-      uint16_t& n1 = a[7 - (i * 2)];
-      uint16_t& n2 = a[6 - (i * 2)];
-
-      uint16_t tmp = n1;
-      n1 = n2;
-      n2 = tmp;
-    }
-
     for (int i = 0; i < 4; i++) {
-      b[3 - i] = half_float::detail::half2float(a[7 - i]);
+      b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
     }
 
     return _mm_load_ps(b);
@@ -7086,6 +7438,38 @@ EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8,
                      ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32,
                      ATOMIC_EXCHANGE_I64);
 
+// ============================================================================
+// OPCODE_ATOMIC_COMPARE_EXCHANGE
+// ============================================================================
+struct ATOMIC_COMPARE_EXCHANGE_I32
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I32,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I32Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.eax, i.src2);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lock();
+    e.cmpxchg(e.dword[e.rdx + e.rcx], i.src3);
+    e.sete(i.dest);
+
+    e.ReloadECX();
+  }
+};
+struct ATOMIC_COMPARE_EXCHANGE_I64
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I64,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.rax, i.src2);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lock();
+    e.cmpxchg(e.qword[e.rdx + e.rcx], i.src3);
+    e.sete(i.dest);
+
+    e.ReloadECX();
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE,
+                     ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64);
+
 void RegisterSequences() {
   Register_OPCODE_COMMENT();
   Register_OPCODE_NOP();
@@ -7201,6 +7585,7 @@ void RegisterSequences() {
   Register_OPCODE_PACK();
   Register_OPCODE_UNPACK();
   Register_OPCODE_ATOMIC_EXCHANGE();
+  Register_OPCODE_ATOMIC_COMPARE_EXCHANGE();
 }
 
 bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index c8a5ef632..7be733142 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -161,6 +161,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             i->Remove();
           }
           break;
+        case OPCODE_ROUND:
+          if (i->src1.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Round(RoundMode(i->flags));
+            i->Remove();
+          }
+          break;
         case OPCODE_ZERO_EXTEND:
           if (i->src1.value->IsConstant()) {
             TypeName target_type = v->type;
@@ -188,6 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
 
         case OPCODE_LOAD:
           if (i->src1.value->IsConstant()) {
+            assert_false(i->flags & LOAD_STORE_BYTE_SWAP);
             auto memory = processor_->memory();
             auto address = i->src1.value->constant.i32;
             auto mmio_range =
@@ -253,12 +261,23 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
 
         case OPCODE_SELECT:
           if (i->src1.value->IsConstant()) {
-            if (i->src1.value->IsConstantTrue()) {
-              v->set_from(i->src2.value);
+            if (i->src1.value->type != VEC128_TYPE) {
+              if (i->src1.value->IsConstantTrue()) {
+                v->set_from(i->src2.value);
+                i->Remove();
+              } else if (i->src1.value->IsConstantFalse()) {
+                v->set_from(i->src3.value);
+                i->Remove();
+              } else if (i->src2.value->IsConstant() &&
+                         i->src3.value->IsConstant()) {
+                // TODO: Select
+                // v->set_from(i->src2.value);
+                // v->Select(i->src3.value, i->src1.value);
+                // i->Remove();
+              }
             } else {
-              v->set_from(i->src3.value);
+              // TODO: vec128 select
             }
-            i->Remove();
           }
           break;
         case OPCODE_IS_TRUE:
@@ -355,7 +374,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
           break;
 
         case OPCODE_DID_SATURATE:
-          assert_true(!i->src1.value->IsConstant());
+          // assert_true(!i->src1.value->IsConstant());
           break;
 
         case OPCODE_ADD:
@@ -413,8 +432,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             i->Remove();
           }
           break;
-        // case OPCODE_MUL_ADD:
-        // case OPCODE_MUL_SUB
+        case OPCODE_MUL_ADD:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            // Multiply part is constant.
+            if (i->src3.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value);
+              i->Remove();
+            }
+          }
+          break;
+        case OPCODE_MUL_SUB:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            // Multiply part is constant.
+            if (i->src3.value->IsConstant()) {
+              v->set_from(i->src1.value);
+              Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value);
+              i->Remove();
+            }
+          }
+          break;
+        case OPCODE_MAX:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->Max(i->src2.value);
+            i->Remove();
+          }
+          break;
         case OPCODE_NEG:
           if (i->src1.value->IsConstant()) {
             v->set_from(i->src1.value);
@@ -484,7 +528,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             i->Remove();
           }
           break;
-        // TODO(benvanik): VECTOR_SHL
         case OPCODE_SHR:
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             v->set_from(i->src1.value);
@@ -515,13 +558,80 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
           }
           break;
         // TODO(benvanik): INSERT/EXTRACT
-        // TODO(benvanik): SPLAT/PERMUTE/SWIZZLE
-        case OPCODE_SPLAT:
-          if (i->src1.value->IsConstant()) {
-            // Quite a few of these, from building vec128s.
+        // TODO(benvanik): PERMUTE/SWIZZLE
+        case OPCODE_EXTRACT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_zero(v->type);
+            v->Extract(i->src1.value, i->src2.value);
+            i->Remove();
+          }
+          break;
+        case OPCODE_SPLAT:
+          if (i->src1.value->IsConstant()) {
+            v->set_zero(v->type);
+            v->Splat(i->src1.value);
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_COMPARE_EQ:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_COMPARE_SGT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_CONVERT_F2I:
+          if (i->src1.value->IsConstant()) {
+            v->set_zero(VEC128_TYPE);
+            v->VectorConvertF2I(i->src1.value);
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_CONVERT_I2F:
+          if (i->src1.value->IsConstant()) {
+            v->set_zero(VEC128_TYPE);
+            v->VectorConvertI2F(i->src1.value);
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_SHL:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->VectorShl(i->src2.value, hir::TypeName(i->flags));
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_SHR:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->VectorShr(i->src2.value, hir::TypeName(i->flags));
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_ROTATE_LEFT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            v->VectorRol(i->src2.value, hir::TypeName(i->flags));
+            i->Remove();
+          }
+          break;
+        case OPCODE_VECTOR_SUB:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src1.value);
+            uint32_t arith_flags = i->flags >> 8;
+            v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF),
+                         !!(arith_flags & ARITHMETIC_UNSIGNED),
+                         !!(arith_flags & ARITHMETIC_SATURATE));
+            i->Remove();
           }
           break;
-
         default:
           // Ignored.
           break;
diff --git a/src/xenia/cpu/cpu_flags.cc b/src/xenia/cpu/cpu_flags.cc
index 0d1748c1a..6c37e7edc 100644
--- a/src/xenia/cpu/cpu_flags.cc
+++ b/src/xenia/cpu/cpu_flags.cc
@@ -28,6 +28,10 @@ DEFINE_bool(trace_function_references, false,
 DEFINE_bool(trace_function_data, false,
             "Generate tracing for function result data.");
 
+DEFINE_bool(
+    disable_global_lock, false,
+    "Disables global lock usage in guest code. Does not affect host code.");
+
 DEFINE_bool(validate_hir, false,
             "Perform validation checks on the HIR during compilation.");
 
diff --git a/src/xenia/cpu/cpu_flags.h b/src/xenia/cpu/cpu_flags.h
index 578429b74..17b88ff08 100644
--- a/src/xenia/cpu/cpu_flags.h
+++ b/src/xenia/cpu/cpu_flags.h
@@ -23,6 +23,8 @@ DECLARE_bool(trace_function_coverage);
 DECLARE_bool(trace_function_references);
 DECLARE_bool(trace_function_data);
 
+DECLARE_bool(disable_global_lock);
+
 DECLARE_bool(validate_hir);
 
 DECLARE_uint64(break_on_instruction);
diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc
index a27f0b86a..db278cd81 100644
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@@ -2074,6 +2074,17 @@ Value* HIRBuilder::AtomicExchange(Value* address, Value* new_value) {
   return i->dest;
 }
 
+Value* HIRBuilder::AtomicCompareExchange(Value* address, Value* old_value,
+                                         Value* new_value) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_ATOMIC_COMPARE_EXCHANGE_info, 0,
+                         AllocValue(INT8_TYPE));
+  i->set_src1(address);
+  i->set_src2(old_value);
+  i->set_src3(new_value);
+  return i->dest;
+}
+
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h
index d6e0e8ccc..44a528f53 100644
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@@ -236,6 +236,8 @@ class HIRBuilder {
   Value* Unpack(Value* value, uint32_t pack_flags = 0);
 
   Value* AtomicExchange(Value* address, Value* new_value);
+  Value* AtomicCompareExchange(Value* address, Value* old_value,
+                               Value* new_value);
   Value* AtomicAdd(Value* address, Value* value);
   Value* AtomicSub(Value* address, Value* value);
 
diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h
index 84bf2b320..8e440c73e 100644
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@@ -76,13 +76,14 @@ enum PackType : uint16_t {
   // Special types:
   PACK_TYPE_D3DCOLOR = 0,
   PACK_TYPE_FLOAT16_2 = 1,
-  PACK_TYPE_FLOAT16_4 = 2,
-  PACK_TYPE_SHORT_2 = 3,
-  PACK_TYPE_UINT_2101010 = 4,
+  PACK_TYPE_FLOAT16_3 = 2,  // FIXME: Not verified, but looks correct.
+  PACK_TYPE_FLOAT16_4 = 3,
+  PACK_TYPE_SHORT_2 = 4,
+  PACK_TYPE_UINT_2101010 = 5,
 
   // Types which use the bitmasks below for configuration:
-  PACK_TYPE_8_IN_16 = 5,
-  PACK_TYPE_16_IN_32 = 6,
+  PACK_TYPE_8_IN_16 = 6,
+  PACK_TYPE_16_IN_32 = 7,
 
   PACK_TYPE_MODE = 0x000F,  // just to get the mode
 
@@ -220,6 +221,7 @@ enum Opcode {
   OPCODE_PACK,
   OPCODE_UNPACK,
   OPCODE_ATOMIC_EXCHANGE,
+  OPCODE_ATOMIC_COMPARE_EXCHANGE,
   __OPCODE_MAX_VALUE,  // Keep at end.
 };
 
diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl
index c5deb7ff8..a2968e238 100644
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@@ -631,3 +631,9 @@ DEFINE_OPCODE(
     "atomic_exchange",
     OPCODE_SIG_V_V_V,
     OPCODE_FLAG_VOLATILE)
+
+DEFINE_OPCODE(
+    OPCODE_ATOMIC_COMPARE_EXCHANGE,
+    "atomic_compare_exchange",
+    OPCODE_SIG_V_V_V_V,
+    OPCODE_FLAG_VOLATILE)
diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc
index 03ef79a2a..4d30de853 100644
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@@ -46,13 +46,13 @@ uint32_t Value::AsUint32() {
   assert_true(IsConstant());
   switch (type) {
     case INT8_TYPE:
-      return constant.i8;
+      return constant.u8;
     case INT16_TYPE:
-      return constant.i16;
+      return constant.u16;
     case INT32_TYPE:
-      return constant.i32;
+      return constant.u32;
     case INT64_TYPE:
-      return (uint32_t)constant.i64;
+      return (uint32_t)constant.u64;
     default:
       assert_unhandled_case(type);
       return 0;
@@ -63,13 +63,13 @@ uint64_t Value::AsUint64() {
   assert_true(IsConstant());
   switch (type) {
     case INT8_TYPE:
-      return constant.i8;
+      return constant.u8;
     case INT16_TYPE:
-      return constant.i16;
+      return constant.u16;
     case INT32_TYPE:
-      return constant.i32;
+      return constant.u32;
     case INT64_TYPE:
-      return constant.i64;
+      return constant.u64;
     default:
       assert_unhandled_case(type);
       return 0;
@@ -85,15 +85,15 @@ void Value::ZeroExtend(TypeName target_type) {
   switch (type) {
     case INT8_TYPE:
       type = target_type;
-      constant.i64 = constant.i64 & 0xFF;
+      constant.u64 = constant.u8;
       return;
     case INT16_TYPE:
       type = target_type;
-      constant.i64 = constant.i64 & 0xFFFF;
+      constant.u64 = constant.u16;
       return;
     case INT32_TYPE:
       type = target_type;
-      constant.i64 = constant.i64 & 0xFFFFFFFF;
+      constant.u64 = constant.u32;
       return;
     default:
       assert_unhandled_case(type);
@@ -210,12 +210,30 @@ void Value::Convert(TypeName target_type, RoundMode round_mode) {
           assert_unhandled_case(target_type);
           return;
       }
+    case INT64_TYPE:
+      switch (target_type) {
+        case FLOAT64_TYPE:
+          type = target_type;
+          constant.f64 = (double)constant.i64;
+          return;
+        default:
+          assert_unhandled_case(target_type);
+          return;
+      }
     case FLOAT64_TYPE:
       switch (target_type) {
         case FLOAT32_TYPE:
           type = target_type;
           constant.f32 = (float)constant.f64;
           return;
+        case INT32_TYPE:
+          type = target_type;
+          constant.i32 = (int32_t)constant.f64;
+          return;
+        case INT64_TYPE:
+          type = target_type;
+          constant.i64 = (int64_t)constant.f64;
+          return;
         default:
           assert_unhandled_case(target_type);
           return;
@@ -227,8 +245,28 @@ void Value::Convert(TypeName target_type, RoundMode round_mode) {
 }
 
 void Value::Round(RoundMode round_mode) {
-  // TODO(benvanik): big matrix.
-  assert_always();
+  switch (type) {
+    case FLOAT32_TYPE:
+      switch (round_mode) {
+        case ROUND_TO_NEAREST:
+          constant.f32 = std::round(constant.f32);
+          return;
+      }
+      return;
+    case FLOAT64_TYPE:
+      return;
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        switch (round_mode) {
+          case ROUND_TO_NEAREST:
+            constant.v128.f32[i] = std::round(constant.v128.f32[i]);
+            return;
+        }
+      }
+      return;
+    default:
+      assert_unhandled_case(type);
+  }
 }
 
 bool Value::Add(Value* other) {
@@ -325,6 +363,11 @@ void Value::Mul(Value* other) {
     case FLOAT64_TYPE:
       constant.f64 *= other->constant.f64;
       break;
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.f32[i] *= other->constant.v128.f32[i];
+      }
+      break;
     default:
       assert_unhandled_case(type);
       break;
@@ -406,6 +449,32 @@ void Value::Div(Value* other, bool is_unsigned) {
     case FLOAT64_TYPE:
       constant.f64 /= other->constant.f64;
       break;
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.f32[i] /= other->constant.v128.f32[i];
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::Max(Value* other) {
+  assert_true(type == other->type);
+  switch (type) {
+    case FLOAT32_TYPE:
+      constant.f32 = std::max(constant.f32, other->constant.f32);
+      break;
+    case FLOAT64_TYPE:
+      constant.f64 = std::max(constant.f64, other->constant.f64);
+      break;
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.f32[i] =
+            std::max(constant.v128.f32[i], other->constant.v128.f32[i]);
+      }
+      break;
     default:
       assert_unhandled_case(type);
       break;
@@ -413,13 +482,49 @@ void Value::Div(Value* other, bool is_unsigned) {
 }
 
 void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) {
-  // TODO(benvanik): big matrix.
-  assert_always();
+  switch (dest->type) {
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        dest->constant.v128.f32[i] =
+            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) +
+            value3->constant.v128.f32[i];
+      }
+      break;
+    case FLOAT32_TYPE:
+      dest->constant.f32 =
+          (value1->constant.f32 * value2->constant.f32) + value3->constant.f32;
+      break;
+    case FLOAT64_TYPE:
+      dest->constant.f64 =
+          (value1->constant.f64 * value2->constant.f64) + value3->constant.f64;
+      break;
+    default:
+      assert_unhandled_case(dest->type);
+      break;
+  }
 }
 
 void Value::MulSub(Value* dest, Value* value1, Value* value2, Value* value3) {
-  // TODO(benvanik): big matrix.
-  assert_always();
+  switch (dest->type) {
+    case VEC128_TYPE:
+      for (int i = 0; i < 4; i++) {
+        dest->constant.v128.f32[i] =
+            (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) -
+            value3->constant.v128.f32[i];
+      }
+      break;
+    case FLOAT32_TYPE:
+      dest->constant.f32 =
+          (value1->constant.f32 * value2->constant.f32) - value3->constant.f32;
+      break;
+    case FLOAT64_TYPE:
+      dest->constant.f64 =
+          (value1->constant.f64 * value2->constant.f64) - value3->constant.f64;
+      break;
+    default:
+      assert_unhandled_case(dest->type);
+      break;
+  }
 }
 
 void Value::Neg() {
@@ -527,6 +632,9 @@ void Value::And(Value* other) {
     case INT64_TYPE:
       constant.i64 &= other->constant.i64;
       break;
+    case VEC128_TYPE:
+      constant.v128 &= other->constant.v128;
+      break;
     default:
       assert_unhandled_case(type);
       break;
@@ -548,6 +656,9 @@ void Value::Or(Value* other) {
     case INT64_TYPE:
       constant.i64 |= other->constant.i64;
       break;
+    case VEC128_TYPE:
+      constant.v128 |= other->constant.v128;
+      break;
     default:
       assert_unhandled_case(type);
       break;
@@ -569,6 +680,9 @@ void Value::Xor(Value* other) {
     case INT64_TYPE:
       constant.i64 ^= other->constant.i64;
       break;
+    case VEC128_TYPE:
+      constant.v128 ^= other->constant.v128;
+      break;
     default:
       assert_unhandled_case(type);
       break;
@@ -603,16 +717,16 @@ void Value::Shl(Value* other) {
   assert_true(other->type == INT8_TYPE);
   switch (type) {
     case INT8_TYPE:
-      constant.i8 <<= other->constant.i8;
+      constant.u8 <<= other->constant.u8;
       break;
     case INT16_TYPE:
-      constant.i16 <<= other->constant.i8;
+      constant.u16 <<= other->constant.u8;
       break;
     case INT32_TYPE:
-      constant.i32 <<= other->constant.i8;
+      constant.u32 <<= other->constant.u8;
       break;
     case INT64_TYPE:
-      constant.i64 <<= other->constant.i8;
+      constant.u64 <<= other->constant.u8;
       break;
     default:
       assert_unhandled_case(type);
@@ -624,16 +738,16 @@ void Value::Shr(Value* other) {
   assert_true(other->type == INT8_TYPE);
   switch (type) {
     case INT8_TYPE:
-      constant.i8 = (uint8_t)constant.i8 >> other->constant.i8;
+      constant.u8 = constant.u8 >> other->constant.u8;
       break;
     case INT16_TYPE:
-      constant.i16 = (uint16_t)constant.i16 >> other->constant.i8;
+      constant.u16 = constant.u16 >> other->constant.u8;
       break;
     case INT32_TYPE:
-      constant.i32 = (uint32_t)constant.i32 >> other->constant.i8;
+      constant.u32 = constant.u32 >> other->constant.u8;
       break;
     case INT64_TYPE:
-      constant.i64 = (uint64_t)constant.i64 >> other->constant.i8;
+      constant.u64 = constant.u64 >> other->constant.u8;
       break;
     default:
       assert_unhandled_case(type);
@@ -645,16 +759,16 @@ void Value::Sha(Value* other) {
   assert_true(other->type == INT8_TYPE);
   switch (type) {
     case INT8_TYPE:
-      constant.i8 = constant.i8 >> other->constant.i8;
+      constant.i8 = constant.i8 >> other->constant.u8;
       break;
     case INT16_TYPE:
-      constant.i16 = constant.i16 >> other->constant.i8;
+      constant.i16 = constant.i16 >> other->constant.u8;
       break;
     case INT32_TYPE:
-      constant.i32 = constant.i32 >> other->constant.i8;
+      constant.i32 = constant.i32 >> other->constant.u8;
       break;
     case INT64_TYPE:
-      constant.i64 = constant.i64 >> other->constant.i8;
+      constant.i64 = constant.i64 >> other->constant.u8;
       break;
     default:
       assert_unhandled_case(type);
@@ -662,6 +776,246 @@ void Value::Sha(Value* other) {
   }
 }
 
+void Value::Extract(Value* vec, Value* index) {
+  assert_true(vec->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      constant.u8 = vec->constant.v128.u8[index->constant.u8];
+      break;
+    case INT16_TYPE:
+      constant.u16 = vec->constant.v128.u16[index->constant.u16];
+      break;
+    case INT32_TYPE:
+      constant.u32 = vec->constant.v128.u32[index->constant.u32];
+      break;
+    case INT64_TYPE:
+      constant.u64 = vec->constant.v128.u64[index->constant.u64];
+      break;
+  }
+}
+
+void Value::Select(Value* other, Value* ctrl) {
+  // TODO
+  assert_always();
+}
+
+void Value::Splat(Value* other) {
+  assert_true(type == VEC128_TYPE);
+  switch (other->type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.i8[i] = other->constant.i8;
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.i16[i] = other->constant.i16;
+      }
+      break;
+    case INT32_TYPE:
+    case FLOAT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.i32[i] = other->constant.i32;
+      }
+      break;
+    case INT64_TYPE:
+    case FLOAT64_TYPE:
+      for (int i = 0; i < 2; i++) {
+        constant.v128.i64[i] = other->constant.i64;
+      }
+      break;
+    default:
+      assert_unhandled_case(other->type);
+      break;
+  }
+}
+
+void Value::VectorCompareEQ(Value* other, TypeName type) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.u8[i] =
+            constant.v128.u8[i] == other->constant.v128.u8[i] ? -1 : 0;
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.u16[i] =
+            constant.v128.u16[i] == other->constant.v128.u16[i] ? -1 : 0;
+      }
+      break;
+    case INT32_TYPE:
+    case FLOAT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] =
+            constant.v128.u32[i] == other->constant.v128.u32[i] ? -1 : 0;
+      }
+      break;
+    case INT64_TYPE:
+    case FLOAT64_TYPE:
+      for (int i = 0; i < 2; i++) {
+        constant.v128.u64[i] =
+            constant.v128.u64[i] == other->constant.v128.u64[i] ? -1 : 0;
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::VectorCompareSGT(Value* other, TypeName type) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.u8[i] =
+            constant.v128.i8[i] > other->constant.v128.i8[i] ? -1 : 0;
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.u16[i] =
+            constant.v128.i16[i] > other->constant.v128.i16[i] ? -1 : 0;
+      }
+      break;
+    case INT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] =
+            constant.v128.i32[i] > other->constant.v128.i32[i] ? -1 : 0;
+      }
+      break;
+    case FLOAT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] =
+            constant.v128.f32[i] > other->constant.v128.f32[i] ? -1 : 0;
+      }
+      break;
+    case INT64_TYPE:
+      for (int i = 0; i < 2; i++) {
+        constant.v128.u64[i] =
+            constant.v128.i64[i] > other->constant.v128.i64[i] ? -1 : 0;
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::VectorConvertI2F(Value* other) {
+  assert_true(type == VEC128_TYPE);
+  for (int i = 0; i < 4; i++) {
+    constant.v128.f32[i] = (float)other->constant.v128.i32[i];
+  }
+}
+
+void Value::VectorConvertF2I(Value* other) {
+  assert_true(type == VEC128_TYPE);
+  for (int i = 0; i < 4; i++) {
+    constant.v128.i32[i] = (int32_t)other->constant.v128.f32[i];
+  }
+}
+
+void Value::VectorShl(Value* other, TypeName type) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.u8[i] <<= other->constant.v128.u8[i] & 0x7;
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.u16[i] <<= other->constant.v128.u16[i] & 0xF;
+      }
+      break;
+    case INT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] <<= other->constant.v128.u32[i] & 0x1F;
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::VectorShr(Value* other, TypeName type) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.u8[i] >>= other->constant.v128.u8[i] & 0x7;
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.u16[i] >>= other->constant.v128.u16[i] & 0xF;
+      }
+      break;
+    case INT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] >>= other->constant.v128.u32[i] & 0x1F;
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::VectorRol(Value* other, TypeName type) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT8_TYPE:
+      for (int i = 0; i < 16; i++) {
+        constant.v128.u8[i] = xe::rotate_left(constant.v128.u8[i],
+                                              other->constant.v128.i8[i] & 0x7);
+      }
+      break;
+    case INT16_TYPE:
+      for (int i = 0; i < 8; i++) {
+        constant.v128.u16[i] = xe::rotate_left(
+            constant.v128.u16[i], other->constant.v128.u16[i] & 0xF);
+      }
+      break;
+    case INT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        constant.v128.u32[i] = xe::rotate_left(
+            constant.v128.u32[i], other->constant.v128.u32[i] & 0x1F);
+      }
+      break;
+    default:
+      assert_unhandled_case(type);
+      break;
+  }
+}
+
+void Value::VectorSub(Value* other, TypeName type, bool is_unsigned,
+                      bool saturate) {
+  assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE);
+  switch (type) {
+    case INT32_TYPE:
+      for (int i = 0; i < 4; i++) {
+        if (is_unsigned) {
+          if (saturate) {
+            assert_always();
+          } else {
+            constant.v128.u32[i] -= other->constant.v128.u32[i];
+          }
+        } else {
+          if (saturate) {
+            assert_always();
+          } else {
+            constant.v128.i32[i] -= other->constant.v128.i32[i];
+          }
+        }
+      }
+  }
+}
+
 void Value::ByteSwap() {
   switch (type) {
     case INT8_TYPE:
diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index c078983bb..d797f27d7 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -77,9 +77,13 @@ class Value {
   } Use;
   typedef union {
     int8_t i8;
+    uint8_t u8;
     int16_t i16;
+    uint16_t u16;
     int32_t i32;
+    uint32_t u32;
     int64_t i64;
+    uint64_t u64;
     float f32;
     double f64;
     vec128_t v128;
@@ -190,6 +194,8 @@ class Value {
           return !!constant.f32;
         case FLOAT64_TYPE:
           return !!constant.f64;
+        case VEC128_TYPE:
+          return constant.v128.low || constant.v128.high;
         default:
           assert_unhandled_case(type);
           return false;
@@ -199,9 +205,6 @@ class Value {
     }
   }
   bool IsConstantFalse() const {
-    if (type == VEC128_TYPE) {
-      assert_always();
-    }
     if (flags & VALUE_IS_CONSTANT) {
       switch (type) {
         case INT8_TYPE:
@@ -216,6 +219,8 @@ class Value {
           return !constant.f32;
         case FLOAT64_TYPE:
           return !constant.f64;
+        case VEC128_TYPE:
+          return !(constant.v128.low || constant.v128.high);
         default:
           assert_unhandled_case(type);
           return false;
@@ -475,6 +480,7 @@ class Value {
   void Mul(Value* other);
   void MulHi(Value* other, bool is_unsigned);
   void Div(Value* other, bool is_unsigned);
+  void Max(Value* other);
   static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3);
   static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3);
   void Neg();
@@ -488,6 +494,17 @@ class Value {
   void Shl(Value* other);
   void Shr(Value* other);
   void Sha(Value* other);
+  void Extract(Value* vec, Value* index);
+  void Select(Value* other, Value* ctrl);
+  void Splat(Value* other);
+  void VectorCompareEQ(Value* other, TypeName type);
+  void VectorCompareSGT(Value* other, TypeName type);
+  void VectorConvertI2F(Value* other);
+  void VectorConvertF2I(Value* other);
+  void VectorShl(Value* other, TypeName type);
+  void VectorShr(Value* other, TypeName type);
+  void VectorRol(Value* other, TypeName type);
+  void VectorSub(Value* other, TypeName type, bool is_unsigned, bool saturate);
   void ByteSwap();
   void CountLeadingZeros(const Value* other);
   bool Compare(Opcode opcode, Value* other);
diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h
index b37f4cda4..9c96daa6b 100644
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@@ -423,8 +423,8 @@ typedef struct PPCContext_s {
 
   uint8_t* physical_membase;
 
-  // Keep the struct padded out to 64b total.
-  uint8_t _padding[8];
+  // Value of last reserved load
+  uint64_t reserved_val;
 
   static std::string GetRegisterName(PPCRegister reg);
   std::string GetStringFromValue(PPCRegister reg) const;
diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index c0c067d31..f2fc1330f 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -2149,6 +2149,9 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) {
     case 3:  // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
       v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
       break;
+    case 4:
+      v = f.Unpack(v, PACK_TYPE_FLOAT16_3);
+      break;
     case 5:  // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
       v = f.Unpack(v, PACK_TYPE_FLOAT16_4);
       break;
diff --git a/src/xenia/cpu/ppc/ppc_emit_control.cc b/src/xenia/cpu/ppc/ppc_emit_control.cc
index a44644193..0bd9cbd2e 100644
--- a/src/xenia/cpu/ppc/ppc_emit_control.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_control.cc
@@ -10,6 +10,7 @@
 #include "xenia/cpu/ppc/ppc_emit-private.h"
 
 #include "xenia/base/assert.h"
+#include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/ppc/ppc_context.h"
 #include "xenia/cpu/ppc/ppc_frontend.h"
 #include "xenia/cpu/ppc/ppc_hir_builder.h"
@@ -725,10 +726,14 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
         f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
     if (i.X.RT == 13) {
       // iff storing from r13 we are taking a lock (disable interrupts).
-      f.CallExtern(f.builtins()->enter_global_lock);
+      if (!FLAGS_disable_global_lock) {
+        f.CallExtern(f.builtins()->enter_global_lock);
+      }
     } else {
       // Otherwise we are restoring interrupts (probably).
-      f.CallExtern(f.builtins()->leave_global_lock);
+      if (!FLAGS_disable_global_lock) {
+        f.CallExtern(f.builtins()->leave_global_lock);
+      }
     }
     return 0;
   } else {
@@ -746,10 +751,14 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
                    f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
     if (i.X.RT == 13) {
       // iff storing from r13 we are taking a lock (disable interrupts).
-      f.CallExtern(f.builtins()->enter_global_lock);
+      if (!FLAGS_disable_global_lock) {
+        f.CallExtern(f.builtins()->enter_global_lock);
+      }
     } else {
       // Otherwise we are restoring interrupts (probably).
-      f.CallExtern(f.builtins()->leave_global_lock);
+      if (!FLAGS_disable_global_lock) {
+        f.CallExtern(f.builtins()->leave_global_lock);
+      }
     }
     return 0;
   } else {
diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc
index 8749deb9a..e9294048e 100644
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@@ -658,6 +658,7 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
   Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
+  f.StoreReserved(rt);
   f.StoreGPR(i.X.RT, rt);
   return 0;
 }
@@ -682,6 +683,7 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
   Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
+  f.StoreReserved(rt);
   f.StoreGPR(i.X.RT, rt);
   return 0;
 }
@@ -700,11 +702,15 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
   // NOTE: we assume we are within a global lock.
   // As we have been exclusively executing this entire time, we assume that no
   // one else could have possibly touched the memory and must always succeed.
+  // We use atomic compare exchange here to support reserved load/store without
+  // being under the global lock (flag disable_global_lock - see mtmsr/mtmsrd).
+  // This will always succeed if under the global lock, however.
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
   Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
-  f.Store(ea, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  Value* res = f.ByteSwap(f.LoadReserved());
+  Value* v = f.AtomicCompareExchange(ea, res, rt);
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
   f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
   f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
 
@@ -729,11 +735,15 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
   // NOTE: we assume we are within a global lock.
   // As we have been exclusively executing this entire time, we assume that no
   // one else could have possibly touched the memory and must always succeed.
+  // We use atomic compare exchange here to support reserved load/store without
+  // being under the global lock (flag disable_global_lock - see mtmsr/mtmsrd).
+  // This will always succeed if under the global lock, however.
 
   Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
   Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
-  f.Store(ea, rt);
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
+  Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
+  Value* v = f.AtomicCompareExchange(ea, res, rt);
+  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
   f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
   f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
 
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc
index bce5d9d16..e18118d7b 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@@ -511,6 +511,15 @@ void PPCHIRBuilder::StoreVR(uint32_t reg, Value* value) {
   trace_reg.value = value;
 }
 
+void PPCHIRBuilder::StoreReserved(Value* val) {
+  assert_true(val->type == INT64_TYPE);
+  StoreContext(offsetof(PPCContext, reserved_val), val);
+}
+
+Value* PPCHIRBuilder::LoadReserved() {
+  return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
+}
+
 }  // namespace ppc
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.h b/src/xenia/cpu/ppc/ppc_hir_builder.h
index ca9830799..8b88ae35d 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.h
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.h
@@ -78,6 +78,9 @@ class PPCHIRBuilder : public hir::HIRBuilder {
   Value* LoadVR(uint32_t reg);
   void StoreVR(uint32_t reg, Value* value);
 
+  void StoreReserved(Value* val);
+  Value* LoadReserved();
+
  private:
   void AnnotateLabel(uint32_t address, Label* label);
 
diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc
index ea54bd4fc..4d875a626 100644
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@@ -286,9 +286,6 @@ bool XexModule::Load(const std::string& name, const std::string& path,
   }
 
   // Setup memory protection.
-  // TODO: This introduces a load of constants into the JIT, and Xenia isn't
-  // quite set-up to handle constants yet...
-  /*
   auto sec_header = xex_security_info();
   auto heap = memory()->LookupHeap(sec_header->load_address);
   auto page_size = heap->page_size();
@@ -311,7 +308,6 @@ bool XexModule::Load(const std::string& name, const std::string& path,
 
     page += desc.size;
   }
-  */
 
   return true;
 }
diff --git a/src/xenia/gpu/glsl_shader_translator.cc b/src/xenia/gpu/glsl_shader_translator.cc
index 397dd3a63..3a891316d 100644
--- a/src/xenia/gpu/glsl_shader_translator.cc
+++ b/src/xenia/gpu/glsl_shader_translator.cc
@@ -535,6 +535,14 @@ void GlslShaderTranslator::ProcessVertexFetchInstruction(
   EmitSource("// ");
   instr.Disassemble(&source_);
 
+  if (instr.operands[0].storage_index != 0) {
+    // Unimplemented for now.
+    EmitUnimplementedTranslationError();
+    EmitSourceDepth("pv.xyzw = vec4(0.0, 0.0, 0.0, 0.0);\n");
+    EmitStoreVectorResult(instr.result);
+    return;
+  }
+
   if (instr.is_predicated) {
     EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!');
     Indent();
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index e9db2fc31..2d7b935bb 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -1251,22 +1251,20 @@ pointer_result_t InterlockedPushEntrySList(
   assert_not_null(plist_ptr);
   assert_not_null(entry);
 
-  // Hold a global lock during this method. Once in the lock we assume we have
-  // exclusive access to the structure.
-  auto global_lock = xe::global_critical_region::AcquireDirect();
-
   alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
   alignas(8) X_SLIST_HEADER new_hdr = {0};
-  new_hdr.depth = old_hdr.depth + 1;
-  new_hdr.sequence = old_hdr.sequence + 1;
+  uint32_t old_head = 0;
+  do {
+    old_hdr = *plist_ptr;
+    new_hdr.depth = old_hdr.depth + 1;
+    new_hdr.sequence = old_hdr.sequence + 1;
 
-  uint32_t old_head = old_hdr.next.next;
-  entry->next = old_hdr.next.next;
-  new_hdr.next.next = entry.guest_address();
-
-  *reinterpret_cast<uint64_t*>(plist_ptr.host_address()) =
-      *reinterpret_cast<uint64_t*>(&new_hdr);
-  xe::threading::SyncMemory();
+    uint32_t old_head = old_hdr.next.next;
+    entry->next = old_hdr.next.next;
+    new_hdr.next.next = entry.guest_address();
+  } while (
+      !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr),
+                      reinterpret_cast<uint64_t*>(plist_ptr.host_address())));
 
   return old_head;
 }
@@ -1276,28 +1274,24 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPushEntrySList,
 pointer_result_t InterlockedPopEntrySList(pointer_t<X_SLIST_HEADER> plist_ptr) {
   assert_not_null(plist_ptr);
 
-  // Hold a global lock during this method. Once in the lock we assume we have
-  // exclusive access to the structure.
-  auto global_lock = xe::global_critical_region::AcquireDirect();
-
   uint32_t popped = 0;
-
-  alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
+  alignas(8) X_SLIST_HEADER old_hdr = {0};
   alignas(8) X_SLIST_HEADER new_hdr = {0};
-  auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
-      old_hdr.next.next);
-  if (!old_hdr.next.next) {
-    return 0;
-  }
-  popped = old_hdr.next.next;
+  do {
+    old_hdr = *plist_ptr;
+    auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
+        old_hdr.next.next);
+    if (!old_hdr.next.next) {
+      return 0;
+    }
+    popped = old_hdr.next.next;
 
-  new_hdr.depth = old_hdr.depth - 1;
-  new_hdr.next.next = next->next;
-  new_hdr.sequence = old_hdr.sequence;
-
-  *reinterpret_cast<uint64_t*>(plist_ptr.host_address()) =
-      *reinterpret_cast<uint64_t*>(&new_hdr);
-  xe::threading::SyncMemory();
+    new_hdr.depth = old_hdr.depth - 1;
+    new_hdr.next.next = next->next;
+    new_hdr.sequence = old_hdr.sequence;
+  } while (
+      !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr),
+                      reinterpret_cast<uint64_t*>(plist_ptr.host_address())));
 
   return popped;
 }
@@ -1307,20 +1301,18 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPopEntrySList,
 pointer_result_t InterlockedFlushSList(pointer_t<X_SLIST_HEADER> plist_ptr) {
   assert_not_null(plist_ptr);
 
-  // Hold a global lock during this method. Once in the lock we assume we have
-  // exclusive access to the structure.
-  auto global_lock = xe::global_critical_region::AcquireDirect();
-
   alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
   alignas(8) X_SLIST_HEADER new_hdr = {0};
-  uint32_t first = old_hdr.next.next;
-  new_hdr.next.next = 0;
-  new_hdr.depth = 0;
-  new_hdr.sequence = 0;
-
-  *reinterpret_cast<uint64_t*>(plist_ptr.host_address()) =
-      *reinterpret_cast<uint64_t*>(&new_hdr);
-  xe::threading::SyncMemory();
+  uint32_t first = 0;
+  do {
+    old_hdr = *plist_ptr;
+    first = old_hdr.next.next;
+    new_hdr.next.next = 0;
+    new_hdr.depth = 0;
+    new_hdr.sequence = 0;
+  } while (
+      !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr),
+                      reinterpret_cast<uint64_t*>(plist_ptr.host_address())));
 
   return first;
 }