From b121f45c7f4bb3b3366703ab09b460370cfc43e5 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 10:46:27 -0600
Subject: [PATCH 01/31] [JIT] Assert that other is constant in set_from.

---
 src/xenia/cpu/hir/value.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index ff41edf3b..dcc95ca8c 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -170,6 +170,7 @@ class Value {
     constant.v128 = value;
   }
   void set_from(const Value* other) {
+    assert_true(other->IsConstant());
     type = other->type;
     flags = other->flags;
     constant.v128 = other->constant.v128;

From 5fbcb8991ed6a63b1ac32a65828729032cfee712 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 11:59:29 -0600
Subject: [PATCH 02/31] [JIT] Run the
 SimplificationPass/ConstantPropagationPass until there are no changes.

---
 src/xenia/cpu/compiler/compiler_passes.h      |  2 +
 .../compiler/passes/conditional_group_pass.cc | 85 +++++++++++++++++
 .../compiler/passes/conditional_group_pass.h  | 45 +++++++++
 .../passes/conditional_group_subpass.cc       | 26 ++++++
 .../passes/conditional_group_subpass.h        | 47 ++++++++++
 .../passes/constant_propagation_pass.cc       | 92 ++++++++++++++++++-
 .../passes/constant_propagation_pass.h        |  6 +-
 .../compiler/passes/simplification_pass.cc    | 46 +++++++---
 .../cpu/compiler/passes/simplification_pass.h | 16 ++--
 src/xenia/cpu/ppc/ppc_translator.cc           | 19 ++--
 10 files changed, 348 insertions(+), 36 deletions(-)
 create mode 100644 src/xenia/cpu/compiler/passes/conditional_group_pass.cc
 create mode 100644 src/xenia/cpu/compiler/passes/conditional_group_pass.h
 create mode 100644 src/xenia/cpu/compiler/passes/conditional_group_subpass.cc
 create mode 100644 src/xenia/cpu/compiler/passes/conditional_group_subpass.h

diff --git a/src/xenia/cpu/compiler/compiler_passes.h b/src/xenia/cpu/compiler/compiler_passes.h
index 6b81d1fb5..fc58ec710 100644
--- a/src/xenia/cpu/compiler/compiler_passes.h
+++ b/src/xenia/cpu/compiler/compiler_passes.h
@@ -10,6 +10,8 @@
 #ifndef XENIA_CPU_COMPILER_COMPILER_PASSES_H_
 #define XENIA_CPU_COMPILER_COMPILER_PASSES_H_
 
+#include "xenia/cpu/compiler/passes/conditional_group_pass.h"
+#include "xenia/cpu/compiler/passes/conditional_group_subpass.h"
 #include "xenia/cpu/compiler/passes/constant_propagation_pass.h"
 #include "xenia/cpu/compiler/passes/context_promotion_pass.h"
 #include "xenia/cpu/compiler/passes/control_flow_analysis_pass.h"
diff --git a/src/xenia/cpu/compiler/passes/conditional_group_pass.cc b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc
new file mode 100644
index 000000000..ef84991e8
--- /dev/null
+++ b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc
@@ -0,0 +1,85 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/compiler/passes/conditional_group_pass.h"
+
+#include <gflags/gflags.h>
+
+#include "xenia/base/profiling.h"
+#include "xenia/cpu/compiler/compiler.h"
+#include "xenia/cpu/ppc/ppc_context.h"
+#include "xenia/cpu/processor.h"
+
+namespace xe {
+namespace cpu {
+namespace compiler {
+namespace passes {
+
+// TODO(benvanik): remove when enums redefined.
+using namespace xe::cpu::hir;
+
+using xe::cpu::hir::Block;
+using xe::cpu::hir::HIRBuilder;
+using xe::cpu::hir::Instr;
+using xe::cpu::hir::Value;
+
+ConditionalGroupPass::ConditionalGroupPass() : CompilerPass() {}
+
+ConditionalGroupPass::~ConditionalGroupPass() {}
+
+bool ConditionalGroupPass::Initialize(Compiler* compiler) {
+  if (!CompilerPass::Initialize(compiler)) {
+    return false;
+  }
+
+  for (size_t i = 0; i < passes_.size(); ++i) {
+    auto& pass = passes_[i];
+    if (!pass->Initialize(compiler)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool ConditionalGroupPass::Run(HIRBuilder* builder) {
+  bool dirty;
+  int loops = 0;
+  do {
+    assert_true(loops < 20);  // arbitrary number
+    dirty = false;
+    for (size_t i = 0; i < passes_.size(); ++i) {
+      scratch_arena()->Reset();
+      auto& pass = passes_[i];
+      auto subpass = dynamic_cast<ConditionalGroupSubpass*>(pass.get());
+      if (!subpass) {
+        if (!pass->Run(builder)) {
+          return false;
+        }
+      } else {
+        bool result = false;
+        if (!subpass->Run(builder, result)) {
+          return false;
+        }
+        dirty |= result;
+      }
+    }
+    loops++;
+  } while (dirty);
+  return true;
+}
+
+void ConditionalGroupPass::AddPass(std::unique_ptr<CompilerPass> pass) {
+  passes_.push_back(std::move(pass));
+}
+
+}  // namespace passes
+}  // namespace compiler
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/compiler/passes/conditional_group_pass.h b/src/xenia/cpu/compiler/passes/conditional_group_pass.h
new file mode 100644
index 000000000..7421fe1b5
--- /dev/null
+++ b/src/xenia/cpu/compiler/passes/conditional_group_pass.h
@@ -0,0 +1,45 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_
+#define XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_
+
+#include <cmath>
+#include <vector>
+
+#include "xenia/base/platform.h"
+#include "xenia/cpu/compiler/compiler_pass.h"
+#include "xenia/cpu/compiler/passes/conditional_group_subpass.h"
+
+namespace xe {
+namespace cpu {
+namespace compiler {
+namespace passes {
+
+class ConditionalGroupPass : public CompilerPass {
+ public:
+  ConditionalGroupPass();
+  virtual ~ConditionalGroupPass() override;
+
+  bool Initialize(Compiler* compiler) override;
+
+  bool Run(hir::HIRBuilder* builder) override;
+
+  void AddPass(std::unique_ptr<CompilerPass> pass);
+
+ private:
+  std::vector<std::unique_ptr<CompilerPass>> passes_;
+};
+
+}  // namespace passes
+}  // namespace compiler
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_PASS_H_
diff --git a/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc b/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc
new file mode 100644
index 000000000..39780e2f5
--- /dev/null
+++ b/src/xenia/cpu/compiler/passes/conditional_group_subpass.cc
@@ -0,0 +1,26 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/compiler/passes/conditional_group_subpass.h"
+
+#include "xenia/cpu/compiler/compiler.h"
+
+namespace xe {
+namespace cpu {
+namespace compiler {
+namespace passes {
+
+ConditionalGroupSubpass::ConditionalGroupSubpass() : CompilerPass() {}
+
+ConditionalGroupSubpass::~ConditionalGroupSubpass() = default;
+
+}  // namespace passes
+}  // namespace compiler
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/compiler/passes/conditional_group_subpass.h b/src/xenia/cpu/compiler/passes/conditional_group_subpass.h
new file mode 100644
index 000000000..f62c50ed3
--- /dev/null
+++ b/src/xenia/cpu/compiler/passes/conditional_group_subpass.h
@@ -0,0 +1,47 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_
+#define XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_
+
+#include "xenia/base/arena.h"
+#include "xenia/cpu/compiler/compiler_pass.h"
+#include "xenia/cpu/hir/hir_builder.h"
+
+namespace xe {
+namespace cpu {
+class Processor;
+}  // namespace cpu
+}  // namespace xe
+
+namespace xe {
+namespace cpu {
+namespace compiler {
+class Compiler;
+namespace passes {
+
+class ConditionalGroupSubpass : public CompilerPass {
+ public:
+  ConditionalGroupSubpass();
+  virtual ~ConditionalGroupSubpass();
+
+  bool Run(hir::HIRBuilder* builder) override {
+    bool dummy;
+    return Run(builder, dummy);
+  }
+
+  virtual bool Run(hir::HIRBuilder* builder, bool& result) = 0;
+};
+
+}  // namespace passes
+}  // namespace compiler
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_COMPILER_PASSES_CONDITIONAL_GROUP_SUBPASS_H_
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 3db8e99d6..3a399cefd 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -31,11 +31,12 @@ using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::TypeName;
 using xe::cpu::hir::Value;
 
-ConstantPropagationPass::ConstantPropagationPass() : CompilerPass() {}
+ConstantPropagationPass::ConstantPropagationPass()
+    : ConditionalGroupSubpass() {}
 
 ConstantPropagationPass::~ConstantPropagationPass() {}
 
-bool ConstantPropagationPass::Run(HIRBuilder* builder) {
+bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
   // Once ContextPromotion has run there will likely be a whole slew of
   // constants that can be pushed through the function.
   // Example:
@@ -63,6 +64,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
   //   v1 = 19
   //   v2 = 0
 
+  result = false;
   auto block = builder->first_block();
   while (block) {
     auto i = block->instr_head;
@@ -76,6 +78,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
 
@@ -86,6 +89,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
 
@@ -98,6 +102,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
         case OPCODE_CALL_INDIRECT:
@@ -109,6 +114,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             }
             i->Replace(&OPCODE_CALL_info, i->flags);
             i->src1.symbol = function;
+            result = true;
           }
           break;
         case OPCODE_CALL_INDIRECT_TRUE:
@@ -120,6 +126,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
 
@@ -132,6 +139,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
         case OPCODE_BRANCH_FALSE:
@@ -143,6 +151,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             } else {
               i->Remove();
             }
+            result = true;
           }
           break;
 
@@ -152,6 +161,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Cast(target_type);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_CONVERT:
@@ -160,6 +170,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Convert(target_type, RoundMode(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_ROUND:
@@ -167,6 +178,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Round(RoundMode(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_ZERO_EXTEND:
@@ -175,6 +187,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->ZeroExtend(target_type);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_SIGN_EXTEND:
@@ -183,6 +196,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->SignExtend(target_type);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_TRUNCATE:
@@ -191,6 +205,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Truncate(target_type);
             i->Remove();
+            result = true;
           }
           break;
 
@@ -210,6 +225,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               i->Replace(&OPCODE_LOAD_MMIO_info, 0);
               i->src1.offset = reinterpret_cast<uint64_t>(mmio_range);
               i->src2.offset = address;
+              result = true;
             } else {
               auto heap = memory->LookupHeap(address);
               uint32_t protect;
@@ -222,18 +238,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                   case INT8_TYPE:
                     v->set_constant(xe::load<uint8_t>(host_addr));
                     i->Remove();
+                    result = true;
                     break;
                   case INT16_TYPE:
                     v->set_constant(xe::load<uint16_t>(host_addr));
                     i->Remove();
+                    result = true;
                     break;
                   case INT32_TYPE:
                     v->set_constant(xe::load<uint32_t>(host_addr));
                     i->Remove();
+                    result = true;
                     break;
                   case INT64_TYPE:
                     v->set_constant(xe::load<uint64_t>(host_addr));
                     i->Remove();
+                    result = true;
                     break;
                   case VEC128_TYPE:
                     vec128_t val;
@@ -241,6 +261,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                     val.high = xe::load<uint64_t>(host_addr + 8);
                     v->set_constant(val);
                     i->Remove();
+                    result = true;
                     break;
                   default:
                     assert_unhandled_case(v->type);
@@ -270,6 +291,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               i->src1.offset = reinterpret_cast<uint64_t>(mmio_range);
               i->src2.offset = address;
               i->set_src3(value);
+              result = true;
             }
           }
           break;
@@ -281,10 +303,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                 auto src2 = i->src2.value;
                 i->Replace(&OPCODE_ASSIGN_info, 0);
                 i->set_src1(src2);
+                result = true;
               } else if (i->src1.value->IsConstantFalse()) {
                 auto src3 = i->src3.value;
                 i->Replace(&OPCODE_ASSIGN_info, 0);
                 i->set_src1(src3);
+                result = true;
               } else if (i->src2.value->IsConstant() &&
                          i->src3.value->IsConstant()) {
                 // TODO: Select
@@ -305,6 +329,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               v->set_constant(uint8_t(0));
             }
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_IS_FALSE:
@@ -315,6 +340,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               v->set_constant(uint8_t(0));
             }
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_IS_NAN:
@@ -329,6 +355,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               v->set_constant(uint8_t(0));
             }
             i->Remove();
+            result = true;
           }
           break;
 
@@ -338,6 +365,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantEQ(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_NE:
@@ -345,6 +373,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantNE(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_SLT:
@@ -352,6 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantSLT(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_SLE:
@@ -359,6 +389,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantSLE(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_SGT:
@@ -366,6 +397,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantSGT(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_SGE:
@@ -373,6 +405,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantSGE(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_ULT:
@@ -380,6 +413,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantULT(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_ULE:
@@ -387,6 +421,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantULE(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_UGT:
@@ -394,6 +429,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantUGT(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_COMPARE_UGE:
@@ -401,6 +437,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             bool value = i->src1.value->IsConstantUGE(i->src2.value);
             i->dest->set_constant(uint8_t(value));
             i->Remove();
+            result = true;
           }
           break;
 
@@ -413,6 +450,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Add(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_ADD_CARRY:
@@ -433,6 +471,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                 i->set_src1(ca);
               }
             }
+            result = true;
           }
           break;
         case OPCODE_SUB:
@@ -440,6 +479,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Sub(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_MUL:
@@ -447,6 +487,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Mul(i->src2.value);
             i->Remove();
+            result = true;
           } else if (i->src1.value->IsConstant() ||
                      i->src2.value->IsConstant()) {
             // Reorder the sources to make things simpler.
@@ -460,12 +501,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             if (s2->type != VEC128_TYPE && s2->IsConstantOne()) {
               i->Replace(&OPCODE_ASSIGN_info, 0);
               i->set_src1(s1);
+              result = true;
             } else if (s2->type == VEC128_TYPE) {
               auto& c = s2->constant;
               if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
                   c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
                 i->Replace(&OPCODE_ASSIGN_info, 0);
                 i->set_src1(s1);
+                result = true;
               }
             }
           }
@@ -475,6 +518,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_DIV:
@@ -482,6 +526,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
             i->Remove();
+            result = true;
           } else if (i->src2.value->IsConstant()) {
             // Division by one = no-op.
             Value* src1 = i->src1.value;
@@ -489,12 +534,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                 i->src2.value->IsConstantOne()) {
               i->Replace(&OPCODE_ASSIGN_info, 0);
               i->set_src1(src1);
+              result = true;
             } else if (i->src2.value->type == VEC128_TYPE) {
               auto& c = i->src2.value->constant;
               if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f &&
                   c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) {
                 i->Replace(&OPCODE_ASSIGN_info, 0);
                 i->set_src1(src1);
+                result = true;
               }
             }
           }
@@ -505,6 +552,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               v->set_from(i->src1.value);
               Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value);
               i->Remove();
+              result = true;
             } else {
               // Multiply part is constant.
               Value* mul = builder->AllocValue();
@@ -515,6 +563,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               i->Replace(&OPCODE_ADD_info, 0);
               i->set_src1(mul);
               i->set_src2(add);
+
+              result = true;
             }
           }
           break;
@@ -525,6 +575,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               v->set_from(i->src1.value);
               Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value);
               i->Remove();
+              result = true;
             } else {
               // Multiply part is constant.
               Value* mul = builder->AllocValue();
@@ -535,6 +586,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
               i->Replace(&OPCODE_SUB_info, 0);
               i->set_src1(mul);
               i->set_src2(add);
+
+              result = true;
             }
           }
           break;
@@ -543,6 +596,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Max(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_NEG:
@@ -550,6 +604,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Neg();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_ABS:
@@ -557,6 +612,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Abs();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_SQRT:
@@ -564,6 +620,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Sqrt();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_RSQRT:
@@ -571,6 +628,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->RSqrt();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_RECIP:
@@ -578,6 +636,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Recip();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_AND:
@@ -585,6 +644,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->And(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_OR:
@@ -592,6 +652,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Or(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_XOR:
@@ -599,11 +660,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Xor(i->src2.value);
             i->Remove();
+            result = true;
           } else if (!i->src1.value->IsConstant() &&
                      !i->src2.value->IsConstant() &&
                      i->src1.value == i->src2.value) {
             v->set_zero(v->type);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_NOT:
@@ -611,6 +674,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Not();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_SHL:
@@ -618,10 +682,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Shl(i->src2.value);
             i->Remove();
+            result = true;
           } else if (i->src2.value->IsConstantZero()) {
             auto src1 = i->src1.value;
             i->Replace(&OPCODE_ASSIGN_info, 0);
             i->set_src1(src1);
+            result = true;
           }
           break;
         case OPCODE_SHR:
@@ -629,10 +695,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Shr(i->src2.value);
             i->Remove();
+            result = true;
           } else if (i->src2.value->IsConstantZero()) {
             auto src1 = i->src1.value;
             i->Replace(&OPCODE_ASSIGN_info, 0);
             i->set_src1(src1);
+            result = true;
           }
           break;
         case OPCODE_SHA:
@@ -640,6 +708,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->Sha(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         // TODO(benvanik): ROTATE_LEFT
@@ -648,6 +717,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->ByteSwap();
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_CNTLZ:
@@ -655,6 +725,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_zero(v->type);
             v->CountLeadingZeros(i->src1.value);
             i->Remove();
+            result = true;
           }
           break;
         // TODO(benvanik): INSERT/EXTRACT
@@ -664,6 +735,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_zero(v->type);
             v->Extract(i->src1.value, i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_SPLAT:
@@ -671,6 +743,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_zero(v->type);
             v->Splat(i->src1.value);
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_COMPARE_EQ:
@@ -678,6 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_COMPARE_SGT:
@@ -685,6 +759,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_COMPARE_SGE:
@@ -692,6 +767,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_COMPARE_UGT:
@@ -699,6 +775,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_COMPARE_UGE:
@@ -706,6 +783,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_CONVERT_F2I:
@@ -714,6 +792,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->VectorConvertF2I(i->src1.value,
                                 !!(i->flags & ARITHMETIC_UNSIGNED));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_CONVERT_I2F:
@@ -722,6 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->VectorConvertI2F(i->src1.value,
                                 !!(i->flags & ARITHMETIC_UNSIGNED));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_SHL:
@@ -729,6 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorShl(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_SHR:
@@ -736,6 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorShr(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_ROTATE_LEFT:
@@ -743,6 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->VectorRol(i->src2.value, hir::TypeName(i->flags));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_ADD:
@@ -753,6 +836,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                          !!(arith_flags & ARITHMETIC_UNSIGNED),
                          !!(arith_flags & ARITHMETIC_SATURATE));
             i->Remove();
+            result = true;
           }
           break;
         case OPCODE_VECTOR_SUB:
@@ -763,6 +847,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                          !!(arith_flags & ARITHMETIC_UNSIGNED),
                          !!(arith_flags & ARITHMETIC_SATURATE));
             i->Remove();
+            result = true;
           }
           break;
 
@@ -771,6 +856,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->DotProduct3(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
 
@@ -779,6 +865,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
             v->set_from(i->src1.value);
             v->DotProduct4(i->src2.value);
             i->Remove();
+            result = true;
           }
           break;
 
@@ -790,6 +877,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) {
                              !!(arith_flags & ARITHMETIC_UNSIGNED),
                              !!(arith_flags & ARITHMETIC_SATURATE));
             i->Remove();
+            result = true;
           }
           break;
 
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.h b/src/xenia/cpu/compiler/passes/constant_propagation_pass.h
index 021bdc981..08bd25b4a 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.h
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.h
@@ -10,19 +10,19 @@
 #ifndef XENIA_CPU_COMPILER_PASSES_CONSTANT_PROPAGATION_PASS_H_
 #define XENIA_CPU_COMPILER_PASSES_CONSTANT_PROPAGATION_PASS_H_
 
-#include "xenia/cpu/compiler/compiler_pass.h"
+#include "xenia/cpu/compiler/passes/conditional_group_subpass.h"
 
 namespace xe {
 namespace cpu {
 namespace compiler {
 namespace passes {
 
-class ConstantPropagationPass : public CompilerPass {
+class ConstantPropagationPass : public ConditionalGroupSubpass {
  public:
   ConstantPropagationPass();
   ~ConstantPropagationPass() override;
 
-  bool Run(hir::HIRBuilder* builder) override;
+  bool Run(hir::HIRBuilder* builder, bool& result) override;
 
  private:
 };
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index 3278ab7c6..3569887a4 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -23,17 +23,18 @@ using xe::cpu::hir::HIRBuilder;
 using xe::cpu::hir::Instr;
 using xe::cpu::hir::Value;
 
-SimplificationPass::SimplificationPass() : CompilerPass() {}
+SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
 
 SimplificationPass::~SimplificationPass() {}
 
-bool SimplificationPass::Run(HIRBuilder* builder) {
-  EliminateConversions(builder);
-  SimplifyAssignments(builder);
+bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
+  result = false;
+  result |= EliminateConversions(builder);
+  result |= SimplifyAssignments(builder);
   return true;
 }
 
-void SimplificationPass::EliminateConversions(HIRBuilder* builder) {
+bool SimplificationPass::EliminateConversions(HIRBuilder* builder) {
   // First, we check for truncates/extensions that can be skipped.
   // This generates some assignments which then the second step will clean up.
   // Both zero/sign extends can be skipped:
@@ -43,6 +44,7 @@ void SimplificationPass::EliminateConversions(HIRBuilder* builder) {
   //   v1.i64 = zero/sign_extend v0.i32 (may be dead code removed later)
   //   v2.i32 = v0.i32
 
+  bool result = false;
   auto block = builder->first_block();
   while (block) {
     auto i = block->instr_head;
@@ -51,20 +53,21 @@ void SimplificationPass::EliminateConversions(HIRBuilder* builder) {
       // back to definition).
       if (i->opcode == &OPCODE_TRUNCATE_info) {
         // Matches zero/sign_extend + truncate.
-        CheckTruncate(i);
+        result |= CheckTruncate(i);
       } else if (i->opcode == &OPCODE_BYTE_SWAP_info) {
         // Matches byte swap + byte swap.
         // This is pretty rare within the same basic block, but is in the
         // memcpy hot path and (probably) worth it. Maybe.
-        CheckByteSwap(i);
+        result |= CheckByteSwap(i);
       }
       i = i->next;
     }
     block = block->next;
   }
+  return result;
 }
 
-void SimplificationPass::CheckTruncate(Instr* i) {
+bool SimplificationPass::CheckTruncate(Instr* i) {
   // Walk backward up src's chain looking for an extend. We may have
   // assigns, so skip those.
   auto src = i->src1.value;
@@ -80,6 +83,7 @@ void SimplificationPass::CheckTruncate(Instr* i) {
         // Types match, use original by turning this into an assign.
         i->Replace(&OPCODE_ASSIGN_info, 0);
         i->set_src1(def->src1.value);
+        return true;
       }
     } else if (def->opcode == &OPCODE_ZERO_EXTEND_info) {
       // Value comes from a zero extend.
@@ -87,12 +91,14 @@ void SimplificationPass::CheckTruncate(Instr* i) {
         // Types match, use original by turning this into an assign.
         i->Replace(&OPCODE_ASSIGN_info, 0);
         i->set_src1(def->src1.value);
+        return true;
       }
     }
   }
+  return false;
 }
 
-void SimplificationPass::CheckByteSwap(Instr* i) {
+bool SimplificationPass::CheckByteSwap(Instr* i) {
   // Walk backward up src's chain looking for a byte swap. We may have
   // assigns, so skip those.
   auto src = i->src1.value;
@@ -107,11 +113,13 @@ void SimplificationPass::CheckByteSwap(Instr* i) {
       // Types match, use original by turning this into an assign.
       i->Replace(&OPCODE_ASSIGN_info, 0);
       i->set_src1(def->src1.value);
+      return true;
     }
   }
+  return false;
 }
 
-void SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
+bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   // Run over the instructions and rename assigned variables:
   //   v1 = v0
   //   v2 = v1
@@ -129,27 +137,35 @@ void SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
   // of that instr. Because we may have chains, we do this recursively until
   // we find a non-assign def.
 
+  bool result = false;
   auto block = builder->first_block();
   while (block) {
     auto i = block->instr_head;
     while (i) {
       uint32_t signature = i->opcode->signature;
       if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
-        i->set_src1(CheckValue(i->src1.value));
+        bool modified = false;
+        i->set_src1(CheckValue(i->src1.value, modified));
+        result |= modified;
       }
       if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
-        i->set_src2(CheckValue(i->src2.value));
+        bool modified = false;
+        i->set_src2(CheckValue(i->src2.value, modified));
+        result |= modified;
       }
       if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
-        i->set_src3(CheckValue(i->src3.value));
+        bool modified = false;
+        i->set_src3(CheckValue(i->src3.value, modified));
+        result |= modified;
       }
       i = i->next;
     }
     block = block->next;
   }
+  return result;
 }
 
-Value* SimplificationPass::CheckValue(Value* value) {
+Value* SimplificationPass::CheckValue(Value* value, bool& result) {
   auto def = value->def;
   if (def && def->opcode == &OPCODE_ASSIGN_info) {
     // Value comes from an assignment - recursively find if it comes from
@@ -162,8 +178,10 @@ Value* SimplificationPass::CheckValue(Value* value) {
       }
       replacement = def->src1.value;
     }
+    result = true;
     return replacement;
   }
+  result = false;
   return value;
 }
 
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h
index 70275f8b4..2ba6efad7 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@@ -10,27 +10,27 @@
 #ifndef XENIA_CPU_COMPILER_PASSES_SIMPLIFICATION_PASS_H_
 #define XENIA_CPU_COMPILER_PASSES_SIMPLIFICATION_PASS_H_
 
-#include "xenia/cpu/compiler/compiler_pass.h"
+#include "xenia/cpu/compiler/passes/conditional_group_subpass.h"
 
 namespace xe {
 namespace cpu {
 namespace compiler {
 namespace passes {
 
-class SimplificationPass : public CompilerPass {
+class SimplificationPass : public ConditionalGroupSubpass {
  public:
   SimplificationPass();
   ~SimplificationPass() override;
 
-  bool Run(hir::HIRBuilder* builder) override;
+  bool Run(hir::HIRBuilder* builder, bool& result) override;
 
  private:
-  void EliminateConversions(hir::HIRBuilder* builder);
-  void CheckTruncate(hir::Instr* i);
-  void CheckByteSwap(hir::Instr* i);
+  bool EliminateConversions(hir::HIRBuilder* builder);
+  bool CheckTruncate(hir::Instr* i);
+  bool CheckByteSwap(hir::Instr* i);
 
-  void SimplifyAssignments(hir::HIRBuilder* builder);
-  hir::Value* CheckValue(hir::Value* value);
+  bool SimplifyAssignments(hir::HIRBuilder* builder);
+  hir::Value* CheckValue(hir::Value* value, bool& result);
 };
 
 }  // namespace passes
diff --git a/src/xenia/cpu/ppc/ppc_translator.cc b/src/xenia/cpu/ppc/ppc_translator.cc
index ec1768163..d408f75b1 100644
--- a/src/xenia/cpu/ppc/ppc_translator.cc
+++ b/src/xenia/cpu/ppc/ppc_translator.cc
@@ -53,15 +53,16 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) {
   if (validate) compiler_->AddPass(std::make_unique<passes::ValidationPass>());
   compiler_->AddPass(std::make_unique<passes::ContextPromotionPass>());
   if (validate) compiler_->AddPass(std::make_unique<passes::ValidationPass>());
-  // TODO(gibbed): loop until these passes stop making changes?
-  for (int i = 0; i < 5; ++i) {
-    compiler_->AddPass(std::make_unique<passes::SimplificationPass>());
-    if (validate)
-      compiler_->AddPass(std::make_unique<passes::ValidationPass>());
-    compiler_->AddPass(std::make_unique<passes::ConstantPropagationPass>());
-    if (validate)
-      compiler_->AddPass(std::make_unique<passes::ValidationPass>());
-  }
+
+  // Grouped simplification + constant propagation.
+  // Loops until no changes are made.
+  auto sap = std::make_unique<passes::ConditionalGroupPass>();
+  sap->AddPass(std::make_unique<passes::SimplificationPass>());
+  if (validate) sap->AddPass(std::make_unique<passes::ValidationPass>());
+  sap->AddPass(std::make_unique<passes::ConstantPropagationPass>());
+  if (validate) sap->AddPass(std::make_unique<passes::ValidationPass>());
+  compiler_->AddPass(std::move(sap));
+
   if (backend->machine_info()->supports_extended_load_store) {
     // Backend supports the advanced LOAD/STORE instructions.
     // These will save us a lot of HIR opcodes.

From 4c53bc5c9a41a6a90168ecffe39b90f072275836 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 12:02:49 -0600
Subject: [PATCH 03/31] [x64] Fix Travis whining.

---
 src/xenia/cpu/backend/x64/x64_sequences.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 5c2118fc7..48f5a86d1 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -5695,8 +5695,7 @@ struct VECTOR_SHL_V128
     if (i.src1.is_constant) {
       src1 = e.xmm2;
       e.LoadConstantXmm(src1, i.src1.constant());
-    }
-    else {
+    } else {
       src1 = i.src1;
     }
 
@@ -5891,8 +5890,7 @@ struct VECTOR_SHR_V128
     if (i.src1.is_constant) {
       src1 = e.xmm2;
       e.LoadConstantXmm(src1, i.src1.constant());
-    }
-    else {
+    } else {
       src1 = i.src1;
     }
 

From 8ed283747ca1dcc633ac42259fadd6b570be5a9d Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 12:21:00 -0600
Subject: [PATCH 04/31] Update lurker blurb.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6cdc19d9e..7c45a27ac 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Xenia is an experimental emulator for the Xbox 360. For more information see the
 [main xenia website](https://xenia.jp/).
 
 Come chat with us about **emulator-related topics** on [Discord](https://discord.gg/Q9mxZf9).
-For developer chat join `#dev` but stay on topic. Lurking is fine.
+For developer chat join `#dev` but stay on topic. Lurking is not only fine, but encouraged!
 Please check the [frequently asked questions](https://xenia.jp/faq/) page before
 asking questions. We've got jobs/lives/etc, so don't expect instant answers.
 

From 13558863d01f944c660b99599630d86b6e7f023a Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 12:21:21 -0600
Subject: [PATCH 05/31] Add Patreon link.

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 7c45a27ac..61377f5d1 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,9 @@ Xenia - Xbox 360 Emulator Research Project
 Xenia is an experimental emulator for the Xbox 360. For more information see the
 [main xenia website](https://xenia.jp/).
 
+**Interested in supporting the core contributors?
+[Xenia Project on Patreon](https://www.patreon.com/xenia_project).**
+
 Come chat with us about **emulator-related topics** on [Discord](https://discord.gg/Q9mxZf9).
 For developer chat join `#dev` but stay on topic. Lurking is not only fine, but encouraged!
 Please check the [frequently asked questions](https://xenia.jp/faq/) page before

From fa023615f6842f7c7d65abb434674819a1ef3bb2 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 12:54:14 -0600
Subject: [PATCH 06/31] Add icon license.

---
 assets/icon/LICENSE | 428 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 428 insertions(+)
 create mode 100644 assets/icon/LICENSE

diff --git a/assets/icon/LICENSE b/assets/icon/LICENSE
new file mode 100644
index 000000000..fe8dbc50f
--- /dev/null
+++ b/assets/icon/LICENSE
@@ -0,0 +1,428 @@
+Attribution-ShareAlike 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public: 
+	wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-ShareAlike 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-ShareAlike 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. BY-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution and ShareAlike.
+
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  k. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  l. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  m. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part; and
+
+            b. produce, reproduce, and Share Adapted Material.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+  b. ShareAlike.
+
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-SA Compatible License.
+
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+

From 021af14a6e6d309dee0f0450c404c034cbe30697 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 14:06:52 -0600
Subject: [PATCH 07/31] Fix 16x16 transparency in icon.ico.

---
 assets/icon/icon.ico | Bin 101581 -> 104957 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/assets/icon/icon.ico b/assets/icon/icon.ico
index 3f0fd9aebd95a25d363f6ff442087378dd6b1f6a..26cd5ad3387f13c9cd731e92f1198b60f2820a19 100644
GIT binary patch
delta 3410
zcmb_eYfzL`82&a1rp6hau^gwgtgL7zs~<J_!?cT2HvPyhe9K<hUG_>`O%Q?UTi6{H
zSP&4o3M~oY51eS!N&><kT7(ouj3|y-ijJB}>R2Z3+w*?#EsLxUmOFFyobR0XdEU!;
z&wEa9b>#J(ktNA<x6c-Yb6%A&8@)<cC^#boVat|W?{2Tk<((BR2$&Oum(j3VS?W_Q
zt@Qg;i|@$aoM<A7Dkvz>7Z(?c7~c&w@{i|BN=n|xnvUm#iswh~-@pGUwOV~CAt9lm
zr>E!1#>U2F4Gj&i^!N8K!kj-gHuij9U*F?F$bCz|^q$RTYpt%XUO6x@@K{+{**iL&
z?$`YMe1q5Pwd(cy!K$jNH<_ZQrsfTnEiEm5H%Jk&5=KO#MWm#plv*s7jvYI8yqK1j
z<^%K~(_>tOwK8_ZU@-KjrluCRwY5D9*>)Z)c&2=Q)!^XZqj7O@Jy5g@EBCdww!VPx
z?_DleBO78g8h?n1i6L%0&qlFt7$0b9X<5P;tJS)jzu9PJjW%I6>h$T;3yvN=x)98t
z`~Ch`v$M0;z||p-$79dQ$ytra{i!f3J?1X*T3%kB6S7Bm+}zx}lsSfmh8|%)aNo;X
z=1Ul<91ced;(UQU@9ph<B0fHT4_aG&ef?{U0`sWC9;Uwz+rOx&sCbJb3HhD8DMdv^
zYH*G)H|tPRLFq9UiM^JVu|l8E7t5h8EG*QrdzqP;IXpIMHD{c9k<{W?@WLpU*AfgO
z#p$)j!J|n}PtU_P9_BGS#N6%e?awqdHN6C-!JMqCt6L$8l6ckWbZ+OkBqt}AW2gGq
zlf=ZtT2U0Qnlv%=%O)MQ=4dE3j1h*|80zq7=}?)TSQeOTNl8gnoNmko&I;Z`=xD~i
zT;a?PjX<qbTl;z~8iacpSYBTKHYZ%Dr^!Z<PE{M|yL=s4wG)wGulr%#`FbN|#fx%!
zDL8NpaO#1xlGBQlhdt)7D+X{cb#-+;%^_dEe!T^u^RSf)Nob#A73q(y20F4?Pw8fn
zJPtLTgi#m$M)G1DDyxi{D<dPry>a74BkSnw>|7kGob?N^Xf8*<Y&P!&Fra|L^{n&)
zyggTEpx#0qorOUK3E`E8-~)VYxmsMleEEUN21o)MFbg5-4grMqhSxJKB6WJTawM)b
z#nHYE+DTh-3yy%vWZDN^Gbb5PS?^OtIWc`dip4g-!}@PVYJG+j<X{Jy<ERH^bhyeu
zgfVYId5Zd0XJnL9d}L(gLHWp=pe-UdH`k@nXig~xYt=DSk*cO)umwJi3o@^T!->eV
z!BV}fc@$1_&aFefg`P@qo#7J;Zn?Tgui`?6xPOCV5!m&ShJLTq(+Svo9POtf9dTAG
z+WF}0$<+qe592shrpEF*?_EcCcXza0YPoPnaY}KKQ_$1xIUTdIvets*9%hrTzKErK
z<Dg(1nkG#EN9*VDVxkJt%zO^4v$?3F1pIR|DqcfrsfO&_AaovIn<_4Up&O!5VD2QB
zG05B%ge;oJC6-Gx#N>=Y3I1w$c=(~9$l^K3Gk~JF{JY)m4=|qa`p#VVkrGI!{(jp)
zbS#R<t)c)<_^2Q(9}$FE#{?nDjdzFAx&&c{(vD$`WmuLm<c95CGsbxSewb)<`BXg!
zIPr@HPaXMHGybAkAStoC-5xx`?P9{nOToMjvpDTFX^o58iyVZ~bW;p6n*4b#+Fj@*
zB+9s)G<HSc_N83~PC8g=rGy0g6cPq>ZF115dNZXYIi{?$@vLC)COe(jYN2(jU3|aE
zaxRyh2EaSgV4`(t&dFG5{HddOkJ3-ymfGpcHWO7#F3SGUMd$ryy7Y;e-07~#;2W?C
z2M!$_*lZ)NKRxI>5Sa~`;jz#^aY-uU94Ou6b5M7g9R<!Pd+o`X@JNxnu}Ar#orvOJ
zjV2kluf$GYBHYZuH^A56wArkoPK@n8hqu=4qO%`c<Z3F{>N*RpMYuSO4IUR=1p&`r
zXfRU^WKo7>B9>uqd0uHhmfOf~w}mRAJUB26PL|<__HD6YJ0vG*wOZnkg*M*H8@=Oi
oR?2dCg2;`xE1cZ4=cF*<pEh_L`+tt&3I@AXEqH1Dx5n6i0n|#gJOBUy

delta 1982
zcmbVMdo)yQ6hGrJUb#1t$IZG)UPWQ)At@!4cUKti8N(cNWRyHJa`I}#giKv;HF@`t
zNKzifZA$WNNqJmrRHo9H#+bU|a=)&-tWtl}-RnDN?Qj2nXMg***LU{cHQ0_EY(}tR
zlo$kY6EV>8<q$Ash%5?0q?2)pwTT!ckzqe5k3wnyP?N-hcW89<6a<NxSzGLOc5CJ0
zJniyG$}o!(V%Q@yoWvp(Zss2*4J8?c82vh$#P6Qxp>N}ewlDY74H8)pWw|uz`j!%L
z#3r#%=E1EXqcj75rL{zEyeo05qTQC;NfoQq4SVc&j+9;7$E<=^y%AT!PYSoA4LrdP
zo}h_0B67;=z&m_ed^j`?RSMPpf@+3?(Xz_Q2}<tB<W-UE?ChJ(RaJ*G#sr(P+x$Q3
zowcEOhC)-lvJ?s>Urmz3j^dxM4drsVhq`m}o`^vzO~Zk;C$^={_*DoV-SM%uw$A7b
zZe&8yRO-32r0EwnX<786mgsLyugAv+rC60RP@=1gi_3k%Y*1mR5_Wj{N@*$F)hBjq
z)Jpwy)E)As*h;x6LCXv+dnn|@y13Qkd{XAjK~@E6-dJJE?y^~Rl|I$c@XD379oT2a
zCR?_=-}fgZ$?p&fh4<XepIfp9XYHg)vx7%WA6qJSbQyNuq=nc>G0+{>9!0_?UAuui
zf)|@=*UC>7k7kyXSZWNe!in#an9$p}an6|G(|&?Lh(Bg_ptS~}W$Ie#J8XbP+mE>e
z4Z4b{FBv&GIk!Va)Hrpo!){%R8Esd&l=Dgzez@1H=aRk=KjFFE+T2O`a-XYa4o$%w
z;)H`ZT)zx9f&DZEUFI(aM;L4lJGpMy%&0=sVOOkwLtnTHr<leh82r^cG&JPh4$<j!
zQ!VRWe?PyXYt_}gn3bHE#<s%P`dK!@Qd$3)u*<BaC9gmi7OSqPsIX2-XS3P&YoPA&
zt!$z4#fvOkTU*hc=Xfp;ITpW%IyK3gb`_85SL<*T4tk(5fv?o1s<hg-J<FC96VYdM
z{3OlCu@6V#A|pQt-qk~pjPt!%+CD99ZQcf5U4dLiNb|_Nw9Ac+Z`bq{H;!aV7HMqa
zPG$wJ)G|qPk}k9>(;Tj+)1piZ`+ZY`K9-f0@z$)CGw6N%Sm327S{}iVnq@wD;=19@
zj$Uk-3l4`48Y)=`MYKI#4;+pDIVib$LhqTJ&KX(f-HyI#J28WM_Uv(-_14@vX45JQ
z*VN-L_4%}jLl-I|#R~Klq`4c{uQxd!=3z-jodRYR9x<Vw>4aDL!F74sc}7)@*G)Nk
z&F2GwZC6!?YinzH@v^j5zG4$5xdnejlA=hJMP+3bO{l*J_GwHm3sSpWwf9Ed3-1(0
z+lylwxymZW7D~Kit0Ko9o_@uCp}Dg{{|Xh38}IGg7`b72^Kl+;ByRg3EZ<ox_vR<?
zb*Ym2hK3<g8wo77u(G_ITyis(w!CM^p6Wx!s`%AB8^B6PoU>^1E38L{xyqf%Cv{RJ
zQQ{k6G=6Pv^P%8i*L;VvlaS_JX&-5=!-{X3*?5bhqM~%C%c$l>D)HLW)(5J^D^wF=
z<p=KZ`Fx4~Nk&jj`^(JT)1#dy!ilf#2N*}f8-(-X<Ir3`G$%5nG`FMI#Doqe3uJBS
zU{PsKh}QxBr%|K^9t@aAkT`H4(X8|o%t{peecx&G^Yh=y`~liG&qu?MhEoA}JpSuM
zI2`VLJW`uVc6WDQMBUunuBVXuv-|@C;l&_u0OwA60H@IR5GCOAIPgK9yXo+Ye3W?f
z2;%DcX$T<*+?VD5j!8HgLSDjN*m`<;-aie$FCyfi2sIR$IPWXC;CnnA`6hWG=<V(O
zh(Vey!M8I=ji&-=QA^27^~Z!F_4I(YOn9!;2Na$V@n3OYpFt!NYf{J`iV1*uF9WVi
zB?D)s1b>?zKqC2n&IfJCJC2N>BhHuM!5S+dMuw1qW@Z91jh1{0@tD53#}O|tFYp<_
zK*=Dou)5BWV3@R+E<OTzSwI93P~Ux)1X|_c;Q>@G8ffeydBFi`0qXOwffjy=F4g}f
S`oDHRbWcSAI}4M9+4T>FYIaNj


From 7226c9e047b186721ec740e229f78de46910d4c5 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 15:32:14 -0600
Subject: [PATCH 08/31] Update mspack, add to premake, and fix missing license.
 Fixes #1252.

---
 premake5.lua                   |    1 +
 src/xenia/cpu/xex_module.cc    |    3 +-
 third_party/mspack.lua         |   33 +
 third_party/mspack/COPYING.LIB |  504 ++++++++
 third_party/mspack/config.h    |  114 ++
 third_party/mspack/lzx.h       |  166 ++-
 third_party/mspack/lzxd.c      | 1033 ++++++++--------
 third_party/mspack/mspack.h    | 2100 +++++++++++++++++++++++++++++++-
 third_party/mspack/readbits.h  |  207 ++++
 third_party/mspack/readhuff.h  |  172 +++
 third_party/mspack/system.c    |  242 ++++
 third_party/mspack/system.h    |  113 ++
 12 files changed, 4093 insertions(+), 595 deletions(-)
 create mode 100644 third_party/mspack.lua
 create mode 100644 third_party/mspack/COPYING.LIB
 create mode 100644 third_party/mspack/config.h
 create mode 100644 third_party/mspack/readbits.h
 create mode 100644 third_party/mspack/readhuff.h
 create mode 100644 third_party/mspack/system.c
 create mode 100644 third_party/mspack/system.h

diff --git a/premake5.lua b/premake5.lua
index 30d025979..b8b1a07b5 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -228,6 +228,7 @@ solution("xenia")
   include("third_party/glslang-spirv.lua")
   include("third_party/imgui.lua")
   include("third_party/libav.lua")
+  include("third_party/mspack.lua")
   include("third_party/snappy.lua")
   include("third_party/spirv-tools.lua")
   include("third_party/volk.lua")
diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc
index 8569e7a9a..8b6b5e173 100644
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@@ -25,7 +25,6 @@
 #include "third_party/crypto/rijndael-alg-fst.c"
 #include "third_party/crypto/rijndael-alg-fst.h"
 #include "third_party/mspack/lzx.h"
-#include "third_party/mspack/lzxd.c"
 #include "third_party/mspack/mspack.h"
 #include "third_party/pe/pe_image.h"
 
@@ -120,7 +119,7 @@ int lzx_decompress(const void* lzx_data, size_t lzx_len, void* dest,
   mspack_memory_file* lzxdst = mspack_memory_open(sys, dest, dest_len);
   lzxd_stream* lzxd =
       lzxd_init(sys, (struct mspack_file*)lzxsrc, (struct mspack_file*)lzxdst,
-                window_bits, 0, 0x8000, (off_t)dest_len);
+                window_bits, 0, 0x8000, (off_t)dest_len, 0);
 
   if (lzxd) {
     if (window_data) {
diff --git a/third_party/mspack.lua b/third_party/mspack.lua
new file mode 100644
index 000000000..85b6bc08f
--- /dev/null
+++ b/third_party/mspack.lua
@@ -0,0 +1,33 @@
+group("third_party")
+project("mspack")
+  uuid("0881692A-75A1-4E7B-87D8-BB9108CEDEA4")
+  kind("StaticLib")
+  language("C")
+
+  defines({
+    "_LIB",
+    "HAVE_CONFIG_H",
+  })
+  removedefines({
+    "_UNICODE",
+    "UNICODE",
+  })
+  includedirs({
+      "mspack",
+  })
+  files({
+      "mspack/lzx.h",
+      "mspack/lzxd.c",
+      "mspack/mspack.h",
+      "mspack/readbits.h",
+      "mspack/readhuff.h",
+      "mspack/system.c",
+      "mspack/system.h",
+  })
+
+  filter("platforms:Windows")
+    defines({
+    })
+  filter("platforms:Linux")
+    defines({
+    })
diff --git a/third_party/mspack/COPYING.LIB b/third_party/mspack/COPYING.LIB
new file mode 100644
index 000000000..b1e3f5a26
--- /dev/null
+++ b/third_party/mspack/COPYING.LIB
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/third_party/mspack/config.h b/third_party/mspack/config.h
new file mode 100644
index 000000000..c4d21f9f7
--- /dev/null
+++ b/third_party/mspack/config.h
@@ -0,0 +1,114 @@
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Turn debugging mode on? */
+#undef DEBUG
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
+#undef HAVE_FSEEKO
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the `mkdir' function. */
+#undef HAVE_MKDIR
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the `towlower' function. */
+#undef HAVE_TOWLOWER
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to 1 if you have the `_mkdir' function. */
+#undef HAVE__MKDIR
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#undef LT_OBJDIR
+
+/* Define if mkdir takes only one argument. */
+#undef MKDIR_TAKES_ONE_ARG
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* The size of `off_t', as computed by sizeof. */
+#undef SIZEOF_OFF_T
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#undef VERSION
+
+/* Enable large inode numbers on Mac OS X 10.5.  */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+#undef _FILE_OFFSET_BITS
+
+/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
+#undef _LARGEFILE_SOURCE
+
+/* Define for large files, on AIX-style hosts. */
+#undef _LARGE_FILES
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `int' if <sys/types.h> does not define. */
+#undef mode_t
+
+/* Define to `long int' if <sys/types.h> does not define. */
+#undef off_t
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/third_party/mspack/lzx.h b/third_party/mspack/lzx.h
index e9eda0fbb..a6152f622 100644
--- a/third_party/mspack/lzx.h
+++ b/third_party/mspack/lzx.h
@@ -1,5 +1,5 @@
 /* This file is part of libmspack.
- * (C) 2003-2004 Stuart Caie.
+ * (C) 2003-2013 Stuart Caie.
  *
  * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted
  * by Microsoft Corporation.
@@ -13,6 +13,10 @@
 #ifndef MSPACK_LZX_H
 #define MSPACK_LZX_H 1
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* LZX compression / decompression definitions */
 
 /* some constants defined by the LZX specification */
@@ -31,7 +35,7 @@
 /* LZX huffman defines: tweak tablebits as desired */
 #define LZX_PRETREE_MAXSYMBOLS  (LZX_PRETREE_NUM_ELEMENTS)
 #define LZX_PRETREE_TABLEBITS   (6)
-#define LZX_MAINTREE_MAXSYMBOLS (LZX_NUM_CHARS + 50*8)
+#define LZX_MAINTREE_MAXSYMBOLS (LZX_NUM_CHARS + 290*8)
 #define LZX_MAINTREE_TABLEBITS  (12)
 #define LZX_LENGTH_MAXSYMBOLS   (LZX_NUM_SECONDARY_LENGTHS+1)
 #define LZX_LENGTH_TABLEBITS    (12)
@@ -51,6 +55,8 @@ struct lzxd_stream {
 
   unsigned char *window;          /* decoding window                         */
   unsigned int   window_size;     /* window size                             */
+  unsigned int   ref_data_size;   /* LZX DELTA reference data size           */
+  unsigned int   num_offsets;     /* number of match_offset entries in table */
   unsigned int   window_posn;     /* decompression offset within window      */
   unsigned int   frame_posn;      /* current frame offset within in window   */
   unsigned int   frame;           /* the number of 32kb frames processed     */
@@ -66,8 +72,8 @@ struct lzxd_stream {
   unsigned char  intel_started;   /* has intel E8 decoding started?          */
   unsigned char  block_type;      /* type of the current block               */
   unsigned char  header_read;     /* have we started decoding at all yet?    */
-  unsigned char  posn_slots;      /* how many posn slots in stream?          */
   unsigned char  input_end;       /* have we reached the end of input?       */
+  unsigned char  is_delta;        /* does stream follow LZX DELTA spec?      */
 
   int error;
 
@@ -83,85 +89,133 @@ struct lzxd_stream {
 
   /* huffman decoding tables */
   unsigned short PRETREE_table [(1 << LZX_PRETREE_TABLEBITS) +
-				(LZX_PRETREE_MAXSYMBOLS * 2)];
+                                (LZX_PRETREE_MAXSYMBOLS * 2)];
   unsigned short MAINTREE_table[(1 << LZX_MAINTREE_TABLEBITS) +
-				(LZX_MAINTREE_MAXSYMBOLS * 2)];
+                                (LZX_MAINTREE_MAXSYMBOLS * 2)];
   unsigned short LENGTH_table  [(1 << LZX_LENGTH_TABLEBITS) +
-				(LZX_LENGTH_MAXSYMBOLS * 2)];
+                                (LZX_LENGTH_MAXSYMBOLS * 2)];
   unsigned short ALIGNED_table [(1 << LZX_ALIGNED_TABLEBITS) +
-				(LZX_ALIGNED_MAXSYMBOLS * 2)];
+                                (LZX_ALIGNED_MAXSYMBOLS * 2)];
+  unsigned char LENGTH_empty;
 
   /* this is used purely for doing the intel E8 transform */
   unsigned char  e8_buf[LZX_FRAME_SIZE];
 };
 
-/* allocates LZX decompression state for decoding the given stream.
+/**
+ * Allocates and initialises LZX decompression state for decoding an LZX
+ * stream.
  *
- * - returns NULL if window_bits is outwith the range 15 to 21 (inclusive).
+ * This routine uses system->alloc() to allocate memory. If memory
+ * allocation fails, or the parameters to this function are invalid,
+ * NULL is returned.
  *
- * - uses system->alloc() to allocate memory
- *
- * - returns NULL if not enough memory
- *
- * - window_bits is the size of the LZX window, from 32Kb (15) to 2Mb (21).
- *
- * - reset_interval is how often the bitstream is reset, measured in
- *   multiples of 32Kb bytes output. For CAB LZX streams, this is always 0
- *   (does not occur).
- *
- * - input_buffer_size is how many bytes to use as an input bitstream buffer
- *
- * - output_length is the length in bytes of the entirely decompressed
- *   output stream, if known in advance. It is used to correctly perform
- *   the Intel E8 transformation, which must stop 6 bytes before the very
- *   end of the decompressed stream. It is not otherwise used or adhered
- *   to. If the full decompressed length is known in advance, set it here.
- *   If it is NOT known, use the value 0, and call lzxd_set_output_length()
- *   once it is known. If never set, 4 of the final 6 bytes of the output
- *   stream may be incorrect.
+ * @param system             an mspack_system structure used to read from
+ *                           the input stream and write to the output
+ *                           stream, also to allocate and free memory.
+ * @param input              an input stream with the LZX data.
+ * @param output             an output stream to write the decoded data to.
+ * @param window_bits        the size of the decoding window, which must be
+ *                           between 15 and 21 inclusive for regular LZX
+ *                           data, or between 17 and 25 inclusive for
+ *                           LZX DELTA data.
+ * @param reset_interval     the interval at which the LZX bitstream is
+ *                           reset, in multiples of LZX frames (32678
+ *                           bytes), e.g. a value of 2 indicates the input
+ *                           stream resets after every 65536 output bytes.
+ *                           A value of 0 indicates that the bitstream never
+ *                           resets, such as in CAB LZX streams.
+ * @param input_buffer_size  the number of bytes to use as an input
+ *                           bitstream buffer.
+ * @param output_length      the length in bytes of the entirely
+ *                           decompressed output stream, if known in
+ *                           advance. It is used to correctly perform the
+ *                           Intel E8 transformation, which must stop 6
+ *                           bytes before the very end of the
+ *                           decompressed stream. It is not otherwise used
+ *                           or adhered to. If the full decompressed
+ *                           length is known in advance, set it here.
+ *                           If it is NOT known, use the value 0, and call
+ *                           lzxd_set_output_length() once it is
+ *                           known. If never set, 4 of the final 6 bytes
+ *                           of the output stream may be incorrect.
+ * @param is_delta           should be zero for all regular LZX data,
+ *                           non-zero for LZX DELTA encoded data.
+ * @return a pointer to an initialised lzxd_stream structure, or NULL if
+ * there was not enough memory or parameters to the function were wrong.
  */
 extern struct lzxd_stream *lzxd_init(struct mspack_system *system,
-				     struct mspack_file *input,
-				     struct mspack_file *output,
-				     int window_bits,
-				     int reset_interval,
-				     int input_buffer_size,
-				     off_t output_length);
+                                     struct mspack_file *input,
+                                     struct mspack_file *output,
+                                     int window_bits,
+                                     int reset_interval,
+                                     int input_buffer_size,
+                                     off_t output_length,
+                                     char is_delta);
 
 /* see description of output_length in lzxd_init() */
 extern void lzxd_set_output_length(struct lzxd_stream *lzx,
-				   off_t output_length);
+                                   off_t output_length);
 
-/* decompresses, or decompresses more of, an LZX stream.
+/**
+ * Reads LZX DELTA reference data into the window and allows
+ * lzxd_decompress() to reference it.
  *
- * - out_bytes of data will be decompressed and the function will return
- *   with an MSPACK_ERR_OK return code.
+ * Call this before the first call to lzxd_decompress().
+
+ * @param lzx    the LZX stream to apply this reference data to
+ * @param system an mspack_system implementation to use with the
+ *               input param. Only read() will be called.
+ * @param input  an input file handle to read reference data using
+ *               system->read().
+ * @param length the length of the reference data. Cannot be longer
+ *               than the LZX window size.
+ * @return an error code, or MSPACK_ERR_OK if successful
+ */
+extern int lzxd_set_reference_data(struct lzxd_stream *lzx,
+                                   struct mspack_system *system,
+                                   struct mspack_file *input,
+                                   unsigned int length);
+
+/**
+ * Decompresses entire or partial LZX streams.
  *
- * - decompressing will stop as soon as out_bytes is reached. if the true
- *   amount of bytes decoded spills over that amount, they will be kept for
- *   a later invocation of lzxd_decompress().
+ * The number of bytes of data that should be decompressed is given as the
+ * out_bytes parameter. If more bytes are decoded than are needed, they
+ * will be kept over for a later invocation.
  *
- * - the output bytes will be passed to the system->write() function given in
- *   lzxd_init(), using the output file handle given in lzxd_init(). More
- *   than one call may be made to system->write().
+ * The output bytes will be passed to the system->write() function given in
+ * lzxd_init(), using the output file handle given in lzxd_init(). More than
+ * one call may be made to system->write().
+
+ * Input bytes will be read in as necessary using the system->read()
+ * function given in lzxd_init(), using the input file handle given in
+ * lzxd_init().  This will continue until system->read() returns 0 bytes,
+ * or an error. Errors will be passed out of the function as
+ * MSPACK_ERR_READ errors.  Input streams should convey an "end of input
+ * stream" by refusing to supply all the bytes that LZX asks for when they
+ * reach the end of the stream, rather than return an error code.
  *
- * - LZX will read input bytes as necessary using the system->read() function
- *   given in lzxd_init(), using the input file handle given in lzxd_init().
- *   This will continue until system->read() returns 0 bytes, or an error.
- *   input streams should convey an "end of input stream" by refusing to
- *   supply all the bytes that LZX asks for when they reach the end of the
- *   stream, rather than return an error code.
+ * If any error code other than MSPACK_ERR_OK is returned, the stream
+ * should be considered unusable and lzxd_decompress() should not be
+ * called again on this stream.
  *
- * - if an error code other than MSPACK_ERR_OK is returned, the stream should
- *   be considered unusable and lzxd_decompress() should not be called again
- *   on this stream.
+ * @param lzx       LZX decompression state, as allocated by lzxd_init().
+ * @param out_bytes the number of bytes of data to decompress.
+ * @return an error code, or MSPACK_ERR_OK if successful
  */
 extern int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes);
 
-/* frees all state associated with an LZX data stream
+/**
+ * Frees all state associated with an LZX data stream. This will call
+ * system->free() using the system pointer given in lzxd_init().
  *
- * - calls system->free() using the system pointer given in lzxd_init()
+ * @param lzx LZX decompression state to free.
  */
 void lzxd_free(struct lzxd_stream *lzx);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/third_party/mspack/lzxd.c b/third_party/mspack/lzxd.c
index 2fdf23e80..6cc33df08 100644
--- a/third_party/mspack/lzxd.c
+++ b/third_party/mspack/lzxd.c
@@ -1,5 +1,5 @@
 /* This file is part of libmspack.
- * (C) 2003-2004 Stuart Caie.
+ * (C) 2003-2013 Stuart Caie.
  *
  * The LZX method was created by Jonathan Forbes and Tomi Poutanen, adapted
  * by Microsoft Corporation.
@@ -12,11 +12,11 @@
 
 /* LZX decompression implementation */
 
-#include "mspack.h"
-#include "lzx.h"
+#include <system.h>
+#include <lzx.h>
 
-/* Microsoft's LZX document and their implementation of the
- * com.ms.util.cab Java package do not concur.
+/* Microsoft's LZX document (in cab-sdk.exe) and their implementation
+ * of the com.ms.util.cab Java package do not concur.
  *
  * In the LZX document, there is a table showing the correlation between
  * window size and the number of position slots. It states that the 1MB
@@ -58,240 +58,85 @@
  * least one element. However, many CAB files contain blocks where the
  * length tree is completely empty (because there are no matches), and
  * this is expected to succeed.
+ *
+ * The errors in LZX documentation appear have been corrected in the
+ * new documentation for the LZX DELTA format.
+ *
+ *     http://msdn.microsoft.com/en-us/library/cc483133.aspx
+ *
+ * However, this is a different format, an extension of regular LZX.
+ * I have noticed the following differences, there may be more:
+ *
+ * The maximum window size has increased from 2MB to 32MB. This also
+ * increases the maximum number of position slots, etc.
+ *
+ * If the match length is 257 (the maximum possible), this signals
+ * a further length decoding step, that allows for matches up to
+ * 33024 bytes long.
+ *
+ * The format now allows for "reference data", supplied by the caller.
+ * If match offsets go further back than the number of bytes
+ * decompressed so far, that is them accessing the reference data.
  */
 
-
-/* LZX decompressor input macros
- *
- * STORE_BITS        stores bitstream state in lzxd_stream structure
- * RESTORE_BITS      restores bitstream state from lzxd_stream structure
- * READ_BITS(var,n)  takes N bits from the buffer and puts them in var
- * ENSURE_BITS(n)    ensures there are at least N bits in the bit buffer.
- * PEEK_BITS(n)      extracts without removing N bits from the bit buffer
- * REMOVE_BITS(n)    removes N bits from the bit buffer
- *
- * These bit access routines work by using the area beyond the MSB and the
- * LSB as a free source of zeroes when shifting. This avoids having to
- * mask any bits. So we have to know the bit width of the bit buffer
- * variable.
- *
- * The bit buffer datatype should be at least 32 bits wide: it must be
- * possible to ENSURE_BITS(16), so it must be possible to add 16 new bits
- * to the bit buffer when the bit buffer already has 1 to 15 bits left.
- */
-
-#include <limits.h>
-#ifndef CHAR_BIT
-# define CHAR_BIT (8)
-#endif
-#define BITBUF_WIDTH (sizeof(bit_buffer) * CHAR_BIT)
-
-#ifdef LZXDEBUG
-# include <stdio.h>
-# define D(x) do { printf("%s:%d (%s) ",__FILE__, __LINE__, __FUNCTION__); \
-                   printf x ; fputc('\n', stdout); fflush(stdout);} while (0);
-#else
-# define D(x)
-#endif
-
-#define STORE_BITS do {                                                 \
-  lzx->i_ptr      = i_ptr;                                              \
-  lzx->i_end      = i_end;                                              \
-  lzx->bit_buffer = bit_buffer;                                         \
-  lzx->bits_left  = bits_left;                                          \
+/* import bit-reading macros and code */
+#define BITS_TYPE struct lzxd_stream
+#define BITS_VAR lzx
+#define BITS_ORDER_MSB
+#define READ_BYTES do {                 \
+    unsigned char b0, b1;               \
+    READ_IF_NEEDED; b0 = *i_ptr++;      \
+    READ_IF_NEEDED; b1 = *i_ptr++;      \
+    INJECT_BITS((b1 << 8) | b0, 16);    \
 } while (0)
+#include <readbits.h>
 
-#define RESTORE_BITS do {                                               \
-  i_ptr      = lzx->i_ptr;                                              \
-  i_end      = lzx->i_end;                                              \
-  bit_buffer = lzx->bit_buffer;                                         \
-  bits_left  = lzx->bits_left;                                          \
-} while (0)
-
-#define ENSURE_BITS(nbits)                                              \
-  while (bits_left < (nbits)) {                                         \
-    if (i_ptr >= i_end) {                                               \
-      if (lzxd_read_input(lzx)) return lzx->error;                      \
-      i_ptr = lzx->i_ptr;                                               \
-      i_end = lzx->i_end;                                               \
-    }                                                                   \
-    bit_buffer |= ((i_ptr[1] << 8) | i_ptr[0])                          \
-                  << (BITBUF_WIDTH - 16 - bits_left);                   \
-    bits_left  += 16;                                                   \
-    i_ptr      += 2;                                                    \
-  }
-
-#define PEEK_BITS(nbits) (bit_buffer >> (BITBUF_WIDTH - (nbits)))
-
-#define REMOVE_BITS(nbits) ((bit_buffer <<= (nbits)), (bits_left -= (nbits)))
-
-#define READ_BITS(val, nbits) do {                                      \
-  ENSURE_BITS(nbits);                                                   \
-  (val) = PEEK_BITS(nbits);                                             \
-  REMOVE_BITS(nbits);                                                   \
-} while (0)
-
-static int lzxd_read_input(struct lzxd_stream *lzx) {
-  int read = lzx->sys->read(lzx->input, &lzx->inbuf[0], (int)lzx->inbuf_size);
-  if (read < 0) return lzx->error = MSPACK_ERR_READ;
-
-  /* huff decode's ENSURE_BYTES(16) might overrun the input stream, even
-   * if those bits aren't used, so fake 2 more bytes */
-  if (read == 0) {
-    if (lzx->input_end) {
-      D(("out of input bytes"))
-      return lzx->error = MSPACK_ERR_READ;
-    }
-    else {
-      read = 2;
-      lzx->inbuf[0] = lzx->inbuf[1] = 0;
-      lzx->input_end = 1;
-    }
-  }
-
-  lzx->i_ptr = &lzx->inbuf[0];
-  lzx->i_end = &lzx->inbuf[read];
-
-  return MSPACK_ERR_OK;
-}
-
-/* Huffman decoding macros */
-
-/* READ_HUFFSYM(tablename, var) decodes one huffman symbol from the
- * bitstream using the stated table and puts it in var.
- */
-#define READ_HUFFSYM(tbl, var) do {                                     \
-  /* huffman symbols can be up to 16 bits long */                       \
-  ENSURE_BITS(16);                                                      \
-  /* immediate table lookup of [tablebits] bits of the code */          \
-  sym = lzx->tbl##_table[PEEK_BITS(LZX_##tbl##_TABLEBITS)];             \
-  /* is the symbol is longer than [tablebits] bits? (i=node index) */   \
-  if (sym >= LZX_##tbl##_MAXSYMBOLS) {                                  \
-    /* decode remaining bits by tree traversal */                       \
-    i = 1 << (BITBUF_WIDTH - LZX_##tbl##_TABLEBITS);                    \
-    do {                                                                \
-      /* one less bit. error if we run out of bits before decode */     \
-      i >>= 1;                                                          \
-      if (i == 0) {                                                     \
-        D(("out of bits in huffman decode"))                            \
-        return lzx->error = MSPACK_ERR_DECRUNCH;                        \
-      }                                                                 \
-      /* double node index and add 0 (left branch) or 1 (right) */      \
-      sym <<= 1; sym |= (bit_buffer & i) ? 1 : 0;                       \
-      /* hop to next node index / decoded symbol */                     \
-      sym = lzx->tbl##_table[sym];                                      \
-      /* while we are still in node indicies, not decoded symbols */    \
-    } while (sym >= LZX_##tbl##_MAXSYMBOLS);                            \
-  }                                                                     \
-  /* result */                                                          \
-  (var) = sym;                                                          \
-  /* look up the code length of that symbol and discard those bits */   \
-  i = lzx->tbl##_len[sym];                                              \
-  REMOVE_BITS(i);                                                       \
-} while (0)
+/* import huffman-reading macros and code */
+#define TABLEBITS(tbl)      LZX_##tbl##_TABLEBITS
+#define MAXSYMBOLS(tbl)     LZX_##tbl##_MAXSYMBOLS
+#define HUFF_TABLE(tbl,idx) lzx->tbl##_table[idx]
+#define HUFF_LEN(tbl,idx)   lzx->tbl##_len[idx]
+#define HUFF_ERROR          return lzx->error = MSPACK_ERR_DECRUNCH
+#include <readhuff.h>
 
 /* BUILD_TABLE(tbl) builds a huffman lookup table from code lengths */
 #define BUILD_TABLE(tbl)                                                \
-  if (make_decode_table(LZX_##tbl##_MAXSYMBOLS, LZX_##tbl##_TABLEBITS,  \
-			&lzx->tbl##_len[0], &lzx->tbl##_table[0]))      \
-  {                                                                     \
-    D(("failed to build %s table", #tbl))                               \
-    return lzx->error = MSPACK_ERR_DECRUNCH;                            \
-  }
-
-/* make_decode_table(nsyms, nbits, length[], table[])
- *
- * This function was coded by David Tritscher. It builds a fast huffman
- * decoding table from a canonical huffman code lengths table.
- *
- * nsyms  = total number of symbols in this huffman tree.
- * nbits  = any symbols with a code length of nbits or less can be decoded
- *          in one lookup of the table.
- * length = A table to get code lengths from [0 to syms-1]
- * table  = The table to fill up with decoded symbols and pointers.
- *
- * Returns 0 for OK or 1 for error
- */
-
-static int make_decode_table(unsigned int nsyms, unsigned int nbits,
-			     unsigned char *length, unsigned short *table)
-{
-  unsigned short sym;
-  unsigned int leaf, fill;
-  unsigned char bit_num;
-  unsigned int pos         = 0; /* the current position in the decode table */
-  unsigned int table_mask  = 1 << nbits;
-  unsigned int bit_mask    = table_mask >> 1; /* don't do 0 length codes */
-  unsigned int next_symbol = bit_mask; /* base of allocation for long codes */
-
-  /* fill entries for codes short enough for a direct mapping */
-  for (bit_num = 1; bit_num <= nbits; bit_num++) {
-    for (sym = 0; sym < nsyms; sym++) {
-      if (length[sym] != bit_num) continue;
-      leaf = pos;
-      if((pos += bit_mask) > table_mask) return 1; /* table overrun */
-      /* fill all possible lookups of this symbol with the symbol itself */
-      for (fill = bit_mask; fill-- > 0;) table[leaf++] = sym;
+    if (make_decode_table(MAXSYMBOLS(tbl), TABLEBITS(tbl),              \
+                          &HUFF_LEN(tbl,0), &HUFF_TABLE(tbl,0)))        \
+    {                                                                   \
+        D(("failed to build %s table", #tbl))                           \
+        return lzx->error = MSPACK_ERR_DECRUNCH;                        \
     }
-    bit_mask >>= 1;
-  }
-
-  /* full table already? */
-  if (pos == table_mask) return 0;
-
-  /* clear the remainder of the table */
-  for (sym = pos; sym < table_mask; sym++) table[sym] = 0xFFFF;
-
-  /* allow codes to be up to nbits+16 long, instead of nbits */
-  pos <<= 16;
-  table_mask <<= 16;
-  bit_mask = 1 << 15;
-
-  for (bit_num = nbits+1; bit_num <= 16; bit_num++) {
-    for (sym = 0; sym < nsyms; sym++) {
-      if (length[sym] != bit_num) continue;
-
-      leaf = pos >> 16;
-      for (fill = 0; fill < bit_num - nbits; fill++) {
-	/* if this path hasn't been taken yet, 'allocate' two entries */
-	if (table[leaf] == 0xFFFF) {
-	  table[(next_symbol << 1)] = 0xFFFF;
-	  table[(next_symbol << 1) + 1] = 0xFFFF;
-	  table[leaf] = next_symbol++;
-	}
-	/* follow the path and select either left or right for next bit */
-	leaf = table[leaf] << 1;
-	if ((pos >> (15-fill)) & 1) leaf++;
-      }
-      table[leaf] = sym;
-
-      if ((pos += bit_mask) > table_mask) return 1; /* table overflow */
-    }
-    bit_mask >>= 1;
-  }
-
-  /* full table? */
-  if (pos == table_mask) return 0;
-
-  /* either erroneous table, or all elements are 0 - let's find out. */
-  for (sym = 0; sym < nsyms; sym++) if (length[sym]) return 1;
-  return 0;
-}
 
+#define BUILD_TABLE_MAYBE_EMPTY(tbl) do {                               \
+    lzx->tbl##_empty = 0;                                               \
+    if (make_decode_table(MAXSYMBOLS(tbl), TABLEBITS(tbl),              \
+                          &HUFF_LEN(tbl,0), &HUFF_TABLE(tbl,0)))        \
+    {                                                                   \
+        for (i = 0; i < MAXSYMBOLS(tbl); i++) {                         \
+            if (HUFF_LEN(tbl, i) > 0) {                                 \
+                D(("failed to build %s table", #tbl))                   \
+                return lzx->error = MSPACK_ERR_DECRUNCH;                \
+            }                                                           \
+        }                                                               \
+        /* empty tree - allow it, but don't decode symbols with it */   \
+        lzx->tbl##_empty = 1;                                           \
+    }                                                                   \
+} while (0)
 
 /* READ_LENGTHS(tablename, first, last) reads in code lengths for symbols
  * first to last in the given table. The code lengths are stored in their
  * own special LZX way.
  */
-#define READ_LENGTHS(tbl, first, last) do {                            \
-  STORE_BITS;                                                          \
-  if (lzxd_read_lens(lzx, &lzx->tbl##_len[0], (first),                 \
-    (unsigned int)(last))) return lzx->error;                          \
-  RESTORE_BITS;                                                        \
+#define READ_LENGTHS(tbl, first, last) do {             \
+  STORE_BITS;                                           \
+  if (lzxd_read_lens(lzx, &HUFF_LEN(tbl, 0), (first),   \
+    (unsigned int)(last))) return lzx->error;           \
+  RESTORE_BITS;                                         \
 } while (0)
 
 static int lzxd_read_lens(struct lzxd_stream *lzx, unsigned char *lens,
-			  unsigned int first, unsigned int last)
+                          unsigned int first, unsigned int last)
 {
   /* bit buffer and huffman symbol decode variables */
   unsigned int bit_buffer;
@@ -348,27 +193,71 @@ static int lzxd_read_lens(struct lzxd_stream *lzx, unsigned char *lens,
  * a small 'position slot' number and a small offset from that slot are
  * encoded instead of one large offset.
  *
+ * The number of slots is decided by how many are needed to encode the
+ * largest offset for a given window size. This is easy when the gap between
+ * slots is less than 128Kb, it's a linear relationship. But when extra_bits
+ * reaches its limit of 17 (because LZX can only ensure reading 17 bits of
+ * data at a time), we can only jump 128Kb at a time and have to start
+ * using more and more position slots as each window size doubles.
+ *
  * position_base[] is an index to the position slot bases
  *
  * extra_bits[] states how many bits of offset-from-base data is needed.
+ *
+ * They are calculated as follows:
+ * extra_bits[i] = 0 where i < 4
+ * extra_bits[i] = floor(i/2)-1 where i >= 4 && i < 36
+ * extra_bits[i] = 17 where i >= 36
+ * position_base[0] = 0
+ * position_base[i] = position_base[i-1] + (1 << extra_bits[i-1])
  */
-static unsigned int  position_base[51];
-static unsigned char extra_bits[51];
-
-static void lzxd_static_init() {
-  int i, j;
-
-  for (i = 0, j = 0; i < 51; i += 2) {
-    extra_bits[i]   = j; /* 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7... */
-    extra_bits[i+1] = j;
-    if ((i != 0) && (j < 17)) j++; /* 0,0,1,2,3,4...15,16,17,17,17,17... */
-  }
-
-  for (i = 0, j = 0; i < 51; i++) {
-    position_base[i] = j; /* 0,1,2,3,4,6,8,12,16,24,32,... */
-    j += 1 << extra_bits[i]; /* 1,1,1,1,2,2,4,4,8,8,16,16,32,32,... */
-  }
-}
+static const unsigned int position_slots[11] = {
+    30, 32, 34, 36, 38, 42, 50, 66, 98, 162, 290
+};
+static const unsigned char extra_bits[36] = {
+    0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+    9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16
+};
+static const unsigned int position_base[290] = {
+    0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512,
+    768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+    49152, 65536, 98304, 131072, 196608, 262144, 393216, 524288, 655360,
+    786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864, 1703936,
+    1835008, 1966080, 2097152, 2228224, 2359296, 2490368, 2621440, 2752512,
+    2883584, 3014656, 3145728, 3276800, 3407872, 3538944, 3670016, 3801088,
+    3932160, 4063232, 4194304, 4325376, 4456448, 4587520, 4718592, 4849664,
+    4980736, 5111808, 5242880, 5373952, 5505024, 5636096, 5767168, 5898240,
+    6029312, 6160384, 6291456, 6422528, 6553600, 6684672, 6815744, 6946816,
+    7077888, 7208960, 7340032, 7471104, 7602176, 7733248, 7864320, 7995392,
+    8126464, 8257536, 8388608, 8519680, 8650752, 8781824, 8912896, 9043968,
+    9175040, 9306112, 9437184, 9568256, 9699328, 9830400, 9961472, 10092544,
+    10223616, 10354688, 10485760, 10616832, 10747904, 10878976, 11010048,
+    11141120, 11272192, 11403264, 11534336, 11665408, 11796480, 11927552,
+    12058624, 12189696, 12320768, 12451840, 12582912, 12713984, 12845056,
+    12976128, 13107200, 13238272, 13369344, 13500416, 13631488, 13762560,
+    13893632, 14024704, 14155776, 14286848, 14417920, 14548992, 14680064,
+    14811136, 14942208, 15073280, 15204352, 15335424, 15466496, 15597568,
+    15728640, 15859712, 15990784, 16121856, 16252928, 16384000, 16515072,
+    16646144, 16777216, 16908288, 17039360, 17170432, 17301504, 17432576,
+    17563648, 17694720, 17825792, 17956864, 18087936, 18219008, 18350080,
+    18481152, 18612224, 18743296, 18874368, 19005440, 19136512, 19267584,
+    19398656, 19529728, 19660800, 19791872, 19922944, 20054016, 20185088,
+    20316160, 20447232, 20578304, 20709376, 20840448, 20971520, 21102592,
+    21233664, 21364736, 21495808, 21626880, 21757952, 21889024, 22020096,
+    22151168, 22282240, 22413312, 22544384, 22675456, 22806528, 22937600,
+    23068672, 23199744, 23330816, 23461888, 23592960, 23724032, 23855104,
+    23986176, 24117248, 24248320, 24379392, 24510464, 24641536, 24772608,
+    24903680, 25034752, 25165824, 25296896, 25427968, 25559040, 25690112,
+    25821184, 25952256, 26083328, 26214400, 26345472, 26476544, 26607616,
+    26738688, 26869760, 27000832, 27131904, 27262976, 27394048, 27525120,
+    27656192, 27787264, 27918336, 28049408, 28180480, 28311552, 28442624,
+    28573696, 28704768, 28835840, 28966912, 29097984, 29229056, 29360128,
+    29491200, 29622272, 29753344, 29884416, 30015488, 30146560, 30277632,
+    30408704, 30539776, 30670848, 30801920, 30932992, 31064064, 31195136,
+    31326208, 31457280, 31588352, 31719424, 31850496, 31981568, 32112640,
+    32243712, 32374784, 32505856, 32636928, 32768000, 32899072, 33030144,
+    33161216, 33292288, 33423360
+};
 
 static void lzxd_reset_state(struct lzxd_stream *lzx) {
   int i;
@@ -388,35 +277,46 @@ static void lzxd_reset_state(struct lzxd_stream *lzx) {
 /*-------- main LZX code --------*/
 
 struct lzxd_stream *lzxd_init(struct mspack_system *system,
-			      struct mspack_file *input,
-			      struct mspack_file *output,
-			      int window_bits,
-			      int reset_interval,
-			      int input_buffer_size,
-			      off_t output_length)
+                              struct mspack_file *input,
+                              struct mspack_file *output,
+                              int window_bits,
+                              int reset_interval,
+                              int input_buffer_size,
+                              off_t output_length,
+                              char is_delta)
 {
   unsigned int window_size = 1 << window_bits;
   struct lzxd_stream *lzx;
 
   if (!system) return NULL;
 
-  /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
-  if (window_bits < 15 || window_bits > 21) return NULL;
+  /* LZX DELTA window sizes are between 2^17 (128KiB) and 2^25 (32MiB),
+   * regular LZX windows are between 2^15 (32KiB) and 2^21 (2MiB)
+   */
+  if (is_delta) {
+      if (window_bits < 17 || window_bits > 25) return NULL;
+  }
+  else {
+      if (window_bits < 15 || window_bits > 21) return NULL;
+  }
 
+  if (reset_interval < 0 || output_length < 0) {
+      D(("reset interval or output length < 0"))
+      return NULL;
+  }
+
+  /* round up input buffer size to multiple of two */
   input_buffer_size = (input_buffer_size + 1) & -2;
-  if (!input_buffer_size) return NULL;
-
-  /* initialise static data */
-  lzxd_static_init();
+  if (input_buffer_size < 2) return NULL;
 
   /* allocate decompression state */
-  if (!(lzx = (struct lzxd_stream *)system->alloc(system, sizeof(struct lzxd_stream)))) {
+  if (!(lzx = (struct lzxd_stream *) system->alloc(system, sizeof(struct lzxd_stream)))) {
     return NULL;
   }
 
   /* allocate decompression window and input buffer */
-  lzx->window = (unsigned char *)system->alloc(system, (size_t) window_size);
-  lzx->inbuf  = (unsigned char *)system->alloc(system, (size_t) input_buffer_size);
+  lzx->window = (unsigned char *) system->alloc(system, (size_t) window_size);
+  lzx->inbuf  = (unsigned char *) system->alloc(system, (size_t) input_buffer_size);
   if (!lzx->window || !lzx->inbuf) {
     system->free(lzx->window);
     system->free(lzx->inbuf);
@@ -433,43 +333,73 @@ struct lzxd_stream *lzxd_init(struct mspack_system *system,
 
   lzx->inbuf_size      = input_buffer_size;
   lzx->window_size     = 1 << window_bits;
+  lzx->ref_data_size   = 0;
   lzx->window_posn     = 0;
   lzx->frame_posn      = 0;
   lzx->frame           = 0;
   lzx->reset_interval  = reset_interval;
   lzx->intel_filesize  = 0;
   lzx->intel_curpos    = 0;
-
-  /* window bits:    15  16  17  18  19  20  21
-   * position slots: 30  32  34  36  38  42  50  */
-  lzx->posn_slots      = ((window_bits == 21) ? 50 :
-			  ((window_bits == 20) ? 42 : (window_bits << 1)));
   lzx->intel_started   = 0;
-  lzx->input_end       = 0;
+  lzx->error           = MSPACK_ERR_OK;
+  lzx->num_offsets     = position_slots[window_bits - 15] << 3;
+  lzx->is_delta        = is_delta;
 
-  lzx->error = MSPACK_ERR_OK;
-
-  lzx->i_ptr = lzx->i_end = &lzx->inbuf[0];
   lzx->o_ptr = lzx->o_end = &lzx->e8_buf[0];
-  lzx->bit_buffer = lzx->bits_left = 0;
-
   lzxd_reset_state(lzx);
+  INIT_BITS;
   return lzx;
 }
 
+int lzxd_set_reference_data(struct lzxd_stream *lzx,
+                            struct mspack_system *system,
+                            struct mspack_file *input,
+                            unsigned int length)
+{
+    if (!lzx) return MSPACK_ERR_ARGS;
+
+    if (!lzx->is_delta) {
+        D(("only LZX DELTA streams support reference data"))
+        return MSPACK_ERR_ARGS;
+    }
+    if (lzx->offset) {
+        D(("too late to set reference data after decoding starts"))
+        return MSPACK_ERR_ARGS;
+    }
+    if (length > lzx->window_size) {
+        D(("reference length (%u) is longer than the window", length))
+        return MSPACK_ERR_ARGS;
+    }
+    if (length > 0 && (!system || !input)) {
+        D(("length > 0 but no system or input"))
+        return MSPACK_ERR_ARGS;
+    }
+
+    lzx->ref_data_size = length;
+    if (length > 0) {
+        /* copy reference data */
+        unsigned char *pos = &lzx->window[lzx->window_size - length];
+        int bytes = system->read(input, pos, length);
+        /* length can't be more than 2^25, so no signedness problem */
+        if (bytes < (int)length) return MSPACK_ERR_READ;
+    }
+    lzx->ref_data_size = length;
+    return MSPACK_ERR_OK;
+}
+
 void lzxd_set_output_length(struct lzxd_stream *lzx, off_t out_bytes) {
-  if (lzx) lzx->length = out_bytes;
+  if (lzx && out_bytes > 0) lzx->length = out_bytes;
 }
 
 int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
-  /* bitstream reading and huffman variables */
+  /* bitstream and huffman reading variables */
   unsigned int bit_buffer;
   int bits_left, i=0;
-  unsigned short sym;
   unsigned char *i_ptr, *i_end;
+  unsigned short sym;
 
   int match_length, length_footer, extra, verbatim_bits, bytes_todo;
-  int this_run, main_element, aligned_bits, j;
+  int this_run, main_element, aligned_bits, j, warned = 0;
   unsigned char *window, *runsrc, *rundest, buf[12];
   unsigned int frame_size=0, end_frame, match_offset, window_posn;
   unsigned int R0, R1, R2;
@@ -505,12 +435,25 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
     /* have we reached the reset interval? (if there is one?) */
     if (lzx->reset_interval && ((lzx->frame % lzx->reset_interval) == 0)) {
       if (lzx->block_remaining) {
-	D(("%d bytes remaining at reset interval", lzx->block_remaining))
-	return lzx->error = MSPACK_ERR_DECRUNCH;
+        /* this is a file format error, we can make a best effort to extract what we can */
+        D(("%d bytes remaining at reset interval", lzx->block_remaining))
+        if (!warned) {
+          lzx->sys->message(NULL, "WARNING; invalid reset interval detected during LZX decompression");
+          warned++;
+        }
       }
 
       /* re-read the intel header and reset the huffman lengths */
       lzxd_reset_state(lzx);
+      R0 = lzx->R0;
+      R1 = lzx->R1;
+      R2 = lzx->R2;
+    }
+
+    /* LZX DELTA format has chunk_size, not present in LZX format */
+    if (lzx->is_delta) {
+      ENSURE_BITS(16);
+      REMOVE_BITS(16);
     }
 
     /* read header if necessary */
@@ -527,7 +470,7 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
      * has been filled in. */
     frame_size = LZX_FRAME_SIZE;
     if (lzx->length && (lzx->length - lzx->offset) < (off_t)frame_size) {
-      frame_size = (unsigned int)(lzx->length - lzx->offset);
+      frame_size = lzx->length - lzx->offset;
     }
 
     /* decode until one more frame is available */
@@ -535,70 +478,61 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
     while (bytes_todo > 0) {
       /* initialise new block, if one is needed */
       if (lzx->block_remaining == 0) {
-	/* realign if previous block was an odd-sized UNCOMPRESSED block */
-	if ((lzx->block_type == LZX_BLOCKTYPE_UNCOMPRESSED) &&
-	    (lzx->block_length & 1))
-	{
-	  if (i_ptr == i_end) {
-	    if (lzxd_read_input(lzx)) return lzx->error;
-	    i_ptr = lzx->i_ptr;
-	    i_end = lzx->i_end;
-	  }
-	  i_ptr++;
-	}
+        /* realign if previous block was an odd-sized UNCOMPRESSED block */
+        if ((lzx->block_type == LZX_BLOCKTYPE_UNCOMPRESSED) &&
+            (lzx->block_length & 1))
+        {
+          READ_IF_NEEDED;
+          i_ptr++;
+        }
 
-	/* read block type (3 bits) and block length (24 bits) */
-	READ_BITS(lzx->block_type, 3);
-	READ_BITS(i, 16); READ_BITS(j, 8);
-	lzx->block_remaining = lzx->block_length = (i << 8) | j;
-	/*D(("new block t%d len %u", lzx->block_type, lzx->block_length))*/
+        /* read block type (3 bits) and block length (24 bits) */
+        READ_BITS(lzx->block_type, 3);
+        READ_BITS(i, 16); READ_BITS(j, 8);
+        lzx->block_remaining = lzx->block_length = (i << 8) | j;
+        /*D(("new block t%d len %u", lzx->block_type, lzx->block_length))*/
 
-	/* read individual block headers */
-	switch (lzx->block_type) {
-	case LZX_BLOCKTYPE_ALIGNED:
-	  /* read lengths of and build aligned huffman decoding tree */
-	  for (i = 0; i < 8; i++) { READ_BITS(j, 3); lzx->ALIGNED_len[i] = j; }
-	  BUILD_TABLE(ALIGNED);
-	  /* no break -- rest of aligned header is same as verbatim */
-	case LZX_BLOCKTYPE_VERBATIM:
-	  /* read lengths of and build main huffman decoding tree */
-	  READ_LENGTHS(MAINTREE, 0, 256);
-	  READ_LENGTHS(MAINTREE, 256, LZX_NUM_CHARS + (lzx->posn_slots << 3));
-	  BUILD_TABLE(MAINTREE);
-	  /* if the literal 0xE8 is anywhere in the block... */
-	  if (lzx->MAINTREE_len[0xE8] != 0) lzx->intel_started = 1;
-	  /* read lengths of and build lengths huffman decoding tree */
-	  READ_LENGTHS(LENGTH, 0, LZX_NUM_SECONDARY_LENGTHS);
-	  BUILD_TABLE(LENGTH);
-	  break;
+        /* read individual block headers */
+        switch (lzx->block_type) {
+        case LZX_BLOCKTYPE_ALIGNED:
+          /* read lengths of and build aligned huffman decoding tree */
+          for (i = 0; i < 8; i++) { READ_BITS(j, 3); lzx->ALIGNED_len[i] = j; }
+          BUILD_TABLE(ALIGNED);
+          /* rest of aligned header is same as verbatim */ /*@fallthrough@*/
+        case LZX_BLOCKTYPE_VERBATIM:
+          /* read lengths of and build main huffman decoding tree */
+          READ_LENGTHS(MAINTREE, 0, 256);
+          READ_LENGTHS(MAINTREE, 256, LZX_NUM_CHARS + lzx->num_offsets);
+          BUILD_TABLE(MAINTREE);
+          /* if the literal 0xE8 is anywhere in the block... */
+          if (lzx->MAINTREE_len[0xE8] != 0) lzx->intel_started = 1;
+          /* read lengths of and build lengths huffman decoding tree */
+          READ_LENGTHS(LENGTH, 0, LZX_NUM_SECONDARY_LENGTHS);
+          BUILD_TABLE_MAYBE_EMPTY(LENGTH);
+          break;
 
-	case LZX_BLOCKTYPE_UNCOMPRESSED:
-	  /* because we can't assume otherwise */
-	  lzx->intel_started = 1;
+        case LZX_BLOCKTYPE_UNCOMPRESSED:
+          /* because we can't assume otherwise */
+          lzx->intel_started = 1;
 
-	  /* read 1-16 (not 0-15) bits to align to bytes */
-	  ENSURE_BITS(16);
-	  if (bits_left > 16) i_ptr -= 2;
-	  bits_left = 0; bit_buffer = 0;
+          /* read 1-16 (not 0-15) bits to align to bytes */
+          if (bits_left == 0) ENSURE_BITS(16);
+          bits_left = 0; bit_buffer = 0;
 
-	  /* read 12 bytes of stored R0 / R1 / R2 values */
-	  for (rundest = &buf[0], i = 0; i < 12; i++) {
-	    if (i_ptr == i_end) {
-	      if (lzxd_read_input(lzx)) return lzx->error;
-	      i_ptr = lzx->i_ptr;
-	      i_end = lzx->i_end;
-	    }
-	    *rundest++ = *i_ptr++;
-	  }
-	  R0 = buf[0] | (buf[1] << 8) | (buf[2]  << 16) | (buf[3]  << 24);
-	  R1 = buf[4] | (buf[5] << 8) | (buf[6]  << 16) | (buf[7]  << 24);
-	  R2 = buf[8] | (buf[9] << 8) | (buf[10] << 16) | (buf[11] << 24);
-	  break;
+          /* read 12 bytes of stored R0 / R1 / R2 values */
+          for (rundest = &buf[0], i = 0; i < 12; i++) {
+            READ_IF_NEEDED;
+            *rundest++ = *i_ptr++;
+          }
+          R0 = buf[0] | (buf[1] << 8) | (buf[2]  << 16) | (buf[3]  << 24);
+          R1 = buf[4] | (buf[5] << 8) | (buf[6]  << 16) | (buf[7]  << 24);
+          R2 = buf[8] | (buf[9] << 8) | (buf[10] << 16) | (buf[11] << 24);
+          break;
 
-	default:
-	  D(("bad block type"))
-	  return lzx->error = MSPACK_ERR_DECRUNCH;
-	}
+        default:
+          D(("bad block type"))
+          return lzx->error = MSPACK_ERR_DECRUNCH;
+        }
       }
 
       /* decode more of the block:
@@ -613,202 +547,270 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
       /* decode at least this_run bytes */
       switch (lzx->block_type) {
       case LZX_BLOCKTYPE_VERBATIM:
-	while (this_run > 0) {
-	  READ_HUFFSYM(MAINTREE, main_element);
-	  if (main_element < LZX_NUM_CHARS) {
-	    /* literal: 0 to LZX_NUM_CHARS-1 */
-	    window[window_posn++] = main_element;
-	    this_run--;
-	  }
-	  else {
-	    /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */
-	    main_element -= LZX_NUM_CHARS;
+        while (this_run > 0) {
+          READ_HUFFSYM(MAINTREE, main_element);
+          if (main_element < LZX_NUM_CHARS) {
+            /* literal: 0 to LZX_NUM_CHARS-1 */
+            window[window_posn++] = main_element;
+            this_run--;
+          }
+          else {
+            /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */
+            main_element -= LZX_NUM_CHARS;
 
-	    /* get match length */
-	    match_length = main_element & LZX_NUM_PRIMARY_LENGTHS;
-	    if (match_length == LZX_NUM_PRIMARY_LENGTHS) {
-	      READ_HUFFSYM(LENGTH, length_footer);
-	      match_length += length_footer;
-	    }
-	    match_length += LZX_MIN_MATCH;
-	  
-	    /* get match offset */
-	    switch ((match_offset = (main_element >> 3))) {
-	    case 0: match_offset = R0;                                  break;
-	    case 1: match_offset = R1; R1=R0;        R0 = match_offset; break;
-	    case 2: match_offset = R2; R2=R0;        R0 = match_offset; break;
-	    case 3: match_offset = 1;  R2=R1; R1=R0; R0 = match_offset; break;
-	    default:
-	      extra = extra_bits[match_offset];
-	      READ_BITS(verbatim_bits, extra);
-	      match_offset = position_base[match_offset] - 2 + verbatim_bits;
-	      R2 = R1; R1 = R0; R0 = match_offset;
-	    }
+            /* get match length */
+            match_length = main_element & LZX_NUM_PRIMARY_LENGTHS;
+            if (match_length == LZX_NUM_PRIMARY_LENGTHS) {
+              if (lzx->LENGTH_empty) {
+                D(("LENGTH symbol needed but tree is empty"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              }
+              READ_HUFFSYM(LENGTH, length_footer);
+              match_length += length_footer;
+            }
+            match_length += LZX_MIN_MATCH;
 
-	    if ((window_posn + match_length) > lzx->window_size) {
-	      D(("match ran over window wrap"))
-	      return lzx->error = MSPACK_ERR_DECRUNCH;
-	    }
-	    
-	    /* copy match */
-	    rundest = &window[window_posn];
-	    i = match_length;
-	    /* does match offset wrap the window? */
-	    if (match_offset > window_posn) {
-	      /* j = length from match offset to end of window */
-	      j = match_offset - window_posn;
-	      if (j > (int) lzx->window_size) {
-		D(("match offset beyond window boundaries"))
-		return lzx->error = MSPACK_ERR_DECRUNCH;
-	      }
-	      runsrc = &window[lzx->window_size - j];
-	      if (j < i) {
-		/* if match goes over the window edge, do two copy runs */
-		i -= j; while (j-- > 0) *rundest++ = *runsrc++;
-		runsrc = window;
-	      }
-	      while (i-- > 0) *rundest++ = *runsrc++;
-	    }
-	    else {
-	      runsrc = rundest - match_offset;
-	      while (i-- > 0) *rundest++ = *runsrc++;
-	    }
+            /* get match offset */
+            switch ((match_offset = (main_element >> 3))) {
+            case 0: match_offset = R0;                                  break;
+            case 1: match_offset = R1; R1=R0;        R0 = match_offset; break;
+            case 2: match_offset = R2; R2=R0;        R0 = match_offset; break;
+            case 3: match_offset = 1;  R2=R1; R1=R0; R0 = match_offset; break;
+            default:
+              extra = (match_offset >= 36) ? 17 : extra_bits[match_offset];
+              READ_BITS(verbatim_bits, extra);
+              match_offset = position_base[match_offset] - 2 + verbatim_bits;
+              R2 = R1; R1 = R0; R0 = match_offset;
+            }
 
-	    this_run    -= match_length;
-	    window_posn += match_length;
-	  }
-	} /* while (this_run > 0) */
-	break;
+            /* LZX DELTA uses max match length to signal even longer match */
+            if (match_length == LZX_MAX_MATCH && lzx->is_delta) {
+                int extra_len = 0;
+                ENSURE_BITS(3); /* 4 entry huffman tree */
+                if (PEEK_BITS(1) == 0) {
+                    REMOVE_BITS(1); /* '0' -> 8 extra length bits */
+                    READ_BITS(extra_len, 8);
+                }
+                else if (PEEK_BITS(2) == 2) {
+                    REMOVE_BITS(2); /* '10' -> 10 extra length bits + 0x100 */
+                    READ_BITS(extra_len, 10);
+                    extra_len += 0x100;
+                }
+                else if (PEEK_BITS(3) == 6) {
+                    REMOVE_BITS(3); /* '110' -> 12 extra length bits + 0x500 */
+                    READ_BITS(extra_len, 12);
+                    extra_len += 0x500;
+                }
+                else {
+                    REMOVE_BITS(3); /* '111' -> 15 extra length bits */
+                    READ_BITS(extra_len, 15);
+                }
+                match_length += extra_len;
+            }
+
+            if ((window_posn + match_length) > lzx->window_size) {
+              D(("match ran over window wrap"))
+              return lzx->error = MSPACK_ERR_DECRUNCH;
+            }
+            
+            /* copy match */
+            rundest = &window[window_posn];
+            i = match_length;
+            /* does match offset wrap the window? */
+            if (match_offset > window_posn) {
+              if ((off_t)match_offset > lzx->offset &&
+                  (match_offset - window_posn) > lzx->ref_data_size)
+              {
+                D(("match offset beyond LZX stream"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              }
+              /* j = length from match offset to end of window */
+              j = match_offset - window_posn;
+              if (j > (int) lzx->window_size) {
+                D(("match offset beyond window boundaries"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              }
+              runsrc = &window[lzx->window_size - j];
+              if (j < i) {
+                /* if match goes over the window edge, do two copy runs */
+                i -= j; while (j-- > 0) *rundest++ = *runsrc++;
+                runsrc = window;
+              }
+              while (i-- > 0) *rundest++ = *runsrc++;
+            }
+            else {
+              runsrc = rundest - match_offset;
+              while (i-- > 0) *rundest++ = *runsrc++;
+            }
+
+            this_run    -= match_length;
+            window_posn += match_length;
+          }
+        } /* while (this_run > 0) */
+        break;
 
       case LZX_BLOCKTYPE_ALIGNED:
-	while (this_run > 0) {
-	  READ_HUFFSYM(MAINTREE, main_element);
-	  if (main_element < LZX_NUM_CHARS) {
-	    /* literal: 0 to LZX_NUM_CHARS-1 */
-	    window[window_posn++] = main_element;
-	    this_run--;
-	  }
-	  else {
-	    /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */
-	    main_element -= LZX_NUM_CHARS;
+        while (this_run > 0) {
+          READ_HUFFSYM(MAINTREE, main_element);
+          if (main_element < LZX_NUM_CHARS) {
+            /* literal: 0 to LZX_NUM_CHARS-1 */
+            window[window_posn++] = main_element;
+            this_run--;
+          }
+          else {
+            /* match: LZX_NUM_CHARS + ((slot<<3) | length_header (3 bits)) */
+            main_element -= LZX_NUM_CHARS;
 
-	    /* get match length */
-	    match_length = main_element & LZX_NUM_PRIMARY_LENGTHS;
-	    if (match_length == LZX_NUM_PRIMARY_LENGTHS) {
-	      READ_HUFFSYM(LENGTH, length_footer);
-	      match_length += length_footer;
-	    }
-	    match_length += LZX_MIN_MATCH;
+            /* get match length */
+            match_length = main_element & LZX_NUM_PRIMARY_LENGTHS;
+            if (match_length == LZX_NUM_PRIMARY_LENGTHS) {
+              if (lzx->LENGTH_empty) {
+                D(("LENGTH symbol needed but tree is empty"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              } 
+              READ_HUFFSYM(LENGTH, length_footer);
+              match_length += length_footer;
+            }
+            match_length += LZX_MIN_MATCH;
 
-	    /* get match offset */
-	    switch ((match_offset = (main_element >> 3))) {
-	    case 0: match_offset = R0;                             break;
-	    case 1: match_offset = R1; R1 = R0; R0 = match_offset; break;
-	    case 2: match_offset = R2; R2 = R0; R0 = match_offset; break;
-	    default:
-	      extra = extra_bits[match_offset];
-	      match_offset = position_base[match_offset] - 2;
-	      if (extra > 3) {
-		/* verbatim and aligned bits */
-		extra -= 3;
-		READ_BITS(verbatim_bits, extra);
-		match_offset += (verbatim_bits << 3);
-		READ_HUFFSYM(ALIGNED, aligned_bits);
-		match_offset += aligned_bits;
-	      }
-	      else if (extra == 3) {
-		/* aligned bits only */
-		READ_HUFFSYM(ALIGNED, aligned_bits);
-		match_offset += aligned_bits;
-	      }
-	      else if (extra > 0) { /* extra==1, extra==2 */
-		/* verbatim bits only */
-		READ_BITS(verbatim_bits, extra);
-		match_offset += verbatim_bits;
-	      }
-	      else /* extra == 0 */ {
-		/* ??? not defined in LZX specification! */
-		match_offset = 1;
-	      }
-	      /* update repeated offset LRU queue */
-	      R2 = R1; R1 = R0; R0 = match_offset;
-	    }
+            /* get match offset */
+            switch ((match_offset = (main_element >> 3))) {
+            case 0: match_offset = R0;                             break;
+            case 1: match_offset = R1; R1 = R0; R0 = match_offset; break;
+            case 2: match_offset = R2; R2 = R0; R0 = match_offset; break;
+            default:
+              extra = (match_offset >= 36) ? 17 : extra_bits[match_offset];
+              match_offset = position_base[match_offset] - 2;
+              if (extra > 3) {
+                /* verbatim and aligned bits */
+                extra -= 3;
+                READ_BITS(verbatim_bits, extra);
+                match_offset += (verbatim_bits << 3);
+                READ_HUFFSYM(ALIGNED, aligned_bits);
+                match_offset += aligned_bits;
+              }
+              else if (extra == 3) {
+                /* aligned bits only */
+                READ_HUFFSYM(ALIGNED, aligned_bits);
+                match_offset += aligned_bits;
+              }
+              else if (extra > 0) { /* extra==1, extra==2 */
+                /* verbatim bits only */
+                READ_BITS(verbatim_bits, extra);
+                match_offset += verbatim_bits;
+              }
+              else /* extra == 0 */ {
+                /* ??? not defined in LZX specification! */
+                match_offset = 1;
+              }
+              /* update repeated offset LRU queue */
+              R2 = R1; R1 = R0; R0 = match_offset;
+            }
 
-	    if ((window_posn + match_length) > lzx->window_size) {
-	      D(("match ran over window wrap"))
-	      return lzx->error = MSPACK_ERR_DECRUNCH;
-	    }
+            /* LZX DELTA uses max match length to signal even longer match */
+            if (match_length == LZX_MAX_MATCH && lzx->is_delta) {
+                int extra_len = 0;
+                ENSURE_BITS(3); /* 4 entry huffman tree */
+                if (PEEK_BITS(1) == 0) {
+                    REMOVE_BITS(1); /* '0' -> 8 extra length bits */
+                    READ_BITS(extra_len, 8);
+                }
+                else if (PEEK_BITS(2) == 2) {
+                    REMOVE_BITS(2); /* '10' -> 10 extra length bits + 0x100 */
+                    READ_BITS(extra_len, 10);
+                    extra_len += 0x100;
+                }
+                else if (PEEK_BITS(3) == 6) {
+                    REMOVE_BITS(3); /* '110' -> 12 extra length bits + 0x500 */
+                    READ_BITS(extra_len, 12);
+                    extra_len += 0x500;
+                }
+                else {
+                    REMOVE_BITS(3); /* '111' -> 15 extra length bits */
+                    READ_BITS(extra_len, 15);
+                }
+                match_length += extra_len;
+            }
 
-	    /* copy match */
-	    rundest = &window[window_posn];
-	    i = match_length;
-	    /* does match offset wrap the window? */
-	    if (match_offset > window_posn) {
-	      /* j = length from match offset to end of window */
-	      j = match_offset - window_posn;
-	      if (j > (int) lzx->window_size) {
-		D(("match offset beyond window boundaries"))
-		return lzx->error = MSPACK_ERR_DECRUNCH;
-	      }
-	      runsrc = &window[lzx->window_size - j];
-	      if (j < i) {
-		/* if match goes over the window edge, do two copy runs */
-		i -= j; while (j-- > 0) *rundest++ = *runsrc++;
-		runsrc = window;
-	      }
-	      while (i-- > 0) *rundest++ = *runsrc++;
-	    }
-	    else {
-	      runsrc = rundest - match_offset;
-	      while (i-- > 0) *rundest++ = *runsrc++;
-	    }
+            if ((window_posn + match_length) > lzx->window_size) {
+              D(("match ran over window wrap"))
+              return lzx->error = MSPACK_ERR_DECRUNCH;
+            }
 
-	    this_run    -= match_length;
-	    window_posn += match_length;
-	  }
-	} /* while (this_run > 0) */
-	break;
+            /* copy match */
+            rundest = &window[window_posn];
+            i = match_length;
+            /* does match offset wrap the window? */
+            if (match_offset > window_posn) {
+              if ((off_t)match_offset > lzx->offset &&
+                  (match_offset - window_posn) > lzx->ref_data_size)
+              {
+                D(("match offset beyond LZX stream"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              }
+              /* j = length from match offset to end of window */
+              j = match_offset - window_posn;
+              if (j > (int) lzx->window_size) {
+                D(("match offset beyond window boundaries"))
+                return lzx->error = MSPACK_ERR_DECRUNCH;
+              }
+              runsrc = &window[lzx->window_size - j];
+              if (j < i) {
+                /* if match goes over the window edge, do two copy runs */
+                i -= j; while (j-- > 0) *rundest++ = *runsrc++;
+                runsrc = window;
+              }
+              while (i-- > 0) *rundest++ = *runsrc++;
+            }
+            else {
+              runsrc = rundest - match_offset;
+              while (i-- > 0) *rundest++ = *runsrc++;
+            }
+
+            this_run    -= match_length;
+            window_posn += match_length;
+          }
+        } /* while (this_run > 0) */
+        break;
 
       case LZX_BLOCKTYPE_UNCOMPRESSED:
-	/* as this_run is limited not to wrap a frame, this also means it
-	 * won't wrap the window (as the window is a multiple of 32k) */
-	rundest = &window[window_posn];
-	window_posn += this_run;
-	while (this_run > 0) {
-	  if ((i = (int)(i_end - i_ptr))) {
-	    if (i > this_run) i = this_run;
-	    lzx->sys->copy(i_ptr, rundest, (size_t) i);
-	    rundest  += i;
-	    i_ptr    += i;
-	    this_run -= i;
-	  }
-	  else {
-	    if (lzxd_read_input(lzx)) return lzx->error;
-	    i_ptr = lzx->i_ptr;
-	    i_end = lzx->i_end;
-	  }
-	}
-	break;
+        /* as this_run is limited not to wrap a frame, this also means it
+         * won't wrap the window (as the window is a multiple of 32k) */
+        rundest = &window[window_posn];
+        window_posn += this_run;
+        while (this_run > 0) {
+          if ((i = (int)(i_end - i_ptr)) == 0) {
+            READ_IF_NEEDED;
+          }
+          else {
+            if (i > this_run) i = this_run;
+            lzx->sys->copy(i_ptr, rundest, (size_t) i);
+            rundest  += i;
+            i_ptr    += i;
+            this_run -= i;
+          }
+        }
+        break;
 
       default:
-	return lzx->error = MSPACK_ERR_DECRUNCH; /* might as well */
+        return lzx->error = MSPACK_ERR_DECRUNCH; /* might as well */
       }
 
       /* did the final match overrun our desired this_run length? */
       if (this_run < 0) {
-	if ((unsigned int)(-this_run) > lzx->block_remaining) {
-	  D(("overrun went past end of block by %d (%d remaining)",
-	     -this_run, lzx->block_remaining ))
-	  return lzx->error = MSPACK_ERR_DECRUNCH;
-	}
-	lzx->block_remaining -= -this_run;
+        if ((unsigned int)(-this_run) > lzx->block_remaining) {
+          D(("overrun went past end of block by %d (%d remaining)",
+             -this_run, lzx->block_remaining ))
+          return lzx->error = MSPACK_ERR_DECRUNCH;
+        }
+        lzx->block_remaining -= -this_run;
       }
     } /* while (bytes_todo > 0) */
 
     /* streams don't extend over frame boundaries */
     if ((window_posn - lzx->frame_posn) != frame_size) {
       D(("decode beyond output frame limits! %d != %d",
-	 window_posn - lzx->frame_posn, frame_size))
+         window_posn - lzx->frame_posn, frame_size))
       return lzx->error = MSPACK_ERR_DECRUNCH;
     }
 
@@ -818,13 +820,14 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
 
     /* check that we've used all of the previous frame first */
     if (lzx->o_ptr != lzx->o_end) {
-      D(("%d avail bytes, new %d frame", lzx->o_end-lzx->o_ptr, frame_size))
+      D(("%ld avail bytes, new %d frame",
+          (long)(lzx->o_end - lzx->o_ptr), frame_size))
       return lzx->error = MSPACK_ERR_DECRUNCH;
     }
 
     /* does this intel block _really_ need decoding? */
     if (lzx->intel_started && lzx->intel_filesize &&
-	(lzx->frame <= 32768) && (frame_size > 10))
+        (lzx->frame <= 32768) && (frame_size > 10))
     {
       unsigned char *data    = &lzx->e8_buf[0];
       unsigned char *dataend = &lzx->e8_buf[frame_size - 10];
@@ -837,17 +840,17 @@ int lzxd_decompress(struct lzxd_stream *lzx, off_t out_bytes) {
       lzx->sys->copy(&lzx->window[lzx->frame_posn], data, frame_size);
 
       while (data < dataend) {
-	if (*data++ != 0xE8) { curpos++; continue; }
-	abs_off = data[0] | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
-	if ((abs_off >= -curpos) && (abs_off < filesize)) {
-	  rel_off = (abs_off >= 0) ? abs_off - curpos : abs_off + filesize;
-	  data[0] = (unsigned char) rel_off;
-	  data[1] = (unsigned char) (rel_off >> 8);
-	  data[2] = (unsigned char) (rel_off >> 16);
-	  data[3] = (unsigned char) (rel_off >> 24);
-	}
-	data += 4;
-	curpos += 5;
+        if (*data++ != 0xE8) { curpos++; continue; }
+        abs_off = data[0] | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+        if ((abs_off >= -curpos) && (abs_off < filesize)) {
+          rel_off = (abs_off >= 0) ? abs_off - curpos : abs_off + filesize;
+          data[0] = (unsigned char) rel_off;
+          data[1] = (unsigned char) (rel_off >> 8);
+          data[2] = (unsigned char) (rel_off >> 16);
+          data[3] = (unsigned char) (rel_off >> 24);
+        }
+        data += 4;
+        curpos += 5;
       }
       lzx->intel_curpos += frame_size;
     }
diff --git a/third_party/mspack/mspack.h b/third_party/mspack/mspack.h
index 0d2584dee..f9161f983 100644
--- a/third_party/mspack/mspack.h
+++ b/third_party/mspack/mspack.h
@@ -1,5 +1,5 @@
 /* libmspack -- a library for working with Microsoft compression formats.
- * (C) 2003-2004 Stuart Caie <kyzer@4u.net>
+ * (C) 2003-2016 Stuart Caie <kyzer@cabextract.org.uk>
  *
  * libmspack is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License (LGPL) version 2.1
@@ -21,6 +21,79 @@
  * libmspack is a library which provides compressors and decompressors,
  * archivers and dearchivers for Microsoft compression formats.
  *
+ * \section formats Formats supported
+ *
+ * The following file formats are supported:
+ * - SZDD files, which use LZSS compression
+ * - KWAJ files, which use LZSS, LZSS+Huffman or deflate compression
+ * - .HLP (MS Help) files, which use LZSS compression
+ * - .CAB (MS Cabinet) files, which use deflate, LZX or Quantum compression
+ * - .CHM (HTML Help) files, which use LZX compression
+ * - .LIT (MS EBook) files, which use LZX compression and DES encryption
+ * - .LZX (Exchange Offline Addressbook) files, which use LZX compression
+ *
+ * To determine the capabilities of the library, and the binary
+ * compatibility version of any particular compressor or decompressor, use
+ * the mspack_version() function. The UNIX library interface version is
+ * defined as the highest-versioned library component.
+ *
+ * \section starting Getting started
+ *
+ * The macro MSPACK_SYS_SELFTEST() should be used to ensure the library can
+ * be used. In particular, it checks if the caller is using 32-bit file I/O
+ * when the library is compiled for 64-bit file I/O and vice versa.
+ *
+ * If compiled normally, the library includes basic file I/O and memory
+ * management functionality using the standard C library. This can be
+ * customised and replaced entirely by creating a mspack_system structure.
+ *
+ * A compressor or decompressor for the required format must be
+ * instantiated before it can be used. Each construction function takes
+ * one parameter, which is either a pointer to a custom mspack_system
+ * structure, or NULL to use the default. The instantiation returned, if
+ * not NULL, contains function pointers (methods) to work with the given
+ * file format.
+ * 
+ * For compression:
+ * - mspack_create_cab_compressor() creates a mscab_compressor
+ * - mspack_create_chm_compressor() creates a mschm_compressor
+ * - mspack_create_lit_compressor() creates a mslit_compressor
+ * - mspack_create_hlp_compressor() creates a mshlp_compressor
+ * - mspack_create_szdd_compressor() creates a msszdd_compressor
+ * - mspack_create_kwaj_compressor() creates a mskwaj_compressor
+ * - mspack_create_oab_compressor() creates a msoab_compressor
+ *
+ * For decompression:
+ * - mspack_create_cab_decompressor() creates a mscab_decompressor
+ * - mspack_create_chm_decompressor() creates a mschm_decompressor
+ * - mspack_create_lit_decompressor() creates a mslit_decompressor
+ * - mspack_create_hlp_decompressor() creates a mshlp_decompressor
+ * - mspack_create_szdd_decompressor() creates a msszdd_decompressor
+ * - mspack_create_kwaj_decompressor() creates a mskwaj_decompressor
+ * - mspack_create_oab_decompressor() creates a msoab_decompressor
+ *
+ * Once finished working with a format, each kind of
+ * compressor/decompressor has its own specific destructor:
+ * - mspack_destroy_cab_compressor()
+ * - mspack_destroy_cab_decompressor()
+ * - mspack_destroy_chm_compressor()
+ * - mspack_destroy_chm_decompressor()
+ * - mspack_destroy_lit_compressor()
+ * - mspack_destroy_lit_decompressor()
+ * - mspack_destroy_hlp_compressor()
+ * - mspack_destroy_hlp_decompressor()
+ * - mspack_destroy_szdd_compressor()
+ * - mspack_destroy_szdd_decompressor()
+ * - mspack_destroy_kwaj_compressor()
+ * - mspack_destroy_kwaj_decompressor()
+ * - mspack_destroy_oab_compressor()
+ * - mspack_destroy_oab_decompressor()
+ *
+ * Destroying a compressor or decompressor does not destroy any objects,
+ * structures or handles that have been created using that compressor or
+ * decompressor. Ensure that everything created or opened is destroyed or
+ * closed before compressor/decompressor is itself destroyed.
+ *
  * \section errors Error codes
  *
  * All compressors and decompressors use the same set of error codes. Most
@@ -45,6 +118,41 @@
  * - #MSPACK_ERR_CHECKSUM indicates that a data checksum has failed.
  * - #MSPACK_ERR_CRUNCH indicates an error occured during compression.
  * - #MSPACK_ERR_DECRUNCH indicates an error occured during decompression.
+ *
+ * \section threading Multi-threading
+ *
+ * libmspack methods are reentrant and multithreading-safe when each
+ * thread has its own compressor or decompressor.
+
+ * You should not call multiple methods simultaneously on a single
+ * compressor or decompressor instance.
+ *
+ * If this may happen, you can either use one compressor or
+ * decompressor per thread, or you can use your preferred lock,
+ * semaphore or mutex library to ensure no more than one method on a
+ * compressor/decompressor is called simultaneously. libmspack will
+ * not do this locking for you.
+ *
+ * Example of incorrect behaviour:
+ * - thread 1 calls mspack_create_cab_decompressor()
+ * - thread 1 calls open()
+ * - thread 1 calls extract() for one file
+ * - thread 2 simultaneously calls extract() for another file
+ *
+ * Correct behaviour:
+ * - thread 1 calls mspack_create_cab_decompressor()
+ * - thread 2 calls mspack_create_cab_decompressor()
+ * - thread 1 calls its own open() / extract()
+ * - thread 2 simultaneously calls its own open() / extract()
+ *
+ * Also correct behaviour:
+ * - thread 1 calls mspack_create_cab_decompressor()
+ * - thread 1 locks a mutex for with the decompressor before
+ *   calling any methods on it, and unlocks the mutex after each
+ *   method returns.
+ * - thread 1 can share the results of open() with thread 2, and both
+ *   can call extract(), provided they both guard against simultaneous
+ *   use of extract(), and any other methods, with the mutex
  */
 
 #ifndef LIB_MSPACK_H
@@ -57,6 +165,102 @@ extern "C" {
 #include <sys/types.h>
 #include <stdlib.h>
 
+/**
+ * System self-test function, to ensure both library and calling program
+ * can use one another.
+ *
+ * A result of MSPACK_ERR_OK means the library and caller are
+ * compatible. Any other result indicates that the library and caller are
+ * not compatible and should not be used. In particular, a value of
+ * MSPACK_ERR_SEEK means the library and caller use different off_t
+ * datatypes.
+ *
+ * It should be used like so:
+ *
+ * @code
+ * int selftest_result;
+ * MSPACK_SYS_SELFTEST(selftest_result);
+ * if (selftest_result != MSPACK_ERR_OK) {
+ *   fprintf(stderr, "incompatible with this build of libmspack\n");
+ *   exit(0);
+ * }
+ * @endcode
+ *
+ * @param  result   an int variable to store the result of the self-test
+ */
+#define MSPACK_SYS_SELFTEST(result)  do { \
+  (result) = mspack_sys_selftest_internal(sizeof(off_t)); \
+} while (0)
+
+/** Part of the MSPACK_SYS_SELFTEST() macro, must not be used directly. */
+extern int mspack_sys_selftest_internal(int);
+
+/**
+ * Enquire about the binary compatibility version of a specific interface in
+ * the library. Currently, the following interfaces are defined:
+ *
+ * - #MSPACK_VER_LIBRARY: the overall library
+ * - #MSPACK_VER_SYSTEM: the mspack_system interface
+ * - #MSPACK_VER_MSCABD: the mscab_decompressor interface
+ * - #MSPACK_VER_MSCABC: the mscab_compressor interface
+ * - #MSPACK_VER_MSCHMD: the mschm_decompressor interface
+ * - #MSPACK_VER_MSCHMC: the mschm_compressor interface
+ * - #MSPACK_VER_MSLITD: the mslit_decompressor interface
+ * - #MSPACK_VER_MSLITC: the mslit_compressor interface
+ * - #MSPACK_VER_MSHLPD: the mshlp_decompressor interface
+ * - #MSPACK_VER_MSHLPC: the mshlp_compressor interface
+ * - #MSPACK_VER_MSSZDDD: the msszdd_decompressor interface
+ * - #MSPACK_VER_MSSZDDC: the msszdd_compressor interface
+ * - #MSPACK_VER_MSKWAJD: the mskwaj_decompressor interface
+ * - #MSPACK_VER_MSKWAJC: the mskwaj_compressor interface
+ * - #MSPACK_VER_MSOABD: the msoab_decompressor interface
+ * - #MSPACK_VER_MSOABC: the msoab_compressor interface
+ *
+ * The result of the function should be interpreted as follows:
+ * - -1: this interface is completely unknown to the library
+ * - 0: this interface is known, but non-functioning
+ * - 1: this interface has all basic functionality
+ * - 2, 3, ...: this interface has additional functionality, clearly marked
+ *   in the documentation as "version 2", "version 3" and so on.
+ *
+ * @param entity the interface to request current version of
+ * @return the version of the requested interface
+ */
+extern int mspack_version(int entity);
+
+/** Pass to mspack_version() to get the overall library version */
+#define MSPACK_VER_LIBRARY   (0)
+/** Pass to mspack_version() to get the mspack_system version */
+#define MSPACK_VER_SYSTEM    (1)
+/** Pass to mspack_version() to get the mscab_decompressor version */
+#define MSPACK_VER_MSCABD    (2)
+/** Pass to mspack_version() to get the mscab_compressor version */
+#define MSPACK_VER_MSCABC    (3)
+/** Pass to mspack_version() to get the mschm_decompressor version */
+#define MSPACK_VER_MSCHMD    (4)
+/** Pass to mspack_version() to get the mschm_compressor version */
+#define MSPACK_VER_MSCHMC    (5)
+/** Pass to mspack_version() to get the mslit_decompressor version */
+#define MSPACK_VER_MSLITD    (6)
+/** Pass to mspack_version() to get the mslit_compressor version */
+#define MSPACK_VER_MSLITC    (7)
+/** Pass to mspack_version() to get the mshlp_decompressor version */
+#define MSPACK_VER_MSHLPD    (8)
+/** Pass to mspack_version() to get the mshlp_compressor version */
+#define MSPACK_VER_MSHLPC    (9)
+/** Pass to mspack_version() to get the msszdd_decompressor version */
+#define MSPACK_VER_MSSZDDD   (10)
+/** Pass to mspack_version() to get the msszdd_compressor version */
+#define MSPACK_VER_MSSZDDC   (11)
+/** Pass to mspack_version() to get the mskwaj_decompressor version */
+#define MSPACK_VER_MSKWAJD   (12)
+/** Pass to mspack_version() to get the mskwaj_compressor version */
+#define MSPACK_VER_MSKWAJC   (13)
+/** Pass to mspack_version() to get the msoab_decompressor version */
+#define MSPACK_VER_MSOABD    (14)
+/** Pass to mspack_version() to get the msoab_compressor version */
+#define MSPACK_VER_MSOABC    (15)
+
 /* --- file I/O abstraction ------------------------------------------------ */
 
 /**
@@ -82,7 +286,7 @@ struct mspack_system {
   /**
    * Opens a file for reading, writing, appending or updating.
    *
-   * @param this     a self-referential pointer to the mspack_system
+   * @param self     a self-referential pointer to the mspack_system
    *                 structure whose open() method is being called. If
    *                 this pointer is required by close(), read(), write(),
    *                 seek() or tell(), it should be stored in the result
@@ -99,12 +303,13 @@ struct mspack_system {
    * @return a pointer to a mspack_file structure. This structure officially
    *         contains no members, its true contents are up to the
    *         mspack_system implementor. It should contain whatever is needed
-   *         for other mspack_system methods to operate.
+   *         for other mspack_system methods to operate. Returning the NULL
+   *         pointer indicates an error condition.
    * @see close(), read(), write(), seek(), tell(), message()
    */
-  struct mspack_file * (*open)(struct mspack_system *sys,
-			       char *filename,
-			       int mode);
+  struct mspack_file * (*open)(struct mspack_system *self,
+                               const char *filename,
+                               int mode);
 
   /**
    * Closes a previously opened file. If any memory was allocated for this
@@ -123,12 +328,14 @@ struct mspack_system {
    * @param bytes   the number of bytes to read from the file.
    * @return the number of bytes successfully read (this can be less than
    *         the number requested), zero to mark the end of file, or less
-   *         than zero to indicate an error.
+   *         than zero to indicate an error. The library does not "retry"
+   *         reads and assumes short reads are due to EOF, so you should
+   *         avoid returning short reads because of transient errors.
    * @see open(), write()
    */
   int (*read)(struct mspack_file *file,
-	      void *buffer,
-	      int bytes);
+              void *buffer,
+              int bytes);
 
   /**
    * Writes a given number of bytes to an open file.
@@ -144,8 +351,8 @@ struct mspack_system {
    * @see open(), read()
    */
   int (*write)(struct mspack_file *file,
-	       void *buffer,
-	       int bytes);
+               void *buffer,
+               int bytes);
 
   /**
    * Seeks to a specific file offset within an open file.
@@ -171,8 +378,8 @@ struct mspack_system {
    * @see open(), tell()
    */
   int (*seek)(struct mspack_file *file,
-	      off_t offset,
-	      int mode);
+              off_t offset,
+              int mode);
 
   /**
    * Returns the current file position (in bytes) of the given file.
@@ -198,26 +405,26 @@ struct mspack_system {
    * @see open()
    */
   void (*message)(struct mspack_file *file,
-		  char *format,
-		  ...);
+                  const char *format,
+                  ...);
 
   /**
    * Allocates memory.
    *
-   * @param sys      a self-referential pointer to the mspack_system
+   * @param self     a self-referential pointer to the mspack_system
    *                 structure whose alloc() method is being called.
    * @param bytes    the number of bytes to allocate
    * @result a pointer to the requested number of bytes, or NULL if
    *         not enough memory is available
    * @see free()
    */
-  void * (*alloc)(struct mspack_system *sys,
-		  size_t bytes);
+  void * (*alloc)(struct mspack_system *self,
+                  size_t bytes);
   
   /**
    * Frees memory.
    * 
-   * @param ptr the memory to be freed.
+   * @param ptr the memory to be freed. NULL is accepted and ignored.
    * @see alloc()
    */
   void (*free)(void *ptr);
@@ -235,8 +442,8 @@ struct mspack_system {
    * @param bytes the size of the memory region, in bytes
    */
   void (*copy)(void *src,
-	       void *dest,
-	       size_t bytes);
+               void *dest,
+               size_t bytes);
 
   /**
    * A null pointer to mark the end of mspack_system. It must equal NULL.
@@ -299,8 +506,1857 @@ struct mspack_file {
 /** Error code: error during decompression */
 #define MSPACK_ERR_DECRUNCH    (11)
 
-#ifdef __cplusplus
+/* --- functions available in library -------------------------------------- */
+
+/** Creates a new CAB compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mscab_compressor or NULL
+ */
+extern struct mscab_compressor *
+  mspack_create_cab_compressor(struct mspack_system *sys);
+
+/** Creates a new CAB decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mscab_decompressor or NULL
+ */
+extern struct mscab_decompressor *
+  mspack_create_cab_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing CAB compressor.
+ * @param self the #mscab_compressor to destroy
+ */
+extern void mspack_destroy_cab_compressor(struct mscab_compressor *self);
+
+/** Destroys an existing CAB decompressor.
+ * @param self the #mscab_decompressor to destroy
+ */
+extern void mspack_destroy_cab_decompressor(struct mscab_decompressor *self);
+
+
+/** Creates a new CHM compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mschm_compressor or NULL
+ */
+extern struct mschm_compressor *
+  mspack_create_chm_compressor(struct mspack_system *sys);
+
+/** Creates a new CHM decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mschm_decompressor or NULL
+ */
+extern struct mschm_decompressor *
+  mspack_create_chm_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing CHM compressor.
+ * @param self the #mschm_compressor to destroy
+ */
+extern void mspack_destroy_chm_compressor(struct mschm_compressor *self);
+
+/** Destroys an existing CHM decompressor.
+ * @param self the #mschm_decompressor to destroy
+ */
+extern void mspack_destroy_chm_decompressor(struct mschm_decompressor *self);
+
+
+/** Creates a new LIT compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mslit_compressor or NULL
+ */
+extern struct mslit_compressor *
+  mspack_create_lit_compressor(struct mspack_system *sys);
+
+/** Creates a new LIT decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mslit_decompressor or NULL
+ */
+extern struct mslit_decompressor *
+  mspack_create_lit_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing LIT compressor.
+ * @param self the #mslit_compressor to destroy
+ */
+extern void mspack_destroy_lit_compressor(struct mslit_compressor *self);
+
+/** Destroys an existing LIT decompressor.
+ * @param self the #mslit_decompressor to destroy
+ */
+extern void mspack_destroy_lit_decompressor(struct mslit_decompressor *self);
+
+
+/** Creates a new HLP compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mshlp_compressor or NULL
+ */
+extern struct mshlp_compressor *
+  mspack_create_hlp_compressor(struct mspack_system *sys);
+
+/** Creates a new HLP decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mshlp_decompressor or NULL
+ */
+extern struct mshlp_decompressor *
+  mspack_create_hlp_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing hlp compressor.
+ * @param self the #mshlp_compressor to destroy
+ */
+extern void mspack_destroy_hlp_compressor(struct mshlp_compressor *self);
+
+/** Destroys an existing hlp decompressor.
+ * @param self the #mshlp_decompressor to destroy
+ */
+extern void mspack_destroy_hlp_decompressor(struct mshlp_decompressor *self);
+
+
+/** Creates a new SZDD compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #msszdd_compressor or NULL
+ */
+extern struct msszdd_compressor *
+  mspack_create_szdd_compressor(struct mspack_system *sys);
+
+/** Creates a new SZDD decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #msszdd_decompressor or NULL
+ */
+extern struct msszdd_decompressor *
+  mspack_create_szdd_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing SZDD compressor.
+ * @param self the #msszdd_compressor to destroy
+ */
+extern void mspack_destroy_szdd_compressor(struct msszdd_compressor *self);
+
+/** Destroys an existing SZDD decompressor.
+ * @param self the #msszdd_decompressor to destroy
+ */
+extern void mspack_destroy_szdd_decompressor(struct msszdd_decompressor *self);
+
+
+/** Creates a new KWAJ compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mskwaj_compressor or NULL
+ */
+extern struct mskwaj_compressor *
+  mspack_create_kwaj_compressor(struct mspack_system *sys);
+
+/** Creates a new KWAJ decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #mskwaj_decompressor or NULL
+ */
+extern struct mskwaj_decompressor *
+  mspack_create_kwaj_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing KWAJ compressor.
+ * @param self the #mskwaj_compressor to destroy
+ */
+extern void mspack_destroy_kwaj_compressor(struct mskwaj_compressor *self);
+
+/** Destroys an existing KWAJ decompressor.
+ * @param self the #mskwaj_decompressor to destroy
+ */
+extern void mspack_destroy_kwaj_decompressor(struct mskwaj_decompressor *self);
+
+
+/** Creates a new OAB compressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #msoab_compressor or NULL
+ */
+extern struct msoab_compressor *
+  mspack_create_oab_compressor(struct mspack_system *sys);
+
+/** Creates a new OAB decompressor.
+ * @param sys a custom mspack_system structure, or NULL to use the default
+ * @return a #msoab_decompressor or NULL
+ */
+extern struct msoab_decompressor *
+  mspack_create_oab_decompressor(struct mspack_system *sys);
+
+/** Destroys an existing OAB compressor.
+ * @param self the #msoab_compressor to destroy
+ */
+extern void mspack_destroy_oab_compressor(struct msoab_compressor *self);
+
+/** Destroys an existing OAB decompressor.
+ * @param self the #msoab_decompressor to destroy
+ */
+extern void mspack_destroy_oab_decompressor(struct msoab_decompressor *self);
+
+
+/* --- support for .CAB (MS Cabinet) file format --------------------------- */
+
+/**
+ * A structure which represents a single cabinet file.
+ *
+ * All fields are READ ONLY.
+ *
+ * If this cabinet is part of a merged cabinet set, the #files and #folders
+ * fields are common to all cabinets in the set, and will be identical.
+ *
+ * @see mscab_decompressor::open(), mscab_decompressor::close(),
+ *      mscab_decompressor::search()
+ */
+struct mscabd_cabinet {
+  /**
+   * The next cabinet in a chained list, if this cabinet was opened with
+   * mscab_decompressor::search(). May be NULL to mark the end of the
+   * list.
+   */
+  struct mscabd_cabinet *next;
+
+  /**
+   * The filename of the cabinet. More correctly, the filename of the
+   * physical file that the cabinet resides in. This is given by the
+   * library user and may be in any format.
+   */
+  const char *filename;
+  
+  /** The file offset of cabinet within the physical file it resides in. */
+  off_t base_offset;
+
+  /** The length of the cabinet file in bytes. */
+  unsigned int length;
+
+  /** The previous cabinet in a cabinet set, or NULL. */
+  struct mscabd_cabinet *prevcab;
+
+  /** The next cabinet in a cabinet set, or NULL. */
+  struct mscabd_cabinet *nextcab;
+
+  /** The filename of the previous cabinet in a cabinet set, or NULL. */
+  char *prevname;
+
+  /** The filename of the next cabinet in a cabinet set, or NULL. */
+  char *nextname;
+
+  /** The name of the disk containing the previous cabinet in a cabinet
+   * set, or NULL.
+   */
+  char *previnfo;
+
+  /** The name of the disk containing the next cabinet in a cabinet set,
+   * or NULL.
+   */
+  char *nextinfo;
+
+  /** A list of all files in the cabinet or cabinet set. */
+  struct mscabd_file *files;
+
+  /** A list of all folders in the cabinet or cabinet set. */
+  struct mscabd_folder *folders;
+
+  /** 
+   * The set ID of the cabinet. All cabinets in the same set should have
+   * the same set ID.
+   */
+  unsigned short set_id;
+
+  /**
+   * The index number of the cabinet within the set. Numbering should
+   * start from 0 for the first cabinet in the set, and increment by 1 for
+   * each following cabinet.
+   */
+  unsigned short set_index;
+
+  /**
+   * The number of bytes reserved in the header area of the cabinet.
+   *
+   * If this is non-zero and flags has MSCAB_HDR_RESV set, this data can
+   * be read by the calling application. It is of the given length,
+   * located at offset (base_offset + MSCAB_HDR_RESV_OFFSET) in the
+   * cabinet file.
+   *
+   * @see flags
+   */
+  unsigned short header_resv;
+
+  /**
+   * Header flags.
+   *
+   * - MSCAB_HDR_PREVCAB indicates the cabinet is part of a cabinet set, and
+   *                     has a predecessor cabinet.
+   * - MSCAB_HDR_NEXTCAB indicates the cabinet is part of a cabinet set, and
+   *                     has a successor cabinet.
+   * - MSCAB_HDR_RESV indicates the cabinet has reserved header space.
+   *
+   * @see prevname, previnfo, nextname, nextinfo, header_resv
+   */
+  int flags;
 };
+
+/** Offset from start of cabinet to the reserved header data (if present). */
+#define MSCAB_HDR_RESV_OFFSET (0x28)
+
+/** Cabinet header flag: cabinet has a predecessor */
+#define MSCAB_HDR_PREVCAB (0x01)
+/** Cabinet header flag: cabinet has a successor */
+#define MSCAB_HDR_NEXTCAB (0x02)
+/** Cabinet header flag: cabinet has reserved header space */
+#define MSCAB_HDR_RESV    (0x04)
+
+/**
+ * A structure which represents a single folder in a cabinet or cabinet set.
+ *
+ * All fields are READ ONLY.
+ *
+ * A folder is a single compressed stream of data. When uncompressed, it
+ * holds the data of one or more files. A folder may be split across more
+ * than one cabinet.
+ */
+struct mscabd_folder {
+  /**
+   * A pointer to the next folder in this cabinet or cabinet set, or NULL
+   * if this is the final folder.
+   */
+  struct mscabd_folder *next;
+
+  /** 
+   * The compression format used by this folder.
+   *
+   * The macro MSCABD_COMP_METHOD() should be used on this field to get
+   * the algorithm used. The macro MSCABD_COMP_LEVEL() should be used to get
+   * the "compression level".
+   *
+   * @see MSCABD_COMP_METHOD(), MSCABD_COMP_LEVEL()
+   */
+  int comp_type;
+
+  /**
+   * The total number of data blocks used by this folder. This includes
+   * data blocks present in other files, if this folder spans more than
+   * one cabinet.
+   */
+  unsigned int num_blocks;
+};
+
+/**
+ * Returns the compression method used by a folder.
+ *
+ * @param comp_type a mscabd_folder::comp_type value
+ * @return one of #MSCAB_COMP_NONE, #MSCAB_COMP_MSZIP, #MSCAB_COMP_QUANTUM
+ *         or #MSCAB_COMP_LZX
+ */
+#define MSCABD_COMP_METHOD(comp_type) ((comp_type) & 0x0F)
+/**
+ * Returns the compression level used by a folder.
+ *
+ * @param comp_type a mscabd_folder::comp_type value
+ * @return the compression level. This is only defined by LZX and Quantum
+ *         compression
+ */
+#define MSCABD_COMP_LEVEL(comp_type) (((comp_type) >> 8) & 0x1F)
+
+/** Compression mode: no compression. */
+#define MSCAB_COMP_NONE       (0)
+/** Compression mode: MSZIP (deflate) compression. */
+#define MSCAB_COMP_MSZIP      (1)
+/** Compression mode: Quantum compression */
+#define MSCAB_COMP_QUANTUM    (2)
+/** Compression mode: LZX compression */
+#define MSCAB_COMP_LZX        (3)
+
+/**
+ * A structure which represents a single file in a cabinet or cabinet set.
+ *
+ * All fields are READ ONLY.
+ */
+struct mscabd_file {
+  /**
+   * The next file in the cabinet or cabinet set, or NULL if this is the
+   * final file.
+   */
+  struct mscabd_file *next;
+
+  /**
+   * The filename of the file.
+   *
+   * A null terminated string of up to 255 bytes in length, it may be in
+   * either ISO-8859-1 or UTF8 format, depending on the file attributes.
+   *
+   * @see attribs
+   */
+  char *filename;
+
+  /** The uncompressed length of the file, in bytes. */
+  unsigned int length;
+
+  /**
+   * File attributes.
+   *
+   * The following attributes are defined:
+   * - #MSCAB_ATTRIB_RDONLY indicates the file is write protected.
+   * - #MSCAB_ATTRIB_HIDDEN indicates the file is hidden.
+   * - #MSCAB_ATTRIB_SYSTEM indicates the file is a operating system file.
+   * - #MSCAB_ATTRIB_ARCH indicates the file is "archived".
+   * - #MSCAB_ATTRIB_EXEC indicates the file is an executable program.
+   * - #MSCAB_ATTRIB_UTF_NAME indicates the filename is in UTF8 format rather
+   *   than ISO-8859-1.
+   */
+  int attribs;
+
+  /** File's last modified time, hour field. */
+  char time_h;
+  /** File's last modified time, minute field. */
+  char time_m;
+  /** File's last modified time, second field. */
+  char time_s;
+
+  /** File's last modified date, day field. */
+  char date_d;
+  /** File's last modified date, month field. */
+  char date_m;
+  /** File's last modified date, year field. */
+  int date_y;
+
+  /** A pointer to the folder that contains this file. */
+  struct mscabd_folder *folder;
+
+  /** The uncompressed offset of this file in its folder. */
+  unsigned int offset;
+};
+
+/** mscabd_file::attribs attribute: file is read-only. */
+#define MSCAB_ATTRIB_RDONLY   (0x01)
+/** mscabd_file::attribs attribute: file is hidden. */
+#define MSCAB_ATTRIB_HIDDEN   (0x02)
+/** mscabd_file::attribs attribute: file is an operating system file. */
+#define MSCAB_ATTRIB_SYSTEM   (0x04)
+/** mscabd_file::attribs attribute: file is "archived". */
+#define MSCAB_ATTRIB_ARCH     (0x20)
+/** mscabd_file::attribs attribute: file is an executable program. */
+#define MSCAB_ATTRIB_EXEC     (0x40)
+/** mscabd_file::attribs attribute: filename is UTF8, not ISO-8859-1. */
+#define MSCAB_ATTRIB_UTF_NAME (0x80)
+
+/** mscab_decompressor::set_param() parameter: search buffer size. */
+#define MSCABD_PARAM_SEARCHBUF (0)
+/** mscab_decompressor::set_param() parameter: repair MS-ZIP streams? */
+#define MSCABD_PARAM_FIXMSZIP  (1)
+/** mscab_decompressor::set_param() parameter: size of decompression buffer */
+#define MSCABD_PARAM_DECOMPBUF (2)
+/** mscab_decompressor::set_param() parameter: salvage data from bad cabinets?
+ * If enabled, open() will skip file with bad folder indices or filenames
+ * rather than reject the whole cabinet, and extract() will limit rather than
+ * reject files with invalid offsets and lengths, and bad data block checksums
+ * will be ignored. Available only in CAB decoder version 2 and above.
+ */
+#define MSCABD_PARAM_SALVAGE   (3)
+
+/** TODO */
+struct mscab_compressor {
+  int dummy; 
+};
+
+/**
+ * A decompressor for .CAB (Microsoft Cabinet) files
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_cab_decompressor(), mspack_destroy_cab_decompressor()
+ */
+struct mscab_decompressor {
+  /**
+   * Opens a cabinet file and reads its contents.
+   *
+   * If the file opened is a valid cabinet file, all headers will be read
+   * and a mscabd_cabinet structure will be returned, with a full list of
+   * folders and files.
+   *
+   * In the case of an error occuring, NULL is returned and the error code
+   * is available from last_error().
+   *
+   * The filename pointer should be considered "in use" until close() is
+   * called on the cabinet.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  filename the filename of the cabinet file. This is passed
+   *                  directly to mspack_system::open().
+   * @return a pointer to a mscabd_cabinet structure, or NULL on failure
+   * @see close(), search(), last_error()
+   */
+  struct mscabd_cabinet * (*open) (struct mscab_decompressor *self,
+                                   const char *filename);
+
+  /**
+   * Closes a previously opened cabinet or cabinet set.
+   *
+   * This closes a cabinet, all cabinets associated with it via the
+   * mscabd_cabinet::next, mscabd_cabinet::prevcab and
+   * mscabd_cabinet::nextcab pointers, and all folders and files. All
+   * memory used by these entities is freed.
+   *
+   * The cabinet pointer is now invalid and cannot be used again. All
+   * mscabd_folder and mscabd_file pointers from that cabinet or cabinet
+   * set are also now invalid, and cannot be used again.
+   *
+   * If the cabinet pointer given was created using search(), it MUST be
+   * the cabinet pointer returned by search() and not one of the later
+   * cabinet pointers further along the mscabd_cabinet::next chain.
+
+   * If extra cabinets have been added using append() or prepend(), these
+   * will all be freed, even if the cabinet pointer given is not the first
+   * cabinet in the set. Do NOT close() more than one cabinet in the set.
+   *
+   * The mscabd_cabinet::filename is not freed by the library, as it is
+   * not allocated by the library. The caller should free this itself if
+   * necessary, before it is lost forever.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  cab      the cabinet to close
+   * @see open(), search(), append(), prepend()
+   */
+  void (*close)(struct mscab_decompressor *self,
+                struct mscabd_cabinet *cab);
+
+  /**
+   * Searches a regular file for embedded cabinets.
+   *
+   * This opens a normal file with the given filename and will search the
+   * entire file for embedded cabinet files
+   *
+   * If any cabinets are found, the equivalent of open() is called on each
+   * potential cabinet file at the offset it was found. All successfully
+   * open()ed cabinets are kept in a list.
+   *
+   * The first cabinet found will be returned directly as the result of
+   * this method. Any further cabinets found will be chained in a list
+   * using the mscabd_cabinet::next field.
+   *
+   * In the case of an error occuring anywhere other than the simulated
+   * open(), NULL is returned and the error code is available from
+   * last_error().
+   *
+   * If no error occurs, but no cabinets can be found in the file, NULL is
+   * returned and last_error() returns MSPACK_ERR_OK.
+   *
+   * The filename pointer should be considered in use until close() is
+   * called on the cabinet.
+   *
+   * close() should only be called on the result of search(), not on any
+   * subsequent cabinets in the mscabd_cabinet::next chain.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  filename the filename of the file to search for cabinets. This
+   *                  is passed directly to mspack_system::open().
+   * @return a pointer to a mscabd_cabinet structure, or NULL
+   * @see close(), open(), last_error()
+   */
+  struct mscabd_cabinet * (*search) (struct mscab_decompressor *self,
+                                     const char *filename);
+
+  /**
+   * Appends one mscabd_cabinet to another, forming or extending a cabinet
+   * set.
+   *
+   * This will attempt to append one cabinet to another such that
+   * <tt>(cab->nextcab == nextcab) && (nextcab->prevcab == cab)</tt> and
+   * any folders split between the two cabinets are merged.
+   *
+   * The cabinets MUST be part of a cabinet set -- a cabinet set is a
+   * cabinet that spans more than one physical cabinet file on disk -- and
+   * must be appropriately matched.
+   *
+   * It can be determined if a cabinet has further parts to load by
+   * examining the mscabd_cabinet::flags field:
+   *
+   * - if <tt>(flags & MSCAB_HDR_PREVCAB)</tt> is non-zero, there is a
+   *   predecessor cabinet to open() and prepend(). Its MS-DOS
+   *   case-insensitive filename is mscabd_cabinet::prevname
+   * - if <tt>(flags & MSCAB_HDR_NEXTCAB)</tt> is non-zero, there is a
+   *   successor cabinet to open() and append(). Its MS-DOS case-insensitive
+   *   filename is mscabd_cabinet::nextname
+   *
+   * If the cabinets do not match, an error code will be returned. Neither
+   * cabinet has been altered, and both should be closed seperately.
+   *
+   * Files and folders in a cabinet set are a single entity. All cabinets
+   * in a set use the same file list, which is updated as cabinets in the
+   * set are added. All pointers to mscabd_folder and mscabd_file
+   * structures in either cabinet must be discarded and re-obtained after
+   * merging.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  cab      the cabinet which will be appended to,
+   *                  predecessor of nextcab
+   * @param  nextcab  the cabinet which will be appended,
+   *                  successor of cab
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see prepend(), open(), close()
+   */
+  int (*append) (struct mscab_decompressor *self,
+                 struct mscabd_cabinet *cab,
+                 struct mscabd_cabinet *nextcab);
+
+  /**
+   * Prepends one mscabd_cabinet to another, forming or extending a
+   * cabinet set.
+   *
+   * This will attempt to prepend one cabinet to another, such that
+   * <tt>(cab->prevcab == prevcab) && (prevcab->nextcab == cab)</tt>. In
+   * all other respects, it is identical to append(). See append() for the
+   * full documentation.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  cab      the cabinet which will be prepended to,
+   *                  successor of prevcab
+   * @param  prevcab  the cabinet which will be prepended,
+   *                  predecessor of cab
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see append(), open(), close()
+   */
+  int (*prepend) (struct mscab_decompressor *self,
+                  struct mscabd_cabinet *cab,
+                  struct mscabd_cabinet *prevcab);
+
+  /**
+   * Extracts a file from a cabinet or cabinet set.
+   *
+   * This extracts a compressed file in a cabinet and writes it to the given
+   * filename.
+   *
+   * The MS-DOS filename of the file, mscabd_file::filename, is NOT USED
+   * by extract(). The caller must examine this MS-DOS filename, copy and
+   * change it as necessary, create directories as necessary, and provide
+   * the correct filename as a parameter, which will be passed unchanged
+   * to the decompressor's mspack_system::open()
+   *
+   * If the file belongs to a split folder in a multi-part cabinet set,
+   * and not enough parts of the cabinet set have been loaded and appended
+   * or prepended, an error will be returned immediately.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  file     the file to be decompressed
+   * @param  filename the filename of the file being written to
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*extract)(struct mscab_decompressor *self,
+                 struct mscabd_file *file,
+                 const char *filename);
+
+  /**
+   * Sets a CAB decompression engine parameter.
+   *
+   * The following parameters are defined:
+   * - #MSCABD_PARAM_SEARCHBUF: How many bytes should be allocated as a
+   *   buffer when using search()? The minimum value is 4.  The default
+   *   value is 32768.
+   * - #MSCABD_PARAM_FIXMSZIP: If non-zero, extract() will ignore bad
+   *   checksums and recover from decompression errors in MS-ZIP
+   *   compressed folders. The default value is 0 (don't recover).
+   * - #MSCABD_PARAM_DECOMPBUF: How many bytes should be used as an input
+   *   bit buffer by decompressors? The minimum value is 4. The default
+   *   value is 4096.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @param  param    the parameter to set
+   * @param  value    the value to set the parameter to
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there
+   *         is a problem with either parameter or value.
+   * @see search(), extract()
+   */
+  int (*set_param)(struct mscab_decompressor *self,
+                   int param,
+                   int value);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * This is useful for open() and search(), which do not return an error
+   * code directly.
+   *
+   * @param  self     a self-referential pointer to the mscab_decompressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see open(), search()
+   */
+  int (*last_error)(struct mscab_decompressor *self);
+};
+
+/* --- support for .CHM (HTMLHelp) file format ----------------------------- */
+
+/**
+ * A structure which represents a file to be placed in a CHM helpfile.
+ *
+ * A contiguous array of these structures should be passed to
+ * mschm_compressor::generate(). The array list is terminated with an
+ * entry whose mschmc_file::section field is set to #MSCHMC_ENDLIST, the
+ * other fields in this entry are ignored.
+ */
+struct mschmc_file {
+  /** One of #MSCHMC_ENDLIST, #MSCHMC_UNCOMP or #MSCHMC_MSCOMP. */
+  int section;
+
+  /** The filename of the source file that will be added to the CHM. This
+   * is passed directly to mspack_system::open(). */
+  const char *filename;
+
+  /** The full path and filename of the file within the CHM helpfile, a
+   * UTF-1 encoded null-terminated string. */
+  char *chm_filename;
+
+  /** The length of the file, in bytes. This will be adhered to strictly
+   * and a read error will be issued if this many bytes cannot be read
+   * from the real file at CHM generation time. */
+  off_t length;
+};
+
+/**
+ * A structure which represents a section of a CHM helpfile.
+ *
+ * All fields are READ ONLY.
+ *
+ * Not used directly, but used as a generic base type for
+ * mschmd_sec_uncompressed and mschmd_sec_mscompressed.
+ */
+struct mschmd_section {
+  /** A pointer to the CHM helpfile that contains this section. */
+  struct mschmd_header *chm;
+
+  /**
+   * The section ID. Either 0 for the uncompressed section
+   * mschmd_sec_uncompressed, or 1 for the LZX compressed section
+   * mschmd_sec_mscompressed. No other section IDs are known.
+   */
+  unsigned int id;
+};
+
+/**
+ * A structure which represents the uncompressed section of a CHM helpfile.
+ * 
+ * All fields are READ ONLY.
+ */
+struct mschmd_sec_uncompressed {
+  /** Generic section data. */
+  struct mschmd_section base;
+
+  /** The file offset of where this section begins in the CHM helpfile. */
+  off_t offset;
+};
+
+/**
+ * A structure which represents the LZX compressed section of a CHM helpfile. 
+ * 
+ * All fields are READ ONLY.
+ */
+struct mschmd_sec_mscompressed {
+  /** Generic section data. */
+  struct mschmd_section base;
+
+  /** A pointer to the meta-file which represents all LZX compressed data. */
+  struct mschmd_file *content;
+
+  /** A pointer to the file which contains the LZX control data. */
+  struct mschmd_file *control;
+
+  /** A pointer to the file which contains the LZX reset table. */
+  struct mschmd_file *rtable;
+
+  /** A pointer to the file which contains the LZX span information.
+   * Available only in CHM decoder version 2 and above.
+   */
+  struct mschmd_file *spaninfo;
+};
+
+/**
+ * A structure which represents a CHM helpfile.
+ * 
+ * All fields are READ ONLY.
+ */
+struct mschmd_header {
+  /** The version of the CHM file format used in this file. */
+  unsigned int version;
+
+  /**
+   * The "timestamp" of the CHM helpfile. 
+   *
+   * It is the lower 32 bits of a 64-bit value representing the number of
+   * centiseconds since 1601-01-01 00:00:00 UTC, plus 42. It is not useful
+   * as a timestamp, but it is useful as a semi-unique ID.
+   */
+  unsigned int timestamp;
+      
+  /**
+   * The default Language and Country ID (LCID) of the user who ran the
+   * HTMLHelp Compiler. This is not the language of the CHM file itself.
+   */
+  unsigned int language;
+
+  /**
+   * The filename of the CHM helpfile. This is given by the library user
+   * and may be in any format.
+   */
+  const char *filename;
+
+  /** The length of the CHM helpfile, in bytes. */
+  off_t length;
+
+  /** A list of all non-system files in the CHM helpfile. */
+  struct mschmd_file *files;
+
+  /**
+   * A list of all system files in the CHM helpfile.
+   *
+   * System files are files which begin with "::". They are meta-files
+   * generated by the CHM creation process.
+   */
+  struct mschmd_file *sysfiles;
+
+  /** The section 0 (uncompressed) data in this CHM helpfile. */
+  struct mschmd_sec_uncompressed sec0;
+
+  /** The section 1 (MSCompressed) data in this CHM helpfile. */
+  struct mschmd_sec_mscompressed sec1;
+
+  /** The file offset of the first PMGL/PMGI directory chunk. */
+  off_t dir_offset;
+
+  /** The number of PMGL/PMGI directory chunks in this CHM helpfile. */
+  unsigned int num_chunks;
+
+  /** The size of each PMGL/PMGI chunk, in bytes. */
+  unsigned int chunk_size;
+
+  /** The "density" of the quick-reference section in PMGL/PMGI chunks. */
+  unsigned int density;
+
+  /** The depth of the index tree.
+   *
+   * - if 1, there are no PMGI chunks, only PMGL chunks.
+   * - if 2, there is 1 PMGI chunk. All chunk indices point to PMGL chunks.
+   * - if 3, the root PMGI chunk points to secondary PMGI chunks, which in
+   *         turn point to PMGL chunks.
+   * - and so on...
+   */
+  unsigned int depth;
+
+  /**
+   * The number of the root PMGI chunk.
+   *
+   * If there is no index in the CHM helpfile, this will be 0xFFFFFFFF.
+   */
+  unsigned int index_root;
+
+  /**
+   * The number of the first PMGL chunk. Usually zero.
+   * Available only in CHM decoder version 2 and above.
+   */
+  unsigned int first_pmgl;
+
+  /**
+   * The number of the last PMGL chunk. Usually num_chunks-1.
+   * Available only in CHM decoder version 2 and above.
+   */
+  unsigned int last_pmgl;
+
+  /**
+   * A cache of loaded chunks, filled in by mschm_decoder::fast_find().
+   * Available only in CHM decoder version 2 and above.
+   */
+  unsigned char **chunk_cache;
+};
+
+/**
+ * A structure which represents a file stored in a CHM helpfile.
+ * 
+ * All fields are READ ONLY.
+ */
+struct mschmd_file {
+  /**
+   * A pointer to the next file in the list, or NULL if this is the final
+   * file.
+   */
+  struct mschmd_file *next;
+
+  /**
+   * A pointer to the section that this file is located in. Indirectly,
+   * it also points to the CHM helpfile the file is located in.
+   */
+  struct mschmd_section *section;
+
+  /** The offset within the section data that this file is located at. */
+  off_t offset;
+
+  /** The length of this file, in bytes */
+  off_t length;
+
+  /** The filename of this file -- a null terminated string in UTF-8. */
+  char *filename;
+};
+
+/** mschmc_file::section value: end of CHM file list */
+#define MSCHMC_ENDLIST   (0)
+/** mschmc_file::section value: this file is in the Uncompressed section */
+#define MSCHMC_UNCOMP    (1)
+/** mschmc_file::section value: this file is in the MSCompressed section */
+#define MSCHMC_MSCOMP    (2)
+ 
+/** mschm_compressor::set_param() parameter: "timestamp" header */
+#define MSCHMC_PARAM_TIMESTAMP  (0)
+/** mschm_compressor::set_param() parameter: "language" header */
+#define MSCHMC_PARAM_LANGUAGE   (1)
+/** mschm_compressor::set_param() parameter: LZX window size */
+#define MSCHMC_PARAM_LZXWINDOW  (2)
+/** mschm_compressor::set_param() parameter: intra-chunk quickref density */
+#define MSCHMC_PARAM_DENSITY    (3)
+/** mschm_compressor::set_param() parameter: whether to create indices */
+#define MSCHMC_PARAM_INDEX      (4)
+
+/**
+ * A compressor for .CHM (Microsoft HTMLHelp) files.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_chm_compressor(), mspack_destroy_chm_compressor()
+ */
+struct mschm_compressor {
+  /**
+   * Generates a CHM help file.
+   *
+   * The help file will contain up to two sections, an Uncompressed
+   * section and potentially an MSCompressed (LZX compressed)
+   * section.
+   *
+   * While the contents listing of a CHM file is always in lexical order,
+   * the file list passed in will be taken as the correct order for files
+   * within the sections.  It is in your interest to place similar files
+   * together for better compression.
+   *
+   * There are two modes of generation, to use a temporary file or not to
+   * use one. See use_temporary_file() for the behaviour of generate() in
+   * these two different modes.
+   *
+   * @param  self        a self-referential pointer to the mschm_compressor
+   *                     instance being called
+   * @param  file_list   an array of mschmc_file structures, terminated
+   *                     with an entry whose mschmc_file::section field is
+   *                     #MSCHMC_ENDLIST. The order of the list is
+   *                     preserved within each section. The length of any
+   *                     mschmc_file::chm_filename string cannot exceed
+   *                     roughly 4096 bytes. Each source file must be able
+   *                     to supply as many bytes as given in the
+   *                     mschmc_file::length field.
+   * @param  output_file the file to write the generated CHM helpfile to.
+   *                     This is passed directly to mspack_system::open()
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see use_temporary_file() set_param()
+   */
+  int (*generate)(struct mschm_compressor *self,
+                  struct mschmc_file file_list[],
+                  const char *output_file);
+
+  /**
+   * Specifies whether a temporary file is used during CHM generation.
+   *
+   * The CHM file format includes data about the compressed section (such
+   * as its overall size) that is stored in the output CHM file prior to
+   * the compressed section itself. This unavoidably requires that the
+   * compressed section has to be generated, before these details can be
+   * set. There are several ways this can be handled. Firstly, the
+   * compressed section could be generated entirely in memory before
+   * writing any of the output CHM file. This approach is not used in
+   * libmspack, as the compressed section can exceed the addressable
+   * memory space on most architectures.
+   *
+   * libmspack has two options, either to write these unknowable sections
+   * with blank data, generate the compressed section, then re-open the
+   * output file for update once the compressed section has been
+   * completed, or to write the compressed section to a temporary file,
+   * then write the entire output file at once, performing a simple
+   * file-to-file copy for the compressed section.
+   *
+   * The simple solution of buffering the entire compressed section in
+   * memory can still be used, if desired. As the temporary file's
+   * filename is passed directly to mspack_system::open(), it is possible
+   * for a custom mspack_system implementation to hold this file in memory,
+   * without writing to a disk.
+   *
+   * If a temporary file is set, generate() performs the following
+   * sequence of events: the temporary file is opened for writing, the
+   * compression algorithm writes to the temporary file, the temporary
+   * file is closed.  Then the output file is opened for writing and the
+   * temporary file is re-opened for reading. The output file is written
+   * and the temporary file is read from. Both files are then closed. The
+   * temporary file itself is not deleted. If that is desired, the
+   * temporary file should be deleted after the completion of generate(),
+   * if it exists.
+   *
+   * If a temporary file is set not to be used, generate() performs the
+   * following sequence of events: the output file is opened for writing,
+   * then it is written and closed. The output file is then re-opened for
+   * update, the appropriate sections are seek()ed to and re-written, then
+   * the output file is closed.
+   *
+   * @param  self          a self-referential pointer to the
+   *                       mschm_compressor instance being called
+   * @param  use_temp_file non-zero if the temporary file should be used,
+   *                       zero if the temporary file should not be used.
+   * @param  temp_file     a file to temporarily write compressed data to,
+   *                       before opening it for reading and copying the
+   *                       contents to the output file. This is passed
+   *                       directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see generate()
+   */
+  int (*use_temporary_file)(struct mschm_compressor *self,
+                            int use_temp_file,
+                            const char *temp_file);
+  /**
+   * Sets a CHM compression engine parameter.
+   *
+   * The following parameters are defined:
+
+   * - #MSCHMC_PARAM_TIMESTAMP: Sets the "timestamp" of the CHM file
+   *   generated. This is not a timestamp, see mschmd_header::timestamp
+   *   for a description. If this timestamp is 0, generate() will use its
+   *   own algorithm for making a unique ID, based on the lengths and
+   *   names of files in the CHM itself. Defaults to 0, any value between
+   *   0 and (2^32)-1 is valid.
+   * - #MSCHMC_PARAM_LANGUAGE: Sets the "language" of the CHM file
+   *   generated.  This is not the language used in the CHM file, but the
+   *   language setting of the user who ran the HTMLHelp compiler. It
+   *   defaults to 0x0409. The valid range is between 0x0000 and 0x7F7F.
+   * - #MSCHMC_PARAM_LZXWINDOW: Sets the size of the LZX history window,
+   *   which is also the interval at which the compressed data stream can be
+   *   randomly accessed. The value is not a size in bytes, but a power of
+   *   two. The default value is 16 (which makes the window 2^16 bytes, or
+   *   64 kilobytes), the valid range is from 15 (32 kilobytes) to 21 (2
+   *   megabytes).
+   * - #MSCHMC_PARAM_DENSITY: Sets the "density" of quick reference
+   *   entries stored at the end of directory listing chunk. Each chunk is
+   *   4096 bytes in size, and contains as many file entries as there is
+   *   room for. At the other end of the chunk, a list of "quick reference"
+   *   pointers is included. The offset of every 'N'th file entry is given a
+   *   quick reference, where N = (2^density) + 1. The default density is
+   *   2. The smallest density is 0 (N=2), the maximum is 10 (N=1025). As
+   *   each file entry requires at least 5 bytes, the maximum number of
+   *   entries in a single chunk is roughly 800, so the maximum value 10
+   *   can be used to indicate there are no quickrefs at all.
+   * - #MSCHMC_PARAM_INDEX: Sets whether or not to include quick lookup
+   *   index chunk(s), in addition to normal directory listing chunks. A
+   *   value of zero means no index chunks will be created, a non-zero value
+   *   means index chunks will be created. The default is zero, "don't
+   *   create an index".
+   *
+   * @param  self     a self-referential pointer to the mschm_compressor
+   *                  instance being called
+   * @param  param    the parameter to set
+   * @param  value    the value to set the parameter to
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there
+   *         is a problem with either parameter or value.
+   * @see generate()
+   */
+  int (*set_param)(struct mschm_compressor *self,
+                   int param,
+                   unsigned int value);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * @param  self     a self-referential pointer to the mschm_compressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see set_param(), generate()
+   */
+  int (*last_error)(struct mschm_compressor *self);
+};
+
+/**
+ * A decompressor for .CHM (Microsoft HTMLHelp) files
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_chm_decompressor(), mspack_destroy_chm_decompressor()
+ */
+struct mschm_decompressor {
+  /**
+   * Opens a CHM helpfile and reads its contents.
+   *
+   * If the file opened is a valid CHM helpfile, all headers will be read
+   * and a mschmd_header structure will be returned, with a full list of
+   * files.
+   *
+   * In the case of an error occuring, NULL is returned and the error code
+   * is available from last_error().
+   *
+   * The filename pointer should be considered "in use" until close() is
+   * called on the CHM helpfile.
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @param  filename the filename of the CHM helpfile. This is passed
+   *                  directly to mspack_system::open().
+   * @return a pointer to a mschmd_header structure, or NULL on failure
+   * @see close()
+   */
+  struct mschmd_header *(*open)(struct mschm_decompressor *self,
+                                const char *filename);
+
+  /**
+   * Closes a previously opened CHM helpfile.
+   *
+   * This closes a CHM helpfile, frees the mschmd_header and all
+   * mschmd_file structures associated with it (if any). This works on
+   * both helpfiles opened with open() and helpfiles opened with
+   * fast_open().
+   *
+   * The CHM header pointer is now invalid and cannot be used again. All
+   * mschmd_file pointers referencing that CHM are also now invalid, and
+   * cannot be used again.
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @param  chm      the CHM helpfile to close
+   * @see open(), fast_open()
+   */
+  void (*close)(struct mschm_decompressor *self,
+                struct mschmd_header *chm);
+
+  /**
+   * Extracts a file from a CHM helpfile.
+   *
+   * This extracts a file from a CHM helpfile and writes it to the given
+   * filename. The filename of the file, mscabd_file::filename, is not
+   * used by extract(), but can be used by the caller as a guide for
+   * constructing an appropriate filename.
+   *
+   * This method works both with files found in the mschmd_header::files
+   * and mschmd_header::sysfiles list and mschmd_file structures generated
+   * on the fly by fast_find().
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @param  file     the file to be decompressed
+   * @param  filename the filename of the file being written to
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*extract)(struct mschm_decompressor *self,
+                 struct mschmd_file *file,
+                 const char *filename);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * This is useful for open() and fast_open(), which do not return an
+   * error code directly.
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see open(), extract()
+   */
+  int (*last_error)(struct mschm_decompressor *self);
+
+  /**
+   * Opens a CHM helpfile quickly.
+   *
+   * If the file opened is a valid CHM helpfile, only essential headers
+   * will be read. A mschmd_header structure will be still be returned, as
+   * with open(), but the mschmd_header::files field will be NULL. No
+   * files details will be automatically read.  The fast_find() method
+   * must be used to obtain file details.
+   *
+   * In the case of an error occuring, NULL is returned and the error code
+   * is available from last_error().
+   *
+   * The filename pointer should be considered "in use" until close() is
+   * called on the CHM helpfile.
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @param  filename the filename of the CHM helpfile. This is passed
+   *                  directly to mspack_system::open().
+   * @return a pointer to a mschmd_header structure, or NULL on failure
+   * @see open(), close(), fast_find(), extract()
+   */
+  struct mschmd_header *(*fast_open)(struct mschm_decompressor *self,
+                                     const char *filename);
+
+  /**
+   * Finds file details quickly.
+   *
+   * Instead of reading all CHM helpfile headers and building a list of
+   * files, fast_open() and fast_find() are intended for finding file
+   * details only when they are needed. The CHM file format includes an
+   * on-disk file index to allow this.
+   *
+   * Given a case-sensitive filename, fast_find() will search the on-disk
+   * index for that file.
+   *
+   * If the file was found, the caller-provided mschmd_file structure will
+   * be filled out like so:
+   * - section: the correct value for the found file
+   * - offset: the correct value for the found file
+   * - length: the correct value for the found file
+   * - all other structure elements: NULL or 0
+   *
+   * If the file was not found, MSPACK_ERR_OK will still be returned as the
+   * result, but the caller-provided structure will be filled out like so:
+   * - section: NULL
+   * - offset: 0
+   * - length: 0
+   * - all other structure elements: NULL or 0
+   *
+   * This method is intended to be used in conjunction with CHM helpfiles
+   * opened with fast_open(), but it also works with helpfiles opened
+   * using the regular open().
+   *
+   * @param  self     a self-referential pointer to the mschm_decompressor
+   *                  instance being called
+   * @param  chm      the CHM helpfile to search for the file
+   * @param  filename the filename of the file to search for
+   * @param  f_ptr    a pointer to a caller-provded mschmd_file structure
+   * @param  f_size   <tt>sizeof(struct mschmd_file)</tt>
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see open(), close(), fast_find(), extract()
+   */
+  int (*fast_find)(struct mschm_decompressor *self,
+                   struct mschmd_header *chm,
+                   const char *filename,
+                   struct mschmd_file *f_ptr,
+                   int f_size);
+};
+
+/* --- support for .LIT (EBook) file format -------------------------------- */
+
+/** TODO */
+struct mslit_compressor {
+  int dummy; 
+};
+
+/** TODO */
+struct mslit_decompressor {
+  int dummy; 
+};
+
+
+/* --- support for .HLP (MS Help) file format ------------------------------ */
+
+/** TODO */
+struct mshlp_compressor {
+  int dummy; 
+};
+
+/** TODO */
+struct mshlp_decompressor {
+  int dummy; 
+};
+
+
+/* --- support for SZDD file format ---------------------------------------- */
+
+/** msszdd_compressor::set_param() parameter: the missing character */
+#define MSSZDDC_PARAM_MISSINGCHAR (0)
+
+/** msszddd_header::format value - a regular SZDD file */
+#define MSSZDD_FMT_NORMAL (0)
+
+/** msszddd_header::format value - a special QBasic SZDD file */
+#define MSSZDD_FMT_QBASIC (1)
+
+/**
+ * A structure which represents an SZDD compressed file.
+ *
+ * All fields are READ ONLY.
+ */
+struct msszddd_header {
+  /** The file format; either #MSSZDD_FMT_NORMAL or #MSSZDD_FMT_QBASIC */
+  int format;
+
+  /** The amount of data in the SZDD file once uncompressed. */
+  off_t length;
+
+  /**
+   * The last character in the filename, traditionally replaced with an
+   * underscore to show the file is compressed. The null character is used
+   * to show that this character has not been stored (e.g. because the
+   * filename is not known). Generally, only characters that may appear in
+   * an MS-DOS filename (except ".") are valid.
+   */
+  char missing_char;
+};
+
+/**
+ * A compressor for the SZDD file format.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_szdd_compressor(), mspack_destroy_szdd_compressor()
+ */
+struct msszdd_compressor {
+  /**
+   * Reads an input file and creates a compressed output file in the
+   * SZDD compressed file format. The SZDD compression format is quick
+   * but gives poor compression. It is possible for the compressed output
+   * file to be larger than the input file.
+   *
+   * Conventionally, SZDD compressed files have the final character in
+   * their filename replaced with an underscore, to show they are
+   * compressed.  The missing character is stored in the compressed file
+   * itself. This is due to the restricted filename conventions of MS-DOS,
+   * most operating systems, such as UNIX, simply append another file
+   * extension to the existing filename. As mspack does not deal with
+   * filenames, this is left up to you. If you wish to set the missing
+   * character stored in the file header, use set_param() with the
+   * #MSSZDDC_PARAM_MISSINGCHAR parameter.
+   *
+   * "Stream" compression (where the length of the input data is not
+   * known) is not possible. The length of the input data is stored in the
+   * header of the SZDD file and must therefore be known before any data
+   * is compressed. Due to technical limitations of the file format, the
+   * maximum size of uncompressed file that will be accepted is 2147483647
+   * bytes.
+   *
+   * @param  self    a self-referential pointer to the msszdd_compressor
+   *                 instance being called
+   * @param  input   the name of the file to compressed. This is passed
+   *                 passed directly to mspack_system::open()
+   * @param  output  the name of the file to write compressed data to.
+   *                 This is passed directly to mspack_system::open().
+   * @param  length  the length of the uncompressed file, or -1 to indicate
+   *                 that this should be determined automatically by using
+   *                 mspack_system::seek() on the input file.
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see set_param()
+   */
+  int (*compress)(struct msszdd_compressor *self,
+                  const char *input,
+                  const char *output,
+                  off_t length);
+
+  /**
+   * Sets an SZDD compression engine parameter.
+   *
+   * The following parameters are defined:
+
+   * - #MSSZDDC_PARAM_CHARACTER: the "missing character", the last character
+   *   in the uncompressed file's filename, which is traditionally replaced
+   *   with an underscore to show the file is compressed. Traditionally,
+   *   this can only be a character that is a valid part of an MS-DOS,
+   *   filename, but libmspack permits any character between 0x00 and 0xFF
+   *   to be stored. 0x00 is the default, and it represents "no character
+   *   stored".
+   *
+   * @param  self     a self-referential pointer to the msszdd_compressor
+   *                  instance being called
+   * @param  param    the parameter to set
+   * @param  value    the value to set the parameter to
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there
+   *         is a problem with either parameter or value.
+   * @see compress()
+   */
+  int (*set_param)(struct msszdd_compressor *self,
+                   int param,
+                   unsigned int value);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * @param  self     a self-referential pointer to the msszdd_compressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see compress()
+   */
+  int (*last_error)(struct mschm_decompressor *self);
+};
+
+/**
+ * A decompressor for SZDD compressed files.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_szdd_decompressor(), mspack_destroy_szdd_decompressor()
+ */
+struct msszdd_decompressor {
+  /**
+   * Opens a SZDD file and reads the header.
+   *
+   * If the file opened is a valid SZDD file, all headers will be read and
+   * a msszddd_header structure will be returned.
+   *
+   * In the case of an error occuring, NULL is returned and the error code
+   * is available from last_error().
+   *
+   * The filename pointer should be considered "in use" until close() is
+   * called on the SZDD file.
+   *
+   * @param  self     a self-referential pointer to the msszdd_decompressor
+   *                  instance being called
+   * @param  filename the filename of the SZDD compressed file. This is
+   *                  passed directly to mspack_system::open().
+   * @return a pointer to a msszddd_header structure, or NULL on failure
+   * @see close()
+   */
+  struct msszddd_header *(*open)(struct msszdd_decompressor *self,
+                                 const char *filename);
+
+  /**
+   * Closes a previously opened SZDD file.
+   *
+   * This closes a SZDD file and frees the msszddd_header associated with
+   * it.
+   *
+   * The SZDD header pointer is now invalid and cannot be used again.
+   *
+   * @param  self     a self-referential pointer to the msszdd_decompressor
+   *                  instance being called
+   * @param  szdd     the SZDD file to close
+   * @see open()
+   */
+  void (*close)(struct msszdd_decompressor *self,
+                struct msszddd_header *szdd);
+
+  /**
+   * Extracts the compressed data from a SZDD file.
+   *
+   * This decompresses the compressed SZDD data stream and writes it to
+   * an output file.
+   *
+   * @param  self     a self-referential pointer to the msszdd_decompressor
+   *                  instance being called
+   * @param  szdd     the SZDD file to extract data from
+   * @param  filename the filename to write the decompressed data to. This
+   *                  is passed directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*extract)(struct msszdd_decompressor *self,
+                 struct msszddd_header *szdd,
+                 const char *filename);
+
+  /**
+   * Decompresses an SZDD file to an output file in one step.
+   *
+   * This opens an SZDD file as input, reads the header, then decompresses
+   * the compressed data immediately to an output file, finally closing
+   * both the input and output file. It is more convenient to use than
+   * open() then extract() then close(), if you do not need to know the
+   * SZDD output size or missing character.
+   *
+   * @param  self     a self-referential pointer to the msszdd_decompressor
+   *                  instance being called
+   * @param  input    the filename of the input SZDD file. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename to write the decompressed data to. This
+   *                  is passed directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*decompress)(struct msszdd_decompressor *self,
+                    const char *input,
+                    const char *output);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * This is useful for open() which does not return an
+   * error code directly.
+   *
+   * @param  self     a self-referential pointer to the msszdd_decompressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see open(), extract(), decompress()
+   */
+  int (*last_error)(struct msszdd_decompressor *self);
+};
+
+/* --- support for KWAJ file format ---------------------------------------- */
+
+/** mskwaj_compressor::set_param() parameter: compression type */
+#define MSKWAJC_PARAM_COMP_TYPE  (0)
+
+/** mskwaj_compressor::set_param() parameter: include the length of the
+ * uncompressed file in the header?
+ */
+#define MSKWAJC_PARAM_INCLUDE_LENGTH (1)
+
+/** KWAJ compression type: no compression. */
+#define MSKWAJ_COMP_NONE (0)
+/** KWAJ compression type: no compression, 0xFF XOR "encryption". */
+#define MSKWAJ_COMP_XOR (1)
+/** KWAJ compression type: LZSS (same method as SZDD) */
+#define MSKWAJ_COMP_SZDD (2)
+/** KWAJ compression type: LZ+Huffman compression */
+#define MSKWAJ_COMP_LZH (3)
+/** KWAJ compression type: MSZIP */
+#define MSKWAJ_COMP_MSZIP (4)
+
+/** KWAJ optional header flag: decompressed file length is included */
+#define MSKWAJ_HDR_HASLENGTH (0x01)
+
+/** KWAJ optional header flag: unknown 2-byte structure is included */
+#define MSKWAJ_HDR_HASUNKNOWN1 (0x02)
+
+/** KWAJ optional header flag: unknown multi-sized structure is included */
+#define MSKWAJ_HDR_HASUNKNOWN2 (0x04)
+
+/** KWAJ optional header flag: file name (no extension) is included */
+#define MSKWAJ_HDR_HASFILENAME (0x08)
+
+/** KWAJ optional header flag: file extension is included */
+#define MSKWAJ_HDR_HASFILEEXT (0x10)
+
+/** KWAJ optional header flag: extra text is included */
+#define MSKWAJ_HDR_HASEXTRATEXT (0x20)
+
+/**
+ * A structure which represents an KWAJ compressed file.
+ *
+ * All fields are READ ONLY.
+ */
+struct mskwajd_header {
+  /** The compression type; should be one of #MSKWAJ_COMP_NONE,
+   * #MSKWAJ_COMP_XOR, #MSKWAJ_COMP_SZDD or #MSKWAJ_COMP_LZH
+   */
+  unsigned short comp_type;
+
+  /** The offset in the file where the compressed data stream begins */
+  off_t data_offset;
+
+  /** Flags indicating which optional headers were included. */
+  int headers;
+
+  /** The amount of uncompressed data in the file, or 0 if not present. */
+  off_t length;
+
+  /** output filename, or NULL if not present */
+  char *filename;
+
+  /** extra uncompressed data (usually text) in the header.
+   * This data can contain nulls so use extra_length to get the size.
+   */
+  char *extra;
+
+  /** length of extra uncompressed data in the header */
+  unsigned short extra_length;
+};
+
+/**
+ * A compressor for the KWAJ file format.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_kwaj_compressor(), mspack_destroy_kwaj_compressor()
+ */
+struct mskwaj_compressor {
+  /**
+   * Reads an input file and creates a compressed output file in the
+   * KWAJ compressed file format. The KWAJ compression format is quick
+   * but gives poor compression. It is possible for the compressed output
+   * file to be larger than the input file.
+   *
+   * @param  self    a self-referential pointer to the mskwaj_compressor
+   *                 instance being called
+   * @param  input   the name of the file to compressed. This is passed
+   *                 passed directly to mspack_system::open()
+   * @param  output  the name of the file to write compressed data to.
+   *                 This is passed directly to mspack_system::open().
+   * @param  length  the length of the uncompressed file, or -1 to indicate
+   *                 that this should be determined automatically by using
+   *                 mspack_system::seek() on the input file.
+   * @return an error code, or MSPACK_ERR_OK if successful
+   * @see set_param()
+   */
+  int (*compress)(struct mskwaj_compressor *self,
+                  const char *input,
+                  const char *output,
+                  off_t length);
+
+  /**
+   * Sets an KWAJ compression engine parameter.
+   *
+   * The following parameters are defined:
+   *
+   * - #MSKWAJC_PARAM_COMP_TYPE: the compression method to use. Must
+   *   be one of #MSKWAJC_COMP_NONE, #MSKWAJC_COMP_XOR, #MSKWAJ_COMP_SZDD
+   *   or #MSKWAJ_COMP_LZH. The default is #MSKWAJ_COMP_LZH.
+   *
+   * - #MSKWAJC_PARAM_INCLUDE_LENGTH: a boolean; should the compressed
+   *   output file should include the uncompressed length of the input
+   *   file in the header? This adds 4 bytes to the size of the output
+   *   file. A value of zero says "no", non-zero says "yes". The default
+   *   is "no".
+   *
+   * @param  self     a self-referential pointer to the mskwaj_compressor
+   *                  instance being called
+   * @param  param    the parameter to set
+   * @param  value    the value to set the parameter to
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if there
+   *         is a problem with either parameter or value.
+   * @see generate()
+   */
+  int (*set_param)(struct mskwaj_compressor *self,
+                   int param,
+                   unsigned int value);
+
+
+  /**
+   * Sets the original filename of the file before compression,
+   * which will be stored in the header of the output file.
+   *
+   * The filename should be a null-terminated string, it must be an
+   * MS-DOS "8.3" type filename (up to 8 bytes for the filename, then
+   * optionally a "." and up to 3 bytes for a filename extension).
+   *
+   * If NULL is passed as the filename, no filename is included in the
+   * header. This is the default.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_compressor
+   *                  instance being called
+   * @param  filename the original filename to use
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS if the
+   *         filename is too long
+   */
+  int (*set_filename)(struct mskwaj_compressor *self,
+                      const char *filename);
+
+  /**
+   * Sets arbitrary data that will be stored in the header of the
+   * output file, uncompressed. It can be up to roughly 64 kilobytes,
+   * as the overall size of the header must not exceed 65535 bytes.
+   * The data can contain null bytes if desired.
+   *
+   * If NULL is passed as the data pointer, or zero is passed as the
+   * length, no extra data is included in the header. This is the
+   * default.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_compressor
+   *                  instance being called
+   * @param  data     a pointer to the data to be stored in the header
+   * @param  bytes    the length of the data in bytes
+   * @return MSPACK_ERR_OK if all is OK, or MSPACK_ERR_ARGS extra data
+   *         is too long
+   */
+  int (*set_extra_data)(struct mskwaj_compressor *self,
+                        void *data,
+                        size_t bytes);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_compressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see compress()
+   */
+  int (*last_error)(struct mschm_decompressor *self);
+};
+
+/**
+ * A decompressor for KWAJ compressed files.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_kwaj_decompressor(), mspack_destroy_kwaj_decompressor()
+ */
+struct mskwaj_decompressor {
+  /**
+   * Opens a KWAJ file and reads the header.
+   *
+   * If the file opened is a valid KWAJ file, all headers will be read and
+   * a mskwajd_header structure will be returned.
+   *
+   * In the case of an error occuring, NULL is returned and the error code
+   * is available from last_error().
+   *
+   * The filename pointer should be considered "in use" until close() is
+   * called on the KWAJ file.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_decompressor
+   *                  instance being called
+   * @param  filename the filename of the KWAJ compressed file. This is
+   *                  passed directly to mspack_system::open().
+   * @return a pointer to a mskwajd_header structure, or NULL on failure
+   * @see close()
+   */
+  struct mskwajd_header *(*open)(struct mskwaj_decompressor *self,
+                                 const char *filename);
+
+  /**
+   * Closes a previously opened KWAJ file.
+   *
+   * This closes a KWAJ file and frees the mskwajd_header associated
+   * with it. The KWAJ header pointer is now invalid and cannot be
+   * used again.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_decompressor
+   *                  instance being called
+   * @param  kwaj     the KWAJ file to close
+   * @see open()
+   */
+  void (*close)(struct mskwaj_decompressor *self,
+                struct mskwajd_header *kwaj);
+
+  /**
+   * Extracts the compressed data from a KWAJ file.
+   *
+   * This decompresses the compressed KWAJ data stream and writes it to
+   * an output file.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_decompressor
+   *                  instance being called
+   * @param  kwaj     the KWAJ file to extract data from
+   * @param  filename the filename to write the decompressed data to. This
+   *                  is passed directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*extract)(struct mskwaj_decompressor *self,
+                 struct mskwajd_header *kwaj,
+                 const char *filename);
+
+  /**
+   * Decompresses an KWAJ file to an output file in one step.
+   *
+   * This opens an KWAJ file as input, reads the header, then decompresses
+   * the compressed data immediately to an output file, finally closing
+   * both the input and output file. It is more convenient to use than
+   * open() then extract() then close(), if you do not need to know the
+   * KWAJ output size or output filename.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_decompressor
+   *                  instance being called
+   * @param  input    the filename of the input KWAJ file. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename to write the decompressed data to. This
+   *                  is passed directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*decompress)(struct mskwaj_decompressor *self,
+                    const char *input,
+                    const char *output);
+
+  /**
+   * Returns the error code set by the most recently called method.
+   *
+   * This is useful for open() which does not return an
+   * error code directly.
+   *
+   * @param  self     a self-referential pointer to the mskwaj_decompressor
+   *                  instance being called
+   * @return the most recent error code
+   * @see open(), search()
+   */
+  int (*last_error)(struct mskwaj_decompressor *self);
+};
+
+/* --- support for .LZX (Offline Address Book) file format ----------------- */
+
+/**
+ * A compressor for the Offline Address Book (OAB) format.
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_oab_compressor(), mspack_destroy_oab_compressor()
+ */
+struct msoab_compressor {
+  /**
+   * Compress a full OAB file.
+   *
+   * The input file will be read and the compressed contents written to the
+   * output file.
+   *
+   * @param  self     a self-referential pointer to the msoab_decompressor
+   *                  instance being called
+   * @param  input    the filename of the input file. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename of the output file. This is passed
+   *                  directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*compress) (struct msoab_compressor *self,
+                   const char *input,
+                   const char *output);
+
+  /**
+   * Generate a compressed incremental OAB patch file.
+   *
+   * The two uncompressed files "input" and "base" will be read, and an
+   * incremental patch to generate "input" from "base" will be written to
+   * the output file.
+   *
+   * @param  self     a self-referential pointer to the msoab_compressor
+   *                  instance being called
+   * @param  input    the filename of the input file containing the new
+   *                  version of its contents. This is passed directly
+   *                  to mspack_system::open().
+   * @param  base     the filename of the original base file containing
+   *                  the old version of its contents, against which the
+   *                  incremental patch shall generated. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename of the output file. This is passed
+   *                  directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*compress_incremental) (struct msoab_compressor *self,
+                               const char *input,
+                               const char *base,
+                               const char *output);
+};
+
+/**
+ * A decompressor for .LZX (Offline Address Book) files
+ *
+ * All fields are READ ONLY.
+ *
+ * @see mspack_create_oab_decompressor(), mspack_destroy_oab_decompressor()
+ */
+struct msoab_decompressor {
+  /**
+   * Decompresses a full Offline Address Book file.
+   *
+   * If the input file is a valid compressed Offline Address Book file, 
+   * it will be read and the decompressed contents will be written to
+   * the output file.
+   *
+   * @param  self     a self-referential pointer to the msoab_decompressor
+   *                  instance being called
+   * @param  input    the filename of the input file. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename of the output file. This is passed
+   *                  directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*decompress) (struct msoab_decompressor *self,
+                     const char *input,
+                     const char *output);
+
+  /**
+   * Decompresses an Offline Address Book with an incremental patch file.
+   *
+   * This requires both a full UNCOMPRESSED Offline Address Book file to
+   * act as the "base", and a compressed incremental patch file as input.
+   * If the input file is valid, it will be decompressed with reference to
+   * the base file, and the decompressed contents will be written to the
+   * output file.
+   *
+   * There is no way to tell what the right base file is for the given
+   * incremental patch, but if you get it wrong, this will usually result
+   * in incorrect data being decompressed, which will then fail a checksum
+   * test.
+   *
+   * @param  self     a self-referential pointer to the msoab_decompressor
+   *                  instance being called
+   * @param  input    the filename of the input file. This is passed
+   *                  directly to mspack_system::open().
+   * @param  base     the filename of the base file to which the
+   *                  incremental patch shall be applied. This is passed
+   *                  directly to mspack_system::open().
+   * @param  output   the filename of the output file. This is passed
+   *                  directly to mspack_system::open().
+   * @return an error code, or MSPACK_ERR_OK if successful
+   */
+  int (*decompress_incremental) (struct msoab_decompressor *self,
+                                 const char *input,
+                                 const char *base,
+                                 const char *output);
+};
+
+#ifdef __cplusplus
+}
 #endif
 
 #endif
diff --git a/third_party/mspack/readbits.h b/third_party/mspack/readbits.h
new file mode 100644
index 000000000..9b237a369
--- /dev/null
+++ b/third_party/mspack/readbits.h
@@ -0,0 +1,207 @@
+/* This file is part of libmspack.
+ * (C) 2003-2010 Stuart Caie.
+ *
+ * libmspack is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License (LGPL) version 2.1
+ *
+ * For further details, see the file COPYING.LIB distributed with libmspack
+ */
+
+#ifndef MSPACK_READBITS_H
+#define MSPACK_READBITS_H 1
+
+/* this header defines macros that read data streams by
+ * the individual bits
+ *
+ * INIT_BITS         initialises bitstream state in state structure
+ * STORE_BITS        stores bitstream state in state structure
+ * RESTORE_BITS      restores bitstream state from state structure
+ * ENSURE_BITS(n)    ensure there are at least N bits in the bit buffer
+ * READ_BITS(var,n)  takes N bits from the buffer and puts them in var
+ * PEEK_BITS(n)      extracts without removing N bits from the bit buffer
+ * REMOVE_BITS(n)    removes N bits from the bit buffer
+ *
+ * READ_BITS simply calls ENSURE_BITS, PEEK_BITS and REMOVE_BITS,
+ * which means it's limited to reading the number of bits you can
+ * ensure at any one time. It also fails if asked to read zero bits.
+ * If you need to read zero bits, or more bits than can be ensured in
+ * one go, use READ_MANY_BITS instead.
+ *
+ * These macros have variable names baked into them, so to use them
+ * you have to define some macros:
+ * - BITS_TYPE: the type name of your state structure
+ * - BITS_VAR: the variable that points to your state structure
+ * - define BITS_ORDER_MSB if bits are read from the MSB, or
+ *   define BITS_ORDER_LSB if bits are read from the LSB
+ * - READ_BYTES: some code that reads more data into the bit buffer,
+ *   it should use READ_IF_NEEDED (calls read_input if the byte buffer
+ *   is empty), then INJECT_BITS(data,n) to put data from the byte
+ *   buffer into the bit buffer.
+ *
+ * You also need to define some variables and structure members:
+ * - unsigned char *i_ptr;    // current position in the byte buffer
+ * - unsigned char *i_end;    // end of the byte buffer
+ * - unsigned int bit_buffer; // the bit buffer itself
+ * - unsigned int bits_left;  // number of bits remaining
+ *
+ * If you use read_input() and READ_IF_NEEDED, they also expect these
+ * structure members:
+ * - struct mspack_system *sys;  // to access sys->read()
+ * - unsigned int error;         // to record/return read errors
+ * - unsigned char input_end;    // to mark reaching the EOF
+ * - unsigned char *inbuf;       // the input byte buffer
+ * - unsigned int inbuf_size;    // the size of the input byte buffer
+ *
+ * Your READ_BYTES implementation should read data from *i_ptr and
+ * put them in the bit buffer. READ_IF_NEEDED will call read_input()
+ * if i_ptr reaches i_end, and will fill up inbuf and set i_ptr to
+ * the start of inbuf and i_end to the end of inbuf.
+ *
+ * If you're reading in MSB order, the routines work by using the area
+ * beyond the MSB and the LSB of the bit buffer as a free source of
+ * zeroes when shifting. This avoids having to mask any bits. So we
+ * have to know the bit width of the bit buffer variable. We use
+ * <limits.h> and CHAR_BIT to find the size of the bit buffer in bits.
+ *
+ * If you are reading in LSB order, bits need to be masked. Normally
+ * this is done by computing the mask: N bits are masked by the value
+ * (1<<N)-1). However, you can define BITS_LSB_TABLE to use a lookup
+ * table instead of computing this. This adds two new macros,
+ * PEEK_BITS_T and READ_BITS_T which work the same way as PEEK_BITS
+ * and READ_BITS, except they use this lookup table. This is useful if
+ * you need to look up a number of bits that are only known at
+ * runtime, so the bit mask can't be turned into a constant by the
+ * compiler.
+
+ * The bit buffer datatype should be at least 32 bits wide: it must be
+ * possible to ENSURE_BITS(17), so it must be possible to add 16 new bits
+ * to the bit buffer when the bit buffer already has 1 to 15 bits left.
+ */
+
+#ifndef BITS_VAR
+# error "define BITS_VAR as the state structure poiner variable name"
+#endif
+#ifndef BITS_TYPE
+# error "define BITS_TYPE as the state structure type"
+#endif
+#if defined(BITS_ORDER_MSB) && defined(BITS_ORDER_LSB)
+# error "you must define either BITS_ORDER_MSB or BITS_ORDER_LSB"
+#else
+# if !(defined(BITS_ORDER_MSB) || defined(BITS_ORDER_LSB))
+#  error "you must define BITS_ORDER_MSB or BITS_ORDER_LSB"
+# endif
+#endif
+
+#if HAVE_LIMITS_H
+# include <limits.h>
+#endif
+#ifndef CHAR_BIT
+# define CHAR_BIT (8)
+#endif
+#define BITBUF_WIDTH (sizeof(bit_buffer) * CHAR_BIT)
+
+#define INIT_BITS do {                          \
+    BITS_VAR->i_ptr      = &BITS_VAR->inbuf[0]; \
+    BITS_VAR->i_end      = &BITS_VAR->inbuf[0]; \
+    BITS_VAR->bit_buffer = 0;                   \
+    BITS_VAR->bits_left  = 0;                   \
+    BITS_VAR->input_end  = 0;                   \
+} while (0)
+
+#define STORE_BITS do {                 \
+    BITS_VAR->i_ptr      = i_ptr;       \
+    BITS_VAR->i_end      = i_end;       \
+    BITS_VAR->bit_buffer = bit_buffer;  \
+    BITS_VAR->bits_left  = bits_left;   \
+} while (0)
+
+#define RESTORE_BITS do {               \
+    i_ptr      = BITS_VAR->i_ptr;       \
+    i_end      = BITS_VAR->i_end;       \
+    bit_buffer = BITS_VAR->bit_buffer;  \
+    bits_left  = BITS_VAR->bits_left;   \
+} while (0)
+
+#define ENSURE_BITS(nbits) do {                 \
+    while (bits_left < (nbits)) READ_BYTES;     \
+} while (0)
+
+#define READ_BITS(val, nbits) do {              \
+    ENSURE_BITS(nbits);                         \
+    (val) = PEEK_BITS(nbits);                   \
+    REMOVE_BITS(nbits);                         \
+} while (0)
+
+#define READ_MANY_BITS(val, bits) do {                          \
+    unsigned char needed = (bits), bitrun;                      \
+    (val) = 0;                                                  \
+    while (needed > 0) {                                        \
+        if (bits_left <= (BITBUF_WIDTH - 16)) READ_BYTES;       \
+        bitrun = (bits_left < needed) ? bits_left : needed;     \
+        (val) = ((val) << bitrun) | PEEK_BITS(bitrun);          \
+        REMOVE_BITS(bitrun);                                    \
+        needed -= bitrun;                                       \
+    }                                                           \
+} while (0)
+
+#ifdef BITS_ORDER_MSB
+# define PEEK_BITS(nbits)   (bit_buffer >> (BITBUF_WIDTH - (nbits)))
+# define REMOVE_BITS(nbits) ((bit_buffer <<= (nbits)), (bits_left -= (nbits)))
+# define INJECT_BITS(bitdata,nbits) ((bit_buffer |= \
+    (bitdata) << (BITBUF_WIDTH - (nbits) - bits_left)), (bits_left += (nbits)))
+#else /* BITS_ORDER_LSB */
+# define PEEK_BITS(nbits)   (bit_buffer & ((1 << (nbits))-1))
+# define REMOVE_BITS(nbits) ((bit_buffer >>= (nbits)), (bits_left -= (nbits)))
+# define INJECT_BITS(bitdata,nbits) ((bit_buffer |= \
+    (bitdata) << bits_left), (bits_left += (nbits)))
+#endif
+
+#ifdef BITS_LSB_TABLE
+/* lsb_bit_mask[n] = (1 << n) - 1 */
+static const unsigned short lsb_bit_mask[17] = {
+    0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+    0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+# define PEEK_BITS_T(nbits) (bit_buffer & lsb_bit_mask[(nbits)])
+# define READ_BITS_T(val, nbits) do {   \
+    ENSURE_BITS(nbits);                 \
+    (val) = PEEK_BITS_T(nbits);         \
+    REMOVE_BITS(nbits);                 \
+} while (0)
+#endif
+
+#ifndef BITS_NO_READ_INPUT
+# define READ_IF_NEEDED do {            \
+    if (i_ptr >= i_end) {               \
+        if (read_input(BITS_VAR))       \
+            return BITS_VAR->error;     \
+        i_ptr = BITS_VAR->i_ptr;        \
+        i_end = BITS_VAR->i_end;        \
+    }                                   \
+} while (0)
+
+static int read_input(BITS_TYPE *p) {
+    int read = p->sys->read(p->input, &p->inbuf[0], (int)p->inbuf_size);
+    if (read < 0) return p->error = MSPACK_ERR_READ;
+
+    /* we might overrun the input stream by asking for bits we don't use,
+     * so fake 2 more bytes at the end of input */
+    if (read == 0) {
+        if (p->input_end) {
+            D(("out of input bytes"))
+            return p->error = MSPACK_ERR_READ;
+        }
+        else {
+            read = 2;
+            p->inbuf[0] = p->inbuf[1] = 0;
+            p->input_end = 1;
+        }
+    }
+
+    /* update i_ptr and i_end */
+    p->i_ptr = &p->inbuf[0];
+    p->i_end = &p->inbuf[read];
+    return MSPACK_ERR_OK;
+}
+#endif
+#endif
diff --git a/third_party/mspack/readhuff.h b/third_party/mspack/readhuff.h
new file mode 100644
index 000000000..4d9422578
--- /dev/null
+++ b/third_party/mspack/readhuff.h
@@ -0,0 +1,172 @@
+/* This file is part of libmspack.
+ * (C) 2003-2014 Stuart Caie.
+ *
+ * libmspack is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License (LGPL) version 2.1
+ *
+ * For further details, see the file COPYING.LIB distributed with libmspack
+ */
+
+#ifndef MSPACK_READHUFF_H
+#define MSPACK_READHUFF_H 1
+
+/* This implements a fast Huffman tree decoding system. */
+
+#if !(defined(BITS_ORDER_MSB) || defined(BITS_ORDER_LSB))
+# error "readhuff.h is used in conjunction with readbits.h, include that first"
+#endif
+#if !(defined(TABLEBITS) && defined(MAXSYMBOLS))
+# error "define TABLEBITS(tbl) and MAXSYMBOLS(tbl) before using readhuff.h"
+#endif
+#if !(defined(HUFF_TABLE) && defined(HUFF_LEN))
+# error "define HUFF_TABLE(tbl) and HUFF_LEN(tbl) before using readhuff.h"
+#endif
+#ifndef HUFF_ERROR
+# error "define HUFF_ERROR before using readhuff.h"
+#endif
+#ifndef HUFF_MAXBITS
+# define HUFF_MAXBITS 16
+#endif
+
+/* Decodes the next huffman symbol from the input bitstream into var.
+ * Do not use this macro on a table unless build_decode_table() succeeded.
+ */
+#define READ_HUFFSYM(tbl, var) do {                     \
+    ENSURE_BITS(HUFF_MAXBITS);                          \
+    sym = HUFF_TABLE(tbl, PEEK_BITS(TABLEBITS(tbl)));   \
+    if (sym >= MAXSYMBOLS(tbl)) HUFF_TRAVERSE(tbl);     \
+    (var) = sym;                                        \
+    i = HUFF_LEN(tbl, sym);                             \
+    REMOVE_BITS(i);                                     \
+} while (0)
+
+#ifdef BITS_ORDER_LSB
+# define HUFF_TRAVERSE(tbl) do {                        \
+    i = TABLEBITS(tbl) - 1;                             \
+    do {                                                \
+        if (i++ > HUFF_MAXBITS) HUFF_ERROR;             \
+        sym = HUFF_TABLE(tbl,                           \
+            (sym << 1) | ((bit_buffer >> i) & 1));      \
+    } while (sym >= MAXSYMBOLS(tbl));                   \
+} while (0)
+#else
+#define HUFF_TRAVERSE(tbl) do {                         \
+    i = 1 << (BITBUF_WIDTH - TABLEBITS(tbl));           \
+    do {                                                \
+        if ((i >>= 1) == 0) HUFF_ERROR;                 \
+        sym = HUFF_TABLE(tbl,                           \
+            (sym << 1) | ((bit_buffer & i) ? 1 : 0));   \
+    } while (sym >= MAXSYMBOLS(tbl));                   \
+} while (0)
+#endif
+
+/* make_decode_table(nsyms, nbits, length[], table[])
+ *
+ * This function was originally coded by David Tritscher.
+ * It builds a fast huffman decoding table from
+ * a canonical huffman code lengths table.
+ *
+ * nsyms  = total number of symbols in this huffman tree.
+ * nbits  = any symbols with a code length of nbits or less can be decoded
+ *          in one lookup of the table.
+ * length = A table to get code lengths from [0 to nsyms-1]
+ * table  = The table to fill up with decoded symbols and pointers.
+ *          Should be ((1<<nbits) + (nsyms*2)) in length.
+ *
+ * Returns 0 for OK or 1 for error
+ */
+static int make_decode_table(unsigned int nsyms, unsigned int nbits,
+                             unsigned char *length, unsigned short *table)
+{
+    register unsigned short sym, next_symbol;
+    register unsigned int leaf, fill;
+#ifdef BITS_ORDER_LSB
+    register unsigned int reverse;
+#endif
+    register unsigned char bit_num;
+    unsigned int pos         = 0; /* the current position in the decode table */
+    unsigned int table_mask  = 1 << nbits;
+    unsigned int bit_mask    = table_mask >> 1; /* don't do 0 length codes */
+
+    /* fill entries for codes short enough for a direct mapping */
+    for (bit_num = 1; bit_num <= nbits; bit_num++) {
+        for (sym = 0; sym < nsyms; sym++) {
+            if (length[sym] != bit_num) continue;
+#ifdef BITS_ORDER_MSB
+            leaf = pos;
+#else
+            /* reverse the significant bits */
+            fill = length[sym]; reverse = pos >> (nbits - fill); leaf = 0;
+            do {leaf <<= 1; leaf |= reverse & 1; reverse >>= 1;} while (--fill);
+#endif
+
+            if((pos += bit_mask) > table_mask) return 1; /* table overrun */
+
+            /* fill all possible lookups of this symbol with the symbol itself */
+#ifdef BITS_ORDER_MSB
+            for (fill = bit_mask; fill-- > 0;) table[leaf++] = sym;
+#else
+            fill = bit_mask; next_symbol = 1 << bit_num;
+            do { table[leaf] = sym; leaf += next_symbol; } while (--fill);
+#endif
+        }
+        bit_mask >>= 1;
+    }
+
+    /* exit with success if table is now complete */
+    if (pos == table_mask) return 0;
+
+    /* mark all remaining table entries as unused */
+    for (sym = pos; sym < table_mask; sym++) {
+#ifdef BITS_ORDER_MSB
+        table[sym] = 0xFFFF;
+#else
+        reverse = sym; leaf = 0; fill = nbits;
+        do { leaf <<= 1; leaf |= reverse & 1; reverse >>= 1; } while (--fill);
+        table[leaf] = 0xFFFF;
+#endif
+    }
+
+    /* next_symbol = base of allocation for long codes */
+    next_symbol = ((table_mask >> 1) < nsyms) ? nsyms : (table_mask >> 1);
+
+    /* give ourselves room for codes to grow by up to 16 more bits.
+     * codes now start at bit nbits+16 and end at (nbits+16-codelength) */
+    pos <<= 16;
+    table_mask <<= 16;
+    bit_mask = 1 << 15;
+
+    for (bit_num = nbits+1; bit_num <= HUFF_MAXBITS; bit_num++) {
+        for (sym = 0; sym < nsyms; sym++) {
+            if (length[sym] != bit_num) continue;
+            if (pos >= table_mask) return 1; /* table overflow */
+
+#ifdef BITS_ORDER_MSB
+            leaf = pos >> 16;
+#else
+            /* leaf = the first nbits of the code, reversed */
+            reverse = pos >> 16; leaf = 0; fill = nbits;
+            do {leaf <<= 1; leaf |= reverse & 1; reverse >>= 1;} while (--fill);
+#endif
+            for (fill = 0; fill < (bit_num - nbits); fill++) {
+                /* if this path hasn't been taken yet, 'allocate' two entries */
+                if (table[leaf] == 0xFFFF) {
+                    table[(next_symbol << 1)     ] = 0xFFFF;
+                    table[(next_symbol << 1) + 1 ] = 0xFFFF;
+                    table[leaf] = next_symbol++;
+                }
+
+                /* follow the path and select either left or right for next bit */
+                leaf = table[leaf] << 1;
+                if ((pos >> (15-fill)) & 1) leaf++;
+            }
+            table[leaf] = sym;
+            pos += bit_mask;
+        }
+        bit_mask >>= 1;
+    }
+
+    /* full table? */
+    return (pos == table_mask) ? 0 : 1;
+}
+#endif
diff --git a/third_party/mspack/system.c b/third_party/mspack/system.c
new file mode 100644
index 000000000..16aa8806d
--- /dev/null
+++ b/third_party/mspack/system.c
@@ -0,0 +1,242 @@
+/* This file is part of libmspack.
+ * (C) 2003-2004 Stuart Caie.
+ *
+ * libmspack is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License (LGPL) version 2.1
+ *
+ * For further details, see the file COPYING.LIB distributed with libmspack
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <system.h>
+
+#if !LARGEFILE_SUPPORT
+const char *largefile_msg = "library not compiled to support large files.";
+#endif
+
+
+int mspack_version(int entity) {
+  switch (entity) {
+   /* CHM decoder version 1 -> 2 changes:
+    * - added mschmd_sec_mscompressed::spaninfo
+    * - added mschmd_header::first_pmgl
+    * - added mschmd_header::last_pmgl
+    * - added mschmd_header::chunk_cache;
+    */
+  case MSPACK_VER_MSCHMD:
+  /* CAB decoder version 1 -> 2 changes:
+   * - added MSCABD_PARAM_SALVAGE
+   */
+  case MSPACK_VER_MSCABD:
+    return 2;
+  case MSPACK_VER_LIBRARY:
+  case MSPACK_VER_SYSTEM:
+  case MSPACK_VER_MSSZDDD:
+  case MSPACK_VER_MSKWAJD:
+  case MSPACK_VER_MSOABD:
+    return 1;
+  case MSPACK_VER_MSCABC:
+  case MSPACK_VER_MSCHMC:
+  case MSPACK_VER_MSLITD:
+  case MSPACK_VER_MSLITC:
+  case MSPACK_VER_MSHLPD:
+  case MSPACK_VER_MSHLPC:
+  case MSPACK_VER_MSSZDDC:
+  case MSPACK_VER_MSKWAJC:
+  case MSPACK_VER_MSOABC:
+    return 0;
+  }
+  return -1;
+}
+
+int mspack_sys_selftest_internal(int offt_size) {
+  return (sizeof(off_t) == offt_size) ? MSPACK_ERR_OK : MSPACK_ERR_SEEK;
+}
+
+/* validates a system structure */
+int mspack_valid_system(struct mspack_system *sys) {
+  return (sys != NULL) && (sys->open != NULL) && (sys->close != NULL) &&
+    (sys->read != NULL) && (sys->write != NULL) && (sys->seek != NULL) &&
+    (sys->tell != NULL) && (sys->message != NULL) && (sys->alloc != NULL) &&
+    (sys->free != NULL) && (sys->copy != NULL) && (sys->null_ptr == NULL);
+}
+
+/* returns the length of a file opened for reading */
+int mspack_sys_filelen(struct mspack_system *system,
+                       struct mspack_file *file, off_t *length)
+{
+  off_t current;
+
+  if (!system || !file || !length) return MSPACK_ERR_OPEN;
+
+  /* get current offset */
+  current = system->tell(file);
+
+  /* seek to end of file */
+  if (system->seek(file, (off_t) 0, MSPACK_SYS_SEEK_END)) {
+    return MSPACK_ERR_SEEK;
+  }
+
+  /* get offset of end of file */
+  *length = system->tell(file);
+
+  /* seek back to original offset */
+  if (system->seek(file, current, MSPACK_SYS_SEEK_START)) {
+    return MSPACK_ERR_SEEK;
+  }
+
+  return MSPACK_ERR_OK;
+}
+
+
+
+/* definition of mspack_default_system -- if the library is compiled with
+ * MSPACK_NO_DEFAULT_SYSTEM, no default system will be provided. Otherwise,
+ * an appropriate default system (e.g. the standard C library, or some native
+ * API calls)
+ */
+
+#ifdef MSPACK_NO_DEFAULT_SYSTEM
+struct mspack_system *mspack_default_system = NULL;
+#else
+
+/* implementation of mspack_default_system for standard C library */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+struct mspack_file_p {
+  FILE *fh;
+  const char *name;
+};
+
+static struct mspack_file *msp_open(struct mspack_system *self,
+                                    const char *filename, int mode)
+{
+  struct mspack_file_p *fh;
+  const char *fmode;
+
+  switch (mode) {
+  case MSPACK_SYS_OPEN_READ:   fmode = "rb";  break;
+  case MSPACK_SYS_OPEN_WRITE:  fmode = "wb";  break;
+  case MSPACK_SYS_OPEN_UPDATE: fmode = "r+b"; break;
+  case MSPACK_SYS_OPEN_APPEND: fmode = "ab";  break;
+  default: return NULL;
+  }
+
+  if ((fh = (struct mspack_file_p *) malloc(sizeof(struct mspack_file_p)))) {
+    fh->name = filename;
+    if ((fh->fh = fopen(filename, fmode))) return (struct mspack_file *) fh;
+    free(fh);
+  }
+  return NULL;
+}
+
+static void msp_close(struct mspack_file *file) {
+  struct mspack_file_p *self = (struct mspack_file_p *) file;
+  if (self) {
+    fclose(self->fh);
+    free(self);
+  }
+}
+
+static int msp_read(struct mspack_file *file, void *buffer, int bytes) {
+  struct mspack_file_p *self = (struct mspack_file_p *) file;
+  if (self && buffer && bytes >= 0) {
+    size_t count = fread(buffer, 1, (size_t) bytes, self->fh);
+    if (!ferror(self->fh)) return (int) count;
+  }
+  return -1;
+}
+
+static int msp_write(struct mspack_file *file, void *buffer, int bytes) {
+  struct mspack_file_p *self = (struct mspack_file_p *) file;
+  if (self && buffer && bytes >= 0) {
+    size_t count = fwrite(buffer, 1, (size_t) bytes, self->fh);
+    if (!ferror(self->fh)) return (int) count;
+  }
+  return -1;
+}
+
+static int msp_seek(struct mspack_file *file, off_t offset, int mode) {
+  struct mspack_file_p *self = (struct mspack_file_p *) file;
+  if (self) {
+    switch (mode) {
+    case MSPACK_SYS_SEEK_START: mode = SEEK_SET; break;
+    case MSPACK_SYS_SEEK_CUR:   mode = SEEK_CUR; break;
+    case MSPACK_SYS_SEEK_END:   mode = SEEK_END; break;
+    default: return -1;
+    }
+#if HAVE_FSEEKO
+    return fseeko(self->fh, offset, mode);
+#else
+    return fseek(self->fh, offset, mode);
+#endif
+  }
+  return -1;
+}
+
+static off_t msp_tell(struct mspack_file *file) {
+  struct mspack_file_p *self = (struct mspack_file_p *) file;
+#if HAVE_FSEEKO
+  return (self) ? (off_t) ftello(self->fh) : 0;
+#else
+  return (self) ? (off_t) ftell(self->fh) : 0;
+#endif
+}
+
+static void msp_msg(struct mspack_file *file, const char *format, ...) {
+  va_list ap;
+  if (file) fprintf(stderr, "%s: ", ((struct mspack_file_p *) file)->name);
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  va_end(ap);
+  fputc((int) '\n', stderr);
+  fflush(stderr);
+}
+
+static void *msp_alloc(struct mspack_system *self, size_t bytes) {
+#if DEBUG
+  /* make uninitialised data obvious */
+  char *buf = malloc(bytes + 8);
+  if (buf) memset(buf, 0xDC, bytes);
+  *((size_t *)buf) = bytes;
+  return &buf[8];
+#else
+  return malloc(bytes);
+#endif
+}
+
+static void msp_free(void *buffer) {
+#if DEBUG
+  char *buf = buffer;
+  size_t bytes;
+  if (buf) {
+    buf -= 8;
+    bytes = *((size_t *)buf);
+    /* make freed data obvious */
+    memset(buf, 0xED, bytes);
+    free(buf);
+  }
+#else
+  free(buffer);
+#endif
+}
+
+static void msp_copy(void *src, void *dest, size_t bytes) {
+  memcpy(dest, src, bytes);
+}
+
+static struct mspack_system msp_system = {
+  &msp_open, &msp_close, &msp_read,  &msp_write, &msp_seek,
+  &msp_tell, &msp_msg, &msp_alloc, &msp_free, &msp_copy, NULL
+};
+
+struct mspack_system *mspack_default_system = &msp_system;
+
+#endif
diff --git a/third_party/mspack/system.h b/third_party/mspack/system.h
new file mode 100644
index 000000000..826e89f3e
--- /dev/null
+++ b/third_party/mspack/system.h
@@ -0,0 +1,113 @@
+/* This file is part of libmspack.
+ * (C) 2003-2018 Stuart Caie.
+ *
+ * libmspack is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License (LGPL) version 2.1
+ *
+ * For further details, see the file COPYING.LIB distributed with libmspack
+ */
+
+#ifndef MSPACK_SYSTEM_H
+#define MSPACK_SYSTEM_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ensure config.h is read before mspack.h */
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <mspack.h>
+
+/* assume <string.h> exists */
+#include <string.h>
+
+/* fix for problem with GCC 4 and glibc (thanks to Ville Skytta)
+ * http://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=150429
+ */
+#ifdef read
+# undef read
+#endif
+
+/* Old GCCs don't have __func__, but __FUNCTION__:
+ * http://gcc.gnu.org/onlinedocs/gcc/Function-Names.html
+ */
+#if __STDC_VERSION__ < 199901L
+# if __GNUC__ >= 2
+#  define __func__ __FUNCTION__
+# else
+#  define __func__ "<unknown>"
+# endif
+#endif
+
+#if DEBUG
+# include <stdio.h>
+# define D(x) do { printf("%s:%d (%s) ",__FILE__, __LINE__, __func__); \
+                   printf x ; fputc('\n', stdout); fflush(stdout);} while (0);
+#else
+# define D(x)
+#endif
+
+/* CAB supports searching through files over 4GB in size, and the CHM file
+ * format actively uses 64-bit offsets. These can only be fully supported
+ * if the system the code runs on supports large files. If not, the library
+ * will work as normal using only 32-bit arithmetic, but if an offset
+ * greater than 2GB is detected, an error message indicating the library
+ * can't support the file should be printed.
+ */
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# define PRId64 "lld"
+# define PRIu64 "llu"
+# define PRId32 "ld"
+# define PRIu32 "lu"
+#endif
+
+#include <limits.h>
+#if ((defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS >= 64) || \
+     (defined(FILESIZEBITS)      && FILESIZEBITS      >= 64) || \
+     defined(_LARGEFILE_SOURCE) || defined(_LARGEFILE64_SOURCE) || \
+     SIZEOF_OFF_T >= 8)
+# define LARGEFILE_SUPPORT 1
+# define LD PRId64
+# define LU PRIu64
+#else
+extern const char *largefile_msg;
+# define LD PRId32
+# define LU PRIu32
+#endif
+
+/* endian-neutral reading of little-endian data */
+#define __egi32(a,n) ( ((((unsigned char *) a)[n+3]) << 24) | \
+                       ((((unsigned char *) a)[n+2]) << 16) | \
+                       ((((unsigned char *) a)[n+1]) <<  8) | \
+                       ((((unsigned char *) a)[n+0])))
+#define EndGetI64(a) ((((unsigned long long int) __egi32(a,4)) << 32) | \
+                      ((unsigned int) __egi32(a,0)))
+#define EndGetI32(a) __egi32(a,0)
+#define EndGetI16(a) ((((a)[1])<<8)|((a)[0]))
+
+/* endian-neutral reading of big-endian data */
+#define EndGetM32(a) (((((unsigned char *) a)[0]) << 24) | \
+                      ((((unsigned char *) a)[1]) << 16) | \
+                      ((((unsigned char *) a)[2]) <<  8) | \
+                      ((((unsigned char *) a)[3])))
+#define EndGetM16(a) ((((a)[0])<<8)|((a)[1]))
+
+extern struct mspack_system *mspack_default_system;
+
+/* returns the length of a file opened for reading */
+extern int mspack_sys_filelen(struct mspack_system *system,
+                              struct mspack_file *file, off_t *length);
+
+/* validates a system structure */
+extern int mspack_valid_system(struct mspack_system *sys);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From d68c27d78b05a30d927e4bfe580836309b0dc99e Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 15:57:21 -0600
Subject: [PATCH 09/31] [CI] Fix mspack linking.

---
 src/xenia/app/premake5.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua
index 3b4b4b9c6..fd4f5b87b 100644
--- a/src/xenia/app/premake5.lua
+++ b/src/xenia/app/premake5.lua
@@ -15,6 +15,7 @@ project("xenia-app")
     "imgui",
     "libavcodec",
     "libavutil",
+    "mspack",
     "snappy",
     "spirv-tools",
     "volk",

From 5e9e226c944ef9ebd843683fc1948f04572afd4b Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 22 Oct 2018 17:39:00 +0100
Subject: [PATCH 10/31] [Kernel] Add XamContentCreateDeviceEnumerator export

Also changed name field of DeviceInfo to be a wchar_t* of known length, makes it match the X360 DeviceInfo struct more closely and it's easier to make sure things reading it won't overflow anything.
---
 src/xenia/kernel/xam/xam_content.cc | 33 +++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc
index 5a3c718c0..8dba3c155 100644
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@@ -23,7 +23,7 @@ struct DeviceInfo {
   uint32_t device_type;
   uint64_t total_bytes;
   uint64_t free_bytes;
-  std::wstring name;
+  wchar_t name[28];
 };
 static const DeviceInfo dummy_device_info_ = {
     0xF00D0000,
@@ -57,7 +57,7 @@ dword_result_t XamContentGetDeviceName(dword_t device_id,
     return X_ERROR_DEVICE_NOT_CONNECTED;
   }
 
-  if (name_capacity < dummy_device_info_.name.size() + 1) {
+  if (name_capacity < wcslen(dummy_device_info_.name) + 1) {
     return X_ERROR_INSUFFICIENT_BUFFER;
   }
 
@@ -174,6 +174,35 @@ dword_result_t XamContentCreateEnumerator(dword_t user_index, dword_t device_id,
 }
 DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);
 
+dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type,
+                                                dword_t content_flags,
+                                                dword_t max_count,
+                                                lpdword_t buffer_size_ptr,
+                                                lpdword_t handle_out) {
+  assert_not_null(handle_out);
+
+  if (buffer_size_ptr) {
+    *buffer_size_ptr = sizeof(DeviceInfo) * max_count;
+  }
+
+  auto e = new XStaticEnumerator(kernel_state(), max_count, sizeof(DeviceInfo));
+  e->Initialize();
+
+  // Copy our dummy device into the enumerator
+  DeviceInfo* dev = (DeviceInfo*)e->AppendItem();
+  if (dev) {
+    xe::store_and_swap(&dev->device_id, dummy_device_info_.device_id);
+    xe::store_and_swap(&dev->device_type, dummy_device_info_.device_type);
+    xe::store_and_swap(&dev->total_bytes, dummy_device_info_.total_bytes);
+    xe::store_and_swap(&dev->free_bytes, dummy_device_info_.free_bytes);
+    xe::copy_and_swap(dev->name, dummy_device_info_.name, 28);
+  }
+
+  *handle_out = e->handle();
+  return X_ERROR_SUCCESS;
+}
+DECLARE_XAM_EXPORT(XamContentCreateDeviceEnumerator, ExportTag::kImplemented);
+
 dword_result_t XamContentCreateEx(dword_t user_index, lpstring_t root_name,
                                   lpvoid_t content_data_ptr, dword_t flags,
                                   lpdword_t disposition_ptr,

From c5056c644f500efae8b66b189894a1d02dacf9ab Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 22 Oct 2018 17:41:45 +0100
Subject: [PATCH 11/31] [Kernel] Add XamNotifyCreateListenerInternal export (1
 unknown param)

Just changed the existing XamNotifyCreateListener import to ...CreateListenerInternal, and made a new XamNotifyCreateListener that just calls the internal version.
---
 src/xenia/kernel/xam/xam_notify.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/xenia/kernel/xam/xam_notify.cc b/src/xenia/kernel/xam/xam_notify.cc
index 1a7337ab8..e3765af25 100644
--- a/src/xenia/kernel/xam/xam_notify.cc
+++ b/src/xenia/kernel/xam/xam_notify.cc
@@ -18,7 +18,8 @@ namespace xe {
 namespace kernel {
 namespace xam {
 
-dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) {
+dword_result_t XamNotifyCreateListenerInternal(qword_t mask, dword_t unk,
+                                               dword_t one) {
   // r4=1 may indicate user process?
 
   auto listener =
@@ -30,6 +31,12 @@ dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) {
 
   return handle;
 }
+DECLARE_XAM_EXPORT2(XamNotifyCreateListenerInternal, kNone, kImplemented,
+                    kSketchy);
+
+dword_result_t XamNotifyCreateListener(qword_t mask, dword_t one) {
+  return XamNotifyCreateListenerInternal(mask, 0, one);
+}
 DECLARE_XAM_EXPORT1(XamNotifyCreateListener, kNone, kImplemented);
 
 // https://github.com/CodeAsm/ffplay360/blob/master/Common/AtgSignIn.cpp

From 8efbe3286718d05da4579485f8b2067cc4dfbbb1 Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 22 Oct 2018 17:51:56 +0100
Subject: [PATCH 12/31] [Kernel] Add XamBuild*ResourceLocator &
 XamFormat*String exports

Resource locators are just strings that say where to find a resource, they can either point to a file path or to an XEX resource.
Normally the Gamercard/SharedSystem exports use XamBuildResourceLocator to make a locator for the resources inside xam.xex.
But since our xam won't have those resources I've made it point them to a local file instead (by passing 0 as the module param for XamBuildResourceLocator).
The resources will have to be extracted to the XEX's local path for it to find them though (and XEX will need to have media:\ symlinked too)

The XamFormat*String exports are really windows-only right now, but they'll at least empty the buffer for other platforms.
They'll also always output MM/DD/YYYY no matter what locale the system is, on 360 this is localized properly of course.

I've renamed XamBuildSharedResourceLocator_ to XamBuildLegacySystemResourceLocator too since that seems to be the proper name for it.
In earlier Xams this just pointed to ...SharedSystemResourceLocator (as does this code), but later versions seem to have their own function.
---
 src/xenia/kernel/xam/xam_info.cc   | 133 +++++++++++++++++++++++++++++
 src/xenia/kernel/xam/xam_table.inc |   2 +-
 2 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 38fdc04d0..431b5a622 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -17,6 +17,10 @@
 #include "xenia/kernel/xthread.h"
 #include "xenia/xbox.h"
 
+#if XE_PLATFORM_WIN32
+#include "xenia/base/platform_win.h"
+#endif
+
 namespace xe {
 namespace kernel {
 namespace xam {
@@ -24,6 +28,135 @@ namespace xam {
 constexpr uint32_t X_LANGUAGE_ENGLISH = 1;
 constexpr uint32_t X_LANGUAGE_JAPANESE = 2;
 
+void XamFormatDateString(dword_t unk, qword_t filetime, lpwstring_t buffer,
+                         dword_t buffer_length) {
+// TODO: implement this for other platforms
+#if XE_PLATFORM_WIN32
+  FILETIME t;
+  t.dwHighDateTime = filetime >> 32;
+  t.dwLowDateTime = (uint32_t)filetime;
+
+  SYSTEMTIME st;
+  SYSTEMTIME stLocal;
+
+  FileTimeToSystemTime(&t, &st);
+  SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal);
+
+  wchar_t buf[256];
+  std::memset(buf, 0, 256 * 2);
+  size_t size = 256 > buffer_length ? buffer_length : 256;
+  // TODO: format this depending on users locale?
+  swprintf_s(buf, size, L"%02d/%02d/%d", stLocal.wMonth, stLocal.wDay,
+             stLocal.wYear);
+#endif
+  std::memset(buffer, 0, buffer_length * 2);
+#if XE_PLATFORM_WIN32
+  xe::store_and_swap<std::wstring>(buffer, buf);
+#endif
+}
+DECLARE_XAM_EXPORT(XamFormatDateString, ExportTag::kImplemented);
+
+void XamFormatTimeString(dword_t unk, qword_t filetime, lpwstring_t buffer,
+                         dword_t buffer_length) {
+// TODO: implement this for other platforms
+#if XE_PLATFORM_WIN32
+  FILETIME t;
+  t.dwHighDateTime = filetime >> 32;
+  t.dwLowDateTime = (uint32_t)filetime;
+
+  SYSTEMTIME st;
+  SYSTEMTIME stLocal;
+
+  FileTimeToSystemTime(&t, &st);
+  SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal);
+
+  wchar_t buf[256];
+  std::memset(buf, 0, 256 * 2);
+  size_t size = 256 > buffer_length ? buffer_length : 256;
+  swprintf_s(buf, size, L"%02d:%02d", stLocal.wHour, stLocal.wMinute);
+#endif
+  std::memset(buffer, 0, buffer_length * 2);
+#if XE_PLATFORM_WIN32
+  xe::store_and_swap<std::wstring>(buffer, buf);
+#endif
+}
+DECLARE_XAM_EXPORT(XamFormatTimeString, ExportTag::kImplemented);
+
+dword_result_t keXamBuildResourceLocator(uint64_t module,
+                                         const wchar_t* container,
+                                         const wchar_t* resource,
+                                         lpwstring_t buffer,
+                                         uint32_t buffer_length) {
+  wchar_t buf[256];
+  size_t size = 256 > buffer_length ? buffer_length : 256;
+
+  if (!module) {
+    swprintf_s(buf, size, L"file://media:/%s.xzp#%s", container, resource);
+    XELOGD(
+        "XamBuildResourceLocator(%ws) returning locator to local file %ws.xzp",
+        container, container);
+  } else {
+    swprintf_s(buf, size, L"section://%X,%s#%s", (uint32_t)module, container,
+               resource);
+  }
+
+  memset(buffer, 0, buffer_length * 2);
+  xe::store_and_swap<std::wstring>(buffer, buf);
+
+  return 0;
+}
+
+dword_result_t XamBuildResourceLocator(qword_t module, lpwstring_t container,
+                                       lpwstring_t resource, lpwstring_t buffer,
+                                       dword_t buffer_length) {
+  return keXamBuildResourceLocator(module, container.value().c_str(),
+                                   resource.value().c_str(), buffer,
+                                   buffer_length);
+}
+DECLARE_XAM_EXPORT(XamBuildResourceLocator, ExportTag::kImplemented);
+
+dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename,
+                                                lpwstring_t buffer,
+                                                dword_t buffer_length) {
+  // On an actual xbox these funcs would return a locator to xam.xex resources,
+  // but for Xenia we can return a locator to the resources as local files. (big
+  // thanks to MS for letting XamBuildResourceLocator return local file
+  // locators!)
+
+  // If you're running an app that'll need them, make sure to extract xam.xex
+  // resources with xextool ("xextool -d . xam.xex") and add a .xzp extension.
+
+  return keXamBuildResourceLocator(0, L"gamercrd", filename.value().c_str(),
+                                   buffer, buffer_length);
+}
+DECLARE_XAM_EXPORT(XamBuildGamercardResourceLocator, ExportTag::kImplemented);
+
+dword_result_t XamBuildSharedSystemResourceLocator(lpwstring_t filename,
+                                                   lpwstring_t buffer,
+                                                   dword_t buffer_length) {
+  // see notes inside XamBuildGamercardResourceLocator above
+  return keXamBuildResourceLocator(0, L"shrdres", filename.value().c_str(),
+                                   buffer, buffer_length);
+}
+DECLARE_XAM_EXPORT(XamBuildSharedSystemResourceLocator,
+                   ExportTag::kImplemented);
+
+dword_result_t XamBuildLegacySystemResourceLocator(lpwstring_t filename,
+                                                   lpwstring_t buffer,
+                                                   dword_t buffer_len) {
+  return XamBuildSharedSystemResourceLocator(filename, buffer, buffer_len);
+}
+DECLARE_XAM_EXPORT(XamBuildLegacySystemResourceLocator,
+                   ExportTag::kImplemented);
+
+dword_result_t XamBuildXamResourceLocator(lpwstring_t filename,
+                                          lpwstring_t buffer,
+                                          dword_t buffer_length) {
+  return keXamBuildResourceLocator(0, L"xam", filename.value().c_str(), buffer,
+                                   buffer_length);
+}
+DECLARE_XAM_EXPORT(XamBuildXamResourceLocator, ExportTag::kImplemented);
+
 dword_result_t XamGetSystemVersion() {
   // eh, just picking one. If we go too low we may break new games, but
   // this value seems to be used for conditionally loading symbols and if
diff --git a/src/xenia/kernel/xam/xam_table.inc b/src/xenia/kernel/xam/xam_table.inc
index cec253c69..0bc4f02e6 100644
--- a/src/xenia/kernel/xam/xam_table.inc
+++ b/src/xenia/kernel/xam/xam_table.inc
@@ -588,7 +588,7 @@ XE_EXPORT(xam,      0x00000318, XamVoiceGetMicArrayStatus,
 XE_EXPORT(xam,      0x00000319, XamVoiceSetAudioCaptureRoutine,                             kFunction),
 XE_EXPORT(xam,      0x0000031A, XamVoiceGetDirectionalData,                                 kFunction),
 XE_EXPORT(xam,      0x0000031B, XamBuildResourceLocator,                                    kFunction),
-XE_EXPORT(xam,      0x0000031C, XamBuildSharedSystemResourceLocator_,                       kFunction),
+XE_EXPORT(xam,      0x0000031C, XamBuildLegacySystemResourceLocator,                        kFunction),
 XE_EXPORT(xam,      0x0000031D, XamBuildGamercardResourceLocator,                           kFunction),
 XE_EXPORT(xam,      0x0000031E, XamBuildDynamicResourceLocator,                             kFunction),
 XE_EXPORT(xam,      0x0000031F, XamBuildXamResourceLocator,                                 kFunction),

From 65e8872dc15edea5408b16abed08c0930f1a7aee Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 22 Oct 2018 17:54:20 +0100
Subject: [PATCH 13/31] [Kernel] Add _vsnwprintf export

---
 src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc
index 2067f29e6..aa1bbf245 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_strings.cc
@@ -1009,6 +1009,46 @@ SHIM_CALL _vsnprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) {
   SHIM_SET_RETURN_32(count);
 }
 
+// https://msdn.microsoft.com/en-us/library/1kt27hek.aspx
+SHIM_CALL _vsnwprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) {
+  uint32_t buffer_ptr = SHIM_GET_ARG_32(0);
+  int32_t buffer_count = SHIM_GET_ARG_32(1);
+  uint32_t format_ptr = SHIM_GET_ARG_32(2);
+  uint32_t arg_ptr = SHIM_GET_ARG_32(3);
+
+  XELOGD("_vsnwprintf(%08X, %i, %08X, %08X)", buffer_ptr, buffer_count,
+         format_ptr, arg_ptr);
+
+  if (buffer_ptr == 0 || buffer_count <= 0 || format_ptr == 0) {
+    SHIM_SET_RETURN_32(-1);
+    return;
+  }
+
+  auto buffer = (uint16_t*)SHIM_MEM_ADDR(buffer_ptr);
+  auto format = (const uint16_t*)SHIM_MEM_ADDR(format_ptr);
+
+  ArrayArgList args(ppc_context, arg_ptr);
+  WideStringFormatData data(format);
+
+  int32_t count = format_core(ppc_context, data, args, true);
+  if (count < 0) {
+    // Error.
+    if (buffer_count > 0) {
+      buffer[0] = '\0';  // write a null, just to be safe
+    }
+  } else if (count <= buffer_count) {
+    // Fit within the buffer.
+    xe::copy_and_swap(buffer, (uint16_t*)data.wstr().c_str(), count);
+    if (count < buffer_count) {
+      buffer[count] = '\0';
+    }
+  } else {
+    // Overflowed buffer. We still return the count we would have written.
+    xe::copy_and_swap(buffer, (uint16_t*)data.wstr().c_str(), buffer_count);
+  }
+  SHIM_SET_RETURN_32(count);
+}
+
 // https://msdn.microsoft.com/en-us/library/28d5ce15.aspx
 SHIM_CALL vsprintf_shim(PPCContext* ppc_context, KernelState* kernel_state) {
   uint32_t buffer_ptr = SHIM_GET_ARG_32(0);
@@ -1100,6 +1140,7 @@ void RegisterStringExports(xe::cpu::ExportResolver* export_resolver,
   SHIM_SET_MAPPING("xboxkrnl.exe", vsprintf, state);
   SHIM_SET_MAPPING("xboxkrnl.exe", _vscwprintf, state);
   SHIM_SET_MAPPING("xboxkrnl.exe", vswprintf, state);
+  SHIM_SET_MAPPING("xboxkrnl.exe", _vsnwprintf, state);
 }
 
 }  // namespace xboxkrnl

From 958882a3ea8766798882e4675ef11cc6dcfb7ff7 Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 22 Oct 2018 17:55:30 +0100
Subject: [PATCH 14/31] [Kernel] Add ExConsoleGameRegion export variable (set
 to 0xFFFFFFFF)

---
 src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc
index 706c58ca8..f38d27d35 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_module.cc
@@ -159,6 +159,14 @@ XboxkrnlModule::XboxkrnlModule(Emulator* emulator, KernelState* kernel_state)
   xe::store_and_swap<uint8_t>(lpXboxHardwareInfo + 4, 0x06);  // cpu count
   // Remaining 11b are zeroes?
 
+  // ExConsoleGameRegion, probably same values as keyvault region uses?
+  // Just return all 0xFF, should satisfy anything that checks it
+  uint32_t pExConsoleGameRegion = memory_->SystemHeapAlloc(4);
+  auto lpExConsoleGameRegion = memory_->TranslateVirtual(pExConsoleGameRegion);
+  export_resolver_->SetVariableMapping(
+      "xboxkrnl.exe", ordinals::ExConsoleGameRegion, pExConsoleGameRegion);
+  xe::store<uint32_t>(lpExConsoleGameRegion, 0xFFFFFFFF);
+
   // XexExecutableModuleHandle (?**)
   // Games try to dereference this to get a pointer to some module struct.
   // So far it seems like it's just in loader code, and only used to look up

From 8c6e0b86f927bdda5802b084f44d492544fa8c93 Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Wed, 24 Oct 2018 00:32:49 +0100
Subject: [PATCH 15/31] [Kernel] swprintf_s -> swprintf, change how
 Format*String/Build*ResourceLocator exports copy text

New way of copying the text should make it less likely for any buffer overflows to occur.
---
 src/xenia/kernel/xam/xam_info.cc | 58 ++++++++++++++------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 431b5a622..3ff1539a0 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -28,8 +28,10 @@ namespace xam {
 constexpr uint32_t X_LANGUAGE_ENGLISH = 1;
 constexpr uint32_t X_LANGUAGE_JAPANESE = 2;
 
-void XamFormatDateString(dword_t unk, qword_t filetime, lpwstring_t buffer,
+void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t buffer,
                          dword_t buffer_length) {
+  std::memset(buffer, 0, buffer_length * 2);
+
 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
   FILETIME t;
@@ -43,21 +45,19 @@ void XamFormatDateString(dword_t unk, qword_t filetime, lpwstring_t buffer,
   SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal);
 
   wchar_t buf[256];
-  std::memset(buf, 0, 256 * 2);
-  size_t size = 256 > buffer_length ? buffer_length : 256;
   // TODO: format this depending on users locale?
-  swprintf_s(buf, size, L"%02d/%02d/%d", stLocal.wMonth, stLocal.wDay,
-             stLocal.wYear);
-#endif
-  std::memset(buffer, 0, buffer_length * 2);
-#if XE_PLATFORM_WIN32
-  xe::store_and_swap<std::wstring>(buffer, buf);
+  swprintf(buf, 256, L"%02d/%02d/%d", stLocal.wMonth, stLocal.wDay,
+           stLocal.wYear);
+
+  xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length);
 #endif
 }
 DECLARE_XAM_EXPORT(XamFormatDateString, ExportTag::kImplemented);
 
-void XamFormatTimeString(dword_t unk, qword_t filetime, lpwstring_t buffer,
+void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t buffer,
                          dword_t buffer_length) {
+  std::memset(buffer, 0, buffer_length * 2);
+
 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
   FILETIME t;
@@ -71,13 +71,9 @@ void XamFormatTimeString(dword_t unk, qword_t filetime, lpwstring_t buffer,
   SystemTimeToTzSpecificLocalTime(NULL, &st, &stLocal);
 
   wchar_t buf[256];
-  std::memset(buf, 0, 256 * 2);
-  size_t size = 256 > buffer_length ? buffer_length : 256;
-  swprintf_s(buf, size, L"%02d:%02d", stLocal.wHour, stLocal.wMinute);
-#endif
-  std::memset(buffer, 0, buffer_length * 2);
-#if XE_PLATFORM_WIN32
-  xe::store_and_swap<std::wstring>(buffer, buf);
+  swprintf(buf, 256, L"%02d:%02d", stLocal.wHour, stLocal.wMinute);
+
+  xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length);
 #endif
 }
 DECLARE_XAM_EXPORT(XamFormatTimeString, ExportTag::kImplemented);
@@ -85,29 +81,26 @@ DECLARE_XAM_EXPORT(XamFormatTimeString, ExportTag::kImplemented);
 dword_result_t keXamBuildResourceLocator(uint64_t module,
                                          const wchar_t* container,
                                          const wchar_t* resource,
-                                         lpwstring_t buffer,
+                                         lpvoid_t buffer,
                                          uint32_t buffer_length) {
   wchar_t buf[256];
-  size_t size = 256 > buffer_length ? buffer_length : 256;
 
   if (!module) {
-    swprintf_s(buf, size, L"file://media:/%s.xzp#%s", container, resource);
+    swprintf(buf, 256, L"file://media:/%s.xzp#%s", container, resource);
     XELOGD(
         "XamBuildResourceLocator(%ws) returning locator to local file %ws.xzp",
         container, container);
   } else {
-    swprintf_s(buf, size, L"section://%X,%s#%s", (uint32_t)module, container,
-               resource);
+    swprintf(buf, 256, L"section://%X,%s#%s", (uint32_t)module, container,
+             resource);
   }
 
-  memset(buffer, 0, buffer_length * 2);
-  xe::store_and_swap<std::wstring>(buffer, buf);
-
+  xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length);
   return 0;
 }
 
 dword_result_t XamBuildResourceLocator(qword_t module, lpwstring_t container,
-                                       lpwstring_t resource, lpwstring_t buffer,
+                                       lpwstring_t resource, lpvoid_t buffer,
                                        dword_t buffer_length) {
   return keXamBuildResourceLocator(module, container.value().c_str(),
                                    resource.value().c_str(), buffer,
@@ -116,7 +109,7 @@ dword_result_t XamBuildResourceLocator(qword_t module, lpwstring_t container,
 DECLARE_XAM_EXPORT(XamBuildResourceLocator, ExportTag::kImplemented);
 
 dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename,
-                                                lpwstring_t buffer,
+                                                lpvoid_t buffer,
                                                 dword_t buffer_length) {
   // On an actual xbox these funcs would return a locator to xam.xex resources,
   // but for Xenia we can return a locator to the resources as local files. (big
@@ -132,7 +125,7 @@ dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename,
 DECLARE_XAM_EXPORT(XamBuildGamercardResourceLocator, ExportTag::kImplemented);
 
 dword_result_t XamBuildSharedSystemResourceLocator(lpwstring_t filename,
-                                                   lpwstring_t buffer,
+                                                   lpvoid_t buffer,
                                                    dword_t buffer_length) {
   // see notes inside XamBuildGamercardResourceLocator above
   return keXamBuildResourceLocator(0, L"shrdres", filename.value().c_str(),
@@ -142,15 +135,14 @@ DECLARE_XAM_EXPORT(XamBuildSharedSystemResourceLocator,
                    ExportTag::kImplemented);
 
 dword_result_t XamBuildLegacySystemResourceLocator(lpwstring_t filename,
-                                                   lpwstring_t buffer,
-                                                   dword_t buffer_len) {
-  return XamBuildSharedSystemResourceLocator(filename, buffer, buffer_len);
+                                                   lpvoid_t buffer,
+                                                   dword_t buffer_length) {
+  return XamBuildSharedSystemResourceLocator(filename, buffer, buffer_length);
 }
 DECLARE_XAM_EXPORT(XamBuildLegacySystemResourceLocator,
                    ExportTag::kImplemented);
 
-dword_result_t XamBuildXamResourceLocator(lpwstring_t filename,
-                                          lpwstring_t buffer,
+dword_result_t XamBuildXamResourceLocator(lpwstring_t filename, lpvoid_t buffer,
                                           dword_t buffer_length) {
   return keXamBuildResourceLocator(0, L"xam", filename.value().c_str(), buffer,
                                    buffer_length);

From 933588717ef53347326be821696dd0199840b1ed Mon Sep 17 00:00:00 2001
From: emoose <abc@cock.li>
Date: Mon, 12 Nov 2018 02:20:34 +0000
Subject: [PATCH 16/31] [Kernel] Add XamGetOnlineSchema export, returns dummy
 data

---
 src/xenia/kernel/xam/xam_info.cc | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 3ff1539a0..7590d96f9 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -28,6 +28,29 @@ namespace xam {
 constexpr uint32_t X_LANGUAGE_ENGLISH = 1;
 constexpr uint32_t X_LANGUAGE_JAPANESE = 2;
 
+dword_result_t XamGetOnlineSchema() {
+  static uint32_t schema_guest = 0;
+  static uint32_t schema_ptr_guest = 0;
+
+  if (!schema_guest) {
+    // create a dummy schema, 8 bytes of 0 seems to work fine
+    // (with another 8 bytes for schema ptr/schema size)
+    schema_guest = kernel_state()->memory()->SystemHeapAlloc(16);
+    schema_ptr_guest = schema_guest + 8;
+
+    auto schema = kernel_state()->memory()->TranslateVirtual(schema_guest);
+    memset(schema, 0, 16);
+
+    // store schema ptr + size
+    xe::store_and_swap<uint32_t>(schema + 0x8, schema_guest);
+    xe::store_and_swap<uint32_t>(schema + 0xC, 0x8);
+  }
+
+  // return pointer to the schema ptr/schema size struct
+  return schema_ptr_guest;
+}
+DECLARE_XAM_EXPORT(XamGetOnlineSchema, ExportTag::kImplemented);
+
 void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t buffer,
                          dword_t buffer_length) {
   std::memset(buffer, 0, buffer_length * 2);

From 213e6881736c856a07e7aedacb684cbf720e7ca6 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 17:27:35 -0600
Subject: [PATCH 17/31] [Kernel] Fix some stuff from PR merge.

---
 src/xenia/kernel/xam/xam_content.cc |  2 +-
 src/xenia/kernel/xam/xam_info.cc    | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc
index 8dba3c155..570e29947 100644
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@@ -201,7 +201,7 @@ dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type,
   *handle_out = e->handle();
   return X_ERROR_SUCCESS;
 }
-DECLARE_XAM_EXPORT(XamContentCreateDeviceEnumerator, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamContentCreateDeviceEnumerator, kNone, kImplemented);
 
 dword_result_t XamContentCreateEx(dword_t user_index, lpstring_t root_name,
                                   lpvoid_t content_data_ptr, dword_t flags,
diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 7590d96f9..997bbe33f 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -49,7 +49,7 @@ dword_result_t XamGetOnlineSchema() {
   // return pointer to the schema ptr/schema size struct
   return schema_ptr_guest;
 }
-DECLARE_XAM_EXPORT(XamGetOnlineSchema, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT2(XamGetOnlineSchema, kNone, kImplemented, kSketchy);
 
 void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t buffer,
                          dword_t buffer_length) {
@@ -73,9 +73,11 @@ void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t buffer,
            stLocal.wYear);
 
   xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length);
+#else
+  assert_always();
 #endif
 }
-DECLARE_XAM_EXPORT(XamFormatDateString, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamFormatDateString, kNone, kImplemented);
 
 void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t buffer,
                          dword_t buffer_length) {
@@ -97,9 +99,11 @@ void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t buffer,
   swprintf(buf, 256, L"%02d:%02d", stLocal.wHour, stLocal.wMinute);
 
   xe::copy_and_swap((wchar_t*)buffer.host_address(), buf, buffer_length);
+#else
+  assert_always();
 #endif
 }
-DECLARE_XAM_EXPORT(XamFormatTimeString, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamFormatTimeString, kNone, kImplemented);
 
 dword_result_t keXamBuildResourceLocator(uint64_t module,
                                          const wchar_t* container,
@@ -129,7 +133,7 @@ dword_result_t XamBuildResourceLocator(qword_t module, lpwstring_t container,
                                    resource.value().c_str(), buffer,
                                    buffer_length);
 }
-DECLARE_XAM_EXPORT(XamBuildResourceLocator, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamBuildResourceLocator, kNone, kImplemented);
 
 dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename,
                                                 lpvoid_t buffer,
@@ -145,7 +149,7 @@ dword_result_t XamBuildGamercardResourceLocator(lpwstring_t filename,
   return keXamBuildResourceLocator(0, L"gamercrd", filename.value().c_str(),
                                    buffer, buffer_length);
 }
-DECLARE_XAM_EXPORT(XamBuildGamercardResourceLocator, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamBuildGamercardResourceLocator, kNone, kImplemented);
 
 dword_result_t XamBuildSharedSystemResourceLocator(lpwstring_t filename,
                                                    lpvoid_t buffer,
@@ -154,23 +158,21 @@ dword_result_t XamBuildSharedSystemResourceLocator(lpwstring_t filename,
   return keXamBuildResourceLocator(0, L"shrdres", filename.value().c_str(),
                                    buffer, buffer_length);
 }
-DECLARE_XAM_EXPORT(XamBuildSharedSystemResourceLocator,
-                   ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamBuildSharedSystemResourceLocator, kNone, kImplemented);
 
 dword_result_t XamBuildLegacySystemResourceLocator(lpwstring_t filename,
                                                    lpvoid_t buffer,
                                                    dword_t buffer_length) {
   return XamBuildSharedSystemResourceLocator(filename, buffer, buffer_length);
 }
-DECLARE_XAM_EXPORT(XamBuildLegacySystemResourceLocator,
-                   ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamBuildLegacySystemResourceLocator, kNone, kImplemented);
 
 dword_result_t XamBuildXamResourceLocator(lpwstring_t filename, lpvoid_t buffer,
                                           dword_t buffer_length) {
   return keXamBuildResourceLocator(0, L"xam", filename.value().c_str(), buffer,
                                    buffer_length);
 }
-DECLARE_XAM_EXPORT(XamBuildXamResourceLocator, ExportTag::kImplemented);
+DECLARE_XAM_EXPORT1(XamBuildXamResourceLocator, kNone, kImplemented);
 
 dword_result_t XamGetSystemVersion() {
   // eh, just picking one. If we go too low we may break new games, but

From d87ae268954e716ae005679072b2c8af5ee1b9d0 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Fri, 23 Nov 2018 17:58:38 -0600
Subject: [PATCH 18/31] [CI] Fix mspack linking even more.

---
 src/xenia/cpu/ppc/testing/premake5.lua | 1 +
 src/xenia/cpu/premake5.lua             | 1 +
 src/xenia/gpu/vulkan/premake5.lua      | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/src/xenia/cpu/ppc/testing/premake5.lua b/src/xenia/cpu/ppc/testing/premake5.lua
index 78874ecc4..d2d5549cd 100644
--- a/src/xenia/cpu/ppc/testing/premake5.lua
+++ b/src/xenia/cpu/ppc/testing/premake5.lua
@@ -13,6 +13,7 @@ project("xenia-cpu-ppc-tests")
     "xenia-base",
     "gflags",
     "capstone", -- cpu-backend-x64
+    "mspack",
   })
   files({
     "ppc_testing_main.cc",
diff --git a/src/xenia/cpu/premake5.lua b/src/xenia/cpu/premake5.lua
index 08fd41c0d..96a41f6e0 100644
--- a/src/xenia/cpu/premake5.lua
+++ b/src/xenia/cpu/premake5.lua
@@ -8,6 +8,7 @@ project("xenia-cpu")
   language("C++")
   links({
     "xenia-base",
+    "mspack",
   })
   includedirs({
     project_root.."/third_party/llvm/include",
diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua
index b399914a8..f8e9903fe 100644
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@@ -38,6 +38,7 @@ project("xenia-gpu-vulkan-trace-viewer")
     "imgui",
     "libavcodec",
     "libavutil",
+    "mspack",
     "snappy",
     "spirv-tools",
     "volk",
@@ -110,6 +111,7 @@ project("xenia-gpu-vulkan-trace-dump")
     "imgui",
     "libavcodec",
     "libavutil",
+    "mspack",
     "snappy",
     "spirv-tools",
     "volk",

From 696c3cd439ad1110b4022f68039be779c8fcfc7f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 15:04:34 -0600
Subject: [PATCH 19/31] [x64] Remove most of RegisterSequences

---
 src/xenia/cpu/backend/x64/x64_sequences.cc | 133 ++-------------------
 1 file changed, 8 insertions(+), 125 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 48f5a86d1..b31bc1b39 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -655,16 +655,19 @@ struct Sequence {
 };
 
 template <typename T>
-void Register() {
+static bool Register() {
   sequence_table.insert({T::head_key(), T::Select});
+  return true;
 }
 template <typename T, typename Tn, typename... Ts>
-void Register() {
-  Register<T>();
-  Register<Tn, Ts...>();
+static bool Register() {
+  bool b = true;
+  b = b && Register<T>();          // Call the above function
+  b = b && Register<Tn, Ts...>();  // Call ourself again (recursively)
+  return b;
 }
 #define EMITTER_OPCODE_TABLE(name, ...) \
-  void Register_##name() { Register<__VA_ARGS__>(); }
+  static bool Registered_##name = Register<__VA_ARGS__>();
 
 // ============================================================================
 // OPCODE_COMMENT
@@ -7812,126 +7815,6 @@ struct SET_ROUNDING_MODE_I32
 EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
 
 void RegisterSequences() {
-  Register_OPCODE_COMMENT();
-  Register_OPCODE_NOP();
-  Register_OPCODE_SOURCE_OFFSET();
-  Register_OPCODE_DEBUG_BREAK();
-  Register_OPCODE_DEBUG_BREAK_TRUE();
-  Register_OPCODE_TRAP();
-  Register_OPCODE_TRAP_TRUE();
-  Register_OPCODE_CALL();
-  Register_OPCODE_CALL_TRUE();
-  Register_OPCODE_CALL_INDIRECT();
-  Register_OPCODE_CALL_INDIRECT_TRUE();
-  Register_OPCODE_CALL_EXTERN();
-  Register_OPCODE_RETURN();
-  Register_OPCODE_RETURN_TRUE();
-  Register_OPCODE_SET_RETURN_ADDRESS();
-  Register_OPCODE_BRANCH();
-  Register_OPCODE_BRANCH_TRUE();
-  Register_OPCODE_BRANCH_FALSE();
-  Register_OPCODE_ASSIGN();
-  Register_OPCODE_CAST();
-  Register_OPCODE_ZERO_EXTEND();
-  Register_OPCODE_SIGN_EXTEND();
-  Register_OPCODE_TRUNCATE();
-  Register_OPCODE_CONVERT();
-  Register_OPCODE_ROUND();
-  Register_OPCODE_VECTOR_CONVERT_I2F();
-  Register_OPCODE_VECTOR_CONVERT_F2I();
-  Register_OPCODE_LOAD_VECTOR_SHL();
-  Register_OPCODE_LOAD_VECTOR_SHR();
-  Register_OPCODE_LOAD_CLOCK();
-  Register_OPCODE_LOAD_LOCAL();
-  Register_OPCODE_STORE_LOCAL();
-  Register_OPCODE_LOAD_CONTEXT();
-  Register_OPCODE_STORE_CONTEXT();
-  Register_OPCODE_CONTEXT_BARRIER();
-  Register_OPCODE_LOAD_MMIO();
-  Register_OPCODE_STORE_MMIO();
-  Register_OPCODE_LOAD_OFFSET();
-  Register_OPCODE_STORE_OFFSET();
-  Register_OPCODE_LOAD();
-  Register_OPCODE_STORE();
-  Register_OPCODE_MEMSET();
-  Register_OPCODE_PREFETCH();
-  Register_OPCODE_MEMORY_BARRIER();
-  Register_OPCODE_MAX();
-  Register_OPCODE_VECTOR_MAX();
-  Register_OPCODE_MIN();
-  Register_OPCODE_VECTOR_MIN();
-  Register_OPCODE_SELECT();
-  Register_OPCODE_IS_TRUE();
-  Register_OPCODE_IS_FALSE();
-  Register_OPCODE_IS_NAN();
-  Register_OPCODE_COMPARE_EQ();
-  Register_OPCODE_COMPARE_NE();
-  Register_OPCODE_COMPARE_SLT();
-  Register_OPCODE_COMPARE_SLE();
-  Register_OPCODE_COMPARE_SGT();
-  Register_OPCODE_COMPARE_SGE();
-  Register_OPCODE_COMPARE_ULT();
-  Register_OPCODE_COMPARE_ULE();
-  Register_OPCODE_COMPARE_UGT();
-  Register_OPCODE_COMPARE_UGE();
-  Register_OPCODE_COMPARE_SLT_FLT();
-  Register_OPCODE_COMPARE_SLE_FLT();
-  Register_OPCODE_COMPARE_SGT_FLT();
-  Register_OPCODE_COMPARE_SGE_FLT();
-  Register_OPCODE_COMPARE_ULT_FLT();
-  Register_OPCODE_COMPARE_ULE_FLT();
-  Register_OPCODE_COMPARE_UGT_FLT();
-  Register_OPCODE_COMPARE_UGE_FLT();
-  Register_OPCODE_DID_SATURATE();
-  Register_OPCODE_VECTOR_COMPARE_EQ();
-  Register_OPCODE_VECTOR_COMPARE_SGT();
-  Register_OPCODE_VECTOR_COMPARE_SGE();
-  Register_OPCODE_VECTOR_COMPARE_UGT();
-  Register_OPCODE_VECTOR_COMPARE_UGE();
-  Register_OPCODE_ADD();
-  Register_OPCODE_ADD_CARRY();
-  Register_OPCODE_VECTOR_ADD();
-  Register_OPCODE_SUB();
-  Register_OPCODE_VECTOR_SUB();
-  Register_OPCODE_MUL();
-  Register_OPCODE_MUL_HI();
-  Register_OPCODE_DIV();
-  Register_OPCODE_MUL_ADD();
-  Register_OPCODE_MUL_SUB();
-  Register_OPCODE_NEG();
-  Register_OPCODE_ABS();
-  Register_OPCODE_SQRT();
-  Register_OPCODE_RSQRT();
-  Register_OPCODE_RECIP();
-  Register_OPCODE_POW2();
-  Register_OPCODE_LOG2();
-  Register_OPCODE_DOT_PRODUCT_3();
-  Register_OPCODE_DOT_PRODUCT_4();
-  Register_OPCODE_AND();
-  Register_OPCODE_OR();
-  Register_OPCODE_XOR();
-  Register_OPCODE_NOT();
-  Register_OPCODE_SHL();
-  Register_OPCODE_SHR();
-  Register_OPCODE_SHA();
-  Register_OPCODE_VECTOR_SHL();
-  Register_OPCODE_VECTOR_SHR();
-  Register_OPCODE_VECTOR_SHA();
-  Register_OPCODE_ROTATE_LEFT();
-  Register_OPCODE_VECTOR_ROTATE_LEFT();
-  Register_OPCODE_VECTOR_AVERAGE();
-  Register_OPCODE_BYTE_SWAP();
-  Register_OPCODE_CNTLZ();
-  Register_OPCODE_INSERT();
-  Register_OPCODE_EXTRACT();
-  Register_OPCODE_SPLAT();
-  Register_OPCODE_PERMUTE();
-  Register_OPCODE_SWIZZLE();
-  Register_OPCODE_PACK();
-  Register_OPCODE_UNPACK();
-  Register_OPCODE_ATOMIC_EXCHANGE();
-  Register_OPCODE_ATOMIC_COMPARE_EXCHANGE();
-  Register_OPCODE_SET_ROUNDING_MODE();
 }
 
 bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {

From 6861cce492db7d86791fdae32b6ac569da4a15c7 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 15:32:11 -0600
Subject: [PATCH 20/31] [x64] Factor out a lot of the opcode handling code

---
 src/xenia/cpu/backend/x64/x64_op.h         | 629 +++++++++++++++++++++
 src/xenia/cpu/backend/x64/x64_sequences.cc | 614 +-------------------
 src/xenia/cpu/backend/x64/x64_sequences.h  |  21 +
 3 files changed, 651 insertions(+), 613 deletions(-)
 create mode 100644 src/xenia/cpu/backend/x64/x64_op.h

diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h
new file mode 100644
index 000000000..f71338304
--- /dev/null
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@@ -0,0 +1,629 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2018 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+#ifndef XENIA_CPU_BACKEND_X64_X64_OP_H_
+#define XENIA_CPU_BACKEND_X64_X64_OP_H_
+
+#include "xenia/cpu/backend/x64/x64_emitter.h"
+
+#include "xenia/cpu/hir/instr.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace x64 {
+
+// TODO(benvanik): direct usings.
+using namespace xe::cpu;
+using namespace xe::cpu::hir;
+using namespace Xbyak;
+
+// Selects the right byte/word/etc from a vector. We need to flip logical
+// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...)
+#define VEC128_B(n) ((n) ^ 0x3)
+#define VEC128_W(n) ((n) ^ 0x1)
+#define VEC128_D(n) (n)
+#define VEC128_F(n) (n)
+
+enum KeyType {
+  KEY_TYPE_X = OPCODE_SIG_TYPE_X,
+  KEY_TYPE_L = OPCODE_SIG_TYPE_L,
+  KEY_TYPE_O = OPCODE_SIG_TYPE_O,
+  KEY_TYPE_S = OPCODE_SIG_TYPE_S,
+  KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE,
+  KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE,
+  KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE,
+  KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE,
+  KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE,
+  KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
+  KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
+};
+
+#pragma pack(push, 1)
+union InstrKey {
+  struct {
+    uint32_t opcode : 8;
+    uint32_t dest : 5;
+    uint32_t src1 : 5;
+    uint32_t src2 : 5;
+    uint32_t src3 : 5;
+    uint32_t reserved : 4;
+  };
+  uint32_t value;
+
+  operator uint32_t() const { return value; }
+
+  InstrKey() : value(0) {}
+  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(const Instr* i) : value(0) {
+    opcode = i->opcode->num;
+    uint32_t sig = i->opcode->signature;
+    dest =
+        GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
+    src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+    if (src1 == OPCODE_SIG_TYPE_V) {
+      src1 += i->src1.value->type;
+    }
+    src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+    if (src2 == OPCODE_SIG_TYPE_V) {
+      src2 += i->src2.value->type;
+    }
+    src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+    if (src3 == OPCODE_SIG_TYPE_V) {
+      src3 += i->src3.value->type;
+    }
+  }
+
+  template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
+            KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
+  struct Construct {
+    static const uint32_t value =
+        (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
+  };
+};
+#pragma pack(pop)
+static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes");
+
+template <typename... Ts>
+struct CombinedStruct;
+template <>
+struct CombinedStruct<> {};
+template <typename T, typename... Ts>
+struct CombinedStruct<T, Ts...> : T, CombinedStruct<Ts...> {};
+
+struct OpBase {};
+
+template <typename T, KeyType KEY_TYPE>
+struct Op : OpBase {
+  static const KeyType key_type = KEY_TYPE;
+};
+
+struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
+ protected:
+  template <typename T, KeyType KEY_TYPE>
+  friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) {}
+};
+
+struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
+  uint64_t value;
+
+ protected:
+  template <typename T, KeyType KEY_TYPE>
+  friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) { this->value = op.offset; }
+};
+
+struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
+  Function* value;
+
+ protected:
+  template <typename T, KeyType KEY_TYPE>
+  friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  bool Load(const Instr::Op& op) {
+    this->value = op.symbol;
+    return true;
+  }
+};
+
+struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
+  hir::Label* value;
+
+ protected:
+  template <typename T, KeyType KEY_TYPE>
+  friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) { this->value = op.label; }
+};
+
+template <typename T, KeyType KEY_TYPE, typename REG_TYPE, typename CONST_TYPE>
+struct ValueOp : Op<ValueOp<T, KEY_TYPE, REG_TYPE, CONST_TYPE>, KEY_TYPE> {
+  typedef REG_TYPE reg_type;
+  const Value* value;
+  bool is_constant;
+  virtual bool ConstantFitsIn32Reg() const { return true; }
+  const REG_TYPE& reg() const {
+    assert_true(!is_constant);
+    return reg_;
+  }
+  operator const REG_TYPE&() const { return reg(); }
+  bool IsEqual(const T& b) const {
+    if (is_constant && b.is_constant) {
+      return reinterpret_cast<const T*>(this)->constant() == b.constant();
+    } else if (!is_constant && !b.is_constant) {
+      return reg_.getIdx() == b.reg_.getIdx();
+    } else {
+      return false;
+    }
+  }
+  bool IsEqual(const Xbyak::Reg& b) const {
+    if (is_constant) {
+      return false;
+    } else if (!is_constant) {
+      return reg_.getIdx() == b.getIdx();
+    } else {
+      return false;
+    }
+  }
+  bool operator==(const T& b) const { return IsEqual(b); }
+  bool operator!=(const T& b) const { return !IsEqual(b); }
+  bool operator==(const Xbyak::Reg& b) const { return IsEqual(b); }
+  bool operator!=(const Xbyak::Reg& b) const { return !IsEqual(b); }
+  void Load(const Instr::Op& op) {
+    value = op.value;
+    is_constant = value->IsConstant();
+    if (!is_constant) {
+      X64Emitter::SetupReg(value, reg_);
+    }
+  }
+
+ protected:
+  REG_TYPE reg_;
+};
+
+struct I8Op : ValueOp<I8Op, KEY_TYPE_V_I8, Reg8, int8_t> {
+  typedef ValueOp<I8Op, KEY_TYPE_V_I8, Reg8, int8_t> BASE;
+  const int8_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i8;
+  }
+};
+struct I16Op : ValueOp<I16Op, KEY_TYPE_V_I16, Reg16, int16_t> {
+  typedef ValueOp<I16Op, KEY_TYPE_V_I16, Reg16, int16_t> BASE;
+  const int16_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i16;
+  }
+};
+struct I32Op : ValueOp<I32Op, KEY_TYPE_V_I32, Reg32, int32_t> {
+  typedef ValueOp<I32Op, KEY_TYPE_V_I32, Reg32, int32_t> BASE;
+  const int32_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i32;
+  }
+};
+struct I64Op : ValueOp<I64Op, KEY_TYPE_V_I64, Reg64, int64_t> {
+  typedef ValueOp<I64Op, KEY_TYPE_V_I64, Reg64, int64_t> BASE;
+  const int64_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i64;
+  }
+  bool ConstantFitsIn32Reg() const override {
+    int64_t v = BASE::value->constant.i64;
+    if ((v & ~0x7FFFFFFF) == 0) {
+      // Fits under 31 bits, so just load using normal mov.
+      return true;
+    } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+      // Negative number that fits in 32bits.
+      return true;
+    }
+    return false;
+  }
+};
+struct F32Op : ValueOp<F32Op, KEY_TYPE_V_F32, Xmm, float> {
+  typedef ValueOp<F32Op, KEY_TYPE_V_F32, Xmm, float> BASE;
+  const float constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.f32;
+  }
+};
+struct F64Op : ValueOp<F64Op, KEY_TYPE_V_F64, Xmm, double> {
+  typedef ValueOp<F64Op, KEY_TYPE_V_F64, Xmm, double> BASE;
+  const double constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.f64;
+  }
+};
+struct V128Op : ValueOp<V128Op, KEY_TYPE_V_V128, Xmm, vec128_t> {
+  typedef ValueOp<V128Op, KEY_TYPE_V_V128, Xmm, vec128_t> BASE;
+  const vec128_t& constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.v128;
+  }
+};
+
+template <typename DEST, typename... Tf>
+struct DestField;
+template <typename DEST>
+struct DestField<DEST> {
+  DEST dest;
+
+ protected:
+  bool LoadDest(const Instr* i) {
+    Instr::Op op;
+    op.value = i->dest;
+    dest.Load(op);
+    return true;
+  }
+};
+template <>
+struct DestField<VoidOp> {
+ protected:
+  bool LoadDest(const Instr* i) { return true; }
+};
+
+template <hir::Opcode OPCODE, typename... Ts>
+struct I;
+template <hir::Opcode OPCODE, typename DEST>
+struct I<OPCODE, DEST> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  const Instr* instr;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1>
+struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  const Instr* instr;
+  SRC1 src1;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2>
+struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
+                          SRC2::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2,
+          typename SRC3>
+struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
+                          SRC2::key_type, SRC3::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  static const KeyType src3_type = SRC3::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+  SRC3 src3;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      src3.Load(i->src3);
+      return true;
+    }
+    return false;
+  }
+};
+
+template <typename T>
+static const T GetTempReg(X64Emitter& e);
+template <>
+const Reg8 GetTempReg<Reg8>(X64Emitter& e) {
+  return e.al;
+}
+template <>
+const Reg16 GetTempReg<Reg16>(X64Emitter& e) {
+  return e.ax;
+}
+template <>
+const Reg32 GetTempReg<Reg32>(X64Emitter& e) {
+  return e.eax;
+}
+template <>
+const Reg64 GetTempReg<Reg64>(X64Emitter& e) {
+  return e.rax;
+}
+
+template <typename SEQ, typename T>
+struct Sequence {
+  typedef T EmitArgType;
+
+  static constexpr uint32_t head_key() { return T::key; }
+
+  static bool Select(X64Emitter& e, const Instr* i) {
+    T args;
+    if (!args.Load(i)) {
+      return false;
+    }
+    SEQ::Emit(e, args);
+    return true;
+  }
+
+  template <typename REG_FN>
+  static void EmitUnaryOp(X64Emitter& e, const EmitArgType& i,
+                          const REG_FN& reg_fn) {
+    if (i.src1.is_constant) {
+      e.mov(i.dest, i.src1.constant());
+      reg_fn(e, i.dest);
+    } else {
+      if (i.dest != i.src1) {
+        e.mov(i.dest, i.src1);
+      }
+      reg_fn(e, i.dest);
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeBinaryOp(X64Emitter& e, const EmitArgType& i,
+                                      const REG_REG_FN& reg_reg_fn,
+                                      const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      if (i.src2.is_constant) {
+        // Both constants.
+        if (i.src1.ConstantFitsIn32Reg()) {
+          e.mov(i.dest, i.src2.constant());
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
+        } else if (i.src2.ConstantFitsIn32Reg()) {
+          e.mov(i.dest, i.src1.constant());
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          e.mov(i.dest, i.src1.constant());
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        // src1 constant.
+        if (i.dest == i.src2) {
+          if (i.src1.ConstantFitsIn32Reg()) {
+            reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
+          } else {
+            auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+            e.mov(temp, i.src1.constant());
+            reg_reg_fn(e, i.dest, temp);
+          }
+        } else {
+          e.mov(i.dest, i.src1.constant());
+          reg_reg_fn(e, i.dest, i.src2);
+        }
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.mov(i.dest, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1);
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        reg_reg_fn(e, i.dest, i.src1);
+      } else {
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeBinaryOp(X64Emitter& e, const EmitArgType& i,
+                                      const REG_REG_FN& reg_reg_fn,
+                                      const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.dest == i.src2) {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2);
+        e.mov(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.mov(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.mov(i.dest, i.src1);
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2);
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+
+  template <typename FN>
+  static void EmitCommutativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i,
+                                         const FN& fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.LoadConstantXmm(e.xmm0, i.src1.constant());
+      fn(e, i.dest, e.xmm0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      fn(e, i.dest, i.src1, e.xmm0);
+    } else {
+      fn(e, i.dest, i.src1, i.src2);
+    }
+  }
+
+  template <typename FN>
+  static void EmitAssociativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i,
+                                         const FN& fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.LoadConstantXmm(e.xmm0, i.src1.constant());
+      fn(e, i.dest, e.xmm0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      fn(e, i.dest, i.src1, e.xmm0);
+    } else {
+      fn(e, i.dest, i.src1, i.src2);
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeCompareOp(X64Emitter& e, const EmitArgType& i,
+                                       const REG_REG_FN& reg_reg_fn,
+                                       const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src2, static_cast<int32_t>(i.src1.constant()));
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+        e.mov(temp, i.src1.constant());
+        reg_reg_fn(e, i.src2, temp);
+      }
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src1, static_cast<int32_t>(i.src2.constant()));
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2.constant());
+        reg_reg_fn(e, i.src1, temp);
+      }
+    } else {
+      reg_reg_fn(e, i.src1, i.src2);
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeCompareOp(X64Emitter& e, const EmitArgType& i,
+                                       const REG_REG_FN& reg_reg_fn,
+                                       const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src2, static_cast<int32_t>(i.src1.constant()),
+                     true);
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+        e.mov(temp, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2, temp, true);
+      }
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src1, static_cast<int32_t>(i.src2.constant()),
+                     false);
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1, temp, false);
+      }
+    } else {
+      reg_reg_fn(e, i.dest, i.src1, i.src2, false);
+    }
+  }
+};
+
+}  // namespace x64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_X64_X64_OP_H_
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index b31bc1b39..4fea97523 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -33,6 +33,7 @@
 #include "xenia/base/logging.h"
 #include "xenia/base/threading.h"
 #include "xenia/cpu/backend/x64/x64_emitter.h"
+#include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
@@ -56,619 +57,6 @@ using xe::cpu::hir::Instr;
 typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*);
 std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 
-// Selects the right byte/word/etc from a vector. We need to flip logical
-// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...)
-#define VEC128_B(n) ((n) ^ 0x3)
-#define VEC128_W(n) ((n) ^ 0x1)
-#define VEC128_D(n) (n)
-#define VEC128_F(n) (n)
-
-enum KeyType {
-  KEY_TYPE_X = OPCODE_SIG_TYPE_X,
-  KEY_TYPE_L = OPCODE_SIG_TYPE_L,
-  KEY_TYPE_O = OPCODE_SIG_TYPE_O,
-  KEY_TYPE_S = OPCODE_SIG_TYPE_S,
-  KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE,
-  KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE,
-  KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE,
-  KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE,
-  KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE,
-  KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
-  KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
-};
-
-#pragma pack(push, 1)
-union InstrKey {
-  struct {
-    uint32_t opcode : 8;
-    uint32_t dest : 5;
-    uint32_t src1 : 5;
-    uint32_t src2 : 5;
-    uint32_t src3 : 5;
-    uint32_t reserved : 4;
-  };
-  uint32_t value;
-
-  operator uint32_t() const { return value; }
-
-  InstrKey() : value(0) {}
-  InstrKey(uint32_t v) : value(v) {}
-  InstrKey(const Instr* i) : value(0) {
-    opcode = i->opcode->num;
-    uint32_t sig = i->opcode->signature;
-    dest =
-        GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
-    src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
-    if (src1 == OPCODE_SIG_TYPE_V) {
-      src1 += i->src1.value->type;
-    }
-    src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
-    if (src2 == OPCODE_SIG_TYPE_V) {
-      src2 += i->src2.value->type;
-    }
-    src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
-    if (src3 == OPCODE_SIG_TYPE_V) {
-      src3 += i->src3.value->type;
-    }
-  }
-
-  template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
-            KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
-  struct Construct {
-    static const uint32_t value =
-        (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
-  };
-};
-#pragma pack(pop)
-static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes");
-
-template <typename... Ts>
-struct CombinedStruct;
-template <>
-struct CombinedStruct<> {};
-template <typename T, typename... Ts>
-struct CombinedStruct<T, Ts...> : T, CombinedStruct<Ts...> {};
-
-struct OpBase {};
-
-template <typename T, KeyType KEY_TYPE>
-struct Op : OpBase {
-  static const KeyType key_type = KEY_TYPE;
-};
-
-struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
- protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
-  template <hir::Opcode OPCODE, typename... Ts>
-  friend struct I;
-  void Load(const Instr::Op& op) {}
-};
-
-struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
-  uint64_t value;
-
- protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
-  template <hir::Opcode OPCODE, typename... Ts>
-  friend struct I;
-  void Load(const Instr::Op& op) { this->value = op.offset; }
-};
-
-struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
-  Function* value;
-
- protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
-  template <hir::Opcode OPCODE, typename... Ts>
-  friend struct I;
-  bool Load(const Instr::Op& op) {
-    this->value = op.symbol;
-    return true;
-  }
-};
-
-struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
-  hir::Label* value;
-
- protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
-  template <hir::Opcode OPCODE, typename... Ts>
-  friend struct I;
-  void Load(const Instr::Op& op) { this->value = op.label; }
-};
-
-template <typename T, KeyType KEY_TYPE, typename REG_TYPE, typename CONST_TYPE>
-struct ValueOp : Op<ValueOp<T, KEY_TYPE, REG_TYPE, CONST_TYPE>, KEY_TYPE> {
-  typedef REG_TYPE reg_type;
-  const Value* value;
-  bool is_constant;
-  virtual bool ConstantFitsIn32Reg() const { return true; }
-  const REG_TYPE& reg() const {
-    assert_true(!is_constant);
-    return reg_;
-  }
-  operator const REG_TYPE&() const { return reg(); }
-  bool IsEqual(const T& b) const {
-    if (is_constant && b.is_constant) {
-      return reinterpret_cast<const T*>(this)->constant() == b.constant();
-    } else if (!is_constant && !b.is_constant) {
-      return reg_.getIdx() == b.reg_.getIdx();
-    } else {
-      return false;
-    }
-  }
-  bool IsEqual(const Xbyak::Reg& b) const {
-    if (is_constant) {
-      return false;
-    } else if (!is_constant) {
-      return reg_.getIdx() == b.getIdx();
-    } else {
-      return false;
-    }
-  }
-  bool operator==(const T& b) const { return IsEqual(b); }
-  bool operator!=(const T& b) const { return !IsEqual(b); }
-  bool operator==(const Xbyak::Reg& b) const { return IsEqual(b); }
-  bool operator!=(const Xbyak::Reg& b) const { return !IsEqual(b); }
-  void Load(const Instr::Op& op) {
-    value = op.value;
-    is_constant = value->IsConstant();
-    if (!is_constant) {
-      X64Emitter::SetupReg(value, reg_);
-    }
-  }
-
- protected:
-  REG_TYPE reg_;
-};
-
-struct I8Op : ValueOp<I8Op, KEY_TYPE_V_I8, Reg8, int8_t> {
-  typedef ValueOp<I8Op, KEY_TYPE_V_I8, Reg8, int8_t> BASE;
-  const int8_t constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.i8;
-  }
-};
-struct I16Op : ValueOp<I16Op, KEY_TYPE_V_I16, Reg16, int16_t> {
-  typedef ValueOp<I16Op, KEY_TYPE_V_I16, Reg16, int16_t> BASE;
-  const int16_t constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.i16;
-  }
-};
-struct I32Op : ValueOp<I32Op, KEY_TYPE_V_I32, Reg32, int32_t> {
-  typedef ValueOp<I32Op, KEY_TYPE_V_I32, Reg32, int32_t> BASE;
-  const int32_t constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.i32;
-  }
-};
-struct I64Op : ValueOp<I64Op, KEY_TYPE_V_I64, Reg64, int64_t> {
-  typedef ValueOp<I64Op, KEY_TYPE_V_I64, Reg64, int64_t> BASE;
-  const int64_t constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.i64;
-  }
-  bool ConstantFitsIn32Reg() const override {
-    int64_t v = BASE::value->constant.i64;
-    if ((v & ~0x7FFFFFFF) == 0) {
-      // Fits under 31 bits, so just load using normal mov.
-      return true;
-    } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
-      // Negative number that fits in 32bits.
-      return true;
-    }
-    return false;
-  }
-};
-struct F32Op : ValueOp<F32Op, KEY_TYPE_V_F32, Xmm, float> {
-  typedef ValueOp<F32Op, KEY_TYPE_V_F32, Xmm, float> BASE;
-  const float constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.f32;
-  }
-};
-struct F64Op : ValueOp<F64Op, KEY_TYPE_V_F64, Xmm, double> {
-  typedef ValueOp<F64Op, KEY_TYPE_V_F64, Xmm, double> BASE;
-  const double constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.f64;
-  }
-};
-struct V128Op : ValueOp<V128Op, KEY_TYPE_V_V128, Xmm, vec128_t> {
-  typedef ValueOp<V128Op, KEY_TYPE_V_V128, Xmm, vec128_t> BASE;
-  const vec128_t& constant() const {
-    assert_true(BASE::is_constant);
-    return BASE::value->constant.v128;
-  }
-};
-
-template <typename DEST, typename... Tf>
-struct DestField;
-template <typename DEST>
-struct DestField<DEST> {
-  DEST dest;
-
- protected:
-  bool LoadDest(const Instr* i) {
-    Instr::Op op;
-    op.value = i->dest;
-    dest.Load(op);
-    return true;
-  }
-};
-template <>
-struct DestField<VoidOp> {
- protected:
-  bool LoadDest(const Instr* i) { return true; }
-};
-
-template <hir::Opcode OPCODE, typename... Ts>
-struct I;
-template <hir::Opcode OPCODE, typename DEST>
-struct I<OPCODE, DEST> : DestField<DEST> {
-  typedef DestField<DEST> BASE;
-  static const hir::Opcode opcode = OPCODE;
-  static const uint32_t key =
-      InstrKey::Construct<OPCODE, DEST::key_type>::value;
-  static const KeyType dest_type = DEST::key_type;
-  const Instr* instr;
-
- protected:
-  template <typename SEQ, typename T>
-  friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
-      instr = i;
-      return true;
-    }
-    return false;
-  }
-};
-template <hir::Opcode OPCODE, typename DEST, typename SRC1>
-struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
-  typedef DestField<DEST> BASE;
-  static const hir::Opcode opcode = OPCODE;
-  static const uint32_t key =
-      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type>::value;
-  static const KeyType dest_type = DEST::key_type;
-  static const KeyType src1_type = SRC1::key_type;
-  const Instr* instr;
-  SRC1 src1;
-
- protected:
-  template <typename SEQ, typename T>
-  friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
-      instr = i;
-      src1.Load(i->src1);
-      return true;
-    }
-    return false;
-  }
-};
-template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2>
-struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
-  typedef DestField<DEST> BASE;
-  static const hir::Opcode opcode = OPCODE;
-  static const uint32_t key =
-      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
-                          SRC2::key_type>::value;
-  static const KeyType dest_type = DEST::key_type;
-  static const KeyType src1_type = SRC1::key_type;
-  static const KeyType src2_type = SRC2::key_type;
-  const Instr* instr;
-  SRC1 src1;
-  SRC2 src2;
-
- protected:
-  template <typename SEQ, typename T>
-  friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
-      instr = i;
-      src1.Load(i->src1);
-      src2.Load(i->src2);
-      return true;
-    }
-    return false;
-  }
-};
-template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2,
-          typename SRC3>
-struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
-  typedef DestField<DEST> BASE;
-  static const hir::Opcode opcode = OPCODE;
-  static const uint32_t key =
-      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
-                          SRC2::key_type, SRC3::key_type>::value;
-  static const KeyType dest_type = DEST::key_type;
-  static const KeyType src1_type = SRC1::key_type;
-  static const KeyType src2_type = SRC2::key_type;
-  static const KeyType src3_type = SRC3::key_type;
-  const Instr* instr;
-  SRC1 src1;
-  SRC2 src2;
-  SRC3 src3;
-
- protected:
-  template <typename SEQ, typename T>
-  friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
-      instr = i;
-      src1.Load(i->src1);
-      src2.Load(i->src2);
-      src3.Load(i->src3);
-      return true;
-    }
-    return false;
-  }
-};
-
-template <typename T>
-const T GetTempReg(X64Emitter& e);
-template <>
-const Reg8 GetTempReg<Reg8>(X64Emitter& e) {
-  return e.al;
-}
-template <>
-const Reg16 GetTempReg<Reg16>(X64Emitter& e) {
-  return e.ax;
-}
-template <>
-const Reg32 GetTempReg<Reg32>(X64Emitter& e) {
-  return e.eax;
-}
-template <>
-const Reg64 GetTempReg<Reg64>(X64Emitter& e) {
-  return e.rax;
-}
-
-template <typename SEQ, typename T>
-struct Sequence {
-  typedef T EmitArgType;
-
-  static constexpr uint32_t head_key() { return T::key; }
-
-  static bool Select(X64Emitter& e, const Instr* i) {
-    T args;
-    if (!args.Load(i)) {
-      return false;
-    }
-    SEQ::Emit(e, args);
-    return true;
-  }
-
-  template <typename REG_FN>
-  static void EmitUnaryOp(X64Emitter& e, const EmitArgType& i,
-                          const REG_FN& reg_fn) {
-    if (i.src1.is_constant) {
-      e.mov(i.dest, i.src1.constant());
-      reg_fn(e, i.dest);
-    } else {
-      if (i.dest != i.src1) {
-        e.mov(i.dest, i.src1);
-      }
-      reg_fn(e, i.dest);
-    }
-  }
-
-  template <typename REG_REG_FN, typename REG_CONST_FN>
-  static void EmitCommutativeBinaryOp(X64Emitter& e, const EmitArgType& i,
-                                      const REG_REG_FN& reg_reg_fn,
-                                      const REG_CONST_FN& reg_const_fn) {
-    if (i.src1.is_constant) {
-      if (i.src2.is_constant) {
-        // Both constants.
-        if (i.src1.ConstantFitsIn32Reg()) {
-          e.mov(i.dest, i.src2.constant());
-          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
-        } else if (i.src2.ConstantFitsIn32Reg()) {
-          e.mov(i.dest, i.src1.constant());
-          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
-        } else {
-          e.mov(i.dest, i.src1.constant());
-          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-          e.mov(temp, i.src2.constant());
-          reg_reg_fn(e, i.dest, temp);
-        }
-      } else {
-        // src1 constant.
-        if (i.dest == i.src2) {
-          if (i.src1.ConstantFitsIn32Reg()) {
-            reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
-          } else {
-            auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
-            e.mov(temp, i.src1.constant());
-            reg_reg_fn(e, i.dest, temp);
-          }
-        } else {
-          e.mov(i.dest, i.src1.constant());
-          reg_reg_fn(e, i.dest, i.src2);
-        }
-      }
-    } else if (i.src2.is_constant) {
-      if (i.dest == i.src1) {
-        if (i.src2.ConstantFitsIn32Reg()) {
-          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
-        } else {
-          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-          e.mov(temp, i.src2.constant());
-          reg_reg_fn(e, i.dest, temp);
-        }
-      } else {
-        e.mov(i.dest, i.src2.constant());
-        reg_reg_fn(e, i.dest, i.src1);
-      }
-    } else {
-      if (i.dest == i.src1) {
-        reg_reg_fn(e, i.dest, i.src2);
-      } else if (i.dest == i.src2) {
-        reg_reg_fn(e, i.dest, i.src1);
-      } else {
-        e.mov(i.dest, i.src1);
-        reg_reg_fn(e, i.dest, i.src2);
-      }
-    }
-  }
-  template <typename REG_REG_FN, typename REG_CONST_FN>
-  static void EmitAssociativeBinaryOp(X64Emitter& e, const EmitArgType& i,
-                                      const REG_REG_FN& reg_reg_fn,
-                                      const REG_CONST_FN& reg_const_fn) {
-    if (i.src1.is_constant) {
-      assert_true(!i.src2.is_constant);
-      if (i.dest == i.src2) {
-        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-        e.mov(temp, i.src2);
-        e.mov(i.dest, i.src1.constant());
-        reg_reg_fn(e, i.dest, temp);
-      } else {
-        e.mov(i.dest, i.src1.constant());
-        reg_reg_fn(e, i.dest, i.src2);
-      }
-    } else if (i.src2.is_constant) {
-      if (i.dest == i.src1) {
-        if (i.src2.ConstantFitsIn32Reg()) {
-          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
-        } else {
-          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-          e.mov(temp, i.src2.constant());
-          reg_reg_fn(e, i.dest, temp);
-        }
-      } else {
-        e.mov(i.dest, i.src1);
-        if (i.src2.ConstantFitsIn32Reg()) {
-          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
-        } else {
-          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-          e.mov(temp, i.src2.constant());
-          reg_reg_fn(e, i.dest, temp);
-        }
-      }
-    } else {
-      if (i.dest == i.src1) {
-        reg_reg_fn(e, i.dest, i.src2);
-      } else if (i.dest == i.src2) {
-        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-        e.mov(temp, i.src2);
-        e.mov(i.dest, i.src1);
-        reg_reg_fn(e, i.dest, temp);
-      } else {
-        e.mov(i.dest, i.src1);
-        reg_reg_fn(e, i.dest, i.src2);
-      }
-    }
-  }
-
-  template <typename FN>
-  static void EmitCommutativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i,
-                                         const FN& fn) {
-    if (i.src1.is_constant) {
-      assert_true(!i.src2.is_constant);
-      e.LoadConstantXmm(e.xmm0, i.src1.constant());
-      fn(e, i.dest, e.xmm0, i.src2);
-    } else if (i.src2.is_constant) {
-      assert_true(!i.src1.is_constant);
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      fn(e, i.dest, i.src1, e.xmm0);
-    } else {
-      fn(e, i.dest, i.src1, i.src2);
-    }
-  }
-
-  template <typename FN>
-  static void EmitAssociativeBinaryXmmOp(X64Emitter& e, const EmitArgType& i,
-                                         const FN& fn) {
-    if (i.src1.is_constant) {
-      assert_true(!i.src2.is_constant);
-      e.LoadConstantXmm(e.xmm0, i.src1.constant());
-      fn(e, i.dest, e.xmm0, i.src2);
-    } else if (i.src2.is_constant) {
-      assert_true(!i.src1.is_constant);
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      fn(e, i.dest, i.src1, e.xmm0);
-    } else {
-      fn(e, i.dest, i.src1, i.src2);
-    }
-  }
-
-  template <typename REG_REG_FN, typename REG_CONST_FN>
-  static void EmitCommutativeCompareOp(X64Emitter& e, const EmitArgType& i,
-                                       const REG_REG_FN& reg_reg_fn,
-                                       const REG_CONST_FN& reg_const_fn) {
-    if (i.src1.is_constant) {
-      assert_true(!i.src2.is_constant);
-      if (i.src1.ConstantFitsIn32Reg()) {
-        reg_const_fn(e, i.src2, static_cast<int32_t>(i.src1.constant()));
-      } else {
-        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
-        e.mov(temp, i.src1.constant());
-        reg_reg_fn(e, i.src2, temp);
-      }
-    } else if (i.src2.is_constant) {
-      assert_true(!i.src1.is_constant);
-      if (i.src2.ConstantFitsIn32Reg()) {
-        reg_const_fn(e, i.src1, static_cast<int32_t>(i.src2.constant()));
-      } else {
-        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-        e.mov(temp, i.src2.constant());
-        reg_reg_fn(e, i.src1, temp);
-      }
-    } else {
-      reg_reg_fn(e, i.src1, i.src2);
-    }
-  }
-  template <typename REG_REG_FN, typename REG_CONST_FN>
-  static void EmitAssociativeCompareOp(X64Emitter& e, const EmitArgType& i,
-                                       const REG_REG_FN& reg_reg_fn,
-                                       const REG_CONST_FN& reg_const_fn) {
-    if (i.src1.is_constant) {
-      assert_true(!i.src2.is_constant);
-      if (i.src1.ConstantFitsIn32Reg()) {
-        reg_const_fn(e, i.dest, i.src2, static_cast<int32_t>(i.src1.constant()),
-                     true);
-      } else {
-        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
-        e.mov(temp, i.src1.constant());
-        reg_reg_fn(e, i.dest, i.src2, temp, true);
-      }
-    } else if (i.src2.is_constant) {
-      assert_true(!i.src1.is_constant);
-      if (i.src2.ConstantFitsIn32Reg()) {
-        reg_const_fn(e, i.dest, i.src1, static_cast<int32_t>(i.src2.constant()),
-                     false);
-      } else {
-        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
-        e.mov(temp, i.src2.constant());
-        reg_reg_fn(e, i.dest, i.src1, temp, false);
-      }
-    } else {
-      reg_reg_fn(e, i.dest, i.src1, i.src2, false);
-    }
-  }
-};
-
-template <typename T>
-static bool Register() {
-  sequence_table.insert({T::head_key(), T::Select});
-  return true;
-}
-template <typename T, typename Tn, typename... Ts>
-static bool Register() {
-  bool b = true;
-  b = b && Register<T>();          // Call the above function
-  b = b && Register<Tn, Ts...>();  // Call ourself again (recursively)
-  return b;
-}
-#define EMITTER_OPCODE_TABLE(name, ...) \
-  static bool Registered_##name = Register<__VA_ARGS__>();
-
 // ============================================================================
 // OPCODE_COMMENT
 // ============================================================================
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index a0103fca5..edb483022 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -12,6 +12,8 @@
 
 #include "xenia/cpu/hir/instr.h"
 
+#include <unordered_map>
+
 namespace xe {
 namespace cpu {
 namespace backend {
@@ -19,6 +21,25 @@ namespace x64 {
 
 class X64Emitter;
 
+typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*);
+extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
+
+template <typename T>
+bool Register() {
+  sequence_table.insert({T::head_key(), T::Select});
+  return true;
+}
+
+template <typename T, typename Tn, typename... Ts>
+static bool Register() {
+  bool b = true;
+  b = b && Register<T>();          // Call the above function
+  b = b && Register<Tn, Ts...>();  // Call ourself again (recursively)
+  return b;
+}
+#define EMITTER_OPCODE_TABLE(name, ...) \
+  const auto X64_INSTR_##name = Register<__VA_ARGS__>();
+
 void RegisterSequences();
 bool SelectSequence(X64Emitter* e, const hir::Instr* i,
                     const hir::Instr** new_tail);

From c3180097330de60f2d98a258a580c03fba6125ac Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 18 Nov 2018 19:23:39 -0600
Subject: [PATCH 21/31] [x64] Factor out vector handling code

---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 2623 +++++++++++++++++++
 src/xenia/cpu/backend/x64/x64_sequences.cc  | 2598 +-----------------
 src/xenia/cpu/backend/x64/x64_sequences.h   |    3 +
 3 files changed, 2627 insertions(+), 2597 deletions(-)
 create mode 100644 src/xenia/cpu/backend/x64/x64_seq_vector.cc

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
new file mode 100644
index 000000000..9e8bf19de
--- /dev/null
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -0,0 +1,2623 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2018 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/x64/x64_sequences.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/cpu/backend/x64/x64_op.h"
+
+// For OPCODE_PACK/OPCODE_UNPACK
+#include "third_party/half/include/half.hpp"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace x64 {
+
+void RegisterVector() {}
+
+// ============================================================================
+// OPCODE_VECTOR_CONVERT_I2F
+// ============================================================================
+struct VECTOR_CONVERT_I2F
+    : Sequence<VECTOR_CONVERT_I2F,
+               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // flags = ARITHMETIC_UNSIGNED
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      // xmm0 = mask of positive values
+      e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
+
+      // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
+      e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
+      e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
+
+      // xmm1 = [0, INT_MAX]
+      e.vcvtdq2ps(i.dest, e.xmm1);
+
+      // scale values back above [INT_MIN, UINT_MAX]
+      e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
+      e.vaddps(i.dest, i.dest, e.xmm0);
+    } else {
+      e.vcvtdq2ps(i.dest, i.src1);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F);
+
+// ============================================================================
+// OPCODE_VECTOR_CONVERT_F2I
+// ============================================================================
+struct VECTOR_CONVERT_F2I
+    : Sequence<VECTOR_CONVERT_F2I,
+               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      // clamp to min 0
+      e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
+
+      // xmm1 = mask of values >= (unsigned)INT_MIN
+      e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
+
+      // scale any values >= (unsigned)INT_MIN back to [0, ...]
+      e.vsubps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
+      e.vblendvps(e.xmm0, e.xmm0, e.xmm2, e.xmm1);
+
+      // xmm0 = [0, INT_MAX]
+      // this may still contain values > INT_MAX (if src has vals > UINT_MAX)
+      e.vcvttps2dq(i.dest, e.xmm0);
+
+      // xmm0 = mask of values that need saturation
+      e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin));
+
+      // scale values back above [INT_MIN, UINT_MAX]
+      e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntMin));
+      e.vpaddd(i.dest, i.dest, e.xmm1);
+
+      // saturate values > UINT_MAX
+      e.vpor(i.dest, i.dest, e.xmm0);
+    } else {
+      // xmm2 = NaN mask
+      e.vcmpunordps(e.xmm2, i.src1, i.src1);
+
+      // convert packed floats to packed dwords
+      e.vcvttps2dq(e.xmm0, i.src1);
+
+      // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0
+      e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin));
+      e.vpandn(e.xmm1, i.src1, e.xmm1);
+
+      // saturate positive values
+      e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1);
+
+      // mask NaNs
+      e.vpandn(i.dest, e.xmm2, i.dest);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
+
+// ============================================================================
+// OPCODE_LOAD_VECTOR_SHL
+// ============================================================================
+static const vec128_t lvsl_table[16] = {
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
+    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
+    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
+    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
+    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
+    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
+    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
+    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
+    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
+    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
+    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
+    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
+    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
+    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
+    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
+};
+struct LOAD_VECTOR_SHL_I8
+    : Sequence<LOAD_VECTOR_SHL_I8, I<OPCODE_LOAD_VECTOR_SHL, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      auto sh = i.src1.constant();
+      assert_true(sh < xe::countof(lvsl_table));
+      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
+      e.vmovaps(i.dest, e.ptr[e.rax]);
+    } else {
+      // TODO(benvanik): find a cheaper way of doing this.
+      e.movzx(e.rdx, i.src1);
+      e.and_(e.dx, 0xF);
+      e.shl(e.dx, 4);
+      e.mov(e.rax, (uintptr_t)lvsl_table);
+      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8);
+
+// ============================================================================
+// OPCODE_LOAD_VECTOR_SHR
+// ============================================================================
+static const vec128_t lvsr_table[16] = {
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
+    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
+    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
+    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
+    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
+    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
+    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
+    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
+    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
+    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
+    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
+    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
+    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
+    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
+    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
+};
+struct LOAD_VECTOR_SHR_I8
+    : Sequence<LOAD_VECTOR_SHR_I8, I<OPCODE_LOAD_VECTOR_SHR, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      auto sh = i.src1.constant();
+      assert_true(sh < xe::countof(lvsr_table));
+      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
+      e.vmovaps(i.dest, e.ptr[e.rax]);
+    } else {
+      // TODO(benvanik): find a cheaper way of doing this.
+      e.movzx(e.rdx, i.src1);
+      e.and_(e.dx, 0xF);
+      e.shl(e.dx, 4);
+      e.mov(e.rax, (uintptr_t)lvsr_table);
+      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8);
+
+// ============================================================================
+// OPCODE_VECTOR_MAX
+// ============================================================================
+struct VECTOR_MAX
+    : Sequence<VECTOR_MAX, I<OPCODE_VECTOR_MAX, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          uint32_t part_type = i.instr->flags >> 8;
+          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.vpmaxub(dest, src1, src2);
+                break;
+              case INT16_TYPE:
+                e.vpmaxuw(dest, src1, src2);
+                break;
+              case INT32_TYPE:
+                e.vpmaxud(dest, src1, src2);
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          } else {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.vpmaxsb(dest, src1, src2);
+                break;
+              case INT16_TYPE:
+                e.vpmaxsw(dest, src1, src2);
+                break;
+              case INT32_TYPE:
+                e.vpmaxsd(dest, src1, src2);
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX);
+
+// ============================================================================
+// OPCODE_VECTOR_MIN
+// ============================================================================
+struct VECTOR_MIN
+    : Sequence<VECTOR_MIN, I<OPCODE_VECTOR_MIN, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          uint32_t part_type = i.instr->flags >> 8;
+          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.vpminub(dest, src1, src2);
+                break;
+              case INT16_TYPE:
+                e.vpminuw(dest, src1, src2);
+                break;
+              case INT32_TYPE:
+                e.vpminud(dest, src1, src2);
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          } else {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.vpminsb(dest, src1, src2);
+                break;
+              case INT16_TYPE:
+                e.vpminsw(dest, src1, src2);
+                break;
+              case INT32_TYPE:
+                e.vpminsd(dest, src1, src2);
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_EQ
+// ============================================================================
+struct VECTOR_COMPARE_EQ_V128
+    : Sequence<VECTOR_COMPARE_EQ_V128,
+               I<OPCODE_VECTOR_COMPARE_EQ, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.vpcmpeqb(dest, src1, src2);
+              break;
+            case INT16_TYPE:
+              e.vpcmpeqw(dest, src1, src2);
+              break;
+            case INT32_TYPE:
+              e.vpcmpeqd(dest, src1, src2);
+              break;
+            case FLOAT32_TYPE:
+              e.vcmpeqps(dest, src1, src2);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_SGT
+// ============================================================================
+struct VECTOR_COMPARE_SGT_V128
+    : Sequence<VECTOR_COMPARE_SGT_V128,
+               I<OPCODE_VECTOR_COMPARE_SGT, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.vpcmpgtb(dest, src1, src2);
+              break;
+            case INT16_TYPE:
+              e.vpcmpgtw(dest, src1, src2);
+              break;
+            case INT32_TYPE:
+              e.vpcmpgtd(dest, src1, src2);
+              break;
+            case FLOAT32_TYPE:
+              e.vcmpgtps(dest, src1, src2);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_SGE
+// ============================================================================
+struct VECTOR_COMPARE_SGE_V128
+    : Sequence<VECTOR_COMPARE_SGE_V128,
+               I<OPCODE_VECTOR_COMPARE_SGE, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.vpcmpeqb(e.xmm0, src1, src2);
+              e.vpcmpgtb(dest, src1, src2);
+              e.vpor(dest, e.xmm0);
+              break;
+            case INT16_TYPE:
+              e.vpcmpeqw(e.xmm0, src1, src2);
+              e.vpcmpgtw(dest, src1, src2);
+              e.vpor(dest, e.xmm0);
+              break;
+            case INT32_TYPE:
+              e.vpcmpeqd(e.xmm0, src1, src2);
+              e.vpcmpgtd(dest, src1, src2);
+              e.vpor(dest, e.xmm0);
+              break;
+            case FLOAT32_TYPE:
+              e.vcmpgeps(dest, src1, src2);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_UGT
+// ============================================================================
+struct VECTOR_COMPARE_UGT_V128
+    : Sequence<VECTOR_COMPARE_UGT_V128,
+               I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
+        break;
+      case INT16_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
+        break;
+      case INT32_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
+        break;
+      case FLOAT32_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+    if (i.src1.is_constant) {
+      // TODO(benvanik): make this constant.
+      e.LoadConstantXmm(e.xmm0, i.src1.constant());
+      e.vpxor(e.xmm0, sign_addr);
+    } else {
+      e.vpxor(e.xmm0, i.src1, sign_addr);
+    }
+    if (i.src2.is_constant) {
+      // TODO(benvanik): make this constant.
+      e.LoadConstantXmm(e.xmm1, i.src2.constant());
+      e.vpxor(e.xmm1, sign_addr);
+    } else {
+      e.vpxor(e.xmm1, i.src2, sign_addr);
+    }
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
+        break;
+      case INT16_TYPE:
+        e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
+        break;
+      case INT32_TYPE:
+        e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
+        break;
+      case FLOAT32_TYPE:
+        e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
+        break;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_UGE
+// ============================================================================
+struct VECTOR_COMPARE_UGE_V128
+    : Sequence<VECTOR_COMPARE_UGE_V128,
+               I<OPCODE_VECTOR_COMPARE_UGE, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
+        break;
+      case INT16_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
+        break;
+      case INT32_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
+        break;
+      case FLOAT32_TYPE:
+        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
+        break;
+    }
+    if (i.src1.is_constant) {
+      // TODO(benvanik): make this constant.
+      e.LoadConstantXmm(e.xmm0, i.src1.constant());
+      e.vpxor(e.xmm0, sign_addr);
+    } else {
+      e.vpxor(e.xmm0, i.src1, sign_addr);
+    }
+    if (i.src2.is_constant) {
+      // TODO(benvanik): make this constant.
+      e.LoadConstantXmm(e.xmm1, i.src2.constant());
+      e.vpxor(e.xmm1, sign_addr);
+    } else {
+      e.vpxor(e.xmm1, i.src2, sign_addr);
+    }
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1);
+        e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
+        e.vpor(i.dest, e.xmm2);
+        break;
+      case INT16_TYPE:
+        e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1);
+        e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
+        e.vpor(i.dest, e.xmm2);
+        break;
+      case INT32_TYPE:
+        e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1);
+        e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
+        e.vpor(i.dest, e.xmm2);
+        break;
+      case FLOAT32_TYPE:
+        e.vcmpgeps(i.dest, e.xmm0, e.xmm1);
+        break;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_ADD
+// ============================================================================
+struct VECTOR_ADD
+    : Sequence<VECTOR_ADD, I<OPCODE_VECTOR_ADD, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (saturate) {
+                // TODO(benvanik): trace DID_SATURATE
+                if (is_unsigned) {
+                  e.vpaddusb(dest, src1, src2);
+                } else {
+                  e.vpaddsb(dest, src1, src2);
+                }
+              } else {
+                e.vpaddb(dest, src1, src2);
+              }
+              break;
+            case INT16_TYPE:
+              if (saturate) {
+                // TODO(benvanik): trace DID_SATURATE
+                if (is_unsigned) {
+                  e.vpaddusw(dest, src1, src2);
+                } else {
+                  e.vpaddsw(dest, src1, src2);
+                }
+              } else {
+                e.vpaddw(dest, src1, src2);
+              }
+              break;
+            case INT32_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  // xmm0 is the only temp register that can be used by
+                  // src1/src2.
+                  e.vpaddd(e.xmm1, src1, src2);
+
+                  // If result is smaller than either of the inputs, we've
+                  // overflowed (only need to check one input)
+                  // if (src1 > res) then overflowed
+                  // http://locklessinc.com/articles/sat_arithmetic/
+                  e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
+                  e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
+                  e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);
+                  e.vpor(dest, e.xmm1, e.xmm0);
+                } else {
+                  e.vpaddd(e.xmm1, src1, src2);
+
+                  // Overflow results if two inputs are the same sign and the
+                  // result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
+                  // (src1 ^ res)) < 0) then overflowed
+                  // http://locklessinc.com/articles/sat_arithmetic/
+                  e.vpxor(e.xmm2, src1, src2);
+                  e.vpxor(e.xmm3, src1, e.xmm1);
+                  e.vpandn(e.xmm2, e.xmm2, e.xmm3);
+
+                  // Set any negative overflowed elements of src1 to INT_MIN
+                  e.vpand(e.xmm3, src1, e.xmm2);
+                  e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32),
+                              e.xmm3);
+
+                  // Set any positive overflowed elements of src1 to INT_MAX
+                  e.vpandn(e.xmm3, src1, e.xmm2);
+                  e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS),
+                              e.xmm3);
+                }
+              } else {
+                e.vpaddd(dest, src1, src2);
+              }
+              break;
+            case FLOAT32_TYPE:
+              assert_false(is_unsigned);
+              assert_false(saturate);
+              e.vaddps(dest, src1, src2);
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD);
+
+// ============================================================================
+// OPCODE_VECTOR_SUB
+// ============================================================================
+struct VECTOR_SUB
+    : Sequence<VECTOR_SUB, I<OPCODE_VECTOR_SUB, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (saturate) {
+                // TODO(benvanik): trace DID_SATURATE
+                if (is_unsigned) {
+                  e.vpsubusb(dest, src1, src2);
+                } else {
+                  e.vpsubsb(dest, src1, src2);
+                }
+              } else {
+                e.vpsubb(dest, src1, src2);
+              }
+              break;
+            case INT16_TYPE:
+              if (saturate) {
+                // TODO(benvanik): trace DID_SATURATE
+                if (is_unsigned) {
+                  e.vpsubusw(dest, src1, src2);
+                } else {
+                  e.vpsubsw(dest, src1, src2);
+                }
+              } else {
+                e.vpsubw(dest, src1, src2);
+              }
+              break;
+            case INT32_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  // xmm0 is the only temp register that can be used by
+                  // src1/src2.
+                  e.vpsubd(e.xmm1, src1, src2);
+
+                  // If result is greater than either of the inputs, we've
+                  // underflowed (only need to check one input)
+                  // if (res > src1) then underflowed
+                  // http://locklessinc.com/articles/sat_arithmetic/
+                  e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
+                  e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
+                  e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2);
+                  e.vpandn(dest, e.xmm0, e.xmm1);
+                } else {
+                  e.vpsubd(e.xmm1, src1, src2);
+
+                  // We can only overflow if the signs of the operands are
+                  // opposite. If signs are opposite and result sign isn't the
+                  // same as src1's sign, we've overflowed. if ((s32b)((src1 ^
+                  // src2) & (src1 ^ res)) < 0) then overflowed
+                  // http://locklessinc.com/articles/sat_arithmetic/
+                  e.vpxor(e.xmm2, src1, src2);
+                  e.vpxor(e.xmm3, src1, e.xmm1);
+                  e.vpand(e.xmm2, e.xmm2, e.xmm3);
+
+                  // Set any negative overflowed elements of src1 to INT_MIN
+                  e.vpand(e.xmm3, src1, e.xmm2);
+                  e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32),
+                              e.xmm3);
+
+                  // Set any positive overflowed elements of src1 to INT_MAX
+                  e.vpandn(e.xmm3, src1, e.xmm2);
+                  e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS),
+                              e.xmm3);
+                }
+              } else {
+                e.vpsubd(dest, src1, src2);
+              }
+              break;
+            case FLOAT32_TYPE:
+              e.vsubps(dest, src1, src2);
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
+
+// ============================================================================
+// OPCODE_VECTOR_SHL
+// ============================================================================
+struct VECTOR_SHL_V128
+    : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+  static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint8_t value[16];
+    alignas(16) uint8_t shamt[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 16; ++i) {
+      value[i] = value[i] << (shamt[i] & 0x7);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI8));
+    e.vmovaps(i.dest, e.xmm0);
+  }
+  static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint16_t value[8];
+    alignas(16) uint16_t shamt[8];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 8; ++i) {
+      value[i] = value[i] << (shamt[i] & 0xF);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
+    Xmm src1;
+    if (i.src1.is_constant) {
+      src1 = e.xmm2;
+      e.LoadConstantXmm(src1, i.src1.constant());
+    } else {
+      src1 = i.src1;
+    }
+
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsllw.
+        e.vpsllw(i.dest, src1, shamt.u16[0] & 0xF);
+        return;
+      }
+    }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      // See if the shift is equal first for a shortcut.
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vpxor(e.xmm1, e.xmm0, i.src2);
+      e.vptest(e.xmm1, e.xmm1);
+      e.jnz(emu);
+
+      // Equal. Shift using vpsllw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsllw(i.dest, src1, e.xmm0);
+      e.jmp(end);
+    }
+
+    // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
+    e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
+  }
+  static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint32_t value[4];
+    alignas(16) uint32_t shamt[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      value[i] = value[i] << (shamt[i] & 0x1F);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
+    Xmm src1;
+    if (i.src1.is_constant) {
+      src1 = e.xmm2;
+      e.LoadConstantXmm(src1, i.src1.constant());
+    } else {
+      src1 = i.src1;
+    }
+
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpslld.
+        e.vpslld(i.dest, src1, shamt.u8[0] & 0x1F);
+        return;
+      }
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      if (i.src2.is_constant) {
+        const auto& shamt = i.src2.constant();
+        // Counts differ, so pre-mask and load constant.
+        vec128_t masked = i.src2.constant();
+        for (size_t n = 0; n < 4; ++n) {
+          masked.u32[n] &= 0x1F;
+        }
+        e.LoadConstantXmm(e.xmm0, masked);
+        e.vpsllvd(i.dest, src1, e.xmm0);
+      } else {
+        // Fully variable shift.
+        // src shift mask may have values >31, and x86 sets to zero when
+        // that happens so we mask.
+        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+        e.vpsllvd(i.dest, src1, e.xmm0);
+      }
+    } else {
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vpxor(e.xmm1, e.xmm0, i.src2);
+        e.vptest(e.xmm1, e.xmm1);
+        e.jnz(emu);
+
+        // Equal. Shift using vpsrad.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpslld(i.dest, src1, e.xmm0);
+        e.jmp(end);
+      }
+
+      // TODO(benvanik): native version (with shift magic).
+      e.L(emu);
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      } else {
+        e.lea(e.r9, e.StashXmm(1, i.src2));
+      }
+      e.lea(e.r8, e.StashXmm(0, src1));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI32));
+      e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_SHR
+// ============================================================================
+struct VECTOR_SHR_V128
+    : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+  static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint8_t value[16];
+    alignas(16) uint8_t shamt[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 16; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0x7);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI8));
+    e.vmovaps(i.dest, e.xmm0);
+  }
+  static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint16_t value[8];
+    alignas(16) uint16_t shamt[8];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 8; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0xF);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsllw.
+        e.vpsrlw(i.dest, i.src1, shamt.u16[0] & 0xF);
+        return;
+      }
+    }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // See if the shift is equal first for a shortcut.
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vpxor(e.xmm1, e.xmm0, i.src2);
+      e.vptest(e.xmm1, e.xmm1);
+      e.jnz(emu);
+
+      // Equal. Shift using vpsrlw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsrlw(i.dest, i.src1, e.xmm0);
+      e.jmp(end);
+    }
+
+    // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16));
+    e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
+  }
+  static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint32_t value[4];
+    alignas(16) uint32_t shamt[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0x1F);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
+    Xmm src1;
+    if (i.src1.is_constant) {
+      src1 = e.xmm2;
+      e.LoadConstantXmm(src1, i.src1.constant());
+    } else {
+      src1 = i.src1;
+    }
+
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsrld.
+        e.vpsrld(i.dest, src1, shamt.u8[0] & 0x1F);
+        return;
+      } else {
+        if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+          // Counts differ, so pre-mask and load constant.
+          vec128_t masked = i.src2.constant();
+          for (size_t n = 0; n < 4; ++n) {
+            masked.u32[n] &= 0x1F;
+          }
+          e.LoadConstantXmm(e.xmm0, masked);
+          e.vpsrlvd(i.dest, src1, e.xmm0);
+          return;
+        }
+      }
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Fully variable shift.
+      // src shift mask may have values >31, and x86 sets to zero when
+      // that happens so we mask.
+      e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+      e.vpsrlvd(i.dest, src1, e.xmm0);
+    } else {
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vpxor(e.xmm1, e.xmm0, i.src2);
+        e.vptest(e.xmm1, e.xmm1);
+        e.jnz(emu);
+
+        // Equal. Shift using vpsrld.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpsrld(i.dest, src1, e.xmm0);
+        e.jmp(end);
+      }
+
+      // TODO(benvanik): native version.
+      e.L(emu);
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      } else {
+        e.lea(e.r9, e.StashXmm(1, i.src2));
+      }
+      e.lea(e.r8, e.StashXmm(0, src1));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
+      e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_SHA
+// ============================================================================
+struct VECTOR_SHA_V128
+    : Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
+  static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) {
+    alignas(16) int8_t value[16];
+    alignas(16) int8_t shamt[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 16; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0x7);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI8));
+    e.vmovaps(i.dest, e.xmm0);
+  }
+
+  static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) {
+    alignas(16) int16_t value[8];
+    alignas(16) int16_t shamt[8];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 8; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0xF);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+
+  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsraw.
+        e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF);
+        return;
+      }
+    }
+
+    // Shift 8 words in src1 by amount specified in src2.
+    Xbyak::Label emu, end;
+
+    // See if the shift is equal first for a shortcut.
+    // Only bother with this check if shift amt isn't constant.
+    if (!i.src2.is_constant) {
+      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
+      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+      e.vpxor(e.xmm1, e.xmm0, i.src2);
+      e.vptest(e.xmm1, e.xmm1);
+      e.jnz(emu);
+
+      // Equal. Shift using vpsraw.
+      e.mov(e.rax, 0xF);
+      e.vmovq(e.xmm1, e.rax);
+      e.vpand(e.xmm0, e.xmm0, e.xmm1);
+      e.vpsraw(i.dest, i.src1, e.xmm0);
+      e.jmp(end);
+    }
+
+    // TODO(benvanik): native version (with shift magic).
+    e.L(emu);
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+    } else {
+      e.lea(e.r9, e.StashXmm(1, i.src2));
+    }
+    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI16));
+    e.vmovaps(i.dest, e.xmm0);
+
+    e.L(end);
+  }
+
+  static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) {
+    alignas(16) int32_t value[4];
+    alignas(16) int32_t shamt[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      value[i] = value[i] >> (shamt[i] & 0x1F);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+
+  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use vpsrad.
+        e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F);
+        return;
+      }
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // src shift mask may have values >31, and x86 sets to zero when
+      // that happens so we mask.
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
+      } else {
+        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+      }
+      e.vpsravd(i.dest, i.src1, e.xmm0);
+    } else {
+      // Shift 4 words in src1 by amount specified in src2.
+      Xbyak::Label emu, end;
+
+      // See if the shift is equal first for a shortcut.
+      // Only bother with this check if shift amt isn't constant.
+      if (!i.src2.is_constant) {
+        e.vpshufd(e.xmm0, i.src2, 0b00000000);
+        e.vpxor(e.xmm1, e.xmm0, i.src2);
+        e.vptest(e.xmm1, e.xmm1);
+        e.jnz(emu);
+
+        // Equal. Shift using vpsrad.
+        e.mov(e.rax, 0x1F);
+        e.vmovq(e.xmm1, e.rax);
+        e.vpand(e.xmm0, e.xmm0, e.xmm1);
+        e.vpsrad(i.dest, i.src1, e.xmm0);
+        e.jmp(end);
+      }
+
+      // TODO(benvanik): native version.
+      e.L(emu);
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      } else {
+        e.lea(e.r9, e.StashXmm(1, i.src2));
+      }
+      e.lea(e.r8, e.StashXmm(0, i.src1));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
+      e.vmovaps(i.dest, e.xmm0);
+
+      e.L(end);
+    }
+  }
+
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_ROTATE_LEFT
+// ============================================================================
+// TODO(benvanik): AVX512 has a native variable rotate (rolv).
+struct VECTOR_ROTATE_LEFT_V128
+    : Sequence<VECTOR_ROTATE_LEFT_V128,
+               I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
+  static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint8_t value[16];
+    alignas(16) uint8_t shamt[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 16; ++i) {
+      value[i] = xe::rotate_left<uint8_t>(value[i], shamt[i] & 0x7);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint16_t value[8];
+    alignas(16) uint16_t shamt[8];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 8; ++i) {
+      value[i] = xe::rotate_left<uint16_t>(value[i], shamt[i] & 0xF);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint32_t value[4];
+    alignas(16) uint32_t shamt[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      value[i] = xe::rotate_left<uint32_t>(value[i], shamt[i] & 0x1F);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        // TODO(benvanik): native version (with shift magic).
+        e.lea(e.r8, e.StashXmm(0, i.src1));
+        if (i.src2.is_constant) {
+          e.LoadConstantXmm(e.xmm0, i.src2.constant());
+          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        } else {
+          e.lea(e.r9, e.StashXmm(1, i.src2));
+        }
+        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI8));
+        e.vmovaps(i.dest, e.xmm0);
+        break;
+      case INT16_TYPE:
+        // TODO(benvanik): native version (with shift magic).
+        e.lea(e.r8, e.StashXmm(0, i.src1));
+        if (i.src2.is_constant) {
+          e.LoadConstantXmm(e.xmm0, i.src2.constant());
+          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        } else {
+          e.lea(e.r9, e.StashXmm(1, i.src2));
+        }
+        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI16));
+        e.vmovaps(i.dest, e.xmm0);
+        break;
+      case INT32_TYPE: {
+        if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+          Xmm temp = i.dest;
+          if (i.dest == i.src1 || i.dest == i.src2) {
+            temp = e.xmm2;
+          }
+          // Shift left (to get high bits):
+          e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
+          e.vpsllvd(e.xmm1, i.src1, e.xmm0);
+          // Shift right (to get low bits):
+          e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
+          e.vpsubd(temp, e.xmm0);
+          e.vpsrlvd(i.dest, i.src1, temp);
+          // Merge:
+          e.vpor(i.dest, e.xmm1);
+        } else {
+          // TODO(benvanik): non-AVX2 native version.
+          e.lea(e.r8, e.StashXmm(0, i.src1));
+          if (i.src2.is_constant) {
+            e.LoadConstantXmm(e.xmm0, i.src2.constant());
+            e.lea(e.r9, e.StashXmm(1, e.xmm0));
+          } else {
+            e.lea(e.r9, e.StashXmm(1, i.src2));
+          }
+          e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI32));
+          e.vmovaps(i.dest, e.xmm0);
+        }
+        break;
+      }
+      default:
+        assert_always();
+        break;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_AVERAGE
+// ============================================================================
+struct VECTOR_AVERAGE
+    : Sequence<VECTOR_AVERAGE,
+               I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
+  static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1,
+                                                 __m128i src2) {
+    alignas(16) uint32_t src1v[4];
+    alignas(16) uint32_t src2v[4];
+    alignas(16) uint32_t value[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1;
+      value[i] = uint32_t(t);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1,
+                                               __m128i src2) {
+    alignas(16) int32_t src1v[4];
+    alignas(16) int32_t src2v[4];
+    alignas(16) int32_t value[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
+    for (size_t i = 0; i < 4; ++i) {
+      auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1;
+      value[i] = int32_t(t);
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+  }
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryXmmOp(
+        e, i,
+        [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (is_unsigned) {
+                e.vpavgb(dest, src1, src2);
+              } else {
+                assert_always();
+              }
+              break;
+            case INT16_TYPE:
+              if (is_unsigned) {
+                e.vpavgw(dest, src1, src2);
+              } else {
+                assert_always();
+              }
+              break;
+            case INT32_TYPE:
+              // No 32bit averages in AVX.
+              if (is_unsigned) {
+                if (i.src2.is_constant) {
+                  e.LoadConstantXmm(e.xmm0, i.src2.constant());
+                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
+                } else {
+                  e.lea(e.r9, e.StashXmm(1, i.src2));
+                }
+                e.lea(e.r8, e.StashXmm(0, i.src1));
+                e.CallNativeSafe(
+                    reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
+                e.vmovaps(i.dest, e.xmm0);
+              } else {
+                if (i.src2.is_constant) {
+                  e.LoadConstantXmm(e.xmm0, i.src2.constant());
+                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
+                } else {
+                  e.lea(e.r9, e.StashXmm(1, i.src2));
+                }
+                e.lea(e.r8, e.StashXmm(0, i.src1));
+                e.CallNativeSafe(
+                    reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
+                e.vmovaps(i.dest, e.xmm0);
+              }
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE);
+
+// ============================================================================
+// OPCODE_INSERT
+// ============================================================================
+struct INSERT_I8
+    : Sequence<INSERT_I8, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.vpinsrb(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x3);
+  }
+};
+struct INSERT_I16
+    : Sequence<INSERT_I16, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.vpinsrw(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x1);
+  }
+};
+struct INSERT_I32
+    : Sequence<INSERT_I32, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.vpinsrd(i.dest, i.src3, i.src2.constant());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32);
+
+// ============================================================================
+// OPCODE_EXTRACT
+// ============================================================================
+// TODO(benvanik): sequence extract/splat:
+//  v0.i32 = extract v0.v128, 0
+//  v0.v128 = splat v0.i32
+// This can be a single broadcast.
+struct EXTRACT_I8
+    : Sequence<EXTRACT_I8, I<OPCODE_EXTRACT, I8Op, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant()));
+    } else {
+      e.mov(e.eax, 0x00000003);
+      e.xor_(e.al, i.src2);
+      e.and_(e.al, 0x1F);
+      e.vmovd(e.xmm0, e.eax);
+      e.vpshufb(e.xmm0, i.src1, e.xmm0);
+      e.vmovd(i.dest.reg().cvt32(), e.xmm0);
+      e.and_(i.dest, uint8_t(0xFF));
+    }
+  }
+};
+struct EXTRACT_I16
+    : Sequence<EXTRACT_I16, I<OPCODE_EXTRACT, I16Op, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant()));
+    } else {
+      e.mov(e.al, i.src2);
+      e.xor_(e.al, 0x01);
+      e.shl(e.al, 1);
+      e.mov(e.ah, e.al);
+      e.add(e.ah, 1);
+      e.vmovd(e.xmm0, e.eax);
+      e.vpshufb(e.xmm0, i.src1, e.xmm0);
+      e.vmovd(i.dest.reg().cvt32(), e.xmm0);
+      e.and_(i.dest.reg().cvt32(), 0xFFFFu);
+    }
+  }
+};
+struct EXTRACT_I32
+    : Sequence<EXTRACT_I32, I<OPCODE_EXTRACT, I32Op, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    static const vec128_t extract_table_32[4] = {
+        vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+    };
+    if (i.src2.is_constant) {
+      // TODO(gibbed): add support to constant propagation pass for
+      // OPCODE_EXTRACT.
+      Xmm src1;
+      if (i.src1.is_constant) {
+        src1 = e.xmm0;
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+      if (i.src2.constant() == 0) {
+        e.vmovd(i.dest, src1);
+      } else {
+        e.vpextrd(i.dest, src1, VEC128_D(i.src2.constant()));
+      }
+    } else {
+      // TODO(benvanik): try out hlide's version:
+      // e.mov(e.eax, 3);
+      // e.and_(e.al, i.src2);       // eax = [(i&3), 0, 0, 0]
+      // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4]
+      // e.add(e.eax, 0x00010203);  // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1,
+      // ((i&3)*4)+0]
+      // e.vmovd(e.xmm0, e.eax);
+      // e.vpshufb(e.xmm0, i.src1, e.xmm0);
+      // e.vmovd(i.dest.reg().cvt32(), e.xmm0);
+      // Get the desired word in xmm0, then extract that.
+      Xmm src1;
+      if (i.src1.is_constant) {
+        src1 = e.xmm1;
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1.reg();
+      }
+
+      e.xor_(e.rax, e.rax);
+      e.mov(e.al, i.src2);
+      e.and_(e.al, 0x03);
+      e.shl(e.al, 4);
+      e.mov(e.rdx, reinterpret_cast<uint64_t>(extract_table_32));
+      e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]);
+      e.vpshufb(e.xmm0, src1, e.xmm0);
+      e.vpextrd(i.dest, e.xmm0, 0);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
+
+// ============================================================================
+// OPCODE_SPLAT
+// ============================================================================
+// Copy a value into all elements of a vector
+struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      // TODO(benvanik): faster constant splats.
+      e.mov(e.eax, i.src1.constant());
+      e.vmovd(e.xmm0, e.eax);
+    } else {
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      e.vpbroadcastb(i.dest, e.xmm0);
+    } else {
+      e.vpunpcklbw(e.xmm0, e.xmm0);
+      e.vpunpcklwd(e.xmm0, e.xmm0);
+      e.vpshufd(i.dest, e.xmm0, 0);
+    }
+  }
+};
+struct SPLAT_I16 : Sequence<SPLAT_I16, I<OPCODE_SPLAT, V128Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      // TODO(benvanik): faster constant splats.
+      e.mov(e.eax, i.src1.constant());
+      e.vmovd(e.xmm0, e.eax);
+    } else {
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      e.vpbroadcastw(i.dest, e.xmm0);
+    } else {
+      e.vpunpcklwd(e.xmm0, e.xmm0);  // unpack low word data
+      e.vpshufd(i.dest, e.xmm0, 0);
+    }
+  }
+};
+struct SPLAT_I32 : Sequence<SPLAT_I32, I<OPCODE_SPLAT, V128Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      // TODO(benvanik): faster constant splats.
+      e.mov(e.eax, i.src1.constant());
+      e.vmovd(e.xmm0, e.eax);
+    } else {
+      e.vmovd(e.xmm0, i.src1);
+    }
+
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      e.vpbroadcastd(i.dest, e.xmm0);
+    } else {
+      e.vpshufd(i.dest, e.xmm0, 0);
+    }
+  }
+};
+struct SPLAT_F32 : Sequence<SPLAT_F32, I<OPCODE_SPLAT, V128Op, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      if (i.src1.is_constant) {
+        // TODO(benvanik): faster constant splats.
+        e.mov(e.eax, i.src1.value->constant.i32);
+        e.vmovd(e.xmm0, e.eax);
+        e.vbroadcastss(i.dest, e.xmm0);
+      } else {
+        e.vbroadcastss(i.dest, i.src1);
+      }
+    } else {
+      if (i.src1.is_constant) {
+        e.mov(e.eax, i.src1.value->constant.i32);
+        e.vmovd(i.dest, e.eax);
+        e.vshufps(i.dest, i.dest, i.dest, 0);
+      } else {
+        e.vshufps(i.dest, i.src1, i.src1, 0);
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32);
+
+// ============================================================================
+// OPCODE_PERMUTE
+// ============================================================================
+struct PERMUTE_I32
+    : Sequence<PERMUTE_I32, I<OPCODE_PERMUTE, V128Op, I32Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.instr->flags == INT32_TYPE);
+    // Permute words between src2 and src3.
+    // TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
+    if (i.src1.is_constant) {
+      uint32_t control = i.src1.constant();
+      // Shuffle things into the right places in dest & xmm0,
+      // then we blend them together.
+      uint32_t src_control =
+          (((control >> 24) & 0x3) << 6) | (((control >> 16) & 0x3) << 4) |
+          (((control >> 8) & 0x3) << 2) | (((control >> 0) & 0x3) << 0);
+
+      uint32_t blend_control = 0;
+      if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+        // Blender for vpblendd
+        blend_control =
+            (((control >> 26) & 0x1) << 3) | (((control >> 18) & 0x1) << 2) |
+            (((control >> 10) & 0x1) << 1) | (((control >> 2) & 0x1) << 0);
+      } else {
+        // Blender for vpblendw
+        blend_control =
+            (((control >> 26) & 0x1) << 6) | (((control >> 18) & 0x1) << 4) |
+            (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 0);
+        blend_control |= blend_control << 1;
+      }
+
+      // TODO(benvanik): if src2/src3 are constants, shuffle now!
+      Xmm src2;
+      if (i.src2.is_constant) {
+        src2 = e.xmm1;
+        e.LoadConstantXmm(src2, i.src2.constant());
+      } else {
+        src2 = i.src2;
+      }
+      Xmm src3;
+      if (i.src3.is_constant) {
+        src3 = e.xmm2;
+        e.LoadConstantXmm(src3, i.src3.constant());
+      } else {
+        src3 = i.src3;
+      }
+      if (i.dest != src3) {
+        e.vpshufd(i.dest, src2, src_control);
+        e.vpshufd(e.xmm0, src3, src_control);
+      } else {
+        e.vmovaps(e.xmm0, src3);
+        e.vpshufd(i.dest, src2, src_control);
+        e.vpshufd(e.xmm0, e.xmm0, src_control);
+      }
+
+      if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+        e.vpblendd(i.dest, e.xmm0, blend_control);  // $0 = $1 <blend> $2
+      } else {
+        e.vpblendw(i.dest, e.xmm0, blend_control);  // $0 = $1 <blend> $2
+      }
+    } else {
+      // Permute by non-constant.
+      assert_always();
+    }
+  }
+};
+struct PERMUTE_V128
+    : Sequence<PERMUTE_V128,
+               I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
+  static void EmitByInt8(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): find out how to do this with only one temp register!
+    // Permute bytes between src2 and src3.
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
+    if (i.src3.value->IsConstantZero()) {
+      // Permuting with src2/zero, so just shuffle/mask.
+      if (i.src2.value->IsConstantZero()) {
+        // src2 & src3 are zero, so result will always be zero.
+        e.vpxor(i.dest, i.dest);
+      } else {
+        // Control mask needs to be shuffled.
+        if (i.src1.is_constant) {
+          e.LoadConstantXmm(e.xmm0, i.src1.constant());
+          e.vxorps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSwapWordMask));
+        } else {
+          e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
+        }
+        e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));
+        if (i.src2.is_constant) {
+          e.LoadConstantXmm(i.dest, i.src2.constant());
+          e.vpshufb(i.dest, i.dest, e.xmm0);
+        } else {
+          e.vpshufb(i.dest, i.src2, e.xmm0);
+        }
+        // Build a mask with values in src2 having 0 and values in src3 having
+        // 1.
+        e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15));
+        e.vpandn(i.dest, e.xmm0, i.dest);
+      }
+    } else {
+      // General permute.
+      // Control mask needs to be shuffled.
+      // TODO(benvanik): do constants here instead of in generated code.
+      if (i.src1.is_constant) {
+        e.LoadConstantXmm(e.xmm2, i.src1.constant());
+        e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
+      } else {
+        e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
+      }
+      e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
+      Xmm src2_shuf = e.xmm0;
+      if (i.src2.value->IsConstantZero()) {
+        e.vpxor(src2_shuf, src2_shuf);
+      } else if (i.src2.is_constant) {
+        e.LoadConstantXmm(src2_shuf, i.src2.constant());
+        e.vpshufb(src2_shuf, src2_shuf, e.xmm2);
+      } else {
+        e.vpshufb(src2_shuf, i.src2, e.xmm2);
+      }
+      Xmm src3_shuf = e.xmm1;
+      if (i.src3.value->IsConstantZero()) {
+        e.vpxor(src3_shuf, src3_shuf);
+      } else if (i.src3.is_constant) {
+        e.LoadConstantXmm(src3_shuf, i.src3.constant());
+        e.vpshufb(src3_shuf, src3_shuf, e.xmm2);
+      } else {
+        e.vpshufb(src3_shuf, i.src3, e.xmm2);
+      }
+      // Build a mask with values in src2 having 0 and values in src3 having 1.
+      e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
+      e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
+    }
+  }
+
+  static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
+    assert_true(i.src1.is_constant);
+    vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
+    vec128_t perm_ctrl = vec128b(0);
+    for (int i = 0; i < 8; i++) {
+      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
+
+      auto v = uint8_t(perm.u16[i]);
+      perm.u8[i * 2] = v * 2;
+      perm.u8[i * 2 + 1] = v * 2 + 1;
+    }
+    e.LoadConstantXmm(e.xmm0, perm);
+
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm1, i.src2.constant());
+    } else {
+      e.vmovdqa(e.xmm1, i.src2);
+    }
+    if (i.src3.is_constant) {
+      e.LoadConstantXmm(e.xmm2, i.src3.constant());
+    } else {
+      e.vmovdqa(e.xmm2, i.src3);
+    }
+
+    e.vpshufb(e.xmm1, e.xmm1, e.xmm0);
+    e.vpshufb(e.xmm2, e.xmm2, e.xmm0);
+
+    uint8_t mask = 0;
+    for (int i = 0; i < 8; i++) {
+      if (perm_ctrl.i16[i] == 0) {
+        mask |= 1 << (7 - i);
+      }
+    }
+    e.vpblendw(i.dest, e.xmm1, e.xmm2, mask);
+  }
+
+  static void EmitByInt32(X64Emitter& e, const EmitArgType& i) {
+    assert_always();
+  }
+
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitByInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitByInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitByInt32(e, i);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        return;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);
+
+// ============================================================================
+// OPCODE_SWIZZLE
+// ============================================================================
+struct SWIZZLE
+    : Sequence<SWIZZLE, I<OPCODE_SWIZZLE, V128Op, V128Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto element_type = i.instr->flags;
+    if (element_type == INT8_TYPE) {
+      assert_always();
+    } else if (element_type == INT16_TYPE) {
+      assert_always();
+    } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) {
+      uint8_t swizzle_mask = static_cast<uint8_t>(i.src2.value);
+      Xmm src1;
+      if (i.src1.is_constant) {
+        src1 = e.xmm0;
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+      e.vpshufd(i.dest, src1, swizzle_mask);
+    } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
+      assert_always();
+    } else {
+      assert_always();
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE);
+
+// ============================================================================
+// OPCODE_PACK
+// ============================================================================
+struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags & PACK_TYPE_MODE) {
+      case PACK_TYPE_D3DCOLOR:
+        EmitD3DCOLOR(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_2:
+        EmitFLOAT16_2(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_4:
+        EmitFLOAT16_4(e, i);
+        break;
+      case PACK_TYPE_SHORT_2:
+        EmitSHORT_2(e, i);
+        break;
+      case PACK_TYPE_SHORT_4:
+        EmitSHORT_4(e, i);
+        break;
+      case PACK_TYPE_UINT_2101010:
+        EmitUINT_2101010(e, i);
+        break;
+      case PACK_TYPE_8_IN_16:
+        Emit8_IN_16(e, i, i.instr->flags);
+        break;
+      case PACK_TYPE_16_IN_32:
+        Emit16_IN_32(e, i, i.instr->flags);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        break;
+    }
+  }
+  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
+    // are valid.
+    e.vminps(i.dest, src, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
+    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333));
+    // Extract bytes.
+    // RGBA (XYZW) -> ARGB (WXYZ)
+    // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
+    //     ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
+    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
+  }
+  static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
+    alignas(16) float a[4];
+    alignas(16) uint16_t b[8];
+    _mm_store_ps(a, src1);
+    std::memset(b, 0, sizeof(b));
+
+    for (int i = 0; i < 2; i++) {
+      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    }
+
+    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
+  }
+  static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
+    // dest = [(src1.x | src1.y), 0, 0, 0]
+
+    Xmm src;
+    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+      if (i.src1.is_constant) {
+        src = i.dest;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      // 0|0|0|0|W|Z|Y|X
+      e.vcvtps2ph(i.dest, src, 0b00000011);
+      // Shuffle to X|Y|0|0|0|0|0|0
+      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2));
+    } else {
+      if (i.src1.is_constant) {
+        src = e.xmm0;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      e.lea(e.r8, e.StashXmm(0, src));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+      e.vmovaps(i.dest, e.xmm0);
+    }
+  }
+  static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
+    alignas(16) float a[4];
+    alignas(16) uint16_t b[8];
+    _mm_store_ps(a, src1);
+    std::memset(b, 0, sizeof(b));
+
+    for (int i = 0; i < 4; i++) {
+      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    }
+
+    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
+  }
+  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0]
+
+    Xmm src;
+    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+      if (i.src1.is_constant) {
+        src = i.dest;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      // 0|0|0|0|W|Z|Y|X
+      e.vcvtps2ph(i.dest, src, 0b00000011);
+      // Shuffle to X|Y|Z|W|0|0|0|0
+      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
+    } else {
+      if (i.src1.is_constant) {
+        src = e.xmm0;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      e.lea(e.r8, e.StashXmm(0, src));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
+      e.vmovaps(i.dest, e.xmm0);
+    }
+  }
+  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Saturate.
+    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min));
+    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
+    // Pack.
+    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2));
+  }
+  static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Saturate.
+    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min));
+    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
+    // Pack.
+    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
+  }
+  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
+    // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
+    // XYZ are 10 bits, signed and saturated.
+    // W is 2 bits, unsigned and saturated.
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Saturate.
+    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
+    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked));
+    // Remove the unneeded bits of the floats.
+    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Shift the components up.
+      e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
+    } else {
+      // Duplicate all the components into bits 10-19.
+      e.vpslld(e.xmm0, i.dest, 10);
+      e.vpor(i.dest, e.xmm0);
+      // Duplicate all the components into bits 20-39
+      // (so alpha will be in 30-31).
+      e.vpslld(e.xmm0, i.dest, 20);
+      e.vpor(i.dest, e.xmm0);
+      // Leave only the needed components.
+      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
+    }
+    // Combine the components.
+    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1));
+    e.vorps(i.dest, e.xmm0);
+    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
+    e.vorps(i.dest, e.xmm0);
+  }
+  static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
+                                              __m128i src2) {
+    alignas(16) uint16_t a[8];
+    alignas(16) uint16_t b[8];
+    alignas(16) uint8_t c[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
+    for (int i = 0; i < 8; ++i) {
+      c[i] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), a[i])));
+      c[i + 8] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), b[i])));
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
+  }
+  static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
+    alignas(16) uint8_t a[16];
+    alignas(16) uint8_t b[16];
+    alignas(16) uint8_t c[16];
+    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
+    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
+    for (int i = 0; i < 8; ++i) {
+      c[i] = a[i * 2];
+      c[i + 8] = b[i * 2];
+    }
+    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
+  }
+  static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
+    // TODO(benvanik): handle src2 (or src1) being constant zero
+    if (IsPackInUnsigned(flags)) {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> unsigned + saturate
+          if (i.src2.is_constant) {
+            e.LoadConstantXmm(e.xmm0, i.src2.constant());
+            e.lea(e.r9, e.StashXmm(1, e.xmm0));
+          } else {
+            e.lea(e.r9, e.StashXmm(1, i.src2));
+          }
+          e.lea(e.r8, e.StashXmm(0, i.src1));
+          e.CallNativeSafe(
+              reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN_SAT));
+          e.vmovaps(i.dest, e.xmm0);
+          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
+        } else {
+          // unsigned -> unsigned
+          e.lea(e.r9, e.StashXmm(1, i.src2));
+          e.lea(e.r8, e.StashXmm(0, i.src1));
+          e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
+          e.vmovaps(i.dest, e.xmm0);
+          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> signed + saturate
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      }
+    } else {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> unsigned + saturate
+          // PACKUSWB / SaturateSignedWordToUnsignedByte
+          Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantXmm(src2, i.src2.constant());
+          }
+
+          e.vpackuswb(i.dest, i.src1, src2);
+          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
+        } else {
+          // signed -> unsigned
+          assert_always();
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> signed + saturate
+          // PACKSSWB / SaturateSignedWordToSignedByte
+          e.vpacksswb(i.dest, i.src1, i.src2);
+          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
+        } else {
+          // signed -> signed
+          assert_always();
+        }
+      }
+    }
+  }
+  // Pack 2 32-bit vectors into a 16-bit vector.
+  static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
+                           uint32_t flags) {
+    // TODO(benvanik): handle src2 (or src1) being constant zero
+    if (IsPackInUnsigned(flags)) {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> unsigned + saturate
+          // Construct a saturation max value
+          e.mov(e.eax, 0xFFFFu);
+          e.vmovd(e.xmm0, e.eax);
+          e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
+
+          if (!i.src1.is_constant) {
+            e.vpminud(e.xmm1, i.src1, e.xmm0);  // Saturate src1
+            e.vpshuflw(e.xmm1, e.xmm1, 0b00100010);
+            e.vpshufhw(e.xmm1, e.xmm1, 0b00100010);
+            e.vpshufd(e.xmm1, e.xmm1, 0b00001000);
+          } else {
+            // TODO(DrChat): Non-zero constants
+            assert_true(i.src1.constant().u64[0] == 0 &&
+                        i.src1.constant().u64[1] == 0);
+            e.vpxor(e.xmm1, e.xmm1);
+          }
+
+          if (!i.src2.is_constant) {
+            e.vpminud(i.dest, i.src2, e.xmm0);  // Saturate src2
+            e.vpshuflw(i.dest, i.dest, 0b00100010);
+            e.vpshufhw(i.dest, i.dest, 0b00100010);
+            e.vpshufd(i.dest, i.dest, 0b10000000);
+          } else {
+            // TODO(DrChat): Non-zero constants
+            assert_true(i.src2.constant().u64[0] == 0 &&
+                        i.src2.constant().u64[1] == 0);
+            e.vpxor(i.dest, i.dest);
+          }
+
+          e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111);
+        } else {
+          // unsigned -> unsigned
+          e.vmovaps(e.xmm0, i.src1);
+          e.vpshuflw(e.xmm0, e.xmm0, 0b00100010);
+          e.vpshufhw(e.xmm0, e.xmm0, 0b00100010);
+          e.vpshufd(e.xmm0, e.xmm0, 0b00001000);
+
+          e.vmovaps(i.dest, i.src2);
+          e.vpshuflw(i.dest, i.dest, 0b00100010);
+          e.vpshufhw(i.dest, i.dest, 0b00100010);
+          e.vpshufd(i.dest, i.dest, 0b10000000);
+
+          e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111);
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> signed + saturate
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      }
+    } else {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> unsigned + saturate
+          // PACKUSDW
+          // TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0];
+          // DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0];
+          e.vpackusdw(i.dest, i.src1, i.src2);
+          e.vpshuflw(i.dest, i.dest, 0b10110001);
+          e.vpshufhw(i.dest, i.dest, 0b10110001);
+        } else {
+          // signed -> unsigned
+          assert_always();
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> signed + saturate
+          // PACKSSDW / SaturateSignedDwordToSignedWord
+          Xmm src2;
+          if (!i.src2.is_constant) {
+            src2 = i.src2;
+          } else {
+            assert_false(i.src1 == e.xmm0);
+            src2 = e.xmm0;
+            e.LoadConstantXmm(src2, i.src2.constant());
+          }
+          e.vpackssdw(i.dest, i.src1, src2);
+          e.vpshuflw(i.dest, i.dest, 0b10110001);
+          e.vpshufhw(i.dest, i.dest, 0b10110001);
+        } else {
+          // signed -> signed
+          assert_always();
+        }
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
+
+// ============================================================================
+// OPCODE_UNPACK
+// ============================================================================
+struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags & PACK_TYPE_MODE) {
+      case PACK_TYPE_D3DCOLOR:
+        EmitD3DCOLOR(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_2:
+        EmitFLOAT16_2(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_4:
+        EmitFLOAT16_4(e, i);
+        break;
+      case PACK_TYPE_SHORT_2:
+        EmitSHORT_2(e, i);
+        break;
+      case PACK_TYPE_SHORT_4:
+        EmitSHORT_4(e, i);
+        break;
+      case PACK_TYPE_UINT_2101010:
+        EmitUINT_2101010(e, i);
+        break;
+      case PACK_TYPE_8_IN_16:
+        Emit8_IN_16(e, i, i.instr->flags);
+        break;
+      case PACK_TYPE_16_IN_32:
+        Emit16_IN_32(e, i, i.instr->flags);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        break;
+    }
+  }
+  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
+    // ARGB (WXYZ) -> RGBA (XYZW)
+    Xmm src;
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne));
+        return;
+      }
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // src = ZZYYXXWW
+    // Unpack to 000000ZZ,000000YY,000000XX,000000WW
+    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackD3DCOLOR));
+    // Add 1.0f to each.
+    e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
+    // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
+  }
+  static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
+    alignas(16) uint16_t a[8];
+    alignas(16) float b[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
+
+    for (int i = 0; i < 2; i++) {
+      b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
+    }
+
+    // Constants, or something
+    b[2] = 0.f;
+    b[3] = 1.f;
+
+    return _mm_load_ps(b);
+  }
+  static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) {
+    // 1 bit sign, 5 bit exponent, 10 bit mantissa
+    // D3D10 half float format
+    // TODO(benvanik):
+    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
+    // Use _mm_cvtph_ps -- requires very modern processors (SSE5+)
+    // Unpacking half floats:
+    // http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+    // Packing half floats: https://gist.github.com/rygorous/2156668
+    // Load source, move from tight pack of X16Y16.... to X16...Y16...
+    // Also zero out the high end.
+    // TODO(benvanik): special case constant unpacks that just get 0/1/etc.
+
+    Xmm src;
+    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+      if (i.src1.is_constant) {
+        src = i.dest;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      // sx = src.iw >> 16;
+      // sy = src.iw & 0xFFFF;
+      // dest = { XMConvertHalfToFloat(sx),
+      //          XMConvertHalfToFloat(sy),
+      //          0.0,
+      //          1.0 };
+      // Shuffle to 0|0|0|0|0|0|Y|X
+      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2));
+      e.vcvtph2ps(i.dest, i.dest);
+      e.vpshufd(i.dest, i.dest, 0b10100100);
+      e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
+    } else {
+      if (i.src1.is_constant) {
+        src = e.xmm0;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      e.lea(e.r8, e.StashXmm(0, src));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+      e.vmovaps(i.dest, e.xmm0);
+    }
+  }
+  static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
+    alignas(16) uint16_t a[8];
+    alignas(16) float b[4];
+    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
+
+    for (int i = 0; i < 4; i++) {
+      b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
+    }
+
+    return _mm_load_ps(b);
+  }
+  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
+    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
+    Xmm src;
+    if (e.IsFeatureEnabled(kX64EmitF16C)) {
+      if (i.src1.is_constant) {
+        src = i.dest;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      // Shuffle to 0|0|0|0|W|Z|Y|X
+      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4));
+      e.vcvtph2ps(i.dest, i.dest);
+    } else {
+      if (i.src1.is_constant) {
+        src = e.xmm0;
+        e.LoadConstantXmm(src, i.src1.constant());
+      } else {
+        src = i.src1;
+      }
+      e.lea(e.r8, e.StashXmm(0, src));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
+      e.vmovaps(i.dest, e.xmm0);
+    }
+  }
+  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
+    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
+    // (VD.y) = 3.0 + (VB.x)*2^-22
+    // (VD.z) = 0.0
+    // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
+    // src is (xx,xx,xx,VALUE)
+    Xmm src;
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301));
+        return;
+      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Shuffle bytes.
+    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2));
+    // If negative, make smaller than 3 - sign extend before adding.
+    e.vpslld(i.dest, 16);
+    e.vpsrad(i.dest, 16);
+    // Add 3,3,0,1.
+    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
+    // Return quiet NaNs in case of negative overflow.
+    e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
+    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
+  }
+  static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
+    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
+    // (VD.y) = 3.0 + (VB.x)*2^-22
+    // (VD.z) = 3.0 + (VB.y>>16)*2^-22
+    // (VD.w) = 3.0 + (VB.y)*2^-22
+    // src is (xx,xx,VALUE,VALUE)
+    Xmm src;
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
+        return;
+      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Shuffle bytes.
+    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
+    // If negative, make smaller than 3 - sign extend before adding.
+    e.vpslld(i.dest, 16);
+    e.vpsrad(i.dest, 16);
+    // Add 3,3,3,3.
+    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
+    // Return quiet NaNs in case of negative overflow.
+    e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
+    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
+  }
+  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
+    Xmm src;
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331));
+        return;
+      }
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Splat W.
+    e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3));
+    // Keep only the needed components.
+    // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
+    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Shift the components down.
+      e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
+    } else {
+      // Duplicate green in 0-9 and alpha in 20-21.
+      e.vpsrld(e.xmm0, i.dest, 10);
+      e.vpor(i.dest, e.xmm0);
+      // Duplicate blue in 0-9 and alpha in 0-1.
+      e.vpsrld(e.xmm0, i.dest, 20);
+      e.vpor(i.dest, e.xmm0);
+      // Remove higher duplicate components.
+      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
+    }
+    // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding.
+    // W is unsigned.
+    e.vpslld(i.dest, 22);
+    e.vpsrad(i.dest, 22);
+    // Add 3,3,3,1.
+    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331));
+    // Return quiet NaNs in case of negative overflow.
+    e.vcmpeqps(e.xmm0, i.dest,
+               e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow));
+    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
+    // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
+    // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.
+  }
+  static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
+    assert_false(IsPackOutSaturate(flags));
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    if (IsPackToLo(flags)) {
+      // Unpack to LO.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask));
+          e.vpunpckhbw(i.dest, i.dest, i.dest);
+          e.vpsraw(i.dest, 8);
+        }
+      }
+    } else {
+      // Unpack to HI.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask));
+          e.vpunpcklbw(i.dest, i.dest, i.dest);
+          e.vpsraw(i.dest, 8);
+        }
+      }
+    }
+  }
+  static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
+                           uint32_t flags) {
+    assert_false(IsPackOutSaturate(flags));
+    Xmm src;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    if (IsPackToLo(flags)) {
+      // Unpack to LO.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.vpunpckhwd(i.dest, src, src);
+          e.vpsrad(i.dest, 16);
+        }
+      }
+    } else {
+      // Unpack to HI.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.vpunpcklwd(i.dest, src, src);
+          e.vpsrad(i.dest, 16);
+        }
+      }
+    }
+    e.vpshufd(i.dest, i.dest, 0xB1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
+
+}  // namespace x64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 4fea97523..8f8050876 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -38,9 +38,6 @@
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
 
-// For OPCODE_PACK/OPCODE_UNPACK
-#include "third_party/half/include/half.hpp"
-
 namespace xe {
 namespace cpu {
 namespace backend {
@@ -964,169 +961,6 @@ struct ROUND_V128 : Sequence<ROUND_V128, I<OPCODE_ROUND, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
 
-// ============================================================================
-// OPCODE_VECTOR_CONVERT_I2F
-// ============================================================================
-struct VECTOR_CONVERT_I2F
-    : Sequence<VECTOR_CONVERT_I2F,
-               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // flags = ARITHMETIC_UNSIGNED
-    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
-      // xmm0 = mask of positive values
-      e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
-
-      // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
-      e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
-      e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
-
-      // xmm1 = [0, INT_MAX]
-      e.vcvtdq2ps(i.dest, e.xmm1);
-
-      // scale values back above [INT_MIN, UINT_MAX]
-      e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
-      e.vaddps(i.dest, i.dest, e.xmm0);
-    } else {
-      e.vcvtdq2ps(i.dest, i.src1);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F);
-
-// ============================================================================
-// OPCODE_VECTOR_CONVERT_F2I
-// ============================================================================
-struct VECTOR_CONVERT_F2I
-    : Sequence<VECTOR_CONVERT_F2I,
-               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
-      // clamp to min 0
-      e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
-
-      // xmm1 = mask of values >= (unsigned)INT_MIN
-      e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
-
-      // scale any values >= (unsigned)INT_MIN back to [0, ...]
-      e.vsubps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
-      e.vblendvps(e.xmm0, e.xmm0, e.xmm2, e.xmm1);
-
-      // xmm0 = [0, INT_MAX]
-      // this may still contain values > INT_MAX (if src has vals > UINT_MAX)
-      e.vcvttps2dq(i.dest, e.xmm0);
-
-      // xmm0 = mask of values that need saturation
-      e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin));
-
-      // scale values back above [INT_MIN, UINT_MAX]
-      e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntMin));
-      e.vpaddd(i.dest, i.dest, e.xmm1);
-
-      // saturate values > UINT_MAX
-      e.vpor(i.dest, i.dest, e.xmm0);
-    } else {
-      // xmm2 = NaN mask
-      e.vcmpunordps(e.xmm2, i.src1, i.src1);
-
-      // convert packed floats to packed dwords
-      e.vcvttps2dq(e.xmm0, i.src1);
-
-      // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0
-      e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin));
-      e.vpandn(e.xmm1, i.src1, e.xmm1);
-
-      // saturate positive values
-      e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1);
-
-      // mask NaNs
-      e.vpandn(i.dest, e.xmm2, i.dest);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
-
-// ============================================================================
-// OPCODE_LOAD_VECTOR_SHL
-// ============================================================================
-static const vec128_t lvsl_table[16] = {
-    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
-    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
-    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
-    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
-    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
-    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
-    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
-    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
-    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
-    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
-    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
-    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
-    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
-    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
-    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
-};
-struct LOAD_VECTOR_SHL_I8
-    : Sequence<LOAD_VECTOR_SHL_I8, I<OPCODE_LOAD_VECTOR_SHL, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src1.is_constant) {
-      auto sh = i.src1.constant();
-      assert_true(sh < xe::countof(lvsl_table));
-      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
-    } else {
-      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsl_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8);
-
-// ============================================================================
-// OPCODE_LOAD_VECTOR_SHR
-// ============================================================================
-static const vec128_t lvsr_table[16] = {
-    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
-    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
-    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
-    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
-    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
-    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
-    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
-    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
-    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
-    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
-    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
-    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
-    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
-    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
-    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
-    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
-};
-struct LOAD_VECTOR_SHR_I8
-    : Sequence<LOAD_VECTOR_SHR_I8, I<OPCODE_LOAD_VECTOR_SHR, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src1.is_constant) {
-      auto sh = i.src1.constant();
-      assert_true(sh < xe::countof(lvsr_table));
-      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
-    } else {
-      // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsr_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8);
-
 // ============================================================================
 // OPCODE_LOAD_CLOCK
 // ============================================================================
@@ -2112,51 +1946,6 @@ struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128);
 
-// ============================================================================
-// OPCODE_VECTOR_MAX
-// ============================================================================
-struct VECTOR_MAX
-    : Sequence<VECTOR_MAX, I<OPCODE_VECTOR_MAX, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          uint32_t part_type = i.instr->flags >> 8;
-          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
-            switch (part_type) {
-              case INT8_TYPE:
-                e.vpmaxub(dest, src1, src2);
-                break;
-              case INT16_TYPE:
-                e.vpmaxuw(dest, src1, src2);
-                break;
-              case INT32_TYPE:
-                e.vpmaxud(dest, src1, src2);
-                break;
-              default:
-                assert_unhandled_case(part_type);
-                break;
-            }
-          } else {
-            switch (part_type) {
-              case INT8_TYPE:
-                e.vpmaxsb(dest, src1, src2);
-                break;
-              case INT16_TYPE:
-                e.vpmaxsw(dest, src1, src2);
-                break;
-              case INT32_TYPE:
-                e.vpmaxsd(dest, src1, src2);
-                break;
-              default:
-                assert_unhandled_case(part_type);
-                break;
-            }
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX);
-
 // ============================================================================
 // OPCODE_MIN
 // ============================================================================
@@ -2247,51 +2036,6 @@ struct MIN_V128 : Sequence<MIN_V128, I<OPCODE_MIN, V128Op, V128Op, V128Op>> {
 EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32,
                      MIN_F64, MIN_V128);
 
-// ============================================================================
-// OPCODE_VECTOR_MIN
-// ============================================================================
-struct VECTOR_MIN
-    : Sequence<VECTOR_MIN, I<OPCODE_VECTOR_MIN, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          uint32_t part_type = i.instr->flags >> 8;
-          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
-            switch (part_type) {
-              case INT8_TYPE:
-                e.vpminub(dest, src1, src2);
-                break;
-              case INT16_TYPE:
-                e.vpminuw(dest, src1, src2);
-                break;
-              case INT32_TYPE:
-                e.vpminud(dest, src1, src2);
-                break;
-              default:
-                assert_unhandled_case(part_type);
-                break;
-            }
-          } else {
-            switch (part_type) {
-              case INT8_TYPE:
-                e.vpminsb(dest, src1, src2);
-                break;
-              case INT16_TYPE:
-                e.vpminsw(dest, src1, src2);
-                break;
-              case INT32_TYPE:
-                e.vpminsd(dest, src1, src2);
-                break;
-              default:
-                assert_unhandled_case(part_type);
-                break;
-            }
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN);
-
 // ============================================================================
 // OPCODE_SELECT
 // ============================================================================
@@ -2808,213 +2552,6 @@ struct DID_SATURATE
 };
 EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, DID_SATURATE);
 
-// ============================================================================
-// OPCODE_VECTOR_COMPARE_EQ
-// ============================================================================
-struct VECTOR_COMPARE_EQ_V128
-    : Sequence<VECTOR_COMPARE_EQ_V128,
-               I<OPCODE_VECTOR_COMPARE_EQ, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          switch (i.instr->flags) {
-            case INT8_TYPE:
-              e.vpcmpeqb(dest, src1, src2);
-              break;
-            case INT16_TYPE:
-              e.vpcmpeqw(dest, src1, src2);
-              break;
-            case INT32_TYPE:
-              e.vpcmpeqd(dest, src1, src2);
-              break;
-            case FLOAT32_TYPE:
-              e.vcmpeqps(dest, src1, src2);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_COMPARE_SGT
-// ============================================================================
-struct VECTOR_COMPARE_SGT_V128
-    : Sequence<VECTOR_COMPARE_SGT_V128,
-               I<OPCODE_VECTOR_COMPARE_SGT, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAssociativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          switch (i.instr->flags) {
-            case INT8_TYPE:
-              e.vpcmpgtb(dest, src1, src2);
-              break;
-            case INT16_TYPE:
-              e.vpcmpgtw(dest, src1, src2);
-              break;
-            case INT32_TYPE:
-              e.vpcmpgtd(dest, src1, src2);
-              break;
-            case FLOAT32_TYPE:
-              e.vcmpgtps(dest, src1, src2);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_COMPARE_SGE
-// ============================================================================
-struct VECTOR_COMPARE_SGE_V128
-    : Sequence<VECTOR_COMPARE_SGE_V128,
-               I<OPCODE_VECTOR_COMPARE_SGE, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAssociativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-          switch (i.instr->flags) {
-            case INT8_TYPE:
-              e.vpcmpeqb(e.xmm0, src1, src2);
-              e.vpcmpgtb(dest, src1, src2);
-              e.vpor(dest, e.xmm0);
-              break;
-            case INT16_TYPE:
-              e.vpcmpeqw(e.xmm0, src1, src2);
-              e.vpcmpgtw(dest, src1, src2);
-              e.vpor(dest, e.xmm0);
-              break;
-            case INT32_TYPE:
-              e.vpcmpeqd(e.xmm0, src1, src2);
-              e.vpcmpgtd(dest, src1, src2);
-              e.vpor(dest, e.xmm0);
-              break;
-            case FLOAT32_TYPE:
-              e.vcmpgeps(dest, src1, src2);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_COMPARE_UGT
-// ============================================================================
-struct VECTOR_COMPARE_UGT_V128
-    : Sequence<VECTOR_COMPARE_UGT_V128,
-               I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
-        break;
-      case INT16_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
-        break;
-      case INT32_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
-        break;
-      case FLOAT32_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
-        break;
-      default:
-        assert_always();
-        break;
-    }
-    if (i.src1.is_constant) {
-      // TODO(benvanik): make this constant.
-      e.LoadConstantXmm(e.xmm0, i.src1.constant());
-      e.vpxor(e.xmm0, sign_addr);
-    } else {
-      e.vpxor(e.xmm0, i.src1, sign_addr);
-    }
-    if (i.src2.is_constant) {
-      // TODO(benvanik): make this constant.
-      e.LoadConstantXmm(e.xmm1, i.src2.constant());
-      e.vpxor(e.xmm1, sign_addr);
-    } else {
-      e.vpxor(e.xmm1, i.src2, sign_addr);
-    }
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
-        break;
-      case INT16_TYPE:
-        e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
-        break;
-      case INT32_TYPE:
-        e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
-        break;
-      case FLOAT32_TYPE:
-        e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
-        break;
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_COMPARE_UGE
-// ============================================================================
-struct VECTOR_COMPARE_UGE_V128
-    : Sequence<VECTOR_COMPARE_UGE_V128,
-               I<OPCODE_VECTOR_COMPARE_UGE, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
-        break;
-      case INT16_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
-        break;
-      case INT32_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
-        break;
-      case FLOAT32_TYPE:
-        sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
-        break;
-    }
-    if (i.src1.is_constant) {
-      // TODO(benvanik): make this constant.
-      e.LoadConstantXmm(e.xmm0, i.src1.constant());
-      e.vpxor(e.xmm0, sign_addr);
-    } else {
-      e.vpxor(e.xmm0, i.src1, sign_addr);
-    }
-    if (i.src2.is_constant) {
-      // TODO(benvanik): make this constant.
-      e.LoadConstantXmm(e.xmm1, i.src2.constant());
-      e.vpxor(e.xmm1, sign_addr);
-    } else {
-      e.vpxor(e.xmm1, i.src2, sign_addr);
-    }
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1);
-        e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
-        e.vpor(i.dest, e.xmm2);
-        break;
-      case INT16_TYPE:
-        e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1);
-        e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
-        e.vpor(i.dest, e.xmm2);
-        break;
-      case INT32_TYPE:
-        e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1);
-        e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
-        e.vpor(i.dest, e.xmm2);
-        break;
-      case FLOAT32_TYPE:
-        e.vcmpgeps(i.dest, e.xmm0, e.xmm1);
-        break;
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128);
-
 // ============================================================================
 // OPCODE_ADD
 // ============================================================================
@@ -3137,98 +2674,6 @@ struct ADD_CARRY_I64
 EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16,
                      ADD_CARRY_I32, ADD_CARRY_I64);
 
-// ============================================================================
-// OPCODE_VECTOR_ADD
-// ============================================================================
-struct VECTOR_ADD
-    : Sequence<VECTOR_ADD, I<OPCODE_VECTOR_ADD, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) {
-          const TypeName part_type =
-              static_cast<TypeName>(i.instr->flags & 0xFF);
-          const uint32_t arithmetic_flags = i.instr->flags >> 8;
-          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
-          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
-          switch (part_type) {
-            case INT8_TYPE:
-              if (saturate) {
-                // TODO(benvanik): trace DID_SATURATE
-                if (is_unsigned) {
-                  e.vpaddusb(dest, src1, src2);
-                } else {
-                  e.vpaddsb(dest, src1, src2);
-                }
-              } else {
-                e.vpaddb(dest, src1, src2);
-              }
-              break;
-            case INT16_TYPE:
-              if (saturate) {
-                // TODO(benvanik): trace DID_SATURATE
-                if (is_unsigned) {
-                  e.vpaddusw(dest, src1, src2);
-                } else {
-                  e.vpaddsw(dest, src1, src2);
-                }
-              } else {
-                e.vpaddw(dest, src1, src2);
-              }
-              break;
-            case INT32_TYPE:
-              if (saturate) {
-                if (is_unsigned) {
-                  // xmm0 is the only temp register that can be used by
-                  // src1/src2.
-                  e.vpaddd(e.xmm1, src1, src2);
-
-                  // If result is smaller than either of the inputs, we've
-                  // overflowed (only need to check one input)
-                  // if (src1 > res) then overflowed
-                  // https://locklessinc.com/articles/sat_arithmetic/
-                  e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
-                  e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
-                  e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);
-                  e.vpor(dest, e.xmm1, e.xmm0);
-                } else {
-                  e.vpaddd(e.xmm1, src1, src2);
-
-                  // Overflow results if two inputs are the same sign and the
-                  // result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
-                  // (src1 ^ res)) < 0) then overflowed
-                  // https://locklessinc.com/articles/sat_arithmetic/
-                  e.vpxor(e.xmm2, src1, src2);
-                  e.vpxor(e.xmm3, src1, e.xmm1);
-                  e.vpandn(e.xmm2, e.xmm2, e.xmm3);
-
-                  // Set any negative overflowed elements of src1 to INT_MIN
-                  e.vpand(e.xmm3, src1, e.xmm2);
-                  e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32),
-                              e.xmm3);
-
-                  // Set any positive overflowed elements of src1 to INT_MAX
-                  e.vpandn(e.xmm3, src1, e.xmm2);
-                  e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS),
-                              e.xmm3);
-                }
-              } else {
-                e.vpaddd(dest, src1, src2);
-              }
-              break;
-            case FLOAT32_TYPE:
-              assert_false(is_unsigned);
-              assert_false(saturate);
-              e.vaddps(dest, src1, src2);
-              break;
-            default:
-              assert_unhandled_case(part_type);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD);
-
 // ============================================================================
 // OPCODE_SUB
 // ============================================================================
@@ -3294,97 +2739,6 @@ struct SUB_V128 : Sequence<SUB_V128, I<OPCODE_SUB, V128Op, V128Op, V128Op>> {
 EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32,
                      SUB_F64, SUB_V128);
 
-// ============================================================================
-// OPCODE_VECTOR_SUB
-// ============================================================================
-struct VECTOR_SUB
-    : Sequence<VECTOR_SUB, I<OPCODE_VECTOR_SUB, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i, [&i](X64Emitter& e, const Xmm& dest, Xmm src1, Xmm src2) {
-          const TypeName part_type =
-              static_cast<TypeName>(i.instr->flags & 0xFF);
-          const uint32_t arithmetic_flags = i.instr->flags >> 8;
-          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
-          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
-          switch (part_type) {
-            case INT8_TYPE:
-              if (saturate) {
-                // TODO(benvanik): trace DID_SATURATE
-                if (is_unsigned) {
-                  e.vpsubusb(dest, src1, src2);
-                } else {
-                  e.vpsubsb(dest, src1, src2);
-                }
-              } else {
-                e.vpsubb(dest, src1, src2);
-              }
-              break;
-            case INT16_TYPE:
-              if (saturate) {
-                // TODO(benvanik): trace DID_SATURATE
-                if (is_unsigned) {
-                  e.vpsubusw(dest, src1, src2);
-                } else {
-                  e.vpsubsw(dest, src1, src2);
-                }
-              } else {
-                e.vpsubw(dest, src1, src2);
-              }
-              break;
-            case INT32_TYPE:
-              if (saturate) {
-                if (is_unsigned) {
-                  // xmm0 is the only temp register that can be used by
-                  // src1/src2.
-                  e.vpsubd(e.xmm1, src1, src2);
-
-                  // If result is greater than either of the inputs, we've
-                  // underflowed (only need to check one input)
-                  // if (res > src1) then underflowed
-                  // https://locklessinc.com/articles/sat_arithmetic/
-                  e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
-                  e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
-                  e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2);
-                  e.vpandn(dest, e.xmm0, e.xmm1);
-                } else {
-                  e.vpsubd(e.xmm1, src1, src2);
-
-                  // We can only overflow if the signs of the operands are
-                  // opposite. If signs are opposite and result sign isn't the
-                  // same as src1's sign, we've overflowed. if ((s32b)((src1 ^
-                  // src2) & (src1 ^ res)) < 0) then overflowed
-                  // https://locklessinc.com/articles/sat_arithmetic/
-                  e.vpxor(e.xmm2, src1, src2);
-                  e.vpxor(e.xmm3, src1, e.xmm1);
-                  e.vpand(e.xmm2, e.xmm2, e.xmm3);
-
-                  // Set any negative overflowed elements of src1 to INT_MIN
-                  e.vpand(e.xmm3, src1, e.xmm2);
-                  e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32),
-                              e.xmm3);
-
-                  // Set any positive overflowed elements of src1 to INT_MAX
-                  e.vpandn(e.xmm3, src1, e.xmm2);
-                  e.vblendvps(dest, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS),
-                              e.xmm3);
-                }
-              } else {
-                e.vpsubd(dest, src1, src2);
-              }
-              break;
-            case FLOAT32_TYPE:
-              e.vsubps(dest, src1, src2);
-              break;
-            default:
-              assert_unhandled_case(part_type);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
-
 // ============================================================================
 // OPCODE_MUL
 // ============================================================================
@@ -4959,588 +4313,6 @@ struct SHA_I64 : Sequence<SHA_I64, I<OPCODE_SHA, I64Op, I64Op, I8Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_SHA, SHA_I8, SHA_I16, SHA_I32, SHA_I64);
 
-// ============================================================================
-// OPCODE_VECTOR_SHL
-// ============================================================================
-struct VECTOR_SHL_V128
-    : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        EmitInt8(e, i);
-        break;
-      case INT16_TYPE:
-        EmitInt16(e, i);
-        break;
-      case INT32_TYPE:
-        EmitInt32(e, i);
-        break;
-      default:
-        assert_always();
-        break;
-    }
-  }
-  static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] << (shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): native version (with shift magic).
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI8));
-    e.vmovaps(i.dest, e.xmm0);
-  }
-  static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] << (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
-    Xmm src1;
-    if (i.src1.is_constant) {
-      src1 = e.xmm2;
-      e.LoadConstantXmm(src1, i.src1.constant());
-    } else {
-      src1 = i.src1;
-    }
-
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 8 - n; ++n) {
-        if (shamt.u16[n] != shamt.u16[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpsllw.
-        e.vpsllw(i.dest, src1, shamt.u16[0] & 0xF);
-        return;
-      }
-    }
-
-    // Shift 8 words in src1 by amount specified in src2.
-    Xbyak::Label emu, end;
-
-    // Only bother with this check if shift amt isn't constant.
-    if (!i.src2.is_constant) {
-      // See if the shift is equal first for a shortcut.
-      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
-      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
-      e.vpxor(e.xmm1, e.xmm0, i.src2);
-      e.vptest(e.xmm1, e.xmm1);
-      e.jnz(emu);
-
-      // Equal. Shift using vpsllw.
-      e.mov(e.rax, 0xF);
-      e.vmovq(e.xmm1, e.rax);
-      e.vpand(e.xmm0, e.xmm0, e.xmm1);
-      e.vpsllw(i.dest, src1, e.xmm0);
-      e.jmp(end);
-    }
-
-    // TODO(benvanik): native version (with shift magic).
-    e.L(emu);
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
-    e.vmovaps(i.dest, e.xmm0);
-
-    e.L(end);
-  }
-  static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] << (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
-    Xmm src1;
-    if (i.src1.is_constant) {
-      src1 = e.xmm2;
-      e.LoadConstantXmm(src1, i.src1.constant());
-    } else {
-      src1 = i.src1;
-    }
-
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 4 - n; ++n) {
-        if (shamt.u32[n] != shamt.u32[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpslld.
-        e.vpslld(i.dest, src1, shamt.u8[0] & 0x1F);
-        return;
-      }
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      if (i.src2.is_constant) {
-        const auto& shamt = i.src2.constant();
-        // Counts differ, so pre-mask and load constant.
-        vec128_t masked = i.src2.constant();
-        for (size_t n = 0; n < 4; ++n) {
-          masked.u32[n] &= 0x1F;
-        }
-        e.LoadConstantXmm(e.xmm0, masked);
-        e.vpsllvd(i.dest, src1, e.xmm0);
-      } else {
-        // Fully variable shift.
-        // src shift mask may have values >31, and x86 sets to zero when
-        // that happens so we mask.
-        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-        e.vpsllvd(i.dest, src1, e.xmm0);
-      }
-    } else {
-      // Shift 4 words in src1 by amount specified in src2.
-      Xbyak::Label emu, end;
-
-      // See if the shift is equal first for a shortcut.
-      // Only bother with this check if shift amt isn't constant.
-      if (!i.src2.is_constant) {
-        e.vpshufd(e.xmm0, i.src2, 0b00000000);
-        e.vpxor(e.xmm1, e.xmm0, i.src2);
-        e.vptest(e.xmm1, e.xmm1);
-        e.jnz(emu);
-
-        // Equal. Shift using vpsrad.
-        e.mov(e.rax, 0x1F);
-        e.vmovq(e.xmm1, e.rax);
-        e.vpand(e.xmm0, e.xmm0, e.xmm1);
-
-        e.vpslld(i.dest, src1, e.xmm0);
-        e.jmp(end);
-      }
-
-      // TODO(benvanik): native version (with shift magic).
-      e.L(emu);
-      if (i.src2.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
-      } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
-      }
-      e.lea(e.r8, e.StashXmm(0, src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI32));
-      e.vmovaps(i.dest, e.xmm0);
-
-      e.L(end);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_SHR
-// ============================================================================
-struct VECTOR_SHR_V128
-    : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        EmitInt8(e, i);
-        break;
-      case INT16_TYPE:
-        EmitInt16(e, i);
-        break;
-      case INT32_TYPE:
-        EmitInt32(e, i);
-        break;
-      default:
-        assert_always();
-        break;
-    }
-  }
-  static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): native version (with shift magic).
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI8));
-    e.vmovaps(i.dest, e.xmm0);
-  }
-  static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 8 - n; ++n) {
-        if (shamt.u16[n] != shamt.u16[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpsllw.
-        e.vpsrlw(i.dest, i.src1, shamt.u16[0] & 0xF);
-        return;
-      }
-    }
-
-    // Shift 8 words in src1 by amount specified in src2.
-    Xbyak::Label emu, end;
-
-    // See if the shift is equal first for a shortcut.
-    // Only bother with this check if shift amt isn't constant.
-    if (!i.src2.is_constant) {
-      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
-      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
-      e.vpxor(e.xmm1, e.xmm0, i.src2);
-      e.vptest(e.xmm1, e.xmm1);
-      e.jnz(emu);
-
-      // Equal. Shift using vpsrlw.
-      e.mov(e.rax, 0xF);
-      e.vmovq(e.xmm1, e.rax);
-      e.vpand(e.xmm0, e.xmm0, e.xmm1);
-      e.vpsrlw(i.dest, i.src1, e.xmm0);
-      e.jmp(end);
-    }
-
-    // TODO(benvanik): native version (with shift magic).
-    e.L(emu);
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16));
-    e.vmovaps(i.dest, e.xmm0);
-
-    e.L(end);
-  }
-  static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
-    Xmm src1;
-    if (i.src1.is_constant) {
-      src1 = e.xmm2;
-      e.LoadConstantXmm(src1, i.src1.constant());
-    } else {
-      src1 = i.src1;
-    }
-
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 4 - n; ++n) {
-        if (shamt.u32[n] != shamt.u32[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpsrld.
-        e.vpsrld(i.dest, src1, shamt.u8[0] & 0x1F);
-        return;
-      } else {
-        if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-          // Counts differ, so pre-mask and load constant.
-          vec128_t masked = i.src2.constant();
-          for (size_t n = 0; n < 4; ++n) {
-            masked.u32[n] &= 0x1F;
-          }
-          e.LoadConstantXmm(e.xmm0, masked);
-          e.vpsrlvd(i.dest, src1, e.xmm0);
-          return;
-        }
-      }
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      // Fully variable shift.
-      // src shift mask may have values >31, and x86 sets to zero when
-      // that happens so we mask.
-      e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-      e.vpsrlvd(i.dest, src1, e.xmm0);
-    } else {
-      // Shift 4 words in src1 by amount specified in src2.
-      Xbyak::Label emu, end;
-
-      // See if the shift is equal first for a shortcut.
-      // Only bother with this check if shift amt isn't constant.
-      if (!i.src2.is_constant) {
-        e.vpshufd(e.xmm0, i.src2, 0b00000000);
-        e.vpxor(e.xmm1, e.xmm0, i.src2);
-        e.vptest(e.xmm1, e.xmm1);
-        e.jnz(emu);
-
-        // Equal. Shift using vpsrld.
-        e.mov(e.rax, 0x1F);
-        e.vmovq(e.xmm1, e.rax);
-        e.vpand(e.xmm0, e.xmm0, e.xmm1);
-        e.vpsrld(i.dest, src1, e.xmm0);
-        e.jmp(end);
-      }
-
-      // TODO(benvanik): native version.
-      e.L(emu);
-      if (i.src2.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
-      } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
-      }
-      e.lea(e.r8, e.StashXmm(0, src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
-      e.vmovaps(i.dest, e.xmm0);
-
-      e.L(end);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_SHA
-// ============================================================================
-struct VECTOR_SHA_V128
-    : Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) int8_t value[16];
-    alignas(16) int8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-
-  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): native version (with shift magic).
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI8));
-    e.vmovaps(i.dest, e.xmm0);
-  }
-
-  static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) int16_t value[8];
-    alignas(16) int16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-
-  static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 8 - n; ++n) {
-        if (shamt.u16[n] != shamt.u16[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpsraw.
-        e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF);
-        return;
-      }
-    }
-
-    // Shift 8 words in src1 by amount specified in src2.
-    Xbyak::Label emu, end;
-
-    // See if the shift is equal first for a shortcut.
-    // Only bother with this check if shift amt isn't constant.
-    if (!i.src2.is_constant) {
-      e.vpshuflw(e.xmm0, i.src2, 0b00000000);
-      e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
-      e.vpxor(e.xmm1, e.xmm0, i.src2);
-      e.vptest(e.xmm1, e.xmm1);
-      e.jnz(emu);
-
-      // Equal. Shift using vpsraw.
-      e.mov(e.rax, 0xF);
-      e.vmovq(e.xmm1, e.rax);
-      e.vpand(e.xmm0, e.xmm0, e.xmm1);
-      e.vpsraw(i.dest, i.src1, e.xmm0);
-      e.jmp(end);
-    }
-
-    // TODO(benvanik): native version (with shift magic).
-    e.L(emu);
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
-    } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
-    }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI16));
-    e.vmovaps(i.dest, e.xmm0);
-
-    e.L(end);
-  }
-
-  static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) int32_t value[4];
-    alignas(16) int32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-
-  static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
-    if (i.src2.is_constant) {
-      const auto& shamt = i.src2.constant();
-      bool all_same = true;
-      for (size_t n = 0; n < 4 - n; ++n) {
-        if (shamt.u32[n] != shamt.u32[n + 1]) {
-          all_same = false;
-          break;
-        }
-      }
-      if (all_same) {
-        // Every count is the same, so we can use vpsrad.
-        e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F);
-        return;
-      }
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      // src shift mask may have values >31, and x86 sets to zero when
-      // that happens so we mask.
-      if (i.src2.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
-      } else {
-        e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-      }
-      e.vpsravd(i.dest, i.src1, e.xmm0);
-    } else {
-      // Shift 4 words in src1 by amount specified in src2.
-      Xbyak::Label emu, end;
-
-      // See if the shift is equal first for a shortcut.
-      // Only bother with this check if shift amt isn't constant.
-      if (!i.src2.is_constant) {
-        e.vpshufd(e.xmm0, i.src2, 0b00000000);
-        e.vpxor(e.xmm1, e.xmm0, i.src2);
-        e.vptest(e.xmm1, e.xmm1);
-        e.jnz(emu);
-
-        // Equal. Shift using vpsrad.
-        e.mov(e.rax, 0x1F);
-        e.vmovq(e.xmm1, e.rax);
-        e.vpand(e.xmm0, e.xmm0, e.xmm1);
-        e.vpsrad(i.dest, i.src1, e.xmm0);
-        e.jmp(end);
-      }
-
-      // TODO(benvanik): native version.
-      e.L(emu);
-      if (i.src2.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
-      } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
-      }
-      e.lea(e.r8, e.StashXmm(0, i.src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
-      e.vmovaps(i.dest, e.xmm0);
-
-      e.L(end);
-    }
-  }
-
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        EmitInt8(e, i);
-        break;
-      case INT16_TYPE:
-        EmitInt16(e, i);
-        break;
-      case INT32_TYPE:
-        EmitInt32(e, i);
-        break;
-      default:
-        assert_always();
-        break;
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
-
 // ============================================================================
 // OPCODE_ROTATE_LEFT
 // ============================================================================
@@ -5599,196 +4371,6 @@ struct ROTATE_LEFT_I64
 EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT, ROTATE_LEFT_I8, ROTATE_LEFT_I16,
                      ROTATE_LEFT_I32, ROTATE_LEFT_I64);
 
-// ============================================================================
-// OPCODE_VECTOR_ROTATE_LEFT
-// ============================================================================
-// TODO(benvanik): AVX512 has a native variable rotate (rolv).
-struct VECTOR_ROTATE_LEFT_V128
-    : Sequence<VECTOR_ROTATE_LEFT_V128,
-               I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = xe::rotate_left<uint8_t>(value[i], shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = xe::rotate_left<uint16_t>(value[i], shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = xe::rotate_left<uint32_t>(value[i], shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        // TODO(benvanik): native version (with shift magic).
-        e.lea(e.r8, e.StashXmm(0, i.src1));
-        if (i.src2.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
-        } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-        }
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI8));
-        e.vmovaps(i.dest, e.xmm0);
-        break;
-      case INT16_TYPE:
-        // TODO(benvanik): native version (with shift magic).
-        e.lea(e.r8, e.StashXmm(0, i.src1));
-        if (i.src2.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
-        } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-        }
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI16));
-        e.vmovaps(i.dest, e.xmm0);
-        break;
-      case INT32_TYPE: {
-        if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-          Xmm temp = i.dest;
-          if (i.dest == i.src1 || i.dest == i.src2) {
-            temp = e.xmm2;
-          }
-          // Shift left (to get high bits):
-          e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
-          e.vpsllvd(e.xmm1, i.src1, e.xmm0);
-          // Shift right (to get low bits):
-          e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
-          e.vpsubd(temp, e.xmm0);
-          e.vpsrlvd(i.dest, i.src1, temp);
-          // Merge:
-          e.vpor(i.dest, e.xmm1);
-        } else {
-          // TODO(benvanik): non-AVX2 native version.
-          e.lea(e.r8, e.StashXmm(0, i.src1));
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
-          } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
-          }
-          e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI32));
-          e.vmovaps(i.dest, e.xmm0);
-        }
-        break;
-      }
-      default:
-        assert_always();
-        break;
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
-
-// ============================================================================
-// OPCODE_VECTOR_AVERAGE
-// ============================================================================
-struct VECTOR_AVERAGE
-    : Sequence<VECTOR_AVERAGE,
-               I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1,
-                                                 __m128i src2) {
-    alignas(16) uint32_t src1v[4];
-    alignas(16) uint32_t src2v[4];
-    alignas(16) uint32_t value[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1;
-      value[i] = uint32_t(t);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1,
-                                               __m128i src2) {
-    alignas(16) int32_t src1v[4];
-    alignas(16) int32_t src2v[4];
-    alignas(16) int32_t value[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1;
-      value[i] = int32_t(t);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitCommutativeBinaryXmmOp(
-        e, i,
-        [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) {
-          const TypeName part_type =
-              static_cast<TypeName>(i.instr->flags & 0xFF);
-          const uint32_t arithmetic_flags = i.instr->flags >> 8;
-          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
-          switch (part_type) {
-            case INT8_TYPE:
-              if (is_unsigned) {
-                e.vpavgb(dest, src1, src2);
-              } else {
-                assert_always();
-              }
-              break;
-            case INT16_TYPE:
-              if (is_unsigned) {
-                e.vpavgw(dest, src1, src2);
-              } else {
-                assert_always();
-              }
-              break;
-            case INT32_TYPE:
-              // No 32bit averages in AVX.
-              if (is_unsigned) {
-                if (i.src2.is_constant) {
-                  e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
-                } else {
-                  e.lea(e.r9, e.StashXmm(1, i.src2));
-                }
-                e.lea(e.r8, e.StashXmm(0, i.src1));
-                e.CallNativeSafe(
-                    reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
-                e.vmovaps(i.dest, e.xmm0);
-              } else {
-                if (i.src2.is_constant) {
-                  e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
-                } else {
-                  e.lea(e.r9, e.StashXmm(1, i.src2));
-                }
-                e.lea(e.r8, e.StashXmm(0, i.src1));
-                e.CallNativeSafe(
-                    reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
-                e.vmovaps(i.dest, e.xmm0);
-              }
-              break;
-            default:
-              assert_unhandled_case(part_type);
-              break;
-          }
-        });
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE);
-
 // ============================================================================
 // OPCODE_BYTE_SWAP
 // ============================================================================
@@ -5914,1185 +4496,6 @@ struct CNTLZ_I64 : Sequence<CNTLZ_I64, I<OPCODE_CNTLZ, I8Op, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64);
 
-// ============================================================================
-// OPCODE_INSERT
-// ============================================================================
-struct INSERT_I8
-    : Sequence<INSERT_I8, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.is_constant);
-    e.vpinsrb(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x3);
-  }
-};
-struct INSERT_I16
-    : Sequence<INSERT_I16, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.is_constant);
-    e.vpinsrw(i.dest, i.src3.reg().cvt32(), i.src2.constant() ^ 0x1);
-  }
-};
-struct INSERT_I32
-    : Sequence<INSERT_I32, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.is_constant);
-    e.vpinsrd(i.dest, i.src3, i.src2.constant());
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32);
-
-// ============================================================================
-// OPCODE_EXTRACT
-// ============================================================================
-// TODO(benvanik): sequence extract/splat:
-//  v0.i32 = extract v0.v128, 0
-//  v0.v128 = splat v0.i32
-// This can be a single broadcast.
-struct EXTRACT_I8
-    : Sequence<EXTRACT_I8, I<OPCODE_EXTRACT, I8Op, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src2.is_constant) {
-      e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant()));
-    } else {
-      e.mov(e.eax, 0x00000003);
-      e.xor_(e.al, i.src2);
-      e.and_(e.al, 0x1F);
-      e.vmovd(e.xmm0, e.eax);
-      e.vpshufb(e.xmm0, i.src1, e.xmm0);
-      e.vmovd(i.dest.reg().cvt32(), e.xmm0);
-      e.and_(i.dest, uint8_t(0xFF));
-    }
-  }
-};
-struct EXTRACT_I16
-    : Sequence<EXTRACT_I16, I<OPCODE_EXTRACT, I16Op, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src2.is_constant) {
-      e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant()));
-    } else {
-      e.mov(e.al, i.src2);
-      e.xor_(e.al, 0x01);
-      e.shl(e.al, 1);
-      e.mov(e.ah, e.al);
-      e.add(e.ah, 1);
-      e.vmovd(e.xmm0, e.eax);
-      e.vpshufb(e.xmm0, i.src1, e.xmm0);
-      e.vmovd(i.dest.reg().cvt32(), e.xmm0);
-      e.and_(i.dest.reg().cvt32(), 0xFFFFu);
-    }
-  }
-};
-struct EXTRACT_I32
-    : Sequence<EXTRACT_I32, I<OPCODE_EXTRACT, I32Op, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    static const vec128_t extract_table_32[4] = {
-        vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-        vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-        vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-        vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-    };
-    if (i.src2.is_constant) {
-      // TODO(gibbed): add support to constant propagation pass for
-      // OPCODE_EXTRACT.
-      Xmm src1;
-      if (i.src1.is_constant) {
-        src1 = e.xmm0;
-        e.LoadConstantXmm(src1, i.src1.constant());
-      } else {
-        src1 = i.src1;
-      }
-      if (i.src2.constant() == 0) {
-        e.vmovd(i.dest, src1);
-      } else {
-        e.vpextrd(i.dest, src1, VEC128_D(i.src2.constant()));
-      }
-    } else {
-      // TODO(benvanik): try out hlide's version:
-      // e.mov(e.eax, 3);
-      // e.and_(e.al, i.src2);       // eax = [(i&3), 0, 0, 0]
-      // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4]
-      // e.add(e.eax, 0x00010203);  // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1,
-      // ((i&3)*4)+0]
-      // e.vmovd(e.xmm0, e.eax);
-      // e.vpshufb(e.xmm0, i.src1, e.xmm0);
-      // e.vmovd(i.dest.reg().cvt32(), e.xmm0);
-      // Get the desired word in xmm0, then extract that.
-      Xmm src1;
-      if (i.src1.is_constant) {
-        src1 = e.xmm1;
-        e.LoadConstantXmm(src1, i.src1.constant());
-      } else {
-        src1 = i.src1.reg();
-      }
-
-      e.xor_(e.rax, e.rax);
-      e.mov(e.al, i.src2);
-      e.and_(e.al, 0x03);
-      e.shl(e.al, 4);
-      e.mov(e.rdx, reinterpret_cast<uint64_t>(extract_table_32));
-      e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]);
-      e.vpshufb(e.xmm0, src1, e.xmm0);
-      e.vpextrd(i.dest, e.xmm0, 0);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
-
-// ============================================================================
-// OPCODE_SPLAT
-// ============================================================================
-// Copy a value into all elements of a vector
-struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src1.is_constant) {
-      // TODO(benvanik): faster constant splats.
-      e.mov(e.eax, i.src1.constant());
-      e.vmovd(e.xmm0, e.eax);
-    } else {
-      e.vmovd(e.xmm0, i.src1.reg().cvt32());
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      e.vpbroadcastb(i.dest, e.xmm0);
-    } else {
-      e.vpunpcklbw(e.xmm0, e.xmm0);
-      e.vpunpcklwd(e.xmm0, e.xmm0);
-      e.vpshufd(i.dest, e.xmm0, 0);
-    }
-  }
-};
-struct SPLAT_I16 : Sequence<SPLAT_I16, I<OPCODE_SPLAT, V128Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src1.is_constant) {
-      // TODO(benvanik): faster constant splats.
-      e.mov(e.eax, i.src1.constant());
-      e.vmovd(e.xmm0, e.eax);
-    } else {
-      e.vmovd(e.xmm0, i.src1.reg().cvt32());
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      e.vpbroadcastw(i.dest, e.xmm0);
-    } else {
-      e.vpunpcklwd(e.xmm0, e.xmm0);  // unpack low word data
-      e.vpshufd(i.dest, e.xmm0, 0);
-    }
-  }
-};
-struct SPLAT_I32 : Sequence<SPLAT_I32, I<OPCODE_SPLAT, V128Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.src1.is_constant) {
-      // TODO(benvanik): faster constant splats.
-      e.mov(e.eax, i.src1.constant());
-      e.vmovd(e.xmm0, e.eax);
-    } else {
-      e.vmovd(e.xmm0, i.src1);
-    }
-
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      e.vpbroadcastd(i.dest, e.xmm0);
-    } else {
-      e.vpshufd(i.dest, e.xmm0, 0);
-    }
-  }
-};
-struct SPLAT_F32 : Sequence<SPLAT_F32, I<OPCODE_SPLAT, V128Op, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      if (i.src1.is_constant) {
-        // TODO(benvanik): faster constant splats.
-        e.mov(e.eax, i.src1.value->constant.i32);
-        e.vmovd(e.xmm0, e.eax);
-        e.vbroadcastss(i.dest, e.xmm0);
-      } else {
-        e.vbroadcastss(i.dest, i.src1);
-      }
-    } else {
-      if (i.src1.is_constant) {
-        e.mov(e.eax, i.src1.value->constant.i32);
-        e.vmovd(i.dest, e.eax);
-        e.vshufps(i.dest, i.dest, i.dest, 0);
-      } else {
-        e.vshufps(i.dest, i.src1, i.src1, 0);
-      }
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32);
-
-// ============================================================================
-// OPCODE_PERMUTE
-// ============================================================================
-struct PERMUTE_I32
-    : Sequence<PERMUTE_I32, I<OPCODE_PERMUTE, V128Op, I32Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.instr->flags == INT32_TYPE);
-    // Permute words between src2 and src3.
-    // TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
-    if (i.src1.is_constant) {
-      uint32_t control = i.src1.constant();
-      // Shuffle things into the right places in dest & xmm0,
-      // then we blend them together.
-      uint32_t src_control =
-          (((control >> 24) & 0x3) << 6) | (((control >> 16) & 0x3) << 4) |
-          (((control >> 8) & 0x3) << 2) | (((control >> 0) & 0x3) << 0);
-
-      uint32_t blend_control = 0;
-      if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-        // Blender for vpblendd
-        blend_control =
-            (((control >> 26) & 0x1) << 3) | (((control >> 18) & 0x1) << 2) |
-            (((control >> 10) & 0x1) << 1) | (((control >> 2) & 0x1) << 0);
-      } else {
-        // Blender for vpblendw
-        blend_control =
-            (((control >> 26) & 0x1) << 6) | (((control >> 18) & 0x1) << 4) |
-            (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 0);
-        blend_control |= blend_control << 1;
-      }
-
-      // TODO(benvanik): if src2/src3 are constants, shuffle now!
-      Xmm src2;
-      if (i.src2.is_constant) {
-        src2 = e.xmm1;
-        e.LoadConstantXmm(src2, i.src2.constant());
-      } else {
-        src2 = i.src2;
-      }
-      Xmm src3;
-      if (i.src3.is_constant) {
-        src3 = e.xmm2;
-        e.LoadConstantXmm(src3, i.src3.constant());
-      } else {
-        src3 = i.src3;
-      }
-      if (i.dest != src3) {
-        e.vpshufd(i.dest, src2, src_control);
-        e.vpshufd(e.xmm0, src3, src_control);
-      } else {
-        e.vmovaps(e.xmm0, src3);
-        e.vpshufd(i.dest, src2, src_control);
-        e.vpshufd(e.xmm0, e.xmm0, src_control);
-      }
-
-      if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-        e.vpblendd(i.dest, e.xmm0, blend_control);  // $0 = $1 <blend> $2
-      } else {
-        e.vpblendw(i.dest, e.xmm0, blend_control);  // $0 = $1 <blend> $2
-      }
-    } else {
-      // Permute by non-constant.
-      assert_always();
-    }
-  }
-};
-struct PERMUTE_V128
-    : Sequence<PERMUTE_V128,
-               I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
-  static void EmitByInt8(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): find out how to do this with only one temp register!
-    // Permute bytes between src2 and src3.
-    // src1 is an array of indices corresponding to positions within src2 and
-    // src3.
-    if (i.src3.value->IsConstantZero()) {
-      // Permuting with src2/zero, so just shuffle/mask.
-      if (i.src2.value->IsConstantZero()) {
-        // src2 & src3 are zero, so result will always be zero.
-        e.vpxor(i.dest, i.dest);
-      } else {
-        // Control mask needs to be shuffled.
-        if (i.src1.is_constant) {
-          e.LoadConstantXmm(e.xmm0, i.src1.constant());
-          e.vxorps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSwapWordMask));
-        } else {
-          e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
-        }
-        e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));
-        if (i.src2.is_constant) {
-          e.LoadConstantXmm(i.dest, i.src2.constant());
-          e.vpshufb(i.dest, i.dest, e.xmm0);
-        } else {
-          e.vpshufb(i.dest, i.src2, e.xmm0);
-        }
-        // Build a mask with values in src2 having 0 and values in src3 having
-        // 1.
-        e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15));
-        e.vpandn(i.dest, e.xmm0, i.dest);
-      }
-    } else {
-      // General permute.
-      // Control mask needs to be shuffled.
-      // TODO(benvanik): do constants here instead of in generated code.
-      if (i.src1.is_constant) {
-        e.LoadConstantXmm(e.xmm2, i.src1.constant());
-        e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
-      } else {
-        e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
-      }
-      e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
-      Xmm src2_shuf = e.xmm0;
-      if (i.src2.value->IsConstantZero()) {
-        e.vpxor(src2_shuf, src2_shuf);
-      } else if (i.src2.is_constant) {
-        e.LoadConstantXmm(src2_shuf, i.src2.constant());
-        e.vpshufb(src2_shuf, src2_shuf, e.xmm2);
-      } else {
-        e.vpshufb(src2_shuf, i.src2, e.xmm2);
-      }
-      Xmm src3_shuf = e.xmm1;
-      if (i.src3.value->IsConstantZero()) {
-        e.vpxor(src3_shuf, src3_shuf);
-      } else if (i.src3.is_constant) {
-        e.LoadConstantXmm(src3_shuf, i.src3.constant());
-        e.vpshufb(src3_shuf, src3_shuf, e.xmm2);
-      } else {
-        e.vpshufb(src3_shuf, i.src3, e.xmm2);
-      }
-      // Build a mask with values in src2 having 0 and values in src3 having 1.
-      e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
-      e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
-    }
-  }
-
-  static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
-    // src1 is an array of indices corresponding to positions within src2 and
-    // src3.
-    assert_true(i.src1.is_constant);
-    vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
-    vec128_t perm_ctrl = vec128b(0);
-    for (int i = 0; i < 8; i++) {
-      perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
-
-      auto v = uint8_t(perm.u16[i]);
-      perm.u8[i * 2] = v * 2;
-      perm.u8[i * 2 + 1] = v * 2 + 1;
-    }
-    e.LoadConstantXmm(e.xmm0, perm);
-
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm1, i.src2.constant());
-    } else {
-      e.vmovdqa(e.xmm1, i.src2);
-    }
-    if (i.src3.is_constant) {
-      e.LoadConstantXmm(e.xmm2, i.src3.constant());
-    } else {
-      e.vmovdqa(e.xmm2, i.src3);
-    }
-
-    e.vpshufb(e.xmm1, e.xmm1, e.xmm0);
-    e.vpshufb(e.xmm2, e.xmm2, e.xmm0);
-
-    uint8_t mask = 0;
-    for (int i = 0; i < 8; i++) {
-      if (perm_ctrl.i16[i] == 0) {
-        mask |= 1 << (7 - i);
-      }
-    }
-    e.vpblendw(i.dest, e.xmm1, e.xmm2, mask);
-  }
-
-  static void EmitByInt32(X64Emitter& e, const EmitArgType& i) {
-    assert_always();
-  }
-
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        EmitByInt8(e, i);
-        break;
-      case INT16_TYPE:
-        EmitByInt16(e, i);
-        break;
-      case INT32_TYPE:
-        EmitByInt32(e, i);
-        break;
-      default:
-        assert_unhandled_case(i.instr->flags);
-        return;
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);
-
-// ============================================================================
-// OPCODE_SWIZZLE
-// ============================================================================
-struct SWIZZLE
-    : Sequence<SWIZZLE, I<OPCODE_SWIZZLE, V128Op, V128Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto element_type = i.instr->flags;
-    if (element_type == INT8_TYPE) {
-      assert_always();
-    } else if (element_type == INT16_TYPE) {
-      assert_always();
-    } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) {
-      uint8_t swizzle_mask = static_cast<uint8_t>(i.src2.value);
-      Xmm src1;
-      if (i.src1.is_constant) {
-        src1 = e.xmm0;
-        e.LoadConstantXmm(src1, i.src1.constant());
-      } else {
-        src1 = i.src1;
-      }
-      e.vpshufd(i.dest, src1, swizzle_mask);
-    } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
-      assert_always();
-    } else {
-      assert_always();
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE);
-
-// ============================================================================
-// OPCODE_PACK
-// ============================================================================
-struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags & PACK_TYPE_MODE) {
-      case PACK_TYPE_D3DCOLOR:
-        EmitD3DCOLOR(e, i);
-        break;
-      case PACK_TYPE_FLOAT16_2:
-        EmitFLOAT16_2(e, i);
-        break;
-      case PACK_TYPE_FLOAT16_4:
-        EmitFLOAT16_4(e, i);
-        break;
-      case PACK_TYPE_SHORT_2:
-        EmitSHORT_2(e, i);
-        break;
-      case PACK_TYPE_SHORT_4:
-        EmitSHORT_4(e, i);
-        break;
-      case PACK_TYPE_UINT_2101010:
-        EmitUINT_2101010(e, i);
-        break;
-      case PACK_TYPE_8_IN_16:
-        Emit8_IN_16(e, i, i.instr->flags);
-        break;
-      case PACK_TYPE_16_IN_32:
-        Emit16_IN_32(e, i, i.instr->flags);
-        break;
-      default:
-        assert_unhandled_case(i.instr->flags);
-        break;
-    }
-  }
-  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
-    // are valid.
-    e.vminps(i.dest, src, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
-    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333));
-    // Extract bytes.
-    // RGBA (XYZW) -> ARGB (WXYZ)
-    // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
-    //     ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
-    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
-  }
-  static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
-    alignas(16) float a[4];
-    alignas(16) uint16_t b[8];
-    _mm_store_ps(a, src1);
-    std::memset(b, 0, sizeof(b));
-
-    for (int i = 0; i < 2; i++) {
-      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
-    }
-
-    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
-  }
-  static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    // https://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
-    // dest = [(src1.x | src1.y), 0, 0, 0]
-
-    Xmm src;
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // 0|0|0|0|W|Z|Y|X
-      e.vcvtps2ph(i.dest, src, 0b00000011);
-      // Shuffle to X|Y|0|0|0|0|0|0
-      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2));
-    } else {
-      if (i.src1.is_constant) {
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      e.lea(e.r8, e.StashXmm(0, src));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
-      e.vmovaps(i.dest, e.xmm0);
-    }
-  }
-  static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
-    alignas(16) float a[4];
-    alignas(16) uint16_t b[8];
-    _mm_store_ps(a, src1);
-    std::memset(b, 0, sizeof(b));
-
-    for (int i = 0; i < 4; i++) {
-      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
-    }
-
-    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
-  }
-  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0]
-
-    Xmm src;
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // 0|0|0|0|W|Z|Y|X
-      e.vcvtps2ph(i.dest, src, 0b00000011);
-      // Shuffle to X|Y|Z|W|0|0|0|0
-      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
-    } else {
-      if (i.src1.is_constant) {
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      e.lea(e.r8, e.StashXmm(0, src));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
-      e.vmovaps(i.dest, e.xmm0);
-    }
-  }
-  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Saturate.
-    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min));
-    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
-    // Pack.
-    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2));
-  }
-  static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Saturate.
-    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackSHORT_Min));
-    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
-    // Pack.
-    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
-  }
-  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
-    // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
-    // XYZ are 10 bits, signed and saturated.
-    // W is 2 bits, unsigned and saturated.
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Saturate.
-    e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
-    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked));
-    // Remove the unneeded bits of the floats.
-    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      // Shift the components up.
-      e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
-    } else {
-      // Duplicate all the components into bits 10-19.
-      e.vpslld(e.xmm0, i.dest, 10);
-      e.vpor(i.dest, e.xmm0);
-      // Duplicate all the components into bits 20-39
-      // (so alpha will be in 30-31).
-      e.vpslld(e.xmm0, i.dest, 20);
-      e.vpor(i.dest, e.xmm0);
-      // Leave only the needed components.
-      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
-    }
-    // Combine the components.
-    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1));
-    e.vorps(i.dest, e.xmm0);
-    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
-    e.vorps(i.dest, e.xmm0);
-  }
-  static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
-                                              __m128i src2) {
-    alignas(16) uint16_t a[8];
-    alignas(16) uint16_t b[8];
-    alignas(16) uint8_t c[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (int i = 0; i < 8; ++i) {
-      c[i] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), a[i])));
-      c[i + 8] = uint8_t(std::max(uint16_t(0), std::min(uint16_t(255), b[i])));
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
-  }
-  static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t a[16];
-    alignas(16) uint8_t b[16];
-    alignas(16) uint8_t c[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
-    for (int i = 0; i < 8; ++i) {
-      c[i] = a[i * 2];
-      c[i + 8] = b[i * 2];
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
-  }
-  static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
-    // TODO(benvanik): handle src2 (or src1) being constant zero
-    if (IsPackInUnsigned(flags)) {
-      if (IsPackOutUnsigned(flags)) {
-        if (IsPackOutSaturate(flags)) {
-          // unsigned -> unsigned + saturate
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
-          } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
-          }
-          e.lea(e.r8, e.StashXmm(0, i.src1));
-          e.CallNativeSafe(
-              reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN_SAT));
-          e.vmovaps(i.dest, e.xmm0);
-          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
-        } else {
-          // unsigned -> unsigned
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-          e.lea(e.r8, e.StashXmm(0, i.src1));
-          e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
-          e.vmovaps(i.dest, e.xmm0);
-          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
-        }
-      } else {
-        if (IsPackOutSaturate(flags)) {
-          // unsigned -> signed + saturate
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      }
-    } else {
-      if (IsPackOutUnsigned(flags)) {
-        if (IsPackOutSaturate(flags)) {
-          // signed -> unsigned + saturate
-          // PACKUSWB / SaturateSignedWordToUnsignedByte
-          Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2;
-          if (i.src2.is_constant) {
-            e.LoadConstantXmm(src2, i.src2.constant());
-          }
-
-          e.vpackuswb(i.dest, i.src1, src2);
-          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
-        } else {
-          // signed -> unsigned
-          assert_always();
-        }
-      } else {
-        if (IsPackOutSaturate(flags)) {
-          // signed -> signed + saturate
-          // PACKSSWB / SaturateSignedWordToSignedByte
-          e.vpacksswb(i.dest, i.src1, i.src2);
-          e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
-        } else {
-          // signed -> signed
-          assert_always();
-        }
-      }
-    }
-  }
-  // Pack 2 32-bit vectors into a 16-bit vector.
-  static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
-                           uint32_t flags) {
-    // TODO(benvanik): handle src2 (or src1) being constant zero
-    if (IsPackInUnsigned(flags)) {
-      if (IsPackOutUnsigned(flags)) {
-        if (IsPackOutSaturate(flags)) {
-          // unsigned -> unsigned + saturate
-          // Construct a saturation max value
-          e.mov(e.eax, 0xFFFFu);
-          e.vmovd(e.xmm0, e.eax);
-          e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
-
-          if (!i.src1.is_constant) {
-            e.vpminud(e.xmm1, i.src1, e.xmm0);  // Saturate src1
-            e.vpshuflw(e.xmm1, e.xmm1, 0b00100010);
-            e.vpshufhw(e.xmm1, e.xmm1, 0b00100010);
-            e.vpshufd(e.xmm1, e.xmm1, 0b00001000);
-          } else {
-            // TODO(DrChat): Non-zero constants
-            assert_true(i.src1.constant().u64[0] == 0 &&
-                        i.src1.constant().u64[1] == 0);
-            e.vpxor(e.xmm1, e.xmm1);
-          }
-
-          if (!i.src2.is_constant) {
-            e.vpminud(i.dest, i.src2, e.xmm0);  // Saturate src2
-            e.vpshuflw(i.dest, i.dest, 0b00100010);
-            e.vpshufhw(i.dest, i.dest, 0b00100010);
-            e.vpshufd(i.dest, i.dest, 0b10000000);
-          } else {
-            // TODO(DrChat): Non-zero constants
-            assert_true(i.src2.constant().u64[0] == 0 &&
-                        i.src2.constant().u64[1] == 0);
-            e.vpxor(i.dest, i.dest);
-          }
-
-          e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111);
-        } else {
-          // unsigned -> unsigned
-          e.vmovaps(e.xmm0, i.src1);
-          e.vpshuflw(e.xmm0, e.xmm0, 0b00100010);
-          e.vpshufhw(e.xmm0, e.xmm0, 0b00100010);
-          e.vpshufd(e.xmm0, e.xmm0, 0b00001000);
-
-          e.vmovaps(i.dest, i.src2);
-          e.vpshuflw(i.dest, i.dest, 0b00100010);
-          e.vpshufhw(i.dest, i.dest, 0b00100010);
-          e.vpshufd(i.dest, i.dest, 0b10000000);
-
-          e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111);
-        }
-      } else {
-        if (IsPackOutSaturate(flags)) {
-          // unsigned -> signed + saturate
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      }
-    } else {
-      if (IsPackOutUnsigned(flags)) {
-        if (IsPackOutSaturate(flags)) {
-          // signed -> unsigned + saturate
-          // PACKUSDW
-          // TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0];
-          // DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0];
-          e.vpackusdw(i.dest, i.src1, i.src2);
-          e.vpshuflw(i.dest, i.dest, 0b10110001);
-          e.vpshufhw(i.dest, i.dest, 0b10110001);
-        } else {
-          // signed -> unsigned
-          assert_always();
-        }
-      } else {
-        if (IsPackOutSaturate(flags)) {
-          // signed -> signed + saturate
-          // PACKSSDW / SaturateSignedDwordToSignedWord
-          Xmm src2;
-          if (!i.src2.is_constant) {
-            src2 = i.src2;
-          } else {
-            assert_false(i.src1 == e.xmm0);
-            src2 = e.xmm0;
-            e.LoadConstantXmm(src2, i.src2.constant());
-          }
-          e.vpackssdw(i.dest, i.src1, src2);
-          e.vpshuflw(i.dest, i.dest, 0b10110001);
-          e.vpshufhw(i.dest, i.dest, 0b10110001);
-        } else {
-          // signed -> signed
-          assert_always();
-        }
-      }
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
-
-// ============================================================================
-// OPCODE_UNPACK
-// ============================================================================
-struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags & PACK_TYPE_MODE) {
-      case PACK_TYPE_D3DCOLOR:
-        EmitD3DCOLOR(e, i);
-        break;
-      case PACK_TYPE_FLOAT16_2:
-        EmitFLOAT16_2(e, i);
-        break;
-      case PACK_TYPE_FLOAT16_4:
-        EmitFLOAT16_4(e, i);
-        break;
-      case PACK_TYPE_SHORT_2:
-        EmitSHORT_2(e, i);
-        break;
-      case PACK_TYPE_SHORT_4:
-        EmitSHORT_4(e, i);
-        break;
-      case PACK_TYPE_UINT_2101010:
-        EmitUINT_2101010(e, i);
-        break;
-      case PACK_TYPE_8_IN_16:
-        Emit8_IN_16(e, i, i.instr->flags);
-        break;
-      case PACK_TYPE_16_IN_32:
-        Emit16_IN_32(e, i, i.instr->flags);
-        break;
-      default:
-        assert_unhandled_case(i.instr->flags);
-        break;
-    }
-  }
-  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
-    // ARGB (WXYZ) -> RGBA (XYZW)
-    Xmm src;
-    if (i.src1.is_constant) {
-      if (i.src1.value->IsConstantZero()) {
-        e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne));
-        return;
-      }
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // src = ZZYYXXWW
-    // Unpack to 000000ZZ,000000YY,000000XX,000000WW
-    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackD3DCOLOR));
-    // Add 1.0f to each.
-    e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
-    // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
-  }
-  static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
-    alignas(16) uint16_t a[8];
-    alignas(16) float b[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-
-    for (int i = 0; i < 2; i++) {
-      b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
-    }
-
-    // Constants, or something
-    b[2] = 0.f;
-    b[3] = 1.f;
-
-    return _mm_load_ps(b);
-  }
-  static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) {
-    // 1 bit sign, 5 bit exponent, 10 bit mantissa
-    // D3D10 half float format
-    // TODO(benvanik):
-    // https://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
-    // Use _mm_cvtph_ps -- requires very modern processors (SSE5+)
-    // Unpacking half floats:
-    // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
-    // Packing half floats: https://gist.github.com/rygorous/2156668
-    // Load source, move from tight pack of X16Y16.... to X16...Y16...
-    // Also zero out the high end.
-    // TODO(benvanik): special case constant unpacks that just get 0/1/etc.
-
-    Xmm src;
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // sx = src.iw >> 16;
-      // sy = src.iw & 0xFFFF;
-      // dest = { XMConvertHalfToFloat(sx),
-      //          XMConvertHalfToFloat(sy),
-      //          0.0,
-      //          1.0 };
-      // Shuffle to 0|0|0|0|0|0|Y|X
-      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2));
-      e.vcvtph2ps(i.dest, i.dest);
-      e.vpshufd(i.dest, i.dest, 0b10100100);
-      e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
-    } else {
-      if (i.src1.is_constant) {
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      e.lea(e.r8, e.StashXmm(0, src));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
-      e.vmovaps(i.dest, e.xmm0);
-    }
-  }
-  static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
-    alignas(16) uint16_t a[8];
-    alignas(16) float b[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-
-    for (int i = 0; i < 4; i++) {
-      b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
-    }
-
-    return _mm_load_ps(b);
-  }
-  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
-    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
-    Xmm src;
-    if (e.IsFeatureEnabled(kX64EmitF16C)) {
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // Shuffle to 0|0|0|0|W|Z|Y|X
-      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4));
-      e.vcvtph2ps(i.dest, i.dest);
-    } else {
-      if (i.src1.is_constant) {
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      e.lea(e.r8, e.StashXmm(0, src));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
-      e.vmovaps(i.dest, e.xmm0);
-    }
-  }
-  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
-    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
-    // (VD.y) = 3.0 + (VB.x)*2^-22
-    // (VD.z) = 0.0
-    // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
-    // src is (xx,xx,xx,VALUE)
-    Xmm src;
-    if (i.src1.is_constant) {
-      if (i.src1.value->IsConstantZero()) {
-        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301));
-        return;
-      }
-      // TODO(benvanik): check other common constants/perform shuffle/or here.
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Shuffle bytes.
-    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2));
-    // If negative, make smaller than 3 - sign extend before adding.
-    e.vpslld(i.dest, 16);
-    e.vpsrad(i.dest, 16);
-    // Add 3,3,0,1.
-    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
-    // Return quiet NaNs in case of negative overflow.
-    e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
-    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
-  }
-  static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
-    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
-    // (VD.y) = 3.0 + (VB.x)*2^-22
-    // (VD.z) = 3.0 + (VB.y>>16)*2^-22
-    // (VD.w) = 3.0 + (VB.y)*2^-22
-    // src is (xx,xx,VALUE,VALUE)
-    Xmm src;
-    if (i.src1.is_constant) {
-      if (i.src1.value->IsConstantZero()) {
-        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
-        return;
-      }
-      // TODO(benvanik): check other common constants/perform shuffle/or here.
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Shuffle bytes.
-    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
-    // If negative, make smaller than 3 - sign extend before adding.
-    e.vpslld(i.dest, 16);
-    e.vpsrad(i.dest, 16);
-    // Add 3,3,3,3.
-    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
-    // Return quiet NaNs in case of negative overflow.
-    e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
-    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
-  }
-  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
-    Xmm src;
-    if (i.src1.is_constant) {
-      if (i.src1.value->IsConstantZero()) {
-        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331));
-        return;
-      }
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    // Splat W.
-    e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3));
-    // Keep only the needed components.
-    // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
-    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
-    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
-      // Shift the components down.
-      e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
-    } else {
-      // Duplicate green in 0-9 and alpha in 20-21.
-      e.vpsrld(e.xmm0, i.dest, 10);
-      e.vpor(i.dest, e.xmm0);
-      // Duplicate blue in 0-9 and alpha in 0-1.
-      e.vpsrld(e.xmm0, i.dest, 20);
-      e.vpor(i.dest, e.xmm0);
-      // Remove higher duplicate components.
-      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
-    }
-    // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding.
-    // W is unsigned.
-    e.vpslld(i.dest, 22);
-    e.vpsrad(i.dest, 22);
-    // Add 3,3,3,1.
-    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331));
-    // Return quiet NaNs in case of negative overflow.
-    e.vcmpeqps(e.xmm0, i.dest,
-               e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow));
-    e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
-    // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
-    // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.
-  }
-  static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
-    assert_false(IsPackOutSaturate(flags));
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    if (IsPackToLo(flags)) {
-      // Unpack to LO.
-      if (IsPackInUnsigned(flags)) {
-        if (IsPackOutUnsigned(flags)) {
-          // unsigned -> unsigned
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      } else {
-        if (IsPackOutUnsigned(flags)) {
-          // signed -> unsigned
-          assert_always();
-        } else {
-          // signed -> signed
-          e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask));
-          e.vpunpckhbw(i.dest, i.dest, i.dest);
-          e.vpsraw(i.dest, 8);
-        }
-      }
-    } else {
-      // Unpack to HI.
-      if (IsPackInUnsigned(flags)) {
-        if (IsPackOutUnsigned(flags)) {
-          // unsigned -> unsigned
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      } else {
-        if (IsPackOutUnsigned(flags)) {
-          // signed -> unsigned
-          assert_always();
-        } else {
-          // signed -> signed
-          e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMByteOrderMask));
-          e.vpunpcklbw(i.dest, i.dest, i.dest);
-          e.vpsraw(i.dest, 8);
-        }
-      }
-    }
-  }
-  static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
-                           uint32_t flags) {
-    assert_false(IsPackOutSaturate(flags));
-    Xmm src;
-    if (i.src1.is_constant) {
-      src = i.dest;
-      e.LoadConstantXmm(src, i.src1.constant());
-    } else {
-      src = i.src1;
-    }
-    if (IsPackToLo(flags)) {
-      // Unpack to LO.
-      if (IsPackInUnsigned(flags)) {
-        if (IsPackOutUnsigned(flags)) {
-          // unsigned -> unsigned
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      } else {
-        if (IsPackOutUnsigned(flags)) {
-          // signed -> unsigned
-          assert_always();
-        } else {
-          // signed -> signed
-          e.vpunpckhwd(i.dest, src, src);
-          e.vpsrad(i.dest, 16);
-        }
-      }
-    } else {
-      // Unpack to HI.
-      if (IsPackInUnsigned(flags)) {
-        if (IsPackOutUnsigned(flags)) {
-          // unsigned -> unsigned
-          assert_always();
-        } else {
-          // unsigned -> signed
-          assert_always();
-        }
-      } else {
-        if (IsPackOutUnsigned(flags)) {
-          // signed -> unsigned
-          assert_always();
-        } else {
-          // signed -> signed
-          e.vpunpcklwd(i.dest, src, src);
-          e.vpsrad(i.dest, 16);
-        }
-      }
-    }
-    e.vpshufd(i.dest, i.dest, 0xB1);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
-
 // ============================================================================
 // OPCODE_ATOMIC_EXCHANGE
 // ============================================================================
@@ -7203,6 +4606,7 @@ struct SET_ROUNDING_MODE_I32
 EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
 
 void RegisterSequences() {
+  RegisterVector();
 }
 
 bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index edb483022..16408be09 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -40,6 +40,9 @@ static bool Register() {
 #define EMITTER_OPCODE_TABLE(name, ...) \
   const auto X64_INSTR_##name = Register<__VA_ARGS__>();
 
+// Registration functions to force inclusion of several files
+void RegisterVector();
+
 void RegisterSequences();
 bool SelectSequence(X64Emitter* e, const hir::Instr* i,
                     const hir::Instr** new_tail);

From bb74114cab57863a4d8a95f4c7bd1430880d9a42 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 16:06:04 -0600
Subject: [PATCH 22/31] [x64] Factor out control code

---
 src/xenia/cpu/backend/x64/x64_seq_control.cc | 553 +++++++++++++++++++
 src/xenia/cpu/backend/x64/x64_sequences.cc   | 527 +-----------------
 src/xenia/cpu/backend/x64/x64_sequences.h    |   1 +
 3 files changed, 555 insertions(+), 526 deletions(-)
 create mode 100644 src/xenia/cpu/backend/x64/x64_seq_control.cc

diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc
new file mode 100644
index 000000000..81d2d9ab6
--- /dev/null
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@@ -0,0 +1,553 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2018 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/x64/x64_sequences.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/cpu/backend/x64/x64_op.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace x64 {
+
+void RegisterControl() {}
+
+// ============================================================================
+// OPCODE_DEBUG_BREAK
+// ============================================================================
+struct DEBUG_BREAK : Sequence<DEBUG_BREAK, I<OPCODE_DEBUG_BREAK, VoidOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) { e.DebugBreak(); }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK);
+
+// ============================================================================
+// OPCODE_DEBUG_BREAK_TRUE
+// ============================================================================
+struct DEBUG_BREAK_TRUE_I8
+    : Sequence<DEBUG_BREAK_TRUE_I8, I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I16
+    : Sequence<DEBUG_BREAK_TRUE_I16,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I32
+    : Sequence<DEBUG_BREAK_TRUE_I32,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I64
+    : Sequence<DEBUG_BREAK_TRUE_I64,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_F32
+    : Sequence<DEBUG_BREAK_TRUE_F32,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_F64
+    : Sequence<DEBUG_BREAK_TRUE_F64,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.DebugBreak();
+    e.L(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8,
+                     DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32,
+                     DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32,
+                     DEBUG_BREAK_TRUE_F64);
+
+// ============================================================================
+// OPCODE_TRAP
+// ============================================================================
+struct TRAP : Sequence<TRAP, I<OPCODE_TRAP, VoidOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.Trap(i.instr->flags);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
+
+// ============================================================================
+// OPCODE_TRAP_TRUE
+// ============================================================================
+struct TRAP_TRUE_I8
+    : Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+struct TRAP_TRUE_I16
+    : Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+struct TRAP_TRUE_I32
+    : Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+struct TRAP_TRUE_I64
+    : Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+struct TRAP_TRUE_F32
+    : Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+struct TRAP_TRUE_F64
+    : Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Trap(i.instr->flags);
+    e.L(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
+                     TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32,
+                     TRAP_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL
+// ============================================================================
+struct CALL : Sequence<CALL, I<OPCODE_CALL, VoidOp, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src1.value->is_guest());
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src1.value));
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL);
+
+// ============================================================================
+// OPCODE_CALL_TRUE
+// ============================================================================
+struct CALL_TRUE_I8
+    : Sequence<CALL_TRUE_I8, I<OPCODE_CALL_TRUE, VoidOp, I8Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+struct CALL_TRUE_I16
+    : Sequence<CALL_TRUE_I16, I<OPCODE_CALL_TRUE, VoidOp, I16Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+struct CALL_TRUE_I32
+    : Sequence<CALL_TRUE_I32, I<OPCODE_CALL_TRUE, VoidOp, I32Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+struct CALL_TRUE_I64
+    : Sequence<CALL_TRUE_I64, I<OPCODE_CALL_TRUE, VoidOp, I64Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+struct CALL_TRUE_F32
+    : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+struct CALL_TRUE_F64
+    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.L(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
+                     CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32,
+                     CALL_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL_INDIRECT
+// ============================================================================
+struct CALL_INDIRECT
+    : Sequence<CALL_INDIRECT, I<OPCODE_CALL_INDIRECT, VoidOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.CallIndirect(i.instr, i.src1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT);
+
+// ============================================================================
+// OPCODE_CALL_INDIRECT_TRUE
+// ============================================================================
+struct CALL_INDIRECT_TRUE_I8
+    : Sequence<CALL_INDIRECT_TRUE_I8,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I8Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I16
+    : Sequence<CALL_INDIRECT_TRUE_I16,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I16Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I32
+    : Sequence<CALL_INDIRECT_TRUE_I32,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I64
+    : Sequence<CALL_INDIRECT_TRUE_I64,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_F32
+    : Sequence<CALL_INDIRECT_TRUE_F32,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_F64
+    : Sequence<CALL_INDIRECT_TRUE_F64,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    Xbyak::Label skip;
+    e.jz(skip, CodeGenerator::T_NEAR);
+    e.CallIndirect(i.instr, i.src2);
+    e.L(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
+                     CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32,
+                     CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32,
+                     CALL_INDIRECT_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL_EXTERN
+// ============================================================================
+struct CALL_EXTERN
+    : Sequence<CALL_EXTERN, I<OPCODE_CALL_EXTERN, VoidOp, SymbolOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.CallExtern(i.instr, i.src1.value);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN);
+
+// ============================================================================
+// OPCODE_RETURN
+// ============================================================================
+struct RETURN : Sequence<RETURN, I<OPCODE_RETURN, VoidOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // If this is the last instruction in the last block, just let us
+    // fall through.
+    if (i.instr->next || i.instr->block->next) {
+      e.jmp(e.epilog_label(), CodeGenerator::T_NEAR);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN);
+
+// ============================================================================
+// OPCODE_RETURN_TRUE
+// ============================================================================
+struct RETURN_TRUE_I8
+    : Sequence<RETURN_TRUE_I8, I<OPCODE_RETURN_TRUE, VoidOp, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+struct RETURN_TRUE_I16
+    : Sequence<RETURN_TRUE_I16, I<OPCODE_RETURN_TRUE, VoidOp, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+struct RETURN_TRUE_I32
+    : Sequence<RETURN_TRUE_I32, I<OPCODE_RETURN_TRUE, VoidOp, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+struct RETURN_TRUE_I64
+    : Sequence<RETURN_TRUE_I64, I<OPCODE_RETURN_TRUE, VoidOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+struct RETURN_TRUE_F32
+    : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+struct RETURN_TRUE_F64
+    : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
+                     RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32,
+                     RETURN_TRUE_F64);
+
+// ============================================================================
+// OPCODE_SET_RETURN_ADDRESS
+// ============================================================================
+struct SET_RETURN_ADDRESS
+    : Sequence<SET_RETURN_ADDRESS,
+               I<OPCODE_SET_RETURN_ADDRESS, VoidOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.SetReturnAddress(i.src1.constant());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
+
+// ============================================================================
+// OPCODE_BRANCH
+// ============================================================================
+struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.jmp(i.src1.value->name, e.T_NEAR);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
+
+// ============================================================================
+// OPCODE_BRANCH_TRUE
+// ============================================================================
+struct BRANCH_TRUE_I8
+    : Sequence<BRANCH_TRUE_I8, I<OPCODE_BRANCH_TRUE, VoidOp, I8Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_TRUE_I16
+    : Sequence<BRANCH_TRUE_I16, I<OPCODE_BRANCH_TRUE, VoidOp, I16Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_TRUE_I32
+    : Sequence<BRANCH_TRUE_I32, I<OPCODE_BRANCH_TRUE, VoidOp, I32Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_TRUE_I64
+    : Sequence<BRANCH_TRUE_I64, I<OPCODE_BRANCH_TRUE, VoidOp, I64Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_TRUE_F32
+    : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_TRUE_F64
+    : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jnz(i.src2.value->name, e.T_NEAR);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
+                     BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32,
+                     BRANCH_TRUE_F64);
+
+// ============================================================================
+// OPCODE_BRANCH_FALSE
+// ============================================================================
+struct BRANCH_FALSE_I8
+    : Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_FALSE_I16
+    : Sequence<BRANCH_FALSE_I16,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_FALSE_I32
+    : Sequence<BRANCH_FALSE_I32,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_FALSE_I64
+    : Sequence<BRANCH_FALSE_I64,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.test(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_FALSE_F32
+    : Sequence<BRANCH_FALSE_F32,
+               I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+struct BRANCH_FALSE_F64
+    : Sequence<BRANCH_FALSE_F64,
+               I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vptest(i.src1, i.src1);
+    e.jz(i.src2.value->name, e.T_NEAR);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,
+                     BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32,
+                     BRANCH_FALSE_F64);
+
+}  // namespace x64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 8f8050876..d33bf3781 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -90,532 +90,6 @@ struct SOURCE_OFFSET
 };
 EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET, SOURCE_OFFSET);
 
-// ============================================================================
-// OPCODE_DEBUG_BREAK
-// ============================================================================
-struct DEBUG_BREAK : Sequence<DEBUG_BREAK, I<OPCODE_DEBUG_BREAK, VoidOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) { e.DebugBreak(); }
-};
-EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK);
-
-// ============================================================================
-// OPCODE_DEBUG_BREAK_TRUE
-// ============================================================================
-struct DEBUG_BREAK_TRUE_I8
-    : Sequence<DEBUG_BREAK_TRUE_I8, I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-struct DEBUG_BREAK_TRUE_I16
-    : Sequence<DEBUG_BREAK_TRUE_I16,
-               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-struct DEBUG_BREAK_TRUE_I32
-    : Sequence<DEBUG_BREAK_TRUE_I32,
-               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-struct DEBUG_BREAK_TRUE_I64
-    : Sequence<DEBUG_BREAK_TRUE_I64,
-               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-struct DEBUG_BREAK_TRUE_F32
-    : Sequence<DEBUG_BREAK_TRUE_F32,
-               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-struct DEBUG_BREAK_TRUE_F64
-    : Sequence<DEBUG_BREAK_TRUE_F64,
-               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.DebugBreak();
-    e.L(skip);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8,
-                     DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32,
-                     DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32,
-                     DEBUG_BREAK_TRUE_F64);
-
-// ============================================================================
-// OPCODE_TRAP
-// ============================================================================
-struct TRAP : Sequence<TRAP, I<OPCODE_TRAP, VoidOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.Trap(i.instr->flags);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
-
-// ============================================================================
-// OPCODE_TRAP_TRUE
-// ============================================================================
-struct TRAP_TRUE_I8
-    : Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-struct TRAP_TRUE_I16
-    : Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-struct TRAP_TRUE_I32
-    : Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-struct TRAP_TRUE_I64
-    : Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-struct TRAP_TRUE_F32
-    : Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-struct TRAP_TRUE_F64
-    : Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Trap(i.instr->flags);
-    e.L(skip);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
-                     TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32,
-                     TRAP_TRUE_F64);
-
-// ============================================================================
-// OPCODE_CALL
-// ============================================================================
-struct CALL : Sequence<CALL, I<OPCODE_CALL, VoidOp, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src1.value->is_guest());
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src1.value));
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL);
-
-// ============================================================================
-// OPCODE_CALL_TRUE
-// ============================================================================
-struct CALL_TRUE_I8
-    : Sequence<CALL_TRUE_I8, I<OPCODE_CALL_TRUE, VoidOp, I8Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-struct CALL_TRUE_I16
-    : Sequence<CALL_TRUE_I16, I<OPCODE_CALL_TRUE, VoidOp, I16Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-struct CALL_TRUE_I32
-    : Sequence<CALL_TRUE_I32, I<OPCODE_CALL_TRUE, VoidOp, I32Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-struct CALL_TRUE_I64
-    : Sequence<CALL_TRUE_I64, I<OPCODE_CALL_TRUE, VoidOp, I64Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-struct CALL_TRUE_F32
-    : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-struct CALL_TRUE_F64
-    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
-                     CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32,
-                     CALL_TRUE_F64);
-
-// ============================================================================
-// OPCODE_CALL_INDIRECT
-// ============================================================================
-struct CALL_INDIRECT
-    : Sequence<CALL_INDIRECT, I<OPCODE_CALL_INDIRECT, VoidOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.CallIndirect(i.instr, i.src1);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT);
-
-// ============================================================================
-// OPCODE_CALL_INDIRECT_TRUE
-// ============================================================================
-struct CALL_INDIRECT_TRUE_I8
-    : Sequence<CALL_INDIRECT_TRUE_I8,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I8Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-struct CALL_INDIRECT_TRUE_I16
-    : Sequence<CALL_INDIRECT_TRUE_I16,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I16Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-struct CALL_INDIRECT_TRUE_I32
-    : Sequence<CALL_INDIRECT_TRUE_I32,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-struct CALL_INDIRECT_TRUE_I64
-    : Sequence<CALL_INDIRECT_TRUE_I64,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-struct CALL_INDIRECT_TRUE_F32
-    : Sequence<CALL_INDIRECT_TRUE_F32,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-struct CALL_INDIRECT_TRUE_F64
-    : Sequence<CALL_INDIRECT_TRUE_F64,
-               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
-                     CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32,
-                     CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32,
-                     CALL_INDIRECT_TRUE_F64);
-
-// ============================================================================
-// OPCODE_CALL_EXTERN
-// ============================================================================
-struct CALL_EXTERN
-    : Sequence<CALL_EXTERN, I<OPCODE_CALL_EXTERN, VoidOp, SymbolOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.CallExtern(i.instr, i.src1.value);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN);
-
-// ============================================================================
-// OPCODE_RETURN
-// ============================================================================
-struct RETURN : Sequence<RETURN, I<OPCODE_RETURN, VoidOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // If this is the last instruction in the last block, just let us
-    // fall through.
-    if (i.instr->next || i.instr->block->next) {
-      e.jmp(e.epilog_label(), CodeGenerator::T_NEAR);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN);
-
-// ============================================================================
-// OPCODE_RETURN_TRUE
-// ============================================================================
-struct RETURN_TRUE_I8
-    : Sequence<RETURN_TRUE_I8, I<OPCODE_RETURN_TRUE, VoidOp, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-struct RETURN_TRUE_I16
-    : Sequence<RETURN_TRUE_I16, I<OPCODE_RETURN_TRUE, VoidOp, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-struct RETURN_TRUE_I32
-    : Sequence<RETURN_TRUE_I32, I<OPCODE_RETURN_TRUE, VoidOp, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-struct RETURN_TRUE_I64
-    : Sequence<RETURN_TRUE_I64, I<OPCODE_RETURN_TRUE, VoidOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-struct RETURN_TRUE_F32
-    : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-struct RETURN_TRUE_F64
-    : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
-                     RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32,
-                     RETURN_TRUE_F64);
-
-// ============================================================================
-// OPCODE_SET_RETURN_ADDRESS
-// ============================================================================
-struct SET_RETURN_ADDRESS
-    : Sequence<SET_RETURN_ADDRESS,
-               I<OPCODE_SET_RETURN_ADDRESS, VoidOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.SetReturnAddress(i.src1.constant());
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
-
-// ============================================================================
-// OPCODE_BRANCH
-// ============================================================================
-struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.jmp(i.src1.value->name, e.T_NEAR);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
-
-// ============================================================================
-// OPCODE_BRANCH_TRUE
-// ============================================================================
-struct BRANCH_TRUE_I8
-    : Sequence<BRANCH_TRUE_I8, I<OPCODE_BRANCH_TRUE, VoidOp, I8Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_TRUE_I16
-    : Sequence<BRANCH_TRUE_I16, I<OPCODE_BRANCH_TRUE, VoidOp, I16Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_TRUE_I32
-    : Sequence<BRANCH_TRUE_I32, I<OPCODE_BRANCH_TRUE, VoidOp, I32Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_TRUE_I64
-    : Sequence<BRANCH_TRUE_I64, I<OPCODE_BRANCH_TRUE, VoidOp, I64Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_TRUE_F32
-    : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_TRUE_F64
-    : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(i.src2.value->name, e.T_NEAR);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
-                     BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32,
-                     BRANCH_TRUE_F64);
-
-// ============================================================================
-// OPCODE_BRANCH_FALSE
-// ============================================================================
-struct BRANCH_FALSE_I8
-    : Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_FALSE_I16
-    : Sequence<BRANCH_FALSE_I16,
-               I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_FALSE_I32
-    : Sequence<BRANCH_FALSE_I32,
-               I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_FALSE_I64
-    : Sequence<BRANCH_FALSE_I64,
-               I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.test(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_FALSE_F32
-    : Sequence<BRANCH_FALSE_F32,
-               I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-struct BRANCH_FALSE_F64
-    : Sequence<BRANCH_FALSE_F64,
-               I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jz(i.src2.value->name, e.T_NEAR);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,
-                     BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32,
-                     BRANCH_FALSE_F64);
-
 // ============================================================================
 // OPCODE_ASSIGN
 // ============================================================================
@@ -4606,6 +4080,7 @@ struct SET_ROUNDING_MODE_I32
 EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
 
 void RegisterSequences() {
+  RegisterControl();
   RegisterVector();
 }
 
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index 16408be09..755887efa 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -41,6 +41,7 @@ static bool Register() {
   const auto X64_INSTR_##name = Register<__VA_ARGS__>();
 
 // Registration functions to force inclusion of several files
+void RegisterControl();
 void RegisterVector();
 
 void RegisterSequences();

From b2f9d54e7e7e340fc291785f0e7253364f36d474 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 16:12:34 -0600
Subject: [PATCH 23/31] [x64] Factor out memory handling code

---
 src/xenia/cpu/backend/x64/x64_seq_memory.cc | 1053 +++++++++++++++++++
 src/xenia/cpu/backend/x64/x64_sequences.cc  | 1023 +-----------------
 src/xenia/cpu/backend/x64/x64_sequences.h   |    1 +
 3 files changed, 1055 insertions(+), 1022 deletions(-)
 create mode 100644 src/xenia/cpu/backend/x64/x64_seq_memory.cc

diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
new file mode 100644
index 000000000..ba647e045
--- /dev/null
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -0,0 +1,1053 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2018 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/x64/x64_sequences.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/cpu/backend/x64/x64_op.h"
+#include "xenia/cpu/backend/x64/x64_tracers.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace x64 {
+
+void RegisterMemory() {}
+
+// Note: all types are always aligned in the context.
+RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
+  return e.GetContextReg() + offset.value;
+}
+
+template <typename T>
+RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
+                                  const T& offset) {
+  assert_true(offset.is_constant);
+  int32_t offset_const = static_cast<int32_t>(offset.constant());
+
+  if (guest.is_constant) {
+    uint32_t address = static_cast<uint32_t>(guest.constant());
+    address += offset_const;
+    if (address < 0x80000000) {
+      return e.GetMembaseReg() + address;
+    } else {
+      e.mov(e.eax, address);
+      return e.GetMembaseReg() + e.rax;
+    }
+  } else {
+    // Clear the top 32 bits, as they are likely garbage.
+    // TODO(benvanik): find a way to avoid doing this.
+    e.mov(e.eax, guest.reg().cvt32());
+    return e.GetMembaseReg() + e.rax + offset_const;
+  }
+}
+
+// Note: most *should* be aligned, but needs to be checked!
+template <typename T>
+RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
+  if (guest.is_constant) {
+    // TODO(benvanik): figure out how to do this without a temp.
+    // Since the constant is often 0x8... if we tried to use that as a
+    // displacement it would be sign extended and mess things up.
+    uint32_t address = static_cast<uint32_t>(guest.constant());
+    if (address < 0x80000000) {
+      return e.GetMembaseReg() + address;
+    } else {
+      e.mov(e.eax, address);
+      return e.GetMembaseReg() + e.rax;
+    }
+  } else {
+    // Clear the top 32 bits, as they are likely garbage.
+    // TODO(benvanik): find a way to avoid doing this.
+    e.mov(e.eax, guest.reg().cvt32());
+    return e.GetMembaseReg() + e.rax;
+  }
+}
+
+// ============================================================================
+// OPCODE_ATOMIC_EXCHANGE
+// ============================================================================
+// Note that the address we use here is a real, host address!
+// This is weird, and should be fixed.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) {
+  if (i.dest == i.src1) {
+    e.mov(e.rax, i.src1);
+    if (i.dest != i.src2) {
+      if (i.src2.is_constant) {
+        e.mov(i.dest, i.src2.constant());
+      } else {
+        e.mov(i.dest, i.src2);
+      }
+    }
+    e.lock();
+    e.xchg(e.dword[e.rax], i.dest);
+  } else {
+    if (i.dest != i.src2) {
+      if (i.src2.is_constant) {
+        e.mov(i.dest, i.src2.constant());
+      } else {
+        e.mov(i.dest, i.src2);
+      }
+    }
+    e.lock();
+    e.xchg(e.dword[i.src1.reg()], i.dest);
+  }
+}
+struct ATOMIC_EXCHANGE_I8
+    : Sequence<ATOMIC_EXCHANGE_I8,
+               I<OPCODE_ATOMIC_EXCHANGE, I8Op, I64Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, Reg8>(e, i);
+  }
+};
+struct ATOMIC_EXCHANGE_I16
+    : Sequence<ATOMIC_EXCHANGE_I16,
+               I<OPCODE_ATOMIC_EXCHANGE, I16Op, I64Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I16, Reg16>(e, i);
+  }
+};
+struct ATOMIC_EXCHANGE_I32
+    : Sequence<ATOMIC_EXCHANGE_I32,
+               I<OPCODE_ATOMIC_EXCHANGE, I32Op, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I32, Reg32>(e, i);
+  }
+};
+struct ATOMIC_EXCHANGE_I64
+    : Sequence<ATOMIC_EXCHANGE_I64,
+               I<OPCODE_ATOMIC_EXCHANGE, I64Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I64, Reg64>(e, i);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8,
+                     ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32,
+                     ATOMIC_EXCHANGE_I64);
+
+// ============================================================================
+// OPCODE_ATOMIC_COMPARE_EXCHANGE
+// ============================================================================
+struct ATOMIC_COMPARE_EXCHANGE_I32
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I32,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I32Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.eax, i.src2);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lock();
+    e.cmpxchg(e.dword[e.GetMembaseReg() + e.rcx], i.src3);
+    e.sete(i.dest);
+  }
+};
+struct ATOMIC_COMPARE_EXCHANGE_I64
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I64,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(e.rax, i.src2);
+    e.mov(e.ecx, i.src1.reg().cvt32());
+    e.lock();
+    e.cmpxchg(e.qword[e.GetMembaseReg() + e.rcx], i.src3);
+    e.sete(i.dest);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE,
+                     ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64);
+
+// ============================================================================
+// OPCODE_LOAD_LOCAL
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct LOAD_LOCAL_I8
+    : Sequence<LOAD_LOCAL_I8, I<OPCODE_LOAD_LOCAL, I8Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]);
+    // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I16
+    : Sequence<LOAD_LOCAL_I16, I<OPCODE_LOAD_LOCAL, I16Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(i.dest, e.word[e.rsp + i.src1.constant()]);
+    // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I32
+    : Sequence<LOAD_LOCAL_I32, I<OPCODE_LOAD_LOCAL, I32Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]);
+    // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I64
+    : Sequence<LOAD_LOCAL_I64, I<OPCODE_LOAD_LOCAL, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]);
+    // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_F32
+    : Sequence<LOAD_LOCAL_F32, I<OPCODE_LOAD_LOCAL, F32Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]);
+    // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_F64
+    : Sequence<LOAD_LOCAL_F64, I<OPCODE_LOAD_LOCAL, F64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]);
+    // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_V128
+    : Sequence<LOAD_LOCAL_V128, I<OPCODE_LOAD_LOCAL, V128Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]);
+    // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16,
+                     LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32,
+                     LOAD_LOCAL_F64, LOAD_LOCAL_V128);
+
+// ============================================================================
+// OPCODE_STORE_LOCAL
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_LOCAL_I8
+    : Sequence<STORE_LOCAL_I8, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2);
+    e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_I16
+    : Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
+    e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_I32
+    : Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
+    e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_I64
+    : Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
+    e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_F32
+    : Sequence<STORE_LOCAL_F32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2);
+    e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_F64
+    : Sequence<STORE_LOCAL_F64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2);
+    e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+struct STORE_LOCAL_V128
+    : Sequence<STORE_LOCAL_V128, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2);
+    e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16,
+                     STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32,
+                     STORE_LOCAL_F64, STORE_LOCAL_V128);
+
+// ============================================================================
+// OPCODE_LOAD_CONTEXT
+// ============================================================================
+struct LOAD_CONTEXT_I8
+    : Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.mov(i.dest, e.byte[addr]);
+    if (IsTracingData()) {
+      e.mov(e.r8, e.byte[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI8));
+    }
+  }
+};
+struct LOAD_CONTEXT_I16
+    : Sequence<LOAD_CONTEXT_I16, I<OPCODE_LOAD_CONTEXT, I16Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.mov(i.dest, e.word[addr]);
+    if (IsTracingData()) {
+      e.mov(e.r8, e.word[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI16));
+    }
+  }
+};
+struct LOAD_CONTEXT_I32
+    : Sequence<LOAD_CONTEXT_I32, I<OPCODE_LOAD_CONTEXT, I32Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.mov(i.dest, e.dword[addr]);
+    if (IsTracingData()) {
+      e.mov(e.r8, e.dword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
+    }
+  }
+};
+struct LOAD_CONTEXT_I64
+    : Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.mov(i.dest, e.qword[addr]);
+    if (IsTracingData()) {
+      e.mov(e.r8, e.qword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI64));
+    }
+  }
+};
+struct LOAD_CONTEXT_F32
+    : Sequence<LOAD_CONTEXT_F32, I<OPCODE_LOAD_CONTEXT, F32Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.vmovss(i.dest, e.dword[addr]);
+    if (IsTracingData()) {
+      e.lea(e.r8, e.dword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF32));
+    }
+  }
+};
+struct LOAD_CONTEXT_F64
+    : Sequence<LOAD_CONTEXT_F64, I<OPCODE_LOAD_CONTEXT, F64Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.vmovsd(i.dest, e.qword[addr]);
+    if (IsTracingData()) {
+      e.lea(e.r8, e.qword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF64));
+    }
+  }
+};
+struct LOAD_CONTEXT_V128
+    : Sequence<LOAD_CONTEXT_V128, I<OPCODE_LOAD_CONTEXT, V128Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    e.vmovaps(i.dest, e.ptr[addr]);
+    if (IsTracingData()) {
+      e.lea(e.r8, e.ptr[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16,
+                     LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32,
+                     LOAD_CONTEXT_F64, LOAD_CONTEXT_V128);
+
+// ============================================================================
+// OPCODE_STORE_CONTEXT
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_CONTEXT_I8
+    : Sequence<STORE_CONTEXT_I8,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.mov(e.byte[addr], i.src2.constant());
+    } else {
+      e.mov(e.byte[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8, e.byte[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI8));
+    }
+  }
+};
+struct STORE_CONTEXT_I16
+    : Sequence<STORE_CONTEXT_I16,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.mov(e.word[addr], i.src2.constant());
+    } else {
+      e.mov(e.word[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8, e.word[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI16));
+    }
+  }
+};
+struct STORE_CONTEXT_I32
+    : Sequence<STORE_CONTEXT_I32,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.mov(e.dword[addr], i.src2.constant());
+    } else {
+      e.mov(e.dword[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8, e.dword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
+    }
+  }
+};
+struct STORE_CONTEXT_I64
+    : Sequence<STORE_CONTEXT_I64,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.MovMem64(addr, i.src2.constant());
+    } else {
+      e.mov(e.qword[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8, e.qword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI64));
+    }
+  }
+};
+struct STORE_CONTEXT_F32
+    : Sequence<STORE_CONTEXT_F32,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.mov(e.dword[addr], i.src2.value->constant.i32);
+    } else {
+      e.vmovss(e.dword[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.dword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF32));
+    }
+  }
+};
+struct STORE_CONTEXT_F64
+    : Sequence<STORE_CONTEXT_F64,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.MovMem64(addr, i.src2.value->constant.i64);
+    } else {
+      e.vmovsd(e.qword[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.qword[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF64));
+    }
+  }
+};
+struct STORE_CONTEXT_V128
+    : Sequence<STORE_CONTEXT_V128,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeContextAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.LoadConstantXmm(e.xmm0, i.src2.constant());
+      e.vmovaps(e.ptr[addr], e.xmm0);
+    } else {
+      e.vmovaps(e.ptr[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.ptr[addr]);
+      e.mov(e.rdx, i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16,
+                     STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32,
+                     STORE_CONTEXT_F64, STORE_CONTEXT_V128);
+
+// ============================================================================
+// OPCODE_LOAD_MMIO
+// ============================================================================
+// Note: all types are always aligned in the context.
+struct LOAD_MMIO_I32
+    : Sequence<LOAD_MMIO_I32, I<OPCODE_LOAD_MMIO, I32Op, OffsetOp, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // uint64_t (context, addr)
+    auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
+    auto read_address = uint32_t(i.src2.value);
+    e.mov(e.r8, uint64_t(mmio_range->callback_context));
+    e.mov(e.r9d, read_address);
+    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->read));
+    e.bswap(e.eax);
+    e.mov(i.dest, e.eax);
+    if (IsTracingData()) {
+      e.mov(e.r8, i.dest);
+      e.mov(e.edx, read_address);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32);
+
+// ============================================================================
+// OPCODE_STORE_MMIO
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_MMIO_I32
+    : Sequence<STORE_MMIO_I32,
+               I<OPCODE_STORE_MMIO, VoidOp, OffsetOp, OffsetOp, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // void (context, addr, value)
+    auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
+    auto write_address = uint32_t(i.src2.value);
+    e.mov(e.r8, uint64_t(mmio_range->callback_context));
+    e.mov(e.r9d, write_address);
+    if (i.src3.is_constant) {
+      e.mov(e.r10d, xe::byte_swap(i.src3.constant()));
+    } else {
+      e.mov(e.r10d, i.src3);
+      e.bswap(e.r10d);
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->write));
+    if (IsTracingData()) {
+      if (i.src3.is_constant) {
+        e.mov(e.r8d, i.src3.constant());
+      } else {
+        e.mov(e.r8d, i.src3);
+      }
+      e.mov(e.edx, write_address);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32);
+
+// ============================================================================
+// OPCODE_LOAD_OFFSET
+// ============================================================================
+struct LOAD_OFFSET_I8
+    : Sequence<LOAD_OFFSET_I8, I<OPCODE_LOAD_OFFSET, I8Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    e.mov(i.dest, e.byte[addr]);
+  }
+};
+
+struct LOAD_OFFSET_I16
+    : Sequence<LOAD_OFFSET_I16, I<OPCODE_LOAD_OFFSET, I16Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.word[addr]);
+      } else {
+        e.mov(i.dest, e.word[addr]);
+        e.ror(i.dest, 8);
+      }
+    } else {
+      e.mov(i.dest, e.word[addr]);
+    }
+  }
+};
+
+struct LOAD_OFFSET_I32
+    : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.dword[addr]);
+      } else {
+        e.mov(i.dest, e.dword[addr]);
+        e.bswap(i.dest);
+      }
+    } else {
+      e.mov(i.dest, e.dword[addr]);
+    }
+  }
+};
+
+struct LOAD_OFFSET_I64
+    : Sequence<LOAD_OFFSET_I64, I<OPCODE_LOAD_OFFSET, I64Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.qword[addr]);
+      } else {
+        e.mov(i.dest, e.qword[addr]);
+        e.bswap(i.dest);
+      }
+    } else {
+      e.mov(i.dest, e.qword[addr]);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
+                     LOAD_OFFSET_I32, LOAD_OFFSET_I64);
+
+// ============================================================================
+// OPCODE_STORE_OFFSET
+// ============================================================================
+struct STORE_OFFSET_I8
+    : Sequence<STORE_OFFSET_I8,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.src3.is_constant) {
+      e.mov(e.byte[addr], i.src3.constant());
+    } else {
+      e.mov(e.byte[addr], i.src3);
+    }
+  }
+};
+
+struct STORE_OFFSET_I16
+    : Sequence<STORE_OFFSET_I16,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.word[addr], i.src3);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src3.is_constant) {
+        e.mov(e.word[addr], i.src3.constant());
+      } else {
+        e.mov(e.word[addr], i.src3);
+      }
+    }
+  }
+};
+
+struct STORE_OFFSET_I32
+    : Sequence<STORE_OFFSET_I32,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.dword[addr], i.src3);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src3.is_constant) {
+        e.mov(e.dword[addr], i.src3.constant());
+      } else {
+        e.mov(e.dword[addr], i.src3);
+      }
+    }
+  }
+};
+
+struct STORE_OFFSET_I64
+    : Sequence<STORE_OFFSET_I64,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.qword[addr], i.src3);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src3.is_constant) {
+        e.MovMem64(addr, i.src3.constant());
+      } else {
+        e.mov(e.qword[addr], i.src3);
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16,
+                     STORE_OFFSET_I32, STORE_OFFSET_I64);
+
+// ============================================================================
+// OPCODE_LOAD
+// ============================================================================
+struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    e.mov(i.dest, e.byte[addr]);
+    if (IsTracingData()) {
+      e.mov(e.r8b, i.dest);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI8));
+    }
+  }
+};
+struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.word[addr]);
+      } else {
+        e.mov(i.dest, e.word[addr]);
+        e.ror(i.dest, 8);
+      }
+    } else {
+      e.mov(i.dest, e.word[addr]);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8w, i.dest);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI16));
+    }
+  }
+};
+struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.dword[addr]);
+      } else {
+        e.mov(i.dest, e.dword[addr]);
+        e.bswap(i.dest);
+      }
+    } else {
+      e.mov(i.dest, e.dword[addr]);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8d, i.dest);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+    }
+  }
+};
+struct LOAD_I64 : Sequence<LOAD_I64, I<OPCODE_LOAD, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(i.dest, e.qword[addr]);
+      } else {
+        e.mov(i.dest, e.qword[addr]);
+        e.bswap(i.dest);
+      }
+    } else {
+      e.mov(i.dest, e.qword[addr]);
+    }
+    if (IsTracingData()) {
+      e.mov(e.r8, i.dest);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI64));
+    }
+  }
+};
+struct LOAD_F32 : Sequence<LOAD_F32, I<OPCODE_LOAD, F32Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    e.vmovss(i.dest, e.dword[addr]);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_always("not implemented yet");
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.dword[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF32));
+    }
+  }
+};
+struct LOAD_F64 : Sequence<LOAD_F64, I<OPCODE_LOAD, F64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    e.vmovsd(i.dest, e.qword[addr]);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_always("not implemented yet");
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.qword[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF64));
+    }
+  }
+};
+struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    // TODO(benvanik): we should try to stick to movaps if possible.
+    e.vmovups(i.dest, e.ptr[addr]);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      // TODO(benvanik): find a way to do this without the memory load.
+      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
+    }
+    if (IsTracingData()) {
+      e.lea(e.r8, e.ptr[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64,
+                     LOAD_F32, LOAD_F64, LOAD_V128);
+
+// ============================================================================
+// OPCODE_STORE
+// ============================================================================
+// Note: most *should* be aligned, but needs to be checked!
+struct STORE_I8 : Sequence<STORE_I8, I<OPCODE_STORE, VoidOp, I64Op, I8Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.mov(e.byte[addr], i.src2.constant());
+    } else {
+      e.mov(e.byte[addr], i.src2);
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.mov(e.r8b, e.byte[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI8));
+    }
+  }
+};
+struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.word[addr], i.src2);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src2.is_constant) {
+        e.mov(e.word[addr], i.src2.constant());
+      } else {
+        e.mov(e.word[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.mov(e.r8w, e.word[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI16));
+    }
+  }
+};
+struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.dword[addr], i.src2);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src2.is_constant) {
+        e.mov(e.dword[addr], i.src2.constant());
+      } else {
+        e.mov(e.dword[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.mov(e.r8d, e.dword[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
+    }
+  }
+};
+struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+        e.movbe(e.qword[addr], i.src2);
+      } else {
+        assert_always("not implemented");
+      }
+    } else {
+      if (i.src2.is_constant) {
+        e.MovMem64(addr, i.src2.constant());
+      } else {
+        e.mov(e.qword[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.mov(e.r8, e.qword[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI64));
+    }
+  }
+};
+struct STORE_F32 : Sequence<STORE_F32, I<OPCODE_STORE, VoidOp, I64Op, F32Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not yet implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.mov(e.dword[addr], i.src2.value->constant.i32);
+      } else {
+        e.vmovss(e.dword[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.lea(e.r8, e.ptr[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF32));
+    }
+  }
+};
+struct STORE_F64 : Sequence<STORE_F64, I<OPCODE_STORE, VoidOp, I64Op, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not yet implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MovMem64(addr, i.src2.value->constant.i64);
+      } else {
+        e.vmovsd(e.qword[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.lea(e.r8, e.ptr[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF64));
+    }
+  }
+};
+struct STORE_V128
+    : Sequence<STORE_V128, I<OPCODE_STORE, VoidOp, I64Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
+      e.vmovaps(e.ptr[addr], e.xmm0);
+    } else {
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(e.xmm0, i.src2.constant());
+        e.vmovaps(e.ptr[addr], e.xmm0);
+      } else {
+        e.vmovaps(e.ptr[addr], i.src2);
+      }
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.lea(e.r8, e.ptr[addr]);
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64,
+                     STORE_F32, STORE_F64, STORE_V128);
+
+// ============================================================================
+// OPCODE_PREFETCH
+// ============================================================================
+struct PREFETCH
+    : Sequence<PREFETCH, I<OPCODE_PREFETCH, VoidOp, I64Op, OffsetOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): prefetch addr -> length.
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_PREFETCH, PREFETCH);
+
+// ============================================================================
+// OPCODE_MEMORY_BARRIER
+// ============================================================================
+struct MEMORY_BARRIER
+    : Sequence<MEMORY_BARRIER, I<OPCODE_MEMORY_BARRIER, VoidOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) { e.mfence(); }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER);
+
+// ============================================================================
+// OPCODE_MEMSET
+// ============================================================================
+struct MEMSET_I64_I8_I64
+    : Sequence<MEMSET_I64_I8_I64,
+               I<OPCODE_MEMSET, VoidOp, I64Op, I8Op, I64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    assert_true(i.src3.is_constant);
+    assert_true(i.src2.constant() == 0);
+    e.vpxor(e.xmm0, e.xmm0);
+    auto addr = ComputeMemoryAddress(e, i.src1);
+    switch (i.src3.constant()) {
+      case 32:
+        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
+        break;
+      case 128:
+        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
+        e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
+        break;
+      default:
+        assert_unhandled_case(i.src3.constant());
+        break;
+    }
+    if (IsTracingData()) {
+      addr = ComputeMemoryAddress(e, i.src1);
+      e.mov(e.r9, i.src3.constant());
+      e.mov(e.r8, i.src2.constant());
+      e.lea(e.rdx, e.ptr[addr]);
+      e.CallNative(reinterpret_cast<void*>(TraceMemset));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64);
+
+}  // namespace x64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index d33bf3781..da6ff8891 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -450,343 +450,6 @@ struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK, LOAD_CLOCK);
 
-// ============================================================================
-// OPCODE_LOAD_LOCAL
-// ============================================================================
-// Note: all types are always aligned on the stack.
-struct LOAD_LOCAL_I8
-    : Sequence<LOAD_LOCAL_I8, I<OPCODE_LOAD_LOCAL, I8Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]);
-    // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_I16
-    : Sequence<LOAD_LOCAL_I16, I<OPCODE_LOAD_LOCAL, I16Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(i.dest, e.word[e.rsp + i.src1.constant()]);
-    // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_I32
-    : Sequence<LOAD_LOCAL_I32, I<OPCODE_LOAD_LOCAL, I32Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]);
-    // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_I64
-    : Sequence<LOAD_LOCAL_I64, I<OPCODE_LOAD_LOCAL, I64Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]);
-    // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_F32
-    : Sequence<LOAD_LOCAL_F32, I<OPCODE_LOAD_LOCAL, F32Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]);
-    // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_F64
-    : Sequence<LOAD_LOCAL_F64, I<OPCODE_LOAD_LOCAL, F64Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]);
-    // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-struct LOAD_LOCAL_V128
-    : Sequence<LOAD_LOCAL_V128, I<OPCODE_LOAD_LOCAL, V128Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]);
-    // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16,
-                     LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32,
-                     LOAD_LOCAL_F64, LOAD_LOCAL_V128);
-
-// ============================================================================
-// OPCODE_STORE_LOCAL
-// ============================================================================
-// Note: all types are always aligned on the stack.
-struct STORE_LOCAL_I8
-    : Sequence<STORE_LOCAL_I8, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_I16
-    : Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_I32
-    : Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_I64
-    : Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
-    e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_F32
-    : Sequence<STORE_LOCAL_F32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2);
-    e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_F64
-    : Sequence<STORE_LOCAL_F64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2);
-    e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-struct STORE_LOCAL_V128
-    : Sequence<STORE_LOCAL_V128, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2);
-    e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16,
-                     STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32,
-                     STORE_LOCAL_F64, STORE_LOCAL_V128);
-
-// ============================================================================
-// OPCODE_LOAD_CONTEXT
-// ============================================================================
-// Note: all types are always aligned in the context.
-RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
-  return e.GetContextReg() + offset.value;
-}
-struct LOAD_CONTEXT_I8
-    : Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.mov(i.dest, e.byte[addr]);
-    if (IsTracingData()) {
-      e.mov(e.r8, e.byte[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI8));
-    }
-  }
-};
-struct LOAD_CONTEXT_I16
-    : Sequence<LOAD_CONTEXT_I16, I<OPCODE_LOAD_CONTEXT, I16Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.mov(i.dest, e.word[addr]);
-    if (IsTracingData()) {
-      e.mov(e.r8, e.word[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI16));
-    }
-  }
-};
-struct LOAD_CONTEXT_I32
-    : Sequence<LOAD_CONTEXT_I32, I<OPCODE_LOAD_CONTEXT, I32Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.mov(i.dest, e.dword[addr]);
-    if (IsTracingData()) {
-      e.mov(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
-    }
-  }
-};
-struct LOAD_CONTEXT_I64
-    : Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.mov(i.dest, e.qword[addr]);
-    if (IsTracingData()) {
-      e.mov(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI64));
-    }
-  }
-};
-struct LOAD_CONTEXT_F32
-    : Sequence<LOAD_CONTEXT_F32, I<OPCODE_LOAD_CONTEXT, F32Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.vmovss(i.dest, e.dword[addr]);
-    if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF32));
-    }
-  }
-};
-struct LOAD_CONTEXT_F64
-    : Sequence<LOAD_CONTEXT_F64, I<OPCODE_LOAD_CONTEXT, F64Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.vmovsd(i.dest, e.qword[addr]);
-    if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF64));
-    }
-  }
-};
-struct LOAD_CONTEXT_V128
-    : Sequence<LOAD_CONTEXT_V128, I<OPCODE_LOAD_CONTEXT, V128Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    e.vmovaps(i.dest, e.ptr[addr]);
-    if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadV128));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16,
-                     LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32,
-                     LOAD_CONTEXT_F64, LOAD_CONTEXT_V128);
-
-// ============================================================================
-// OPCODE_STORE_CONTEXT
-// ============================================================================
-// Note: all types are always aligned on the stack.
-struct STORE_CONTEXT_I8
-    : Sequence<STORE_CONTEXT_I8,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.mov(e.byte[addr], i.src2.constant());
-    } else {
-      e.mov(e.byte[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8, e.byte[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI8));
-    }
-  }
-};
-struct STORE_CONTEXT_I16
-    : Sequence<STORE_CONTEXT_I16,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.mov(e.word[addr], i.src2.constant());
-    } else {
-      e.mov(e.word[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8, e.word[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI16));
-    }
-  }
-};
-struct STORE_CONTEXT_I32
-    : Sequence<STORE_CONTEXT_I32,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.mov(e.dword[addr], i.src2.constant());
-    } else {
-      e.mov(e.dword[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
-    }
-  }
-};
-struct STORE_CONTEXT_I64
-    : Sequence<STORE_CONTEXT_I64,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.MovMem64(addr, i.src2.constant());
-    } else {
-      e.mov(e.qword[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI64));
-    }
-  }
-};
-struct STORE_CONTEXT_F32
-    : Sequence<STORE_CONTEXT_F32,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.mov(e.dword[addr], i.src2.value->constant.i32);
-    } else {
-      e.vmovss(e.dword[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF32));
-    }
-  }
-};
-struct STORE_CONTEXT_F64
-    : Sequence<STORE_CONTEXT_F64,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.MovMem64(addr, i.src2.value->constant.i64);
-    } else {
-      e.vmovsd(e.qword[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF64));
-    }
-  }
-};
-struct STORE_CONTEXT_V128
-    : Sequence<STORE_CONTEXT_V128,
-               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeContextAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.vmovaps(e.ptr[addr], e.xmm0);
-    } else {
-      e.vmovaps(e.ptr[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.mov(e.rdx, i.src1.value);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreV128));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16,
-                     STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32,
-                     STORE_CONTEXT_F64, STORE_CONTEXT_V128);
-
 // ============================================================================
 // OPCODE_CONTEXT_BARRIER
 // ============================================================================
@@ -796,601 +459,6 @@ struct CONTEXT_BARRIER
 };
 EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER);
 
-// ============================================================================
-// OPCODE_LOAD_MMIO
-// ============================================================================
-// Note: all types are always aligned in the context.
-struct LOAD_MMIO_I32
-    : Sequence<LOAD_MMIO_I32, I<OPCODE_LOAD_MMIO, I32Op, OffsetOp, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // uint64_t (context, addr)
-    auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
-    auto read_address = uint32_t(i.src2.value);
-    e.mov(e.r8, uint64_t(mmio_range->callback_context));
-    e.mov(e.r9d, read_address);
-    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->read));
-    e.bswap(e.eax);
-    e.mov(i.dest, e.eax);
-    if (IsTracingData()) {
-      e.mov(e.r8, i.dest);
-      e.mov(e.edx, read_address);
-      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32);
-
-// ============================================================================
-// OPCODE_STORE_MMIO
-// ============================================================================
-// Note: all types are always aligned on the stack.
-struct STORE_MMIO_I32
-    : Sequence<STORE_MMIO_I32,
-               I<OPCODE_STORE_MMIO, VoidOp, OffsetOp, OffsetOp, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // void (context, addr, value)
-    auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
-    auto write_address = uint32_t(i.src2.value);
-    e.mov(e.r8, uint64_t(mmio_range->callback_context));
-    e.mov(e.r9d, write_address);
-    if (i.src3.is_constant) {
-      e.mov(e.r10d, xe::byte_swap(i.src3.constant()));
-    } else {
-      e.mov(e.r10d, i.src3);
-      e.bswap(e.r10d);
-    }
-    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->write));
-    if (IsTracingData()) {
-      if (i.src3.is_constant) {
-        e.mov(e.r8d, i.src3.constant());
-      } else {
-        e.mov(e.r8d, i.src3);
-      }
-      e.mov(e.edx, write_address);
-      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32);
-
-// ============================================================================
-// OPCODE_LOAD_OFFSET
-// ============================================================================
-template <typename T>
-RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
-                                  const T& offset) {
-  int32_t offset_const = static_cast<int32_t>(offset.constant());
-
-  if (guest.is_constant) {
-    uint32_t address = static_cast<uint32_t>(guest.constant());
-    address += static_cast<int32_t>(offset.constant());
-    if (address < 0x80000000) {
-      return e.GetMembaseReg() + address;
-    } else {
-      e.mov(e.eax, address);
-      return e.GetMembaseReg() + e.rax;
-    }
-  } else {
-    // Clear the top 32 bits, as they are likely garbage.
-    // TODO(benvanik): find a way to avoid doing this.
-    e.mov(e.eax, guest.reg().cvt32());
-    return e.GetMembaseReg() + e.rax + offset_const;
-  }
-}
-
-struct LOAD_OFFSET_I8
-    : Sequence<LOAD_OFFSET_I8, I<OPCODE_LOAD_OFFSET, I8Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    e.mov(i.dest, e.byte[addr]);
-  }
-};
-
-struct LOAD_OFFSET_I16
-    : Sequence<LOAD_OFFSET_I16, I<OPCODE_LOAD_OFFSET, I16Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.word[addr]);
-      } else {
-        e.mov(i.dest, e.word[addr]);
-        e.ror(i.dest, 8);
-      }
-    } else {
-      e.mov(i.dest, e.word[addr]);
-    }
-  }
-};
-
-struct LOAD_OFFSET_I32
-    : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
-      } else {
-        e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
-      }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
-    }
-  }
-};
-
-struct LOAD_OFFSET_I64
-    : Sequence<LOAD_OFFSET_I64, I<OPCODE_LOAD_OFFSET, I64Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.qword[addr]);
-      } else {
-        e.mov(i.dest, e.qword[addr]);
-        e.bswap(i.dest);
-      }
-    } else {
-      e.mov(i.dest, e.qword[addr]);
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
-                     LOAD_OFFSET_I32, LOAD_OFFSET_I64);
-
-// ============================================================================
-// OPCODE_STORE_OFFSET
-// ============================================================================
-struct STORE_OFFSET_I8
-    : Sequence<STORE_OFFSET_I8,
-               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.src3.is_constant) {
-      e.mov(e.byte[addr], i.src3.constant());
-    } else {
-      e.mov(e.byte[addr], i.src3);
-    }
-  }
-};
-
-struct STORE_OFFSET_I16
-    : Sequence<STORE_OFFSET_I16,
-               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src3.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.word[addr], i.src3);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src3.is_constant) {
-        e.mov(e.word[addr], i.src3.constant());
-      } else {
-        e.mov(e.word[addr], i.src3);
-      }
-    }
-  }
-};
-
-struct STORE_OFFSET_I32
-    : Sequence<STORE_OFFSET_I32,
-               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src3.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.dword[addr], i.src3);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src3.is_constant) {
-        e.mov(e.dword[addr], i.src3.constant());
-      } else {
-        e.mov(e.dword[addr], i.src3);
-      }
-    }
-  }
-};
-
-struct STORE_OFFSET_I64
-    : Sequence<STORE_OFFSET_I64,
-               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src3.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.qword[addr], i.src3);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src3.is_constant) {
-        e.MovMem64(addr, i.src3.constant());
-      } else {
-        e.mov(e.qword[addr], i.src3);
-      }
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16,
-                     STORE_OFFSET_I32, STORE_OFFSET_I64);
-
-// ============================================================================
-// OPCODE_LOAD
-// ============================================================================
-// Note: most *should* be aligned, but needs to be checked!
-template <typename T>
-RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
-  if (guest.is_constant) {
-    // TODO(benvanik): figure out how to do this without a temp.
-    // Since the constant is often 0x8... if we tried to use that as a
-    // displacement it would be sign extended and mess things up.
-    uint32_t address = static_cast<uint32_t>(guest.constant());
-    if (address < 0x80000000) {
-      return e.GetMembaseReg() + address;
-    } else {
-      e.mov(e.eax, address);
-      return e.GetMembaseReg() + e.rax;
-    }
-  } else {
-    // Clear the top 32 bits, as they are likely garbage.
-    // TODO(benvanik): find a way to avoid doing this.
-    e.mov(e.eax, guest.reg().cvt32());
-    return e.GetMembaseReg() + e.rax;
-  }
-}
-struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    e.mov(i.dest, e.byte[addr]);
-    if (IsTracingData()) {
-      e.mov(e.r8b, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI8));
-    }
-  }
-};
-struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.word[addr]);
-      } else {
-        e.mov(i.dest, e.word[addr]);
-        e.ror(i.dest, 8);
-      }
-    } else {
-      e.mov(i.dest, e.word[addr]);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8w, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI16));
-    }
-  }
-};
-struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
-      } else {
-        e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
-      }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8d, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
-    }
-  }
-};
-struct LOAD_I64 : Sequence<LOAD_I64, I<OPCODE_LOAD, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.qword[addr]);
-      } else {
-        e.mov(i.dest, e.qword[addr]);
-        e.bswap(i.dest);
-      }
-    } else {
-      e.mov(i.dest, e.qword[addr]);
-    }
-    if (IsTracingData()) {
-      e.mov(e.r8, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI64));
-    }
-  }
-};
-struct LOAD_F32 : Sequence<LOAD_F32, I<OPCODE_LOAD, F32Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    e.vmovss(i.dest, e.dword[addr]);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_always("not implemented yet");
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF32));
-    }
-  }
-};
-struct LOAD_F64 : Sequence<LOAD_F64, I<OPCODE_LOAD, F64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    e.vmovsd(i.dest, e.qword[addr]);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_always("not implemented yet");
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF64));
-    }
-  }
-};
-struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    // TODO(benvanik): we should try to stick to movaps if possible.
-    e.vmovups(i.dest, e.ptr[addr]);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      // TODO(benvanik): find a way to do this without the memory load.
-      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
-    }
-    if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadV128));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64,
-                     LOAD_F32, LOAD_F64, LOAD_V128);
-
-// ============================================================================
-// OPCODE_STORE
-// ============================================================================
-// Note: most *should* be aligned, but needs to be checked!
-struct STORE_I8 : Sequence<STORE_I8, I<OPCODE_STORE, VoidOp, I64Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.src2.is_constant) {
-      e.mov(e.byte[addr], i.src2.constant());
-    } else {
-      e.mov(e.byte[addr], i.src2);
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8b, e.byte[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI8));
-    }
-  }
-};
-struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.word[addr], i.src2);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src2.is_constant) {
-        e.mov(e.word[addr], i.src2.constant());
-      } else {
-        e.mov(e.word[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8w, e.word[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI16));
-    }
-  }
-};
-struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.dword[addr], i.src2);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src2.is_constant) {
-        e.mov(e.dword[addr], i.src2.constant());
-      } else {
-        e.mov(e.dword[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8d, e.dword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
-    }
-  }
-};
-struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(e.qword[addr], i.src2);
-      } else {
-        assert_always("not implemented");
-      }
-    } else {
-      if (i.src2.is_constant) {
-        e.MovMem64(addr, i.src2.constant());
-      } else {
-        e.mov(e.qword[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8, e.qword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI64));
-    }
-  }
-};
-struct STORE_F32 : Sequence<STORE_F32, I<OPCODE_STORE, VoidOp, I64Op, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      assert_always("not yet implemented");
-    } else {
-      if (i.src2.is_constant) {
-        e.mov(e.dword[addr], i.src2.value->constant.i32);
-      } else {
-        e.vmovss(e.dword[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF32));
-    }
-  }
-};
-struct STORE_F64 : Sequence<STORE_F64, I<OPCODE_STORE, VoidOp, I64Op, F64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      assert_always("not yet implemented");
-    } else {
-      if (i.src2.is_constant) {
-        e.MovMem64(addr, i.src2.value->constant.i64);
-      } else {
-        e.vmovsd(e.qword[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF64));
-    }
-  }
-};
-struct STORE_V128
-    : Sequence<STORE_V128, I<OPCODE_STORE, VoidOp, I64Op, V128Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      assert_false(i.src2.is_constant);
-      e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
-      e.vmovaps(e.ptr[addr], e.xmm0);
-    } else {
-      if (i.src2.is_constant) {
-        e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.vmovaps(e.ptr[addr], e.xmm0);
-      } else {
-        e.vmovaps(e.ptr[addr], i.src2);
-      }
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreV128));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64,
-                     STORE_F32, STORE_F64, STORE_V128);
-
-// ============================================================================
-// OPCODE_PREFETCH
-// ============================================================================
-struct PREFETCH
-    : Sequence<PREFETCH, I<OPCODE_PREFETCH, VoidOp, I64Op, OffsetOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // TODO(benvanik): prefetch addr -> length.
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_PREFETCH, PREFETCH);
-
-// ============================================================================
-// OPCODE_MEMORY_BARRIER
-// ============================================================================
-struct MEMORY_BARRIER
-    : Sequence<MEMORY_BARRIER, I<OPCODE_MEMORY_BARRIER, VoidOp>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) { e.mfence(); }
-};
-EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER);
-
-// ============================================================================
-// OPCODE_MEMSET
-// ============================================================================
-struct MEMSET_I64_I8_I64
-    : Sequence<MEMSET_I64_I8_I64,
-               I<OPCODE_MEMSET, VoidOp, I64Op, I8Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.is_constant);
-    assert_true(i.src3.is_constant);
-    assert_true(i.src2.constant() == 0);
-    e.vpxor(e.xmm0, e.xmm0);
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    switch (i.src3.constant()) {
-      case 32:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
-        break;
-      case 128:
-        e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
-        e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
-        break;
-      default:
-        assert_unhandled_case(i.src3.constant());
-        break;
-    }
-    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r9, i.src3.constant());
-      e.mov(e.r8, i.src2.constant());
-      e.lea(e.rdx, e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemset));
-    }
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64);
-
 // ============================================================================
 // OPCODE_MAX
 // ============================================================================
@@ -3970,96 +3038,6 @@ struct CNTLZ_I64 : Sequence<CNTLZ_I64, I<OPCODE_CNTLZ, I8Op, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64);
 
-// ============================================================================
-// OPCODE_ATOMIC_EXCHANGE
-// ============================================================================
-// Note that the address we use here is a real, host address!
-// This is weird, and should be fixed.
-template <typename SEQ, typename REG, typename ARGS>
-void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) {
-  if (i.dest == i.src1) {
-    e.mov(e.rax, i.src1);
-    if (i.dest != i.src2) {
-      if (i.src2.is_constant) {
-        e.mov(i.dest, i.src2.constant());
-      } else {
-        e.mov(i.dest, i.src2);
-      }
-    }
-    e.lock();
-    e.xchg(e.dword[e.rax], i.dest);
-  } else {
-    if (i.dest != i.src2) {
-      if (i.src2.is_constant) {
-        e.mov(i.dest, i.src2.constant());
-      } else {
-        e.mov(i.dest, i.src2);
-      }
-    }
-    e.lock();
-    e.xchg(e.dword[i.src1.reg()], i.dest);
-  }
-}
-struct ATOMIC_EXCHANGE_I8
-    : Sequence<ATOMIC_EXCHANGE_I8,
-               I<OPCODE_ATOMIC_EXCHANGE, I8Op, I64Op, I8Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, Reg8>(e, i);
-  }
-};
-struct ATOMIC_EXCHANGE_I16
-    : Sequence<ATOMIC_EXCHANGE_I16,
-               I<OPCODE_ATOMIC_EXCHANGE, I16Op, I64Op, I16Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I16, Reg16>(e, i);
-  }
-};
-struct ATOMIC_EXCHANGE_I32
-    : Sequence<ATOMIC_EXCHANGE_I32,
-               I<OPCODE_ATOMIC_EXCHANGE, I32Op, I64Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I32, Reg32>(e, i);
-  }
-};
-struct ATOMIC_EXCHANGE_I64
-    : Sequence<ATOMIC_EXCHANGE_I64,
-               I<OPCODE_ATOMIC_EXCHANGE, I64Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I64, Reg64>(e, i);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8,
-                     ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32,
-                     ATOMIC_EXCHANGE_I64);
-
-// ============================================================================
-// OPCODE_ATOMIC_COMPARE_EXCHANGE
-// ============================================================================
-struct ATOMIC_COMPARE_EXCHANGE_I32
-    : Sequence<ATOMIC_COMPARE_EXCHANGE_I32,
-               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I32Op, I32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(e.eax, i.src2);
-    e.mov(e.ecx, i.src1.reg().cvt32());
-    e.lock();
-    e.cmpxchg(e.dword[e.GetMembaseReg() + e.rcx], i.src3);
-    e.sete(i.dest);
-  }
-};
-struct ATOMIC_COMPARE_EXCHANGE_I64
-    : Sequence<ATOMIC_COMPARE_EXCHANGE_I64,
-               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I64Op, I64Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.mov(e.rax, i.src2);
-    e.mov(e.ecx, i.src1.reg().cvt32());
-    e.lock();
-    e.cmpxchg(e.qword[e.GetMembaseReg() + e.rcx], i.src3);
-    e.sete(i.dest);
-  }
-};
-EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE,
-                     ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64);
-
 // ============================================================================
 // OPCODE_SET_ROUNDING_MODE
 // ============================================================================
@@ -4081,6 +3059,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
 
 void RegisterSequences() {
   RegisterControl();
+  RegisterMemory();
   RegisterVector();
 }
 
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index 755887efa..5815a3a92 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -42,6 +42,7 @@ static bool Register() {
 
 // Registration functions to force inclusion of several files
 void RegisterControl();
+void RegisterMemory();
 void RegisterVector();
 
 void RegisterSequences();

From 4571e8207a2de0143199cbfd2d3f4eed40136bb5 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 16:06:45 -0600
Subject: [PATCH 24/31] [x64] Minor cleanups in emitter

---
 src/xenia/cpu/backend/x64/x64_emitter.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 0c6957acc..4e229f2a4 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -148,11 +148,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
   for (auto it = locals.begin(); it != locals.end(); ++it) {
     auto slot = *it;
     size_t type_size = GetTypeSize(slot->type);
+
     // Align to natural size.
     stack_offset = xe::align(stack_offset, type_size);
     slot->set_constant((uint32_t)stack_offset);
     stack_offset += type_size;
   }
+
   // Ensure 16b alignment.
   stack_offset -= StackLayout::GUEST_STACK_SIZE;
   stack_offset = xe::align(stack_offset, static_cast<size_t>(16));
@@ -160,7 +162,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
   // Function prolog.
   // Must be 16b aligned.
   // Windows is very strict about the form of this and the epilog:
-  // https://msdn.microsoft.com/en-us/library/tawsa7cb.aspx
+  // https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2017
   // IMPORTANT: any changes to the prolog must be kept in sync with
   //     X64CodeCache, which dynamically generates exception information.
   //     Adding or changing anything here must be matched!
@@ -168,6 +170,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
   assert_true((stack_size + 8) % 16 == 0);
   *out_stack_size = stack_size;
   stack_size_ = stack_size;
+
   sub(rsp, (uint32_t)stack_size);
   mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
   mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
@@ -340,13 +343,14 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 
 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
 extern "C" uint64_t ResolveFunction(void* raw_context,
-                                    uint32_t target_address) {
+                                    uint64_t target_address) {
   auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
 
   // TODO(benvanik): required?
   assert_not_zero(target_address);
 
-  auto fn = thread_state->processor()->ResolveFunction(target_address);
+  auto fn =
+      thread_state->processor()->ResolveFunction((uint32_t)target_address);
   assert_not_null(fn);
   auto x64_fn = static_cast<X64Function*>(fn);
   uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
@@ -373,10 +377,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
     // Old-style resolve.
     // Not too important because indirection table is almost always available.
     // TODO: Overwrite the call-site with a straight call.
-    mov(rax, reinterpret_cast<uint64_t>(ResolveFunction));
-    mov(rcx, GetContextReg());
-    mov(rdx, function->address());
-    call(rax);
+    CallNative(&ResolveFunction, function->address());
   }
 
   // Actually jump/call to rax.

From c451fda819547f73b8edd3f750ce384feb39dfc6 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 17 Nov 2018 21:21:26 -0600
Subject: [PATCH 25/31] [x64] Template-ize a few vector emulation functions

---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 286 ++++++++------------
 1 file changed, 108 insertions(+), 178 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 9e8bf19de..0ee776b0d 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -670,6 +670,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
 // ============================================================================
 // OPCODE_VECTOR_SHL
 // ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load SSE registers into a C array.
+  _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+  _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+}
+
 struct VECTOR_SHL_V128
     : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
@@ -688,16 +705,7 @@ struct VECTOR_SHL_V128
         break;
     }
   }
-  static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] << (shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
@@ -707,19 +715,10 @@ struct VECTOR_SHL_V128
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
     e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI8));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
-  static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] << (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
     Xmm src1;
     if (i.src1.is_constant) {
@@ -773,22 +772,13 @@ struct VECTOR_SHL_V128
     } else {
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
+    e.lea(e.r8, e.StashXmm(0, src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
     e.L(end);
   }
-  static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] << (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
     Xmm src1;
     if (i.src1.is_constant) {
@@ -860,7 +850,7 @@ struct VECTOR_SHL_V128
         e.lea(e.r9, e.StashXmm(1, i.src2));
       }
       e.lea(e.r8, e.StashXmm(0, src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI32));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
       e.L(end);
@@ -872,6 +862,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
 // ============================================================================
 // OPCODE_VECTOR_SHR
 // ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load SSE registers into a C array.
+  _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+  _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+}
+
 struct VECTOR_SHR_V128
     : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
@@ -890,16 +897,7 @@ struct VECTOR_SHR_V128
         break;
     }
   }
-  static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
@@ -909,19 +907,10 @@ struct VECTOR_SHR_V128
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
     e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI8));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
-  static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
     if (i.src2.is_constant) {
       const auto& shamt = i.src2.constant();
@@ -968,21 +957,12 @@ struct VECTOR_SHR_V128
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
     e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
     e.L(end);
   }
-  static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
+
   static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
     Xmm src1;
     if (i.src1.is_constant) {
@@ -1054,7 +1034,7 @@ struct VECTOR_SHR_V128
         e.lea(e.r9, e.StashXmm(1, i.src2));
       }
       e.lea(e.r8, e.StashXmm(0, src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
       e.L(end);
@@ -1068,15 +1048,21 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
 // ============================================================================
 struct VECTOR_SHA_V128
     : Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) int8_t value[16];
-    alignas(16) int8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x7);
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
     }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
   }
 
   static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
@@ -1088,21 +1074,10 @@ struct VECTOR_SHA_V128
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
     e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI8));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
 
-  static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) int16_t value[8];
-    alignas(16) int16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-
   static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
     if (i.src2.is_constant) {
       const auto& shamt = i.src2.constant();
@@ -1149,23 +1124,12 @@ struct VECTOR_SHA_V128
       e.lea(e.r9, e.StashXmm(1, i.src2));
     }
     e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI16));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
     e.L(end);
   }
 
-  static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) int32_t value[4];
-    alignas(16) int32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = value[i] >> (shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-
   static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
     if (i.src2.is_constant) {
       const auto& shamt = i.src2.constant();
@@ -1222,69 +1186,39 @@ struct VECTOR_SHA_V128
         e.lea(e.r9, e.StashXmm(1, i.src2));
       }
       e.lea(e.r8, e.StashXmm(0, i.src1));
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
+      e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
       e.L(end);
     }
   }
-
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    switch (i.instr->flags) {
-      case INT8_TYPE:
-        EmitInt8(e, i);
-        break;
-      case INT16_TYPE:
-        EmitInt16(e, i);
-        break;
-      case INT32_TYPE:
-        EmitInt32(e, i);
-        break;
-      default:
-        assert_always();
-        break;
-    }
-  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
 
 // ============================================================================
 // OPCODE_VECTOR_ROTATE_LEFT
 // ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load SSE registers into a C array.
+  _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
+  _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = xe::rotate_left<T>(value[i], shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+}
+
 // TODO(benvanik): AVX512 has a native variable rotate (rolv).
 struct VECTOR_ROTATE_LEFT_V128
     : Sequence<VECTOR_ROTATE_LEFT_V128,
                I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint8_t value[16];
-    alignas(16) uint8_t shamt[16];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 16; ++i) {
-      value[i] = xe::rotate_left<uint8_t>(value[i], shamt[i] & 0x7);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint16_t value[8];
-    alignas(16) uint16_t shamt[8];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 8; ++i) {
-      value[i] = xe::rotate_left<uint16_t>(value[i], shamt[i] & 0xF);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) {
-    alignas(16) uint32_t value[4];
-    alignas(16) uint32_t shamt[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      value[i] = xe::rotate_left<uint32_t>(value[i], shamt[i] & 0x1F);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     switch (i.instr->flags) {
       case INT8_TYPE:
@@ -1296,7 +1230,8 @@ struct VECTOR_ROTATE_LEFT_V128
         } else {
           e.lea(e.r9, e.StashXmm(1, i.src2));
         }
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI8));
+        e.CallNativeSafe(
+            reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
         e.vmovaps(i.dest, e.xmm0);
         break;
       case INT16_TYPE:
@@ -1308,7 +1243,8 @@ struct VECTOR_ROTATE_LEFT_V128
         } else {
           e.lea(e.r9, e.StashXmm(1, i.src2));
         }
-        e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI16));
+        e.CallNativeSafe(
+            reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
         e.vmovaps(i.dest, e.xmm0);
         break;
       case INT32_TYPE: {
@@ -1335,7 +1271,8 @@ struct VECTOR_ROTATE_LEFT_V128
           } else {
             e.lea(e.r9, e.StashXmm(1, i.src2));
           }
-          e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI32));
+          e.CallNativeSafe(
+              reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
           e.vmovaps(i.dest, e.xmm0);
         }
         break;
@@ -1351,35 +1288,28 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
 // ============================================================================
 // OPCODE_VECTOR_AVERAGE
 // ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
+  alignas(16) T src1v[16 / sizeof(T)];
+  alignas(16) T src2v[16 / sizeof(T)];
+  alignas(16) T value[16 / sizeof(T)];
+
+  // Load SSE registers into a C array.
+  _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
+  _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2;
+    value[i] = T(t);
+  }
+
+  // Store result and return it.
+  return _mm_load_si128(reinterpret_cast<__m128i*>(value));
+}
+
 struct VECTOR_AVERAGE
     : Sequence<VECTOR_AVERAGE,
                I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
-  static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1,
-                                                 __m128i src2) {
-    alignas(16) uint32_t src1v[4];
-    alignas(16) uint32_t src2v[4];
-    alignas(16) uint32_t value[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1;
-      value[i] = uint32_t(t);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
-  static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1,
-                                               __m128i src2) {
-    alignas(16) int32_t src1v[4];
-    alignas(16) int32_t src2v[4];
-    alignas(16) int32_t value[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
-    _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
-    for (size_t i = 0; i < 4; ++i) {
-      auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1;
-      value[i] = int32_t(t);
-    }
-    return _mm_load_si128(reinterpret_cast<__m128i*>(value));
-  }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     EmitCommutativeBinaryXmmOp(
         e, i,
@@ -1414,7 +1344,7 @@ struct VECTOR_AVERAGE
                 }
                 e.lea(e.r8, e.StashXmm(0, i.src1));
                 e.CallNativeSafe(
-                    reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
+                    reinterpret_cast<void*>(EmulateVectorAverage<uint32_t>));
                 e.vmovaps(i.dest, e.xmm0);
               } else {
                 if (i.src2.is_constant) {
@@ -1425,7 +1355,7 @@ struct VECTOR_AVERAGE
                 }
                 e.lea(e.r8, e.StashXmm(0, i.src1));
                 e.CallNativeSafe(
-                    reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
+                    reinterpret_cast<void*>(EmulateVectorAverage<int32_t>));
                 e.vmovaps(i.dest, e.xmm0);
               }
               break;

From b57bb74965c4cc61233417b4de7e6b238b57d06f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 18 Nov 2018 14:23:16 -0600
Subject: [PATCH 26/31] [x64] Change the parameters to host_to_guest_thunk
 Shuffle some code around in x64_backend.cc Add GetNativeParam to avoid
 hardcoding parameters

---
 src/xenia/cpu/backend/x64/x64_backend.cc    | 181 +++++++++++---------
 src/xenia/cpu/backend/x64/x64_emitter.cc    |  54 +++---
 src/xenia/cpu/backend/x64/x64_emitter.h     |   2 +
 src/xenia/cpu/backend/x64/x64_seq_memory.cc | 138 +++++++--------
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 102 +++++------
 src/xenia/cpu/backend/x64/x64_sequences.cc  |  24 +--
 6 files changed, 268 insertions(+), 233 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index e45141eac..7f51db92b 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -42,6 +42,15 @@ class X64ThunkEmitter : public X64Emitter {
   HostToGuestThunk EmitHostToGuestThunk();
   GuestToHostThunk EmitGuestToHostThunk();
   ResolveFunctionThunk EmitResolveFunctionThunk();
+
+ private:
+  // The following four functions provide save/load functionality for registers.
+  // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
+  // allocated on the stack.
+  void EmitSaveVolatileRegs();
+  void EmitLoadVolatileRegs();
+  void EmitSaveNonvolatileRegs();
+  void EmitLoadNonvolatileRegs();
 };
 
 X64Backend::X64Backend() : Backend(), code_cache_(nullptr) {
@@ -406,53 +415,15 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
   mov(qword[rsp + 8 * 1], rcx);
   sub(rsp, stack_size);
 
-  // Preserve nonvolatile registers.
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rbp);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r12);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r13);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r14);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r15);
-
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm6);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm7);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm8);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm9);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm10);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm11);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[6])], xmm12);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[7])], xmm13);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[8])], xmm14);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[9])], xmm15);
+  // Save nonvolatile registers.
+  EmitSaveNonvolatileRegs();
 
   mov(rax, rcx);
   mov(rsi, rdx);  // context
   mov(rcx, r8);   // return address
   call(rax);
 
-  movaps(xmm6, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
-  movaps(xmm7, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
-  movaps(xmm8, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
-  movaps(xmm9, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
-  movaps(xmm10, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
-  movaps(xmm11, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
-  movaps(xmm12, qword[rsp + offsetof(StackLayout::Thunk, xmm[6])]);
-  movaps(xmm13, qword[rsp + offsetof(StackLayout::Thunk, xmm[7])]);
-  movaps(xmm14, qword[rsp + offsetof(StackLayout::Thunk, xmm[8])]);
-  movaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
-
-  mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
-  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
-  mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
-  mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
-  mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
-  mov(r12, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
-  mov(r13, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
-  mov(r14, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
-  mov(r15, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
+  EmitLoadNonvolatileRegs();
 
   add(rsp, stack_size);
   mov(rcx, qword[rsp + 8 * 1]);
@@ -465,56 +436,26 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
 }
 
 GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
-  // rcx = context
-  // rdx = target function
-  // r8  = arg0
-  // r9  = arg1
-  // r10 = arg2
+  // rcx = target function
+  // rdx = arg0
+  // r8  = arg1
+  // r9  = arg2
 
   const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
   // rsp + 0 = return address
-  mov(qword[rsp + 8 * 2], rdx);
-  mov(qword[rsp + 8 * 1], rcx);
   sub(rsp, stack_size);
 
   // Save off volatile registers.
-  // TODO(DrChat): Enable this when we actually need this.
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rcx);
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rdx);
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], r8);
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r9);
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r10);
-  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r11);
+  // TODO(DrChat): Enable when necessary.
+  // EmitSaveVolatileRegs();
 
-  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1);
-  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2);
-  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3);
-  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4);
-  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
-
-  mov(rax, rdx);
-  mov(rcx, rsi);  // context
-  mov(rdx, r8);
-  mov(r8, r9);
-  mov(r9, r10);
+  mov(rax, rcx);              // function
+  mov(rcx, GetContextReg());  // context
   call(rax);
 
-  // movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
-  // movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
-  // movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
-  // movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
-  // movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
-
-  // mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
-  // mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
-  // mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
-  // mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
-  // mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
-  // mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
+  // EmitLoadVolatileRegs();
 
   add(rsp, stack_size);
-  mov(rcx, qword[rsp + 8 * 1]);
-  mov(rdx, qword[rsp + 8 * 2]);
   ret();
 
   void* fn = Emplace(stack_size);
@@ -527,7 +468,6 @@ extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
 ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
   // ebx = target PPC address
   // rcx = context
-
   uint32_t stack_size = 0x18;
 
   // rsp + 0 = return address
@@ -549,6 +489,85 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
   return (ResolveFunctionThunk)fn;
 }
 
+void X64ThunkEmitter::EmitSaveVolatileRegs() {
+  // Save off volatile registers.
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rcx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rdx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], r8);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r9);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r10);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r11);
+
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm1);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm2);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm3);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm4);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm5);
+}
+
+void X64ThunkEmitter::EmitLoadVolatileRegs() {
+  // Load volatile registers from our stack frame.
+  movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
+  movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
+  movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
+  movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
+  movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
+
+  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
+  mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
+  mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
+  mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
+  mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
+  mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
+}
+
+void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
+  // Preserve nonvolatile registers.
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rbp);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r12);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r13);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r14);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r15);
+
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm6);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm7);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm8);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm9);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm10);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm11);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[6])], xmm12);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[7])], xmm13);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[8])], xmm14);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[9])], xmm15);
+}
+
+void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
+  movaps(xmm6, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
+  movaps(xmm7, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
+  movaps(xmm8, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
+  movaps(xmm9, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
+  movaps(xmm10, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
+  movaps(xmm11, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
+  movaps(xmm12, qword[rsp + offsetof(StackLayout::Thunk, xmm[6])]);
+  movaps(xmm13, qword[rsp + offsetof(StackLayout::Thunk, xmm[7])]);
+  movaps(xmm14, qword[rsp + offsetof(StackLayout::Thunk, xmm[8])]);
+  movaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
+
+  mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
+  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
+  mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
+  mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
+  mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
+  mov(r12, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
+  mov(r13, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
+  mov(r14, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
+  mov(r15, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
+}
+
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 4e229f2a4..3cf7f5813 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -224,6 +224,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
       const Instr* new_tail = instr;
       if (!SelectSequence(this, instr, &new_tail)) {
         // No sequence found!
+        // NOTE: If you encounter this after adding a new instruction, do a full
+        // rebuild!
         assert_always();
         XELOGE("Unable to process HIR opcode %s", instr->opcode->name);
         break;
@@ -458,16 +460,15 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
     auto builtin_function = static_cast<const BuiltinFunction*>(function);
     if (builtin_function->handler()) {
       undefined = false;
-      // rcx = context
-      // rdx = target host function
-      // r8  = arg0
-      // r9  = arg1
-      mov(rcx, GetContextReg());
-      mov(rdx, reinterpret_cast<uint64_t>(builtin_function->handler()));
-      mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg0()));
-      mov(r9, reinterpret_cast<uint64_t>(builtin_function->arg1()));
+      // rcx = target function
+      // rdx = arg0
+      // r8  = arg1
+      // r9  = arg2
       auto thunk = backend()->guest_to_host_thunk();
       mov(rax, reinterpret_cast<uint64_t>(thunk));
+      mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
+      mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
+      mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
       call(rax);
       // rax = host return
     }
@@ -475,13 +476,15 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
     auto extern_function = static_cast<const GuestFunction*>(function);
     if (extern_function->extern_handler()) {
       undefined = false;
-      // rcx = context
-      // rdx = target host function
-      mov(rcx, GetContextReg());
-      mov(rdx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
-      mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
+      // rcx = target function
+      // rdx = arg0
+      // r8  = arg1
+      // r9  = arg2
       auto thunk = backend()->guest_to_host_thunk();
       mov(rax, reinterpret_cast<uint64_t>(thunk));
+      mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
+      mov(rdx,
+          qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
       call(rax);
       // rax = host return
     }
@@ -518,15 +521,13 @@ void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
 }
 
 void X64Emitter::CallNativeSafe(void* fn) {
-  // rcx = context
-  // rdx = target function
-  // r8  = arg0
-  // r9  = arg1
-  // r10 = arg2
+  // rcx = target function
+  // rdx = arg0
+  // r8  = arg1
+  // r9  = arg2
   auto thunk = backend()->guest_to_host_thunk();
   mov(rax, reinterpret_cast<uint64_t>(thunk));
-  mov(rcx, GetContextReg());
-  mov(rdx, reinterpret_cast<uint64_t>(fn));
+  mov(rcx, reinterpret_cast<uint64_t>(fn));
   call(rax);
   // rax = host return
 }
@@ -536,6 +537,19 @@ void X64Emitter::SetReturnAddress(uint64_t value) {
   mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);
 }
 
+Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param)
+{
+  if (param == 0)
+    return rdx;
+  else if (param == 1)
+    return r8;
+  else if (param == 2)
+    return r9;
+
+  assert_always();
+  return r9;
+}
+
 // Important: If you change these, you must update the thunks in x64_backend.cc!
 Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
 Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 33ce2c0a2..3fb32d58d 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -187,6 +187,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
   void CallNativeSafe(void* fn);
   void SetReturnAddress(uint64_t value);
 
+  Xbyak::Reg64 GetNativeParam(uint32_t param);
+
   Xbyak::Reg64 GetContextReg();
   Xbyak::Reg64 GetMembaseReg();
   void ReloadContext();
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index ba647e045..aec6218fc 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -285,8 +285,8 @@ struct LOAD_CONTEXT_I8
     auto addr = ComputeContextAddress(e, i.src1);
     e.mov(i.dest, e.byte[addr]);
     if (IsTracingData()) {
-      e.mov(e.r8, e.byte[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(0), i.src1.value);
+      e.mov(e.GetNativeParam(1), e.byte[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadI8));
     }
   }
@@ -297,8 +297,8 @@ struct LOAD_CONTEXT_I16
     auto addr = ComputeContextAddress(e, i.src1);
     e.mov(i.dest, e.word[addr]);
     if (IsTracingData()) {
-      e.mov(e.r8, e.word[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.word[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadI16));
     }
   }
@@ -309,8 +309,8 @@ struct LOAD_CONTEXT_I32
     auto addr = ComputeContextAddress(e, i.src1);
     e.mov(i.dest, e.dword[addr]);
     if (IsTracingData()) {
-      e.mov(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.dword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
     }
   }
@@ -321,8 +321,8 @@ struct LOAD_CONTEXT_I64
     auto addr = ComputeContextAddress(e, i.src1);
     e.mov(i.dest, e.qword[addr]);
     if (IsTracingData()) {
-      e.mov(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.qword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadI64));
     }
   }
@@ -333,8 +333,8 @@ struct LOAD_CONTEXT_F32
     auto addr = ComputeContextAddress(e, i.src1);
     e.vmovss(i.dest, e.dword[addr]);
     if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.dword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadF32));
     }
   }
@@ -345,8 +345,8 @@ struct LOAD_CONTEXT_F64
     auto addr = ComputeContextAddress(e, i.src1);
     e.vmovsd(i.dest, e.qword[addr]);
     if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.qword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadF64));
     }
   }
@@ -357,8 +357,8 @@ struct LOAD_CONTEXT_V128
     auto addr = ComputeContextAddress(e, i.src1);
     e.vmovaps(i.dest, e.ptr[addr]);
     if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadV128));
     }
   }
@@ -382,8 +382,8 @@ struct STORE_CONTEXT_I8
       e.mov(e.byte[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.mov(e.r8, e.byte[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.byte[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreI8));
     }
   }
@@ -399,8 +399,8 @@ struct STORE_CONTEXT_I16
       e.mov(e.word[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.mov(e.r8, e.word[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.word[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreI16));
     }
   }
@@ -416,8 +416,8 @@ struct STORE_CONTEXT_I32
       e.mov(e.dword[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.mov(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.dword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
     }
   }
@@ -433,8 +433,8 @@ struct STORE_CONTEXT_I64
       e.mov(e.qword[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.mov(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.mov(e.GetNativeParam(1), e.qword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreI64));
     }
   }
@@ -450,8 +450,8 @@ struct STORE_CONTEXT_F32
       e.vmovss(e.dword[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.dword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreF32));
     }
   }
@@ -467,8 +467,8 @@ struct STORE_CONTEXT_F64
       e.vmovsd(e.qword[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.qword[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreF64));
     }
   }
@@ -485,8 +485,8 @@ struct STORE_CONTEXT_V128
       e.vmovaps(e.ptr[addr], i.src2);
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.mov(e.rdx, i.src1.value);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.mov(e.GetNativeParam(0), i.src1.value);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreV128));
     }
   }
@@ -505,13 +505,13 @@ struct LOAD_MMIO_I32
     // uint64_t (context, addr)
     auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
     auto read_address = uint32_t(i.src2.value);
-    e.mov(e.r8, uint64_t(mmio_range->callback_context));
-    e.mov(e.r9d, read_address);
+    e.mov(e.GetNativeParam(0), uint64_t(mmio_range->callback_context));
+    e.mov(e.GetNativeParam(1).cvt32(), read_address);
     e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->read));
     e.bswap(e.eax);
     e.mov(i.dest, e.eax);
     if (IsTracingData()) {
-      e.mov(e.r8, i.dest);
+      e.mov(e.GetNativeParam(0), i.dest);
       e.mov(e.edx, read_address);
       e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
     }
@@ -530,20 +530,20 @@ struct STORE_MMIO_I32
     // void (context, addr, value)
     auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
     auto write_address = uint32_t(i.src2.value);
-    e.mov(e.r8, uint64_t(mmio_range->callback_context));
-    e.mov(e.r9d, write_address);
+    e.mov(e.GetNativeParam(0), uint64_t(mmio_range->callback_context));
+    e.mov(e.GetNativeParam(1).cvt32(), write_address);
     if (i.src3.is_constant) {
-      e.mov(e.r10d, xe::byte_swap(i.src3.constant()));
+      e.mov(e.GetNativeParam(2).cvt32(), xe::byte_swap(i.src3.constant()));
     } else {
-      e.mov(e.r10d, i.src3);
-      e.bswap(e.r10d);
+      e.mov(e.GetNativeParam(2).cvt32(), i.src3);
+      e.bswap(e.GetNativeParam(2).cvt32());
     }
     e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->write));
     if (IsTracingData()) {
       if (i.src3.is_constant) {
-        e.mov(e.r8d, i.src3.constant());
+        e.mov(e.GetNativeParam(0).cvt32(), i.src3.constant());
       } else {
-        e.mov(e.r8d, i.src3);
+        e.mov(e.GetNativeParam(0).cvt32(), i.src3);
       }
       e.mov(e.edx, write_address);
       e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
@@ -708,8 +708,8 @@ struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
     auto addr = ComputeMemoryAddress(e, i.src1);
     e.mov(i.dest, e.byte[addr]);
     if (IsTracingData()) {
-      e.mov(e.r8b, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt8(), i.dest);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI8));
     }
   }
@@ -728,8 +728,8 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
       e.mov(i.dest, e.word[addr]);
     }
     if (IsTracingData()) {
-      e.mov(e.r8w, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt16(), i.dest);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI16));
     }
   }
@@ -748,8 +748,8 @@ struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
       e.mov(i.dest, e.dword[addr]);
     }
     if (IsTracingData()) {
-      e.mov(e.r8d, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt32(), i.dest);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
     }
   }
@@ -768,8 +768,8 @@ struct LOAD_I64 : Sequence<LOAD_I64, I<OPCODE_LOAD, I64Op, I64Op>> {
       e.mov(i.dest, e.qword[addr]);
     }
     if (IsTracingData()) {
-      e.mov(e.r8, i.dest);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1), i.dest);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI64));
     }
   }
@@ -782,8 +782,8 @@ struct LOAD_F32 : Sequence<LOAD_F32, I<OPCODE_LOAD, F32Op, I64Op>> {
       assert_always("not implemented yet");
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.dword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.dword[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF32));
     }
   }
@@ -796,8 +796,8 @@ struct LOAD_F64 : Sequence<LOAD_F64, I<OPCODE_LOAD, F64Op, I64Op>> {
       assert_always("not implemented yet");
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.qword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.qword[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF64));
     }
   }
@@ -812,8 +812,8 @@ struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
       e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
     }
     if (IsTracingData()) {
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadV128));
     }
   }
@@ -835,8 +835,8 @@ struct STORE_I8 : Sequence<STORE_I8, I<OPCODE_STORE, VoidOp, I64Op, I8Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8b, e.byte[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt8(), e.byte[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI8));
     }
   }
@@ -860,8 +860,8 @@ struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8w, e.word[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt16(), e.word[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI16));
     }
   }
@@ -885,8 +885,8 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8d, e.dword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
     }
   }
@@ -910,8 +910,8 @@ struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r8, e.qword[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(1), e.qword[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI64));
     }
   }
@@ -931,8 +931,8 @@ struct STORE_F32 : Sequence<STORE_F32, I<OPCODE_STORE, VoidOp, I64Op, F32Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF32));
     }
   }
@@ -952,8 +952,8 @@ struct STORE_F64 : Sequence<STORE_F64, I<OPCODE_STORE, VoidOp, I64Op, F64Op>> {
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF64));
     }
   }
@@ -976,8 +976,8 @@ struct STORE_V128
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.lea(e.r8, e.ptr[addr]);
-      e.lea(e.rdx, e.ptr[addr]);
+      e.lea(e.GetNativeParam(1), e.ptr[addr]);
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreV128));
     }
   }
@@ -1038,9 +1038,9 @@ struct MEMSET_I64_I8_I64
     }
     if (IsTracingData()) {
       addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.r9, i.src3.constant());
-      e.mov(e.r8, i.src2.constant());
-      e.lea(e.rdx, e.ptr[addr]);
+      e.mov(e.GetNativeParam(2), i.src3.constant());
+      e.mov(e.GetNativeParam(1), i.src2.constant());
+      e.lea(e.GetNativeParam(0), e.ptr[addr]);
       e.CallNative(reinterpret_cast<void*>(TraceMemset));
     }
   }
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 0ee776b0d..bfd3e76fe 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -710,11 +710,11 @@ struct VECTOR_SHL_V128
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -768,11 +768,11 @@ struct VECTOR_SHL_V128
     e.L(emu);
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
@@ -845,11 +845,11 @@ struct VECTOR_SHL_V128
       e.L(emu);
       if (i.src2.is_constant) {
         e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
       } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
       }
-      e.lea(e.r8, e.StashXmm(0, src1));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
@@ -902,11 +902,11 @@ struct VECTOR_SHR_V128
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -952,11 +952,11 @@ struct VECTOR_SHR_V128
     e.L(emu);
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
@@ -1029,11 +1029,11 @@ struct VECTOR_SHR_V128
       e.L(emu);
       if (i.src2.is_constant) {
         e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
       } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
       }
-      e.lea(e.r8, e.StashXmm(0, src1));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
@@ -1069,11 +1069,11 @@ struct VECTOR_SHA_V128
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -1119,11 +1119,11 @@ struct VECTOR_SHA_V128
     e.L(emu);
     if (i.src2.is_constant) {
       e.LoadConstantXmm(e.xmm0, i.src2.constant());
-      e.lea(e.r9, e.StashXmm(1, e.xmm0));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
     } else {
-      e.lea(e.r9, e.StashXmm(1, i.src2));
+      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
     e.vmovaps(i.dest, e.xmm0);
 
@@ -1181,11 +1181,11 @@ struct VECTOR_SHA_V128
       e.L(emu);
       if (i.src2.is_constant) {
         e.LoadConstantXmm(e.xmm0, i.src2.constant());
-        e.lea(e.r9, e.StashXmm(1, e.xmm0));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
       } else {
-        e.lea(e.r9, e.StashXmm(1, i.src2));
+        e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
       }
-      e.lea(e.r8, e.StashXmm(0, i.src1));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
       e.vmovaps(i.dest, e.xmm0);
 
@@ -1223,12 +1223,12 @@ struct VECTOR_ROTATE_LEFT_V128
     switch (i.instr->flags) {
       case INT8_TYPE:
         // TODO(benvanik): native version (with shift magic).
-        e.lea(e.r8, e.StashXmm(0, i.src1));
+        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
         if (i.src2.is_constant) {
           e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+          e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
         } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
+          e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
         }
         e.CallNativeSafe(
             reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
@@ -1236,12 +1236,12 @@ struct VECTOR_ROTATE_LEFT_V128
         break;
       case INT16_TYPE:
         // TODO(benvanik): native version (with shift magic).
-        e.lea(e.r8, e.StashXmm(0, i.src1));
+        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
         if (i.src2.is_constant) {
           e.LoadConstantXmm(e.xmm0, i.src2.constant());
-          e.lea(e.r9, e.StashXmm(1, e.xmm0));
+          e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
         } else {
-          e.lea(e.r9, e.StashXmm(1, i.src2));
+          e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
         }
         e.CallNativeSafe(
             reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
@@ -1264,12 +1264,12 @@ struct VECTOR_ROTATE_LEFT_V128
           e.vpor(i.dest, e.xmm1);
         } else {
           // TODO(benvanik): non-AVX2 native version.
-          e.lea(e.r8, e.StashXmm(0, i.src1));
+          e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
           if (i.src2.is_constant) {
             e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
+            e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
           } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
+            e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
           }
           e.CallNativeSafe(
               reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
@@ -1338,22 +1338,22 @@ struct VECTOR_AVERAGE
               if (is_unsigned) {
                 if (i.src2.is_constant) {
                   e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
+                  e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
                 } else {
-                  e.lea(e.r9, e.StashXmm(1, i.src2));
+                  e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
                 }
-                e.lea(e.r8, e.StashXmm(0, i.src1));
+                e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
                 e.CallNativeSafe(
                     reinterpret_cast<void*>(EmulateVectorAverage<uint32_t>));
                 e.vmovaps(i.dest, e.xmm0);
               } else {
                 if (i.src2.is_constant) {
                   e.LoadConstantXmm(e.xmm0, i.src2.constant());
-                  e.lea(e.r9, e.StashXmm(1, e.xmm0));
+                  e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
                 } else {
-                  e.lea(e.r9, e.StashXmm(1, i.src2));
+                  e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
                 }
-                e.lea(e.r8, e.StashXmm(0, i.src1));
+                e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
                 e.CallNativeSafe(
                     reinterpret_cast<void*>(EmulateVectorAverage<int32_t>));
                 e.vmovaps(i.dest, e.xmm0);
@@ -1888,7 +1888,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
       } else {
         src = i.src1;
       }
-      e.lea(e.r8, e.StashXmm(0, src));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
       e.vmovaps(i.dest, e.xmm0);
     }
@@ -1928,7 +1928,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
       } else {
         src = i.src1;
       }
-      e.lea(e.r8, e.StashXmm(0, src));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
       e.vmovaps(i.dest, e.xmm0);
     }
@@ -2032,19 +2032,19 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
           // unsigned -> unsigned + saturate
           if (i.src2.is_constant) {
             e.LoadConstantXmm(e.xmm0, i.src2.constant());
-            e.lea(e.r9, e.StashXmm(1, e.xmm0));
+            e.lea(e.GetNativeParam(1), e.StashXmm(1, e.xmm0));
           } else {
-            e.lea(e.r9, e.StashXmm(1, i.src2));
+            e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
           }
-          e.lea(e.r8, e.StashXmm(0, i.src1));
+          e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
           e.CallNativeSafe(
               reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN_SAT));
           e.vmovaps(i.dest, e.xmm0);
           e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
         } else {
           // unsigned -> unsigned
-          e.lea(e.r9, e.StashXmm(1, i.src2));
-          e.lea(e.r8, e.StashXmm(0, i.src1));
+          e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
+          e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
           e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
           e.vmovaps(i.dest, e.xmm0);
           e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
@@ -2296,7 +2296,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
       } else {
         src = i.src1;
       }
-      e.lea(e.r8, e.StashXmm(0, src));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
       e.vmovaps(i.dest, e.xmm0);
     }
@@ -2332,7 +2332,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
       } else {
         src = i.src1;
       }
-      e.lea(e.r8, e.StashXmm(0, src));
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, src));
       e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
       e.vmovaps(i.dest, e.xmm0);
     }
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index da6ff8891..0fff6d458 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -2360,7 +2360,7 @@ struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     assert_always();
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2374,7 +2374,7 @@ struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     assert_always();
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2389,7 +2389,7 @@ struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
     return _mm_load_ps(values);
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2411,7 +2411,7 @@ struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     assert_always();
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2425,7 +2425,7 @@ struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     assert_always();
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2440,7 +2440,7 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
     return _mm_load_ps(values);
   }
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2705,11 +2705,11 @@ struct SHL_V128 : Sequence<SHL_V128, I<OPCODE_SHL, V128Op, V128Op, I8Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
-      e.mov(e.r9, i.src2.constant());
+      e.mov(e.GetNativeParam(1), i.src2.constant());
     } else {
-      e.mov(e.r9, i.src2);
+      e.mov(e.GetNativeParam(1), i.src2);
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
     e.vmovaps(i.dest, e.xmm0);
   }
@@ -2782,11 +2782,11 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
-      e.mov(e.r9, i.src2.constant());
+      e.mov(e.GetNativeParam(1), i.src2.constant());
     } else {
-      e.mov(e.r9, i.src2);
+      e.mov(e.GetNativeParam(1), i.src2);
     }
-    e.lea(e.r8, e.StashXmm(0, i.src1));
+    e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
     e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
     e.vmovaps(i.dest, e.xmm0);
   }

From 384ec98a42d659e952b3642b692643926eb7c6fb Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 18 Nov 2018 15:22:56 -0600
Subject: [PATCH 27/31] [x64] Allow the JIT to use volatile registers

---
 src/xenia/cpu/backend/x64/x64_backend.cc | 64 +++++++++++++-----------
 src/xenia/cpu/backend/x64/x64_emitter.cc | 30 ++++-------
 src/xenia/cpu/backend/x64/x64_emitter.h  | 10 ++--
 3 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index 7f51db92b..beed2af8e 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -446,14 +446,13 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
   sub(rsp, stack_size);
 
   // Save off volatile registers.
-  // TODO(DrChat): Enable when necessary.
-  // EmitSaveVolatileRegs();
+  EmitSaveVolatileRegs();
 
   mov(rax, rcx);              // function
   mov(rcx, GetContextReg());  // context
   call(rax);
 
-  // EmitLoadVolatileRegs();
+  EmitLoadVolatileRegs();
 
   add(rsp, stack_size);
   ret();
@@ -468,21 +467,22 @@ extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
 ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
   // ebx = target PPC address
   // rcx = context
-  uint32_t stack_size = 0x18;
+  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
 
   // rsp + 0 = return address
-  mov(qword[rsp + 8 * 2], rdx);
-  mov(qword[rsp + 8 * 1], rcx);
   sub(rsp, stack_size);
 
+  // Save volatile registers
+  EmitSaveVolatileRegs();
+
   mov(rcx, rsi);  // context
   mov(rdx, rbx);
   mov(rax, uint64_t(&ResolveFunction));
   call(rax);
 
+  EmitLoadVolatileRegs();
+
   add(rsp, stack_size);
-  mov(rcx, qword[rsp + 8 * 1]);
-  mov(rdx, qword[rsp + 8 * 2]);
   jmp(rax);
 
   void* fn = Emplace(stack_size);
@@ -491,34 +491,38 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
 
 void X64ThunkEmitter::EmitSaveVolatileRegs() {
   // Save off volatile registers.
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rcx);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rdx);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], r8);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r9);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r10);
-  mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r11);
+  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rdx);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], r8);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], r9);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r10);
+  mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r11);
 
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm1);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm2);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm3);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm4);
-  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm5);
+  // movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm0);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4);
+  movaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
 }
 
 void X64ThunkEmitter::EmitLoadVolatileRegs() {
   // Load volatile registers from our stack frame.
-  movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
-  movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
-  movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
-  movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
-  movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
+  // movaps(xmm0, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
+  movaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
+  movaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
+  movaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
+  movaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
+  movaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
 
-  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
-  mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
-  mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
-  mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
-  mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
-  mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
+  // mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
+  mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
+  mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
+  mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
+  mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
+  mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
+  mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
 }
 
 void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index 3cf7f5813..7ffd7b582 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -56,12 +56,13 @@ static const size_t kStashOffset = 32;
 // static const size_t kStashOffsetHigh = 32 + 32;
 
 const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
-    Xbyak::Operand::RBX, Xbyak::Operand::R12, Xbyak::Operand::R13,
-    Xbyak::Operand::R14, Xbyak::Operand::R15,
+    Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11,
+    Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14,
+    Xbyak::Operand::R15,
 };
 
 const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = {
-    6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
 
 X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
@@ -494,30 +495,20 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
   }
 }
 
-void X64Emitter::CallNative(void* fn) {
-  mov(rax, reinterpret_cast<uint64_t>(fn));
-  mov(rcx, GetContextReg());
-  call(rax);
-}
+void X64Emitter::CallNative(void* fn) { CallNativeSafe(fn); }
 
 void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context)) {
-  mov(rax, reinterpret_cast<uint64_t>(fn));
-  mov(rcx, GetContextReg());
-  call(rax);
+  CallNativeSafe(reinterpret_cast<void*>(fn));
 }
 
 void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) {
-  mov(rax, reinterpret_cast<uint64_t>(fn));
-  mov(rcx, GetContextReg());
-  call(rax);
+  CallNativeSafe(reinterpret_cast<void*>(fn));
 }
 
 void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
                             uint64_t arg0) {
-  mov(rax, reinterpret_cast<uint64_t>(fn));
-  mov(rcx, GetContextReg());
-  mov(rdx, arg0);
-  call(rax);
+  mov(GetNativeParam(0), arg0);
+  CallNativeSafe(reinterpret_cast<void*>(fn));
 }
 
 void X64Emitter::CallNativeSafe(void* fn) {
@@ -537,8 +528,7 @@ void X64Emitter::SetReturnAddress(uint64_t value) {
   mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);
 }
 
-Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param)
-{
+Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
   if (param == 0)
     return rdx;
   else if (param == 1)
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 3fb32d58d..a35c2d2b0 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -139,13 +139,13 @@ class X64Emitter : public Xbyak::CodeGenerator {
             std::vector<SourceMapEntry>* out_source_map);
 
  public:
-  // Reserved:  rsp
+  // Reserved:  rsp, rsi, rdi
   // Scratch:   rax/rcx/rdx
   //            xmm0-2
-  // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?)
-  //            xmm6-xmm15 (save to get xmm3-xmm5)
-  static const int GPR_COUNT = 5;
-  static const int XMM_COUNT = 10;
+  // Available: rbx, r10-r15
+  //            xmm4-xmm15 (save to get xmm3)
+  static const int GPR_COUNT = 7;
+  static const int XMM_COUNT = 12;
 
   static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) {
     auto idx = gpr_reg_map_[v->reg.index];

From df964015a4423856c8a8e609c97081e676133d0f Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sun, 18 Nov 2018 18:24:35 -0600
Subject: [PATCH 28/31] [x64] Fix improper use of compare_exchange_strong when
 adjusting code commit mark

---
 src/xenia/cpu/backend/x64/x64_code_cache.cc | 28 ++++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.cc b/src/xenia/cpu/backend/x64/x64_code_cache.cc
index b258f2658..e4a23248e 100644
--- a/src/xenia/cpu/backend/x64/x64_code_cache.cc
+++ b/src/xenia/cpu/backend/x64/x64_code_cache.cc
@@ -174,15 +174,17 @@ void* X64CodeCache::PlaceGuestCode(uint32_t guest_address, void* machine_code,
     // If we are going above the high water mark of committed memory, commit
     // some more. It's ok if multiple threads do this, as redundant commits
     // aren't harmful.
-    size_t old_commit_mark = generated_code_commit_mark_;
-    if (high_mark > old_commit_mark) {
-      size_t new_commit_mark = old_commit_mark + 16 * 1024 * 1024;
+    size_t old_commit_mark, new_commit_mark;
+    do {
+      old_commit_mark = generated_code_commit_mark_;
+      if (high_mark <= old_commit_mark) break;
+
+      new_commit_mark = old_commit_mark + 16 * 1024 * 1024;
       xe::memory::AllocFixed(generated_code_base_, new_commit_mark,
                              xe::memory::AllocationType::kCommit,
                              xe::memory::PageAccess::kExecuteReadWrite);
-      generated_code_commit_mark_.compare_exchange_strong(old_commit_mark,
-                                                          new_commit_mark);
-    }
+    } while (generated_code_commit_mark_.compare_exchange_weak(
+        old_commit_mark, new_commit_mark));
 
     // Copy code.
     std::memcpy(code_address, machine_code, code_size);
@@ -248,15 +250,17 @@ uint32_t X64CodeCache::PlaceData(const void* data, size_t length) {
   // If we are going above the high water mark of committed memory, commit some
   // more. It's ok if multiple threads do this, as redundant commits aren't
   // harmful.
-  size_t old_commit_mark = generated_code_commit_mark_;
-  if (high_mark > old_commit_mark) {
-    size_t new_commit_mark = old_commit_mark + 16 * 1024 * 1024;
+  size_t old_commit_mark, new_commit_mark;
+  do {
+    old_commit_mark = generated_code_commit_mark_;
+    if (high_mark <= old_commit_mark) break;
+
+    new_commit_mark = old_commit_mark + 16 * 1024 * 1024;
     xe::memory::AllocFixed(generated_code_base_, new_commit_mark,
                            xe::memory::AllocationType::kCommit,
                            xe::memory::PageAccess::kExecuteReadWrite);
-    generated_code_commit_mark_.compare_exchange_strong(old_commit_mark,
-                                                        new_commit_mark);
-  }
+  } while (generated_code_commit_mark_.compare_exchange_weak(old_commit_mark,
+                                                             new_commit_mark));
 
   // Copy code.
   std::memcpy(data_address, data, length);

From b3d509eb0167697f4916f65f5b9a2cd0aed27dc4 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Fri, 23 Nov 2018 17:24:55 -0600
Subject: [PATCH 29/31] [x64] Swap to using anchor variables instead of dummy
 function calls

---
 src/xenia/cpu/backend/x64/x64_backend.cc     |  2 --
 src/xenia/cpu/backend/x64/x64_seq_control.cc |  2 +-
 src/xenia/cpu/backend/x64/x64_seq_memory.cc  |  2 +-
 src/xenia/cpu/backend/x64/x64_seq_vector.cc  |  2 +-
 src/xenia/cpu/backend/x64/x64_sequences.cc   | 14 +++++++++-----
 src/xenia/cpu/backend/x64/x64_sequences.h    |  6 ------
 6 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index beed2af8e..4e6356959 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -82,8 +82,6 @@ bool X64Backend::Initialize(Processor* processor) {
     return false;
   }
 
-  RegisterSequences();
-
   // Need movbe to do advanced LOAD/STORE tricks.
   if (FLAGS_enable_haswell_instructions) {
     machine_info_.supports_extended_load_store =
diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc
index 81d2d9ab6..80eeeebc7 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@@ -19,7 +19,7 @@ namespace cpu {
 namespace backend {
 namespace x64 {
 
-void RegisterControl() {}
+volatile int anchor_control = 0;
 
 // ============================================================================
 // OPCODE_DEBUG_BREAK
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index aec6218fc..7526d1fc8 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -20,7 +20,7 @@ namespace cpu {
 namespace backend {
 namespace x64 {
 
-void RegisterMemory() {}
+volatile int anchor_memory = 0;
 
 // Note: all types are always aligned in the context.
 RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index bfd3e76fe..89d3bee14 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -22,7 +22,7 @@ namespace cpu {
 namespace backend {
 namespace x64 {
 
-void RegisterVector() {}
+volatile int anchor_vector = 0;
 
 // ============================================================================
 // OPCODE_VECTOR_CONVERT_I2F
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 0fff6d458..7d18cb4d0 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -3057,11 +3057,15 @@ struct SET_ROUNDING_MODE_I32
 };
 EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
 
-void RegisterSequences() {
-  RegisterControl();
-  RegisterMemory();
-  RegisterVector();
-}
+// Include anchors to other sequence sources so they get included in the build.
+extern volatile int anchor_control;
+static int anchor_control_dest = anchor_control;
+
+extern volatile int anchor_memory;
+static int anchor_memory_dest = anchor_memory;
+
+extern volatile int anchor_vector;
+static int anchor_vector_dest = anchor_vector;
 
 bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
   const InstrKey key(i);
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index 5815a3a92..07b264ab2 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -40,12 +40,6 @@ static bool Register() {
 #define EMITTER_OPCODE_TABLE(name, ...) \
   const auto X64_INSTR_##name = Register<__VA_ARGS__>();
 
-// Registration functions to force inclusion of several files
-void RegisterControl();
-void RegisterMemory();
-void RegisterVector();
-
-void RegisterSequences();
 bool SelectSequence(X64Emitter* e, const hir::Instr* i,
                     const hir::Instr** new_tail);
 

From 207589e5a1e481ea14bf2951bb22c8316193dac7 Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Sat, 24 Nov 2018 04:05:41 -0600
Subject: [PATCH 30/31] [CPU/Kernel] Correct parsing of
 XEX_HEADER_IMPORT_LIBRARIES.

---
 src/xenia/cpu/xex_module.cc       | 46 +++++++++++++++++--------------
 src/xenia/kernel/user_module.cc   | 34 +++++++++++++----------
 src/xenia/kernel/util/xex2_info.h | 10 ++++---
 3 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc
index 8b6b5e173..6f8537970 100644
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@@ -1119,23 +1119,23 @@ bool XexModule::LoadContinue() {
   processor_->backend()->CommitExecutableRange(low_address_, high_address_);
 
   // Add all imports (variables/functions).
-  xex2_opt_import_libraries* opt_import_header = nullptr;
-  GetOptHeader(XEX_HEADER_IMPORT_LIBRARIES, &opt_import_header);
+  xex2_opt_import_libraries* opt_import_libraries = nullptr;
+  GetOptHeader(XEX_HEADER_IMPORT_LIBRARIES, &opt_import_libraries);
 
-  if (opt_import_header) {
+  if (opt_import_libraries) {
     // FIXME: Don't know if 32 is the actual limit, but haven't seen more than
     // 2.
     const char* string_table[32];
     std::memset(string_table, 0, sizeof(string_table));
-    size_t max_string_table_index = 0;
 
     // Parse the string table
-    for (size_t i = 0; i < opt_import_header->string_table_size;
-         ++max_string_table_index) {
-      assert_true(max_string_table_index < xe::countof(string_table));
-      const char* str = opt_import_header->string_table + i;
+    for (size_t i = 0, o = 0; i < opt_import_libraries->string_table.size &&
+                              o < opt_import_libraries->string_table.count;
+         ++o) {
+      assert_true(o < xe::countof(string_table));
+      const char* str = &opt_import_libraries->string_table.data[i];
 
-      string_table[max_string_table_index] = str;
+      string_table[o] = str;
       i += std::strlen(str) + 1;
 
       // Padding
@@ -1144,15 +1144,19 @@ bool XexModule::LoadContinue() {
       }
     }
 
-    auto libraries_ptr = reinterpret_cast<uint8_t*>(opt_import_header) +
-                         opt_import_header->string_table_size + 12;
+    auto library_data = reinterpret_cast<uint8_t*>(opt_import_libraries) +
+                        opt_import_libraries->string_table.size + 12;
     uint32_t library_offset = 0;
-    uint32_t library_count = opt_import_header->library_count;
-    for (uint32_t i = 0; i < library_count; i++) {
-      auto library = reinterpret_cast<xex2_import_library*>(libraries_ptr +
-                                                            library_offset);
+    while (library_offset < opt_import_libraries->size) {
+      auto library =
+          reinterpret_cast<xex2_import_library*>(library_data + library_offset);
+      if (!library->size) {
+        break;
+      }
       size_t library_name_index = library->name_index & 0xFF;
-      assert_true(library_name_index < max_string_table_index);
+      assert_true(library_name_index <
+                  opt_import_libraries->string_table.count);
+      assert_not_null(string_table[library_name_index]);
       SetupLibraryImports(string_table[library_name_index], library);
       library_offset += library->size;
     }
@@ -1312,10 +1316,12 @@ bool XexModule::SetupLibraryImports(const char* name,
       var_info->set_status(Symbol::Status::kDefined);
     } else if (record_type == 1) {
       // Thunk.
-      assert_true(library_info.imports.size() > 0);
-      auto& prev_import = library_info.imports[library_info.imports.size() - 1];
-      assert_true(prev_import.ordinal == ordinal);
-      prev_import.thunk_address = record_addr;
+      if (library_info.imports.size() > 0) {
+        auto& prev_import =
+            library_info.imports[library_info.imports.size() - 1];
+        assert_true(prev_import.ordinal == ordinal);
+        prev_import.thunk_address = record_addr;
+      }
 
       if (kernel_export) {
         import_name.AppendFormat("%s", kernel_export->name);
diff --git a/src/xenia/kernel/user_module.cc b/src/xenia/kernel/user_module.cc
index a79091d3c..df65650f2 100644
--- a/src/xenia/kernel/user_module.cc
+++ b/src/xenia/kernel/user_module.cc
@@ -486,29 +486,33 @@ void UserModule::Dump() {
         std::memset(string_table, 0, sizeof(string_table));
 
         // Parse the string table
-        for (size_t l = 0, j = 0; l < opt_import_libraries->string_table_size;
-             j++) {
-          assert_true(j < xe::countof(string_table));
-          const char* str = opt_import_libraries->string_table + l;
+        for (size_t j = 0, o = 0; j < opt_import_libraries->string_table.size &&
+                                  o < opt_import_libraries->string_table.count;
+             o++) {
+          assert_true(o < xe::countof(string_table));
+          const char* str = &opt_import_libraries->string_table.data[o];
 
-          string_table[j] = str;
-          l += std::strlen(str) + 1;
+          string_table[o] = str;
+          j += std::strlen(str) + 1;
 
           // Padding
-          if ((l % 4) != 0) {
-            l += 4 - (l % 4);
+          if ((j % 4) != 0) {
+            j += 4 - (j % 4);
           }
         }
 
-        auto libraries =
+        auto library_data =
             reinterpret_cast<const uint8_t*>(opt_import_libraries) +
-            opt_import_libraries->string_table_size + 12;
+            opt_import_libraries->string_table.size + 12;
         uint32_t library_offset = 0;
-        uint32_t library_count = opt_import_libraries->library_count;
-        for (uint32_t l = 0; l < library_count; l++) {
+        while (library_offset < opt_import_libraries->size) {
           auto library = reinterpret_cast<const xex2_import_library*>(
-              libraries + library_offset);
+              library_data + library_offset);
+          if (!library->size) {
+            break;
+          }
           auto name = string_table[library->name_index & 0xFF];
+          assert_not_null(name);
           sb.AppendFormat("    %s - %d imports\n", name,
                           (uint16_t)library->count);
 
@@ -786,11 +790,11 @@ void UserModule::Dump() {
         }
         if (kernel_export &&
             kernel_export->type == cpu::Export::Type::kVariable) {
-          sb.AppendFormat("   V %.8X          %.3X (%3d) %s %s\n",
+          sb.AppendFormat("   V %.8X          %.3X (%4d) %s %s\n",
                           info->value_address, info->ordinal, info->ordinal,
                           implemented ? "  " : "!!", name);
         } else if (info->thunk_address) {
-          sb.AppendFormat("   F %.8X %.8X %.3X (%3d) %s %s\n",
+          sb.AppendFormat("   F %.8X %.8X %.3X (%4d) %s %s\n",
                           info->value_address, info->thunk_address,
                           info->ordinal, info->ordinal,
                           implemented ? "  " : "!!", name);
diff --git a/src/xenia/kernel/util/xex2_info.h b/src/xenia/kernel/util/xex2_info.h
index 23aa62524..f91b7c30f 100644
--- a/src/xenia/kernel/util/xex2_info.h
+++ b/src/xenia/kernel/util/xex2_info.h
@@ -474,10 +474,12 @@ struct xex2_opt_execution_info {
 static_assert_size(xex2_opt_execution_info, 0x18);
 
 struct xex2_opt_import_libraries {
-  xe::be<uint32_t> section_size;       // 0x0
-  xe::be<uint32_t> string_table_size;  // 0x4
-  xe::be<uint32_t> library_count;      // 0x8
-  char string_table[1];                // 0xC string_table_size bytes
+  xe::be<uint32_t> size;  // 0x0
+  struct {
+    xe::be<uint32_t> size;   // 0x4
+    xe::be<uint32_t> count;  // 0x8
+    char data[1];            // 0xC string_table_size bytes
+  } string_table;
 };
 
 struct xex2_import_library {

From 24dd0267bfd7663ee3ec5d6edbceaf6e7ff8ac0c Mon Sep 17 00:00:00 2001
From: gibbed <rick@gibbed.us>
Date: Sat, 24 Nov 2018 04:07:56 -0600
Subject: [PATCH 31/31] [Kernel] Oops.

---
 src/xenia/kernel/user_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/kernel/user_module.cc b/src/xenia/kernel/user_module.cc
index df65650f2..d36aa67ad 100644
--- a/src/xenia/kernel/user_module.cc
+++ b/src/xenia/kernel/user_module.cc
@@ -490,7 +490,7 @@ void UserModule::Dump() {
                                   o < opt_import_libraries->string_table.count;
              o++) {
           assert_true(o < xe::countof(string_table));
-          const char* str = &opt_import_libraries->string_table.data[o];
+          const char* str = &opt_import_libraries->string_table.data[j];
 
           string_table[o] = str;
           j += std::strlen(str) + 1;