Trying out a new style of JIT pattern matching.

2014-05-26 20:28:21 -07:00 · 2014-05-26 20:28:21 -07:00 · 5a85263e5f
parent a001714fb0
commit 5a85263e5f
38 changed files with 6403 additions and 5160 deletions
--- a/src/alloy/backend/ivm/ivm_intcode.cc
+++ b/src/alloy/backend/ivm/ivm_intcode.cc
@ -40,10 +40,10 @@ namespace ivm {
 #define DPRINT
 #define DFLUSH()

-//#define IPRINT if (ics.thread_state->thread_id() == 1) printf
-//#define IFLUSH() fflush(stdout)
-//#define DPRINT if (ics.thread_state->thread_id() == 1) printf
-//#define DFLUSH() fflush(stdout)
+#define IPRINT if (ics.thread_state->thread_id() == 1) printf
+#define IFLUSH() fflush(stdout)
+#define DPRINT if (ics.thread_state->thread_id() == 1) printf
+#define DFLUSH() fflush(stdout)

 #if XE_CPU_BIGENDIAN
 #define VECB16(v,n) (v.b16[n])
@ -1364,31 +1364,31 @@ int Translate_LOAD_CLOCK(TranslationContext& ctx, Instr* i) {
 }

 uint32_t IntCode_LOAD_LOCAL_I8(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_I16(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_I32(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_I64(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_F32(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_F64(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 uint32_t IntCode_LOAD_LOCAL_V128(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64));
+  ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32));
  return IA_NEXT;
 }
 int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) {
@ -1405,31 +1405,31 @@ int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) {
 }

 uint32_t IntCode_STORE_LOCAL_I8(IntCodeState& ics, const IntCode* i) {
-  *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i8;
+  *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i8;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_I16(IntCodeState& ics, const IntCode* i) {
-  *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i16;
+  *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i16;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_I32(IntCodeState& ics, const IntCode* i) {
-  *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i32;
+  *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i32;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_I64(IntCodeState& ics, const IntCode* i) {
-  *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i64;
+  *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i64;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_F32(IntCodeState& ics, const IntCode* i) {
-  *((float*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f32;
+  *((float*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f32;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_F64(IntCodeState& ics, const IntCode* i) {
-  *((double*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f64;
+  *((double*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f64;
  return IA_NEXT;
 }
 uint32_t IntCode_STORE_LOCAL_V128(IntCodeState& ics, const IntCode* i) {
-  *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128;
+  *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].v128;
  return IA_NEXT;
 }
 int Translate_STORE_LOCAL(TranslationContext& ctx, Instr* i) {
@ -3715,17 +3715,17 @@ int Translate_CNTLZ(TranslationContext& ctx, Instr* i) {

 uint32_t IntCode_EXTRACT_INT8_V128(IntCodeState& ics, const IntCode* i) {
  const vec128_t& src1 = ics.rf[i->src1_reg].v128;
-  ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i64);
+  ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i8);
  return IA_NEXT;
 }
 uint32_t IntCode_EXTRACT_INT16_V128(IntCodeState& ics, const IntCode* i) {
  const vec128_t& src1 = ics.rf[i->src1_reg].v128;
-  ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i64);
+  ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i8);
  return IA_NEXT;
 }
 uint32_t IntCode_EXTRACT_INT32_V128(IntCodeState& ics, const IntCode* i) {
  const vec128_t& src1 = ics.rf[i->src1_reg].v128;
-  ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i64);
+  ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i8);
  return IA_NEXT;
 }
 int Translate_EXTRACT(TranslationContext& ctx, Instr* i) {
--- a/src/alloy/backend/x64/lowering/lowering_sequences.cc
+++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc
--- a/src/alloy/backend/x64/lowering/lowering_table.cc
+++ b/src/alloy/backend/x64/lowering/lowering_table.cc
@ -1,71 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#include <alloy/backend/x64/lowering/lowering_table.h>
-
-#include <alloy/backend/x64/x64_emitter.h>
-#include <alloy/backend/x64/lowering/lowering_sequences.h>
-
-using namespace alloy;
-using namespace alloy::backend::x64;
-using namespace alloy::backend::x64::lowering;
-
-
-LoweringTable::LoweringTable(X64Backend* backend) :
-    backend_(backend) {
-  xe_zero_struct(lookup_, sizeof(lookup_));
-}
-
-LoweringTable::~LoweringTable() {
-  for (size_t n = 0; n < XECOUNT(lookup_); n++) {
-    auto entry = lookup_[n];
-    while (entry) {
-      auto next = entry->next;
-      delete entry;
-      entry = next;
-    }
-  }
-}
-
-int LoweringTable::Initialize() {
-  RegisterSequences(this);
-  return 0;
-}
-
-void LoweringTable::AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn) {
-  auto existing_entry = lookup_[starting_opcode];
-  auto new_entry = new sequence_fn_entry_t();
-  new_entry->fn = fn;
-  new_entry->next = existing_entry;
-  lookup_[starting_opcode] = new_entry;
-}
-
-int LoweringTable::ProcessBlock(X64Emitter& e, hir::Block* block) {
-  // Process instructions.
-  auto instr = block->instr_head;
-  while (instr) {
-    bool processed = false;
-    auto entry = lookup_[instr->opcode->num];
-    while (entry) {
-      if ((*entry->fn)(e, instr)) {
-        processed = true;
-        break;
-      }
-      entry = entry->next;
-    }
-    if (!processed) {
-      // No sequence found!
-      XELOGE("Unable to process HIR opcode %s", instr->opcode->name);
-      return 1;
-      instr = e.Advance(instr);
-    }
-  }
-
-  return 0;
-}
--- a/src/alloy/backend/x64/lowering/lowering_table.h
+++ b/src/alloy/backend/x64/lowering/lowering_table.h
@ -1,58 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_
-#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_
-
-#include <alloy/core.h>
-#include <alloy/hir/hir_builder.h>
-
-
-namespace alloy {
-namespace backend {
-namespace x64 {
-class X64Backend;
-class X64Emitter;
-namespace lowering {
-
-
-class LoweringTable {
-public:
-  LoweringTable(X64Backend* backend);
-  ~LoweringTable();
-
-  int Initialize();
-
-  int ProcessBlock(X64Emitter& e, hir::Block* block);
-
-public:
-  typedef bool(*sequence_fn_t)(X64Emitter& e, hir::Instr*& instr);
-  void AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn);
-
-private:
-  class sequence_fn_entry_t {
-  public:
-    sequence_fn_t fn;
-    sequence_fn_entry_t* next;
-  };
-
-  // NOTE: this class is shared by multiple threads and is not thread safe.
-  // Do not modify anything after init.
-  X64Backend* backend_;
-  sequence_fn_entry_t* lookup_[hir::__OPCODE_MAX_VALUE];
-};
-
-
-}  // namespace lowering
-}  // namespace x64
-}  // namespace backend
-}  // namespace alloy
-
-
-#endif  // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_
--- a/src/alloy/backend/x64/lowering/op_utils.inl
+++ b/src/alloy/backend/x64/lowering/op_utils.inl
--- a/src/alloy/backend/x64/lowering/sources.gypi
+++ b/src/alloy/backend/x64/lowering/sources.gypi
@ -1,12 +0,0 @@
-# Copyright 2013 Ben Vanik. All Rights Reserved.
-{
-  'sources': [
-    'lowering_sequences.cc',
-    'lowering_sequences.h',
-    'lowering_table.cc',
-    'lowering_table.h',
-    'op_utils.inl',
-    'tracers.cc',
-    'tracers.h',
-  ],
-}
--- a/src/alloy/backend/x64/sources.gypi
+++ b/src/alloy/backend/x64/sources.gypi
@ -12,11 +12,12 @@
    'x64_emitter.h',
    'x64_function.cc',
    'x64_function.h',
+    'x64_sequence.inl',
+    'x64_sequences.cc',
+    'x64_sequences.h',
    'x64_thunk_emitter.cc',
    'x64_thunk_emitter.h',
-  ],
-
-  'includes': [
-    'lowering/sources.gypi',
+    'x64_tracers.cc',
+    'x64_tracers.h',
  ],
 }
--- a/src/alloy/backend/x64/x64_backend.cc
+++ b/src/alloy/backend/x64/x64_backend.cc
@ -12,26 +12,23 @@
 #include <alloy/backend/x64/tracing.h>
 #include <alloy/backend/x64/x64_assembler.h>
 #include <alloy/backend/x64/x64_code_cache.h>
+#include <alloy/backend/x64/x64_sequences.h>
 #include <alloy/backend/x64/x64_thunk_emitter.h>
-#include <alloy/backend/x64/lowering/lowering_table.h>
-#include <alloy/backend/x64/lowering/lowering_sequences.h>

 using namespace alloy;
 using namespace alloy::backend;
 using namespace alloy::backend::x64;
-using namespace alloy::backend::x64::lowering;
 using namespace alloy::runtime;


 X64Backend::X64Backend(Runtime* runtime) :
-    code_cache_(0), lowering_table_(0),
+    code_cache_(0),
    Backend(runtime) {
 }

 X64Backend::~X64Backend() {
  alloy::tracing::WriteEvent(EventType::Deinit({
  }));
-  delete lowering_table_;
  delete code_cache_;
 }

@ -41,6 +38,8 @@ int X64Backend::Initialize() {
    return result;
  }

+  RegisterSequences();
+
  machine_info_.register_sets[0] = {
    0,
    "gpr",
@ -68,9 +67,6 @@ int X64Backend::Initialize() {
  delete thunk_emitter;
  delete allocator;

-  lowering_table_ = new LoweringTable(this);
-  RegisterSequences(lowering_table_);
-
  alloy::tracing::WriteEvent(EventType::Init({
  }));

--- a/src/alloy/backend/x64/x64_backend.h
+++ b/src/alloy/backend/x64/x64_backend.h
@ -20,7 +20,6 @@ namespace backend {
 namespace x64 {

 class X64CodeCache;
-namespace lowering { class LoweringTable; }


 #define ALLOY_HAS_X64_BACKEND 1
@ -38,8 +37,6 @@ public:
  HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; }
  GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; }

-  lowering::LoweringTable* lowering_table() const { return lowering_table_; }
-
  virtual int Initialize();

  virtual Assembler* CreateAssembler();
@ -48,8 +45,6 @@ private:
  X64CodeCache* code_cache_;
  HostToGuestThunk host_to_guest_thunk_;
  GuestToHostThunk guest_to_host_thunk_;
-
-  lowering::LoweringTable* lowering_table_;
 };


--- a/src/alloy/backend/x64/x64_emitter.cc
+++ b/src/alloy/backend/x64/x64_emitter.cc
@ -11,10 +11,14 @@

 #include <alloy/backend/x64/x64_backend.h>
 #include <alloy/backend/x64/x64_code_cache.h>
+#include <alloy/backend/x64/x64_function.h>
+#include <alloy/backend/x64/x64_sequences.h>
 #include <alloy/backend/x64/x64_thunk_emitter.h>
-#include <alloy/backend/x64/lowering/lowering_table.h>
 #include <alloy/hir/hir_builder.h>
 #include <alloy/runtime/debug_info.h>
+#include <alloy/runtime/runtime.h>
+#include <alloy/runtime/symbol_info.h>
+#include <alloy/runtime/thread_state.h>

 using namespace alloy;
 using namespace alloy::backend;
@ -31,6 +35,13 @@ namespace x64 {

 static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024;

+static const size_t STASH_OFFSET = 32;
+
+// If we are running with tracing on we have to store the EFLAGS in the stack,
+// otherwise our calls out to C to print will clear it before DID_CARRY/etc
+// can get the value.
+#define STORE_EFLAGS 1
+
 }  // namespace x64
 }  // namespace backend
 }  // namespace alloy
@ -145,12 +156,9 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) {
    mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx);
    mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx);
    mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
-    // ReloadRDX:
    mov(rdx, qword[rcx + 8]); // membase
  }

-  auto lowering_table = backend_->lowering_table();
-
  // Body.
  auto block = builder->first_block();
  while (block) {
@ -161,12 +169,17 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) {
      label = label->next;
    }

-    // Add instructions.
-    // The table will process sequences of instructions to (try to)
-    // generate optimal code.
-    current_instr_ = block->instr_head;
-    if (lowering_table->ProcessBlock(*this, block)) {
-      return 1;
+    // Process instructions.
+    const Instr* instr = block->instr_head;
+    while (instr) {
+      const Instr* new_tail = instr;
+      if (!SelectSequence(*this, instr, &new_tail)) {
+        // No sequence found!
+        XEASSERTALWAYS();
+        XELOGE("Unable to process HIR opcode %s", instr->opcode->name);
+        break;
+      }
+      instr = new_tail;
    }

    block = block->next;
@ -191,16 +204,320 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) {
  return 0;
 }

-Instr* X64Emitter::Advance(Instr* i) {
-  auto next = i->next;
-  current_instr_ = next;
-  return next;
-}
-
-void X64Emitter::MarkSourceOffset(Instr* i) {
+void X64Emitter::MarkSourceOffset(const Instr* i) {
  auto entry = source_map_arena_.Alloc<SourceMapEntry>();
  entry->source_offset  = i->src1.offset;
  entry->hir_offset     = uint32_t(i->block->ordinal << 16) | i->ordinal;
  entry->code_offset    = getSize();
  source_map_count_++;
 }
+
+void X64Emitter::DebugBreak() {
+  // TODO(benvanik): notify debugger.
+  db(0xCC);
+}
+
+void X64Emitter::Trap() {
+  // TODO(benvanik): notify debugger.
+  db(0xCC);
+}
+
+void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
+  // TODO(benvanik): notify debugger.
+  db(0xCC);
+  XEASSERTALWAYS();
+}
+
+uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
+  // TODO(benvanik): generate this thunk at runtime? or a shim?
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  auto symbol_info = reinterpret_cast<FunctionInfo*>(symbol_info_ptr);
+
+  Function* fn = NULL;
+  thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn);
+  XEASSERTNOTNULL(fn);
+  auto x64_fn = static_cast<X64Function*>(fn);
+  return reinterpret_cast<uint64_t>(x64_fn->machine_code());
+}
+
+void X64Emitter::Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info) {
+  auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
+  // Resolve address to the function to call and store in rax.
+  // TODO(benvanik): caching/etc. For now this makes debugging easier.
+  if (fn) {
+    mov(rax, reinterpret_cast<uint64_t>(fn->machine_code()));
+  } else {
+    CallNative(ResolveFunctionSymbol, reinterpret_cast<uint64_t>(symbol_info));
+  }
+
+  // Actually jump/call to rax.
+  if (instr->flags & CALL_TAIL) {
+    // Pass the callers return address over.
+    mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
+
+    add(rsp, static_cast<uint32_t>(stack_size()));
+    jmp(rax);
+  } else {
+    // Return address is from the previous SET_RETURN_ADDRESS.
+    mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
+    call(rax);
+  }
+}
+
+uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) {
+  // TODO(benvanik): generate this thunk at runtime? or a shim?
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+
+  // TODO(benvanik): required?
+  target_address &= 0xFFFFFFFF;
+
+  Function* fn = NULL;
+  thread_state->runtime()->ResolveFunction(target_address, &fn);
+  XEASSERTNOTNULL(fn);
+  auto x64_fn = static_cast<X64Function*>(fn);
+  return reinterpret_cast<uint64_t>(x64_fn->machine_code());
+}
+
+void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
+  // Check if return.
+  if (instr->flags & CALL_POSSIBLE_RETURN) {
+    cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
+    je("epilog", CodeGenerator::T_NEAR);
+  }
+
+  // Resolve address to the function to call and store in rax.
+  // TODO(benvanik): caching/etc. For now this makes debugging easier.
+  if (reg.getIdx() != rdx.getIdx()) {
+    mov(rdx, reg);
+  }
+  CallNative(ResolveFunctionAddress);
+
+  // Actually jump/call to rax.
+  if (instr->flags & CALL_TAIL) {
+    // Pass the callers return address over.
+    mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
+
+    add(rsp, static_cast<uint32_t>(stack_size()));
+    jmp(rax);
+  } else {
+    // Return address is from the previous SET_RETURN_ADDRESS.
+    mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
+    call(rax);
+  }
+}
+
+uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) {
+  auto symbol_info = reinterpret_cast<FunctionInfo*>(symbol_info_ptr);
+  XELOGW("undefined extern call to %.8X %s",
+         symbol_info->address(),
+         symbol_info->name());
+  return 0;
+}
+void X64Emitter::CallExtern(const hir::Instr* instr, const FunctionInfo* symbol_info) {
+  XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN);
+  if (!symbol_info->extern_handler()) {
+    CallNative(UndefinedCallExtern, reinterpret_cast<uint64_t>(symbol_info));
+  } else {
+    // rcx = context
+    // rdx = target host function
+    // r8  = arg0
+    // r9  = arg1
+    mov(rdx, reinterpret_cast<uint64_t>(symbol_info->extern_handler()));
+    mov(r8, reinterpret_cast<uint64_t>(symbol_info->extern_arg0()));
+    mov(r9, reinterpret_cast<uint64_t>(symbol_info->extern_arg1()));
+    auto thunk = backend()->guest_to_host_thunk();
+    mov(rax, reinterpret_cast<uint64_t>(thunk));
+    call(rax);
+    ReloadECX();
+    ReloadEDX();
+    // rax = host return
+  }
+}
+
+void X64Emitter::CallNative(void* fn) {
+  mov(rax, reinterpret_cast<uint64_t>(fn));
+  call(rax);
+  ReloadECX();
+  ReloadEDX();
+}
+
+void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context)) {
+  mov(rax, reinterpret_cast<uint64_t>(fn));
+  call(rax);
+  ReloadECX();
+  ReloadEDX();
+}
+
+void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)) {
+  mov(rax, reinterpret_cast<uint64_t>(fn));
+  call(rax);
+  ReloadECX();
+  ReloadEDX();
+}
+
+void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0) {
+  mov(rdx, arg0);
+  mov(rax, reinterpret_cast<uint64_t>(fn));
+  call(rax);
+  ReloadECX();
+  ReloadEDX();
+}
+
+void X64Emitter::SetReturnAddress(uint64_t value) {
+  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], value);
+}
+
+void X64Emitter::ReloadECX() {
+  mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]);
+}
+
+void X64Emitter::ReloadEDX() {
+  mov(rdx, qword[rcx + 8]); // membase
+}
+
+void X64Emitter::LoadEflags() {
+#if STORE_EFLAGS
+  mov(eax, dword[rsp + STASH_OFFSET]);
+  push(rax);
+  popf();
+#else
+  // EFLAGS already present.
+#endif  // STORE_EFLAGS
+}
+
+void X64Emitter::StoreEflags() {
+#if STORE_EFLAGS
+  pushf();
+  pop(qword[rsp + STASH_OFFSET]);
+#else
+  // EFLAGS should have CA set?
+  // (so long as we don't fuck with it)
+#endif  // STORE_EFLAGS
+}
+
+bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
+  if ((v & ~0x7FFFFFFF) == 0) {
+    // Fits under 31 bits, so just load using normal mov.
+    return true;
+  } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+    // Negative number that fits in 32bits.
+    return true;
+  }
+  return false;
+}
+
+void X64Emitter::MovMem64(const RegExp& addr, uint64_t v) {
+  if ((v & ~0x7FFFFFFF) == 0) {
+    // Fits under 31 bits, so just load using normal mov.
+    mov(qword[addr], v);
+  } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+    // Negative number that fits in 32bits.
+    mov(qword[addr], v);
+  } else if (!(v >> 32)) {
+    // All high bits are zero. It'd be nice if we had a way to load a 32bit
+    // immediate without sign extending!
+    // TODO(benvanik): this is super common, find a better way.
+    mov(dword[addr], static_cast<uint32_t>(v));
+    mov(dword[addr + 4], 0);
+  } else {
+    // 64bit number that needs double movs.
+    mov(dword[addr], static_cast<uint32_t>(v));
+    mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
+  }
+}
+
+Address X64Emitter::GetXmmConstPtr(XmmConst id) {
+  static const vec128_t xmm_consts[] = {
+    /* XMMZero                */ vec128f(0.0f, 0.0f, 0.0f, 0.0f),
+    /* XMMOne                 */ vec128f(1.0f, 1.0f, 1.0f, 1.0f),
+    /* XMMNegativeOne         */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
+    /* XMMMaskX16Y16          */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+    /* XMMFlipX16Y16          */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000),
+    /* XMMFixX16Y16           */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f),
+    /* XMMNormalizeX16Y16     */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f),
+    /* XMM3301                */ vec128f(3.0f, 3.0f, 0.0f, 1.0f),
+    /* XMMSignMaskPS          */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
+    /* XMMSignMaskPD          */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u),
+    /* XMMByteSwapMask        */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
+    /* XMMPermuteControl15    */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15),
+    /* XMMUnpackD3DCOLOR      */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02),
+    /* XMMOneOver255          */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f),
+    /* XMMShiftMaskPS         */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu),
+    /* XMMOneMask             */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+  };
+  // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to
+  // prevent this move.
+  // TODO(benvanik): move to predictable location in PPCContext? could then
+  // just do rcx relative addression with no rax overwriting.
+  mov(rax, (uint64_t)&xmm_consts[id]);
+  return ptr[rax];
+}
+
+void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
+  // http://www.agner.org/optimize/optimizing_assembly.pdf
+  // 13.4 Generating constants
+  if (!v.low && !v.high) {
+    // 0000...
+    vpxor(dest, dest);
+  } else if (v.low == ~0ull && v.high == ~0ull) {
+    // 1111...
+    vmovaps(dest, GetXmmConstPtr(XMMOneMask));
+  } else {
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+    MovMem64(rsp + STASH_OFFSET, v.low);
+    MovMem64(rsp + STASH_OFFSET + 8, v.high);
+    vmovdqa(dest, ptr[rsp + STASH_OFFSET]);
+  }
+}
+
+void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
+  union {
+    float f;
+    uint32_t i;
+  } x = { v };
+  if (!v) {
+    // 0
+    vpxor(dest, dest);
+  } else if (x.i == ~0UL) {
+    // 1111...
+    vmovaps(dest, GetXmmConstPtr(XMMOneMask));
+  } else {
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+    mov(eax, x.i);
+    vmovd(dest, eax);
+  }
+}
+
+void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
+  union {
+    double d;
+    uint64_t i;
+  } x = { v };
+  if (!v) {
+    // 0
+    vpxor(dest, dest);
+  } else if (x.i == ~0ULL) {
+    // 1111...
+    vmovaps(dest, GetXmmConstPtr(XMMOneMask));
+  } else {
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+    mov(rax, x.i);
+    vmovq(dest, rax);
+  }
+}
+
+Address X64Emitter::StashXmm(const Xmm& r) {
+  auto addr = ptr[rsp + STASH_OFFSET];
+  vmovups(addr, r);
+  return addr;
+}
+
+Address X64Emitter::StashXmm(const vec128_t& v) {
+  auto addr = ptr[rsp + STASH_OFFSET];
+  LoadConstantXmm(xmm0, v);
+  vmovups(addr, xmm0);
+  return addr;
+}
--- a/src/alloy/backend/x64/x64_emitter.h
+++ b/src/alloy/backend/x64/x64_emitter.h
@ -19,7 +19,9 @@
 XEDECLARECLASS2(alloy, hir, HIRBuilder);
 XEDECLARECLASS2(alloy, hir, Instr);
 XEDECLARECLASS2(alloy, runtime, DebugInfo);
+XEDECLARECLASS2(alloy, runtime, FunctionInfo);
 XEDECLARECLASS2(alloy, runtime, Runtime);
+XEDECLARECLASS2(alloy, runtime, SymbolInfo);

 namespace alloy {
 namespace backend {
@ -33,6 +35,25 @@ enum RegisterFlags {
  REG_ABCD  = (1 << 1),
 };

+enum XmmConst {
+  XMMZero               = 0,
+  XMMOne                = 1,
+  XMMNegativeOne        = 2,
+  XMMMaskX16Y16         = 3,
+  XMMFlipX16Y16         = 4,
+  XMMFixX16Y16          = 5,
+  XMMNormalizeX16Y16    = 6,
+  XMM3301               = 7,
+  XMMSignMaskPS         = 8,
+  XMMSignMaskPD         = 9,
+  XMMByteSwapMask       = 10,
+  XMMPermuteControl15   = 11,
+  XMMUnpackD3DCOLOR     = 12,
+  XMMOneOver255         = 13,
+  XMMShiftMaskPS        = 14,
+  XMMOneMask            = 15,
+};
+
 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
 class XbyakAllocator : public Xbyak::Allocator {
 public:
@ -54,79 +75,68 @@ public:
           void*& out_code_address, size_t& out_code_size);

 public:
-  template<typename V0>
-  void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags) {
-    SetupReg(v0, r0);
-  }
-  template<typename V0, typename V1>
-  void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags,
-               hir::Value* v1, V1& r1, uint32_t r1_flags) {
-    SetupReg(v0, r0);
-    SetupReg(v1, r1);
-  }
-  template<typename V0, typename V1, typename V2>
-  void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags,
-               hir::Value* v1, V1& r1, uint32_t r1_flags,
-               hir::Value* v2, V2& r2, uint32_t r2_flags) {
-    SetupReg(v0, r0);
-    SetupReg(v1, r1);
-    SetupReg(v2, r2);
-  }
-  template<typename V0, typename V1, typename V2, typename V3>
-  void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags,
-               hir::Value* v1, V1& r1, uint32_t r1_flags,
-               hir::Value* v2, V2& r2, uint32_t r2_flags,
-               hir::Value* v3, V3& r3, uint32_t r3_flags) {
-    SetupReg(v0, r0);
-    SetupReg(v1, r1);
-    SetupReg(v2, r2);
-    SetupReg(v3, r3);
-  }
-  template<typename V0>
-  void EndOp(V0& r0) {
-  }
-  template<typename V0, typename V1>
-  void EndOp(V0& r0, V1& r1) {
-  }
-  template<typename V0, typename V1, typename V2>
-  void EndOp(V0& r0, V1& r1, V2& r2) {
-  }
-  template<typename V0, typename V1, typename V2, typename V3>
-  void EndOp(V0& r0, V1& r1, V2& r2, V3& r3) {
-  }
-
  // Reserved:  rsp
  // Scratch:   rax/rcx/rdx
-  //            xmm0-1
+  //            xmm0-2 (could be only xmm0 with some trickery)
  // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?)
-  //            xmm6-xmm15 (save to get xmm2-xmm5)
+  //            xmm6-xmm15 (save to get xmm3-xmm5)
  static const int GPR_COUNT = 5;
  static const int XMM_COUNT = 10;

-  static void SetupReg(hir::Value* v, Xbyak::Reg8& r) {
+  static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) {
    auto idx = gpr_reg_map_[v->reg.index];
    r = Xbyak::Reg8(idx);
  }
-  static void SetupReg(hir::Value* v, Xbyak::Reg16& r) {
+  static void SetupReg(const hir::Value* v, Xbyak::Reg16& r) {
    auto idx = gpr_reg_map_[v->reg.index];
    r = Xbyak::Reg16(idx);
  }
-  static void SetupReg(hir::Value* v, Xbyak::Reg32& r) {
+  static void SetupReg(const hir::Value* v, Xbyak::Reg32& r) {
    auto idx = gpr_reg_map_[v->reg.index];
    r = Xbyak::Reg32(idx);
  }
-  static void SetupReg(hir::Value* v, Xbyak::Reg64& r) {
+  static void SetupReg(const hir::Value* v, Xbyak::Reg64& r) {
    auto idx = gpr_reg_map_[v->reg.index];
    r = Xbyak::Reg64(idx);
  }
-  static void SetupReg(hir::Value* v, Xbyak::Xmm& r) {
+  static void SetupReg(const hir::Value* v, Xbyak::Xmm& r) {
    auto idx = xmm_reg_map_[v->reg.index];
    r = Xbyak::Xmm(idx);
  }

-  hir::Instr* Advance(hir::Instr* i);
+  void MarkSourceOffset(const hir::Instr* i);

-  void MarkSourceOffset(hir::Instr* i);
+  void DebugBreak();
+  void Trap();
+  void UnimplementedInstr(const hir::Instr* i);
+  void UnimplementedExtern(const hir::Instr* i);
+
+  void Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info);
+  void CallIndirect(const hir::Instr* instr, const Xbyak::Reg64& reg);
+  void CallExtern(const hir::Instr* instr, const runtime::FunctionInfo* symbol_info);
+  void CallNative(void* fn);
+  void CallNative(uint64_t(*fn)(void* raw_context));
+  void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0));
+  void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0);
+  void SetReturnAddress(uint64_t value);
+  void ReloadECX();
+  void ReloadEDX();
+
+  // TODO(benvanik): Label for epilog (don't use strings).
+
+  void LoadEflags();
+  void StoreEflags();
+
+  // Moves a 64bit immediate into memory.
+  bool ConstantFitsIn32Reg(uint64_t v);
+  void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
+
+  Xbyak::Address GetXmmConstPtr(XmmConst id);
+  void LoadConstantXmm(Xbyak::Xmm dest, float v);
+  void LoadConstantXmm(Xbyak::Xmm dest, double v);
+  void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
+  Xbyak::Address StashXmm(const Xbyak::Xmm& r);
+  Xbyak::Address StashXmm(const vec128_t& v);

  size_t stack_size() const { return stack_size_; }

--- a/src/alloy/backend/x64/x64_sequence.inl
+++ b/src/alloy/backend/x64/x64_sequence.inl
@ -0,0 +1,714 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+
+namespace {
+
+enum KeyType {
+  KEY_TYPE_X = OPCODE_SIG_TYPE_X,
+  KEY_TYPE_L = OPCODE_SIG_TYPE_L,
+  KEY_TYPE_O = OPCODE_SIG_TYPE_O,
+  KEY_TYPE_S = OPCODE_SIG_TYPE_S,
+  KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE,
+  KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE,
+  KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE,
+  KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE,
+  KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE,
+  KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
+  KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
+};
+
+#pragma pack(push, 1)
+union InstrKey {
+  struct {
+    uint32_t opcode : 8;
+    uint32_t dest : 5;
+    uint32_t src1 : 5;
+    uint32_t src2 : 5;
+    uint32_t src3 : 5;
+    uint32_t reserved : 4;
+  };
+  uint32_t value;
+
+  operator uint32_t() const {
+    return value;
+  }
+
+  InstrKey() : value(0) {}
+  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(const Instr* i) : value(0) {
+    opcode = i->opcode->num;
+    uint32_t sig = i->opcode->signature;
+    dest = GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
+    src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+    if (src1 == OPCODE_SIG_TYPE_V) {
+      src1 += i->src1.value->type;
+    }
+    src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+    if (src2 == OPCODE_SIG_TYPE_V) {
+      src2 += i->src2.value->type;
+    }
+    src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+    if (src3 == OPCODE_SIG_TYPE_V) {
+      src3 += i->src3.value->type;
+    }
+  }
+
+  template <Opcode OPCODE,
+            KeyType DEST = KEY_TYPE_X,
+            KeyType SRC1 = KEY_TYPE_X,
+            KeyType SRC2 = KEY_TYPE_X,
+            KeyType SRC3 = KEY_TYPE_X>
+  struct Construct {
+    static const uint32_t value =
+        (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
+  };
+};
+#pragma pack(pop)
+static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes");
+
+template <typename... Ts>
+struct CombinedStruct;
+template <>
+struct CombinedStruct<> {};
+template <typename T, typename... Ts>
+struct CombinedStruct<T, Ts...> : T, CombinedStruct<Ts...> {};
+
+struct OpBase {};
+
+template <typename T, KeyType KEY_TYPE>
+struct Op : OpBase {
+  static const KeyType key_type = KEY_TYPE;
+};
+
+struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
+protected:
+  template <typename T, KeyType KEY_TYPE> friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts> friend struct I;
+  void Load(const Instr::Op& op) {}
+};
+
+struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
+  uint64_t value;
+protected:
+  template <typename T, KeyType KEY_TYPE> friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts> friend struct I;
+  void Load(const Instr::Op& op) {
+    this->value = op.offset;
+  }
+};
+
+struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
+  FunctionInfo* value;
+protected:
+  template <typename T, KeyType KEY_TYPE> friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts> friend struct I;
+  bool Load(const Instr::Op& op) {
+    this->value = op.symbol_info;
+    return true;
+  }
+};
+
+struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
+  hir::Label* value;
+protected:
+  template <typename T, KeyType KEY_TYPE> friend struct Op;
+  template <hir::Opcode OPCODE, typename... Ts> friend struct I;
+  void Load(const Instr::Op& op) {
+    this->value = op.label;
+  }
+};
+
+template <typename T, KeyType KEY_TYPE, typename REG_TYPE, typename CONST_TYPE, int TAG = -1>
+struct ValueOp : Op<ValueOp<T, KEY_TYPE, REG_TYPE, CONST_TYPE, TAG>, KEY_TYPE> {
+  typedef REG_TYPE reg_type;
+  static const int tag = TAG;
+  const Value* value;
+  bool is_constant;
+  virtual bool ConstantFitsIn32Reg() const { return true; }
+  const REG_TYPE& reg() const {
+    XEASSERT(!is_constant);
+    return reg_;
+  }
+  operator const REG_TYPE&() const {
+    return reg();
+  }
+  bool IsEqual(const T& b) const {
+    if (is_constant && b.is_constant) {
+      return reinterpret_cast<const T*>(this)->constant() == b.constant();
+    } else if (!is_constant && !b.is_constant) {
+      return reg_.getIdx() == b.reg_.getIdx();
+    } else {
+      return false;
+    }
+  }
+  bool IsEqual(const Xbyak::Reg& b) const {
+    if (is_constant) {
+      return false;
+    } else if (!is_constant) {
+      return reg_.getIdx() == b.getIdx();
+    } else {
+      return false;
+    }
+  }
+  bool operator== (const T& b) const {
+    return IsEqual(b);
+  }
+  bool operator!= (const T& b) const {
+    return !IsEqual(b);
+  }
+  bool operator== (const Xbyak::Reg& b) const {
+    return IsEqual(b);
+  }
+  bool operator!= (const Xbyak::Reg& b) const {
+    return !IsEqual(b);
+  }
+  void Load(const Instr::Op& op) {
+    const Value* value = op.value;
+    this->value = value;
+    is_constant = value->IsConstant();
+    if (!is_constant) {
+      X64Emitter::SetupReg(value, reg_);
+    }
+  }
+protected:
+  REG_TYPE reg_;
+};
+
+template <int TAG = -1>
+struct I8 : ValueOp<I8<TAG>, KEY_TYPE_V_I8, Reg8, int8_t, TAG> {
+  const int8_t constant() const {
+    XEASSERT(is_constant);
+    return value->constant.i8;
+  }
+};
+template <int TAG = -1>
+struct I16 : ValueOp<I16<TAG>, KEY_TYPE_V_I16, Reg16, int16_t, TAG> {
+  const int16_t constant() const {
+    XEASSERT(is_constant);
+    return value->constant.i16;
+  }
+};
+template <int TAG = -1>
+struct I32 : ValueOp<I32<TAG>, KEY_TYPE_V_I32, Reg32, int32_t, TAG> {
+  const int32_t constant() const {
+    XEASSERT(is_constant);
+    return value->constant.i32;
+  }
+};
+template <int TAG = -1>
+struct I64 : ValueOp<I64<TAG>, KEY_TYPE_V_I64, Reg64, int64_t, TAG> {
+  const int64_t constant() const {
+    XEASSERT(is_constant);
+    return value->constant.i64;
+  }
+  bool ConstantFitsIn32Reg() const override {
+    int64_t v = value->constant.i64;
+    if ((v & ~0x7FFFFFFF) == 0) {
+      // Fits under 31 bits, so just load using normal mov.
+      return true;
+    } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+      // Negative number that fits in 32bits.
+      return true;
+    }
+    return false;
+  }
+};
+template <int TAG = -1>
+struct F32 : ValueOp<F32<TAG>, KEY_TYPE_V_F32, Xmm, float, TAG> {
+  const float constant() const {
+    XEASSERT(is_constant);
+    return value->constant.f32;
+  }
+};
+template <int TAG = -1>
+struct F64 : ValueOp<F64<TAG>, KEY_TYPE_V_F64, Xmm, double, TAG> {
+  const double constant() const {
+    XEASSERT(is_constant);
+    return value->constant.f64;
+  }
+};
+template <int TAG = -1>
+struct V128 : ValueOp<V128<TAG>, KEY_TYPE_V_V128, Xmm, vec128_t, TAG> {
+  const vec128_t& constant() const {
+    XEASSERT(is_constant);
+    return value->constant.v128;
+  }
+};
+
+struct TagTable {
+  struct {
+    bool valid;
+    Instr::Op op;
+  } table[16];
+
+  template <typename T, typename std::enable_if<T::key_type == KEY_TYPE_X>::type* = nullptr>
+  bool CheckTag(const Instr::Op& op) {
+    return true;
+  }
+  template <typename T, typename std::enable_if<T::key_type == KEY_TYPE_L>::type* = nullptr>
+  bool CheckTag(const Instr::Op& op) {
+    return true;
+  }
+  template <typename T, typename std::enable_if<T::key_type == KEY_TYPE_O>::type* = nullptr>
+  bool CheckTag(const Instr::Op& op) {
+    return true;
+  }
+  template <typename T, typename std::enable_if<T::key_type == KEY_TYPE_S>::type* = nullptr>
+  bool CheckTag(const Instr::Op& op) {
+    return true;
+  }
+  template <typename T, typename std::enable_if<T::key_type >= KEY_TYPE_V_I8>::type* = nullptr>
+  bool CheckTag(const Instr::Op& op) {
+    const Value* value = op.value;
+    if (T::tag == -1) {
+      return true;
+    }
+    if (table[T::tag].valid &&
+        table[T::tag].op.value != value) {
+      return false;
+    }
+    table[T::tag].valid = true;
+    table[T::tag].op.value = (Value*)value;
+    return true;
+  }
+};
+
+template <typename DEST, typename... Tf>
+struct DestField;
+template <typename DEST>
+struct DestField<DEST> {
+  DEST dest;
+protected:
+  bool LoadDest(const Instr* i, TagTable& tag_table) {
+    Instr::Op op;
+    op.value = i->dest;
+    if (tag_table.CheckTag<DEST>(op)) {
+      dest.Load(op);
+      return true;
+    }
+    return false;
+  }
+};
+template <>
+struct DestField<VoidOp> {
+protected:
+  bool LoadDest(const Instr* i, TagTable& tag_table) {
+    return true;
+  }
+};
+
+template <hir::Opcode OPCODE, typename... Ts>
+struct I;
+template <hir::Opcode OPCODE, typename DEST>
+struct I<OPCODE, DEST> : DestField<DEST> {
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key = InstrKey::Construct<OPCODE, DEST::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  const Instr* instr;
+protected:
+  template <typename... Ti> friend struct SequenceFields;
+  bool Load(const Instr* i, TagTable& tag_table) {
+    if (InstrKey(i).value == key &&
+        LoadDest(i, tag_table)) {
+      instr = i;
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1>
+struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key = InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  const Instr* instr;
+  SRC1 src1;
+protected:
+  template <typename... Ti> friend struct SequenceFields;
+  bool Load(const Instr* i, TagTable& tag_table) {
+    if (InstrKey(i).value == key &&
+        LoadDest(i, tag_table) &&
+        tag_table.CheckTag<SRC1>(i->src1)) {
+      instr = i;
+      src1.Load(i->src1);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2>
+struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key = InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type, SRC2::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+protected:
+  template <typename... Ti> friend struct SequenceFields;
+  bool Load(const Instr* i, TagTable& tag_table) {
+    if (InstrKey(i).value == key &&
+        LoadDest(i, tag_table) &&
+        tag_table.CheckTag<SRC1>(i->src1) &&
+        tag_table.CheckTag<SRC2>(i->src2)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2, typename SRC3>
+struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key = InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type, SRC2::key_type, SRC3::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  static const KeyType src3_type = SRC3::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+  SRC3 src3;
+protected:
+  template <typename... Ti> friend struct SequenceFields;
+  bool Load(const Instr* i, TagTable& tag_table) {
+    if (InstrKey(i).value == key &&
+        LoadDest(i, tag_table) &&
+        tag_table.CheckTag<SRC1>(i->src1) &&
+        tag_table.CheckTag<SRC2>(i->src2) &&
+        tag_table.CheckTag<SRC3>(i->src3)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      src3.Load(i->src3);
+      return true;
+    }
+    return false;
+  }
+};
+
+template <typename... Ti>
+struct SequenceFields;
+template <typename I1>
+struct SequenceFields<I1> {
+  I1 i1;
+  typedef typename I1 I1Type;
+protected:
+  template <typename SEQ, typename... Ti> friend struct Sequence;
+  bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) {
+    if (i1.Load(i, tag_table)) {
+      *new_tail = i->next;
+      return true;
+    }
+    return false;
+  }
+};
+template <typename I1, typename I2>
+struct SequenceFields<I1, I2> : SequenceFields<I1> {
+  I2 i2;
+protected:
+  template <typename SEQ, typename... Ti> friend struct Sequence;
+  bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) {
+    if (SequenceFields<I1>::Check(i, tag_table, new_tail)) {
+      auto ni = i->next;
+      if (ni && i2.Load(ni, tag_table)) {
+        *new_tail = ni;
+        return i;
+      }
+    }
+    return false;
+  }
+};
+template <typename I1, typename I2, typename I3>
+struct SequenceFields<I1, I2, I3> : SequenceFields<I1, I2> {
+  I3 i3;
+protected:
+  template <typename SEQ, typename... Ti> friend struct Sequence;
+  bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) {
+    if (SequenceFields<I1, I2>::Check(i, tag_table, new_tail)) {
+      auto ni = i->next;
+      if (ni && i3.Load(ni, tag_table)) {
+        *new_tail = ni;
+        return i;
+      }
+    }
+    return false;
+  }
+};
+template <typename I1, typename I2, typename I3, typename I4>
+struct SequenceFields<I1, I2, I3, I4> : SequenceFields<I1, I2, I3> {
+  I4 i4;
+protected:
+  template <typename SEQ, typename... Ti> friend struct Sequence;
+  bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) {
+    if (SequenceFields<I1, I2, I3>::Check(i, tag_table, new_tail)) {
+      auto ni = i->next;
+      if (ni && i4.Load(ni, tag_table)) {
+        *new_tail = ni;
+        return i;
+      }
+    }
+    return false;
+  }
+};
+template <typename I1, typename I2, typename I3, typename I4, typename I5>
+struct SequenceFields<I1, I2, I3, I4, I5> : SequenceFields<I1, I2, I3, I4> {
+  I5 i5;
+protected:
+  template <typename SEQ, typename... Ti> friend struct Sequence;
+  bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) {
+    if (SequenceFields<I1, I2, I3, I4>::Check(i, tag_table, new_tail)) {
+      auto ni = i->next;
+      if (ni && i5.Load(ni, tag_table)) {
+        *new_tail = ni;
+        return i;
+      }
+    }
+    return false;
+  }
+};
+
+template <typename SEQ, typename... Ti>
+struct Sequence {
+  struct EmitArgs : SequenceFields<Ti...> {};
+
+  static bool Select(X64Emitter& e, const Instr* i, const Instr** new_tail) {
+    EmitArgs args;
+    TagTable tag_table;
+    if (!args.Check(i, tag_table, new_tail)) {
+      return false;
+    }
+    SEQ::Emit(e, args);
+    return true;
+  }
+};
+
+template <typename T>
+const T GetTempReg(X64Emitter& e);
+template <>
+const Reg8 GetTempReg<Reg8>(X64Emitter& e) {
+  return e.al;
+}
+template <>
+const Reg16 GetTempReg<Reg16>(X64Emitter& e) {
+  return e.ax;
+}
+template <>
+const Reg32 GetTempReg<Reg32>(X64Emitter& e) {
+  return e.eax;
+}
+template <>
+const Reg64 GetTempReg<Reg64>(X64Emitter& e) {
+  return e.rax;
+}
+
+template <typename SEQ, typename T>
+struct SingleSequence : public Sequence<SingleSequence<SEQ, T>, T> {
+  typedef T EmitArgType;
+  static const uint32_t head_key = T::key;
+  static void Emit(X64Emitter& e, const EmitArgs& _) {
+    SEQ::Emit(e, _.i1);
+  }
+
+  template <typename REG_FN>
+  static void EmitUnaryOp(
+      X64Emitter& e, const EmitArgType& i,
+      const REG_FN& reg_fn) {
+    if (i.src1.is_constant) {
+      e.mov(i.dest, i.src1.constant());
+      reg_fn(e, i.dest);
+    } else {
+      if (i.dest != i.src1) {
+        e.mov(i.dest, i.src1);
+      }
+      reg_fn(e, i.dest);
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeBinaryOp(
+      X64Emitter& e, const EmitArgType& i,
+      const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      XEASSERT(!i.src2.is_constant);
+      if (i.dest == i.src2) {
+        if (i.src1.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
+        } else {
+          auto temp = GetTempReg<decltype(i.src1)::reg_type>(e);
+          e.mov(temp, i.src1.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.mov(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.mov(i.dest, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1);
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        reg_reg_fn(e, i.dest, i.src1);
+      } else {
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeBinaryOp(
+      X64Emitter& e, const EmitArgType& i,
+      const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      XEASSERT(!i.src2.is_constant);
+      if (i.dest == i.src2) {
+        auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2);
+        e.mov(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.mov(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.mov(i.dest, i.src1);
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+          e.mov(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2);
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.mov(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeCompareOp(
+      X64Emitter& e, const EmitArgType& i,
+      const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      XEASSERT(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src2, static_cast<int32_t>(i.src1.constant()));
+      } else {
+        auto temp = GetTempReg<decltype(i.src1)::reg_type>(e);
+        e.mov(temp, i.src1.constant());
+        reg_reg_fn(e, i.src2, temp);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src1, static_cast<int32_t>(i.src2.constant()));
+      } else {
+        auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2.constant());
+        reg_reg_fn(e, i.src1, temp);
+      }
+    } else {
+      reg_reg_fn(e, i.src1, i.src2);
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeCompareOp(
+      X64Emitter& e, const EmitArgType& i,
+      const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      XEASSERT(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src2, static_cast<int32_t>(i.src1.constant()), true);
+      } else {
+        auto temp = GetTempReg<decltype(i.src1)::reg_type>(e);
+        e.mov(temp, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2, temp, true);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src1, static_cast<int32_t>(i.src2.constant()), false);
+      } else {
+        auto temp = GetTempReg<decltype(i.src2)::reg_type>(e);
+        e.mov(temp, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1, temp, false);
+      }
+    } else {
+      reg_reg_fn(e, i.dest, i.src1, i.src2, false);
+    }
+  }
+};
+
+static const int ANY = -1;
+typedef int tag_t;
+static const tag_t TAG0 = 0;
+static const tag_t TAG1 = 1;
+static const tag_t TAG2 = 2;
+static const tag_t TAG3 = 3;
+static const tag_t TAG4 = 4;
+static const tag_t TAG5 = 5;
+static const tag_t TAG6 = 6;
+static const tag_t TAG7 = 7;
+
+typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, const Instr**);
+
+template <typename T>
+void Register() {
+  sequence_table.insert({ T::head_key, T::Select });
+}
+template <typename T, typename Tn, typename... Ts>
+void Register() {
+  Register<T>();
+  Register<Tn, Ts...>();
+};
+#define EMITTER_OPCODE_TABLE(name, ...) \
+  void Register_##name() { \
+    Register<__VA_ARGS__>(); \
+  }
+
+#define MATCH(...) __VA_ARGS__
+#define EMITTER(name, match) struct name : SingleSequence<name, match>
+#define SEQUENCE(name, match) struct name : Sequence<name, match>
+
+}  // namespace
--- a/src/alloy/backend/x64/x64_sequences.cc
+++ b/src/alloy/backend/x64/x64_sequences.cc
--- a/src/alloy/backend/x64/lowering/lowering_sequences.h
+++ b/src/alloy/backend/x64/lowering/lowering_sequences.h
@ -2,32 +2,32 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

-#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_
-#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_
+#ifndef ALLOY_BACKEND_X64_X64_SEQUENCES_H_
+#define ALLOY_BACKEND_X64_X64_SEQUENCES_H_

 #include <alloy/core.h>
-#include <alloy/hir/instr.h>

+XEDECLARECLASS2(alloy, hir, Instr);

 namespace alloy {
 namespace backend {
 namespace x64 {
-namespace lowering {

-class LoweringTable;
-
-void RegisterSequences(LoweringTable* table);
+class X64Emitter;
+
+
+void RegisterSequences();
+bool SelectSequence(X64Emitter& e, const hir::Instr* i, const hir::Instr** new_tail);


-}  // namespace lowering
 }  // namespace x64
 }  // namespace backend
 }  // namespace alloy


-#endif  // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_
+#endif  // ALLOY_BACKEND_X64_X64_SEQUENCES_H_
--- a/src/alloy/backend/x64/lowering/tracers.cc
+++ b/src/alloy/backend/x64/lowering/tracers.cc
@ -7,7 +7,7 @@
 ******************************************************************************
 */

-#include <alloy/backend/x64/lowering/tracers.h>
+#include <alloy/backend/x64/x64_tracers.h>

 #include <alloy/backend/x64/x64_emitter.h>
 #include <alloy/runtime/runtime.h>
@ -15,19 +15,14 @@

 using namespace alloy;
 using namespace alloy::backend::x64;
-using namespace alloy::backend::x64::lowering;
 using namespace alloy::runtime;

 namespace alloy {
 namespace backend {
 namespace x64 {
-namespace lowering {

-
-#define IFLUSH()
-#define IPRINT
-#define DFLUSH()
-#define DPRINT
+#define ITRACE 0
+#define DTRACE 0

 #define TARGET_THREAD 1

@ -36,6 +31,16 @@ namespace lowering {
 #define DFLUSH() fflush(stdout)
 #define DPRINT DFLUSH(); if (thread_state->thread_id() == TARGET_THREAD) printf

+uint32_t GetTracingMode() {
+  uint32_t mode = 0;
+#if ITRACE
+  mode |= TRACING_INSTR;
+#endif  // ITRACE
+#if DTRACE
+  mode |= TRACING_DATA;
+#endif  // DTRACE
+  return mode;
+}

 void TraceString(void* raw_context, const char* str) {
  auto thread_state = *((ThreadState**)raw_context);
@ -190,7 +195,6 @@ void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) {
 }


-}  // namespace lowering
 }  // namespace x64
 }  // namespace backend
 }  // namespace alloy
--- a/src/alloy/backend/x64/lowering/tracers.h
+++ b/src/alloy/backend/x64/lowering/tracers.h
@ -7,8 +7,8 @@
 ******************************************************************************
 */

-#ifndef ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_
-#define ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_
+#ifndef ALLOY_BACKEND_X64_X64_TRACERS_H_
+#define ALLOY_BACKEND_X64_X64_TRACERS_H_

 #include <alloy/core.h>

@ -33,7 +33,15 @@ namespace alloy {
 namespace backend {
 namespace x64 {
 class X64Emitter;
-namespace lowering {
+
+enum TracingMode {
+  TRACING_INSTR = (1 << 1),
+  TRACING_DATA = (1 << 2),
+};
+
+uint32_t GetTracingMode();
+inline bool IsTracingInstr() { return (GetTracingMode() & TRACING_INSTR) != 0; }
+inline bool IsTracingData() { return (GetTracingMode() & TRACING_DATA) != 0; }

 void TraceString(void* raw_context, const char* str);

@ -69,10 +77,9 @@ void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value);
 void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value);
 void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value);

-}  // namespace lowering
 }  // namespace x64
 }  // namespace backend
 }  // namespace alloy


-#endif  // ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_
+#endif  // ALLOY_BACKEND_X64_X64_TRACERS_H_
--- a/src/alloy/compiler/passes/constant_propagation_pass.cc
+++ b/src/alloy/compiler/passes/constant_propagation_pass.cc
@ -368,6 +368,13 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) {
          i->Remove();
        }
        break;
+      case OPCODE_CNTLZ:
+        if (i->src1.value->IsConstant()) {
+          v->set_zero(v->type);
+          v->CountLeadingZeros(i->src1.value->constant);
+          i->Remove();
+        }
+        break;
      // TODO(benvanik): INSERT/EXTRACT
      // TODO(benvanik): SPLAT/PERMUTE/SWIZZLE
      case OPCODE_SPLAT:
--- a/src/alloy/compiler/passes/context_promotion_pass.cc
+++ b/src/alloy/compiler/passes/context_promotion_pass.cc
@ -9,6 +9,8 @@

 #include <alloy/compiler/passes/context_promotion_pass.h>

+#include <gflags/gflags.h>
+
 #include <alloy/compiler/compiler.h>
 #include <alloy/runtime/runtime.h>

@ -20,6 +22,10 @@ using namespace alloy::hir;
 using namespace alloy::runtime;


+DEFINE_bool(store_all_context_values, false,
+            "Don't strip dead context stores to aid in debugging.");
+
+
 ContextPromotionPass::ContextPromotionPass() :
    context_values_size_(0), context_values_(0),
    CompilerPass() {
@ -69,10 +75,12 @@ int ContextPromotionPass::Run(HIRBuilder* builder) {
  }

  // Remove all dead stores.
-  block = builder->first_block();
-  while (block) {
-    RemoveDeadStoresBlock(block);
-    block = block->next;
+  if (!FLAGS_store_all_context_values) {
+    block = builder->first_block();
+    while (block) {
+      RemoveDeadStoresBlock(block);
+      block = block->next;
+    }
  }

  return 0;
--- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc
+++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc
@ -13,12 +13,6 @@
 #include <alloy/compiler/compiler.h>
 #include <alloy/runtime/runtime.h>

-#pragma warning(push)
-#pragma warning(disable : 4244)
-#pragma warning(disable : 4267)
-#include <llvm/ADT/BitVector.h>
-#pragma warning(pop)
-
 using namespace alloy;
 using namespace alloy::backend;
 using namespace alloy::compiler;
--- a/src/alloy/compiler/passes/data_flow_analysis_pass.cc
+++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc
@ -36,8 +36,6 @@ DataFlowAnalysisPass::~DataFlowAnalysisPass() {
 }

 int DataFlowAnalysisPass::Run(HIRBuilder* builder) {
-  auto arena = builder->arena();
-
  // Linearize blocks so that we can detect cycles and propagate dependencies.
  uint32_t block_count = LinearizeBlocks(builder);

--- a/src/alloy/compiler/passes/register_allocation_pass.cc
+++ b/src/alloy/compiler/passes/register_allocation_pass.cc
@ -9,6 +9,8 @@

 #include <alloy/compiler/passes/register_allocation_pass.h>

+#include <algorithm>
+
 using namespace alloy;
 using namespace alloy::backend;
 using namespace alloy::compiler;
@ -16,180 +18,135 @@ using namespace alloy::compiler::passes;
 using namespace alloy::hir;


-struct RegisterAllocationPass::Interval {
-  uint32_t start_ordinal;
-  uint32_t end_ordinal;
-  Value* value;
-  RegisterFreeUntilSet* free_until_set;
-  // TODO(benvanik): reduce to offsets in arena?
-  struct Interval* next;
-  struct Interval* prev;
+#define ASSERT_NO_CYCLES 0

-  void AddToList(Interval** list_head) {
-    auto list_next = *list_head;
-    this->next = list_next;
-    if (list_next) {
-      list_next->prev = this;
-    }
-    *list_head = this;
-  }
-
-  void InsertIntoList(Interval** list_head) {
-    auto it = *list_head;
-    while (it) {
-      if (it->start_ordinal > this->start_ordinal) {
-        // Went too far. Insert before this interval.
-        this->prev = it->prev;
-        this->next = it;
-        if (it->prev) {
-          it->prev->next = this;
-        } else {
-          *list_head = this;
-        }
-        it->prev = this;
-        return;
-      }
-      if (!it->next) {
-        // None found, add at tail.
-        it->next = this;
-        this->prev = it;
-        return;
-      }
-      it = it->next;
-    }
-  }
-
-  void RemoveFromList(Interval** list_head) {
-    if (this->next) {
-      this->next->prev = this->prev;
-    }
-    if (this->prev) {
-      this->prev->next = this->next;
-    } else {
-      *list_head = this->next;
-    }
-    this->next = this->prev = NULL;
-  }
-};
-
-struct RegisterAllocationPass::Intervals {
-  Interval* unhandled;
-  Interval* active;
-  Interval* handled;
-};

 RegisterAllocationPass::RegisterAllocationPass(
    const MachineInfo* machine_info) :
    machine_info_(machine_info),
    CompilerPass() {
-  // Initialize register sets. The values of these will be
-  // cleared before use, so just the structure is required.
+  // Initialize register sets.
+  // TODO(benvanik): rewrite in a way that makes sense - this is terrible.
  auto mi_sets = machine_info->register_sets;
-  xe_zero_struct(&free_until_sets_, sizeof(free_until_sets_));
+  xe_zero_struct(&usage_sets_, sizeof(usage_sets_));
  uint32_t n = 0;
  while (mi_sets[n].count) {
    auto& mi_set = mi_sets[n];
-    auto free_until_set = new RegisterFreeUntilSet();
-    free_until_sets_.all_sets[n] = free_until_set;
-    free_until_set->count = mi_set.count;
-    free_until_set->set = &mi_set;
+    auto usage_set = new RegisterSetUsage();
+    usage_sets_.all_sets[n] = usage_set;
+    usage_set->count = mi_set.count;
+    usage_set->set = &mi_set;
    if (mi_set.types & MachineInfo::RegisterSet::INT_TYPES) {
-      free_until_sets_.int_set = free_until_set;
+      usage_sets_.int_set = usage_set;
    }
    if (mi_set.types & MachineInfo::RegisterSet::FLOAT_TYPES) {
-      free_until_sets_.float_set = free_until_set;
+      usage_sets_.float_set = usage_set;
    }
    if (mi_set.types & MachineInfo::RegisterSet::VEC_TYPES) {
-      free_until_sets_.vec_set = free_until_set;
+      usage_sets_.vec_set = usage_set;
    }
    n++;
  }
 }

 RegisterAllocationPass::~RegisterAllocationPass() {
-  for (size_t n = 0; n < XECOUNT(free_until_sets_.all_sets); n++) {
-    if (!free_until_sets_.all_sets[n]) {
+  for (size_t n = 0; n < XECOUNT(usage_sets_.all_sets); n++) {
+    if (!usage_sets_.all_sets[n]) {
      break;
    }
-    delete free_until_sets_.all_sets[n];
+    delete usage_sets_.all_sets[n];
  }
 }

 int RegisterAllocationPass::Run(HIRBuilder* builder) {
-  // A (probably broken) implementation of a linear scan register allocator
-  // that operates directly on SSA form:
-  // http://www.christianwimmer.at/Publications/Wimmer10a/Wimmer10a.pdf
-  //
-  // Requirements:
-  // - SSA form (single definition for variables)
-  // - block should be in linear order:
-  //   - dominators *should* come before (a->b->c)
-  //   - loop block sequences *should not* have intervening non-loop blocks
+  // Simple per-block allocator that operates on SSA form.
+  // Registers do not move across blocks, though this could be
+  // optimized with some intra-block analysis (dominators/etc).
+  // Really, it'd just be nice to have someone who knew what they
+  // were doing lower SSA and do this right.

-  auto arena = scratch_arena();
-
-  // Renumber everything.
  uint32_t block_ordinal = 0;
  uint32_t instr_ordinal = 0;
  auto block = builder->first_block();
  while (block) {
    // Sequential block ordinals.
    block->ordinal = block_ordinal++;
+
+    // Reset all state.
+    PrepareBlockState();
+
+    // Renumber all instructions in the block. This is required so that
+    // we can sort the usage pointers below.
    auto instr = block->instr_head;
    while (instr) {
      // Sequential global instruction ordinals.
      instr->ordinal = instr_ordinal++;
      instr = instr->next;
    }
-    block = block->next;
-  }

-  // Compute all liveness ranges by walking forward through all
-  // blocks/instructions and checking the last use of each value. This lets
-  // us know the exact order in (block#,instr#) form, which is then used to
-  // setup the range.
-  // TODO(benvanik): ideally we would have a list of all values and not have
-  //     to keep walking instructions over and over.
-  Interval* prev_interval = NULL;
-  Interval* head_interval = NULL;
-  block = builder->first_block();
-  while (block) {
-    auto instr = block->instr_head;
+    instr = block->instr_head;
    while (instr) {
-      // Compute last-use for the dest value.
-      // Since we know all values of importance must be defined, we can avoid
-      // having to check every value and just look at dest.
      const OpcodeInfo* info = instr->opcode;
-      if (GET_OPCODE_SIG_TYPE_DEST(info->signature) == OPCODE_SIG_TYPE_V) {
-        auto v = instr->dest;
-        if (!v->last_use) {
-          ComputeLastUse(v);
-        }
+      uint32_t signature = info->signature;

-        // Add interval.
-        auto interval = arena->Alloc<Interval>();
-        interval->start_ordinal = instr->ordinal;
-        interval->end_ordinal = v->last_use ?
-            v->last_use->ordinal : v->def->ordinal;
-        interval->value = v;
-        interval->next = NULL;
-        interval->prev = prev_interval;
-        if (prev_interval) {
-          prev_interval->next = interval;
-        } else {
-          head_interval = interval;
-        }
-        prev_interval = interval;
+      // Update the register use heaps.
+      AdvanceUses(instr);

-        // Grab register set to use.
-        // We do this now so it's only once per interval, and it makes it easy
-        // to only compare intervals that overlap their sets.
-        if (v->type <= INT64_TYPE) {
-          interval->free_until_set = free_until_sets_.int_set;
-        } else if (v->type <= FLOAT64_TYPE) {
-          interval->free_until_set = free_until_sets_.float_set;
+      // Check sources for retirement. If any are unused after this instruction
+      // we can eagerly evict them to speed up register allocation.
+      // Since X64 (and other platforms) can often take advantage of dest==src1
+      // register mappings we track retired src1 so that we can attempt to
+      // reuse it.
+      // NOTE: these checks require that the usage list be sorted!
+      bool has_preferred_reg = false;
+      RegAssignment preferred_reg = { 0 };
+      if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V &&
+          !instr->src1.value->IsConstant()) {
+        if (!instr->src1_use->next) {
+          // Pull off preferred register. We will try to reuse this for the
+          // dest.
+          has_preferred_reg = true;
+          preferred_reg = instr->src1.value->reg;
+          XEASSERTNOTNULL(preferred_reg.set);
+        }
+      }
+
+      if (GET_OPCODE_SIG_TYPE_DEST(signature) == OPCODE_SIG_TYPE_V) {
+        // Must not have been set already.
+        XEASSERTNULL(instr->dest->reg.set);
+
+        // Sort the usage list. We depend on this in future uses of this variable.
+        SortUsageList(instr->dest);
+
+        // If we have a preferred register, use that.
+        // This way we can help along the stupid X86 two opcode instructions.
+        bool allocated;
+        if (has_preferred_reg) {
+          // Allocate with the given preferred register. If the register is in
+          // the wrong set it will not be reused.
+          allocated = TryAllocateRegister(instr->dest, preferred_reg);
        } else {
-          interval->free_until_set = free_until_sets_.vec_set;
+          // Allocate a register. This will either reserve a free one or
+          // spill and reuse an active one.
+          allocated = TryAllocateRegister(instr->dest);
+        }
+        if (!allocated) {
+          // Failed to allocate register -- need to spill and try again.
+          // We spill only those registers we aren't using.
+          if (!SpillOneRegister(builder, instr->dest->type)) {
+            // Unable to spill anything - this shouldn't happen.
+            XELOGE("Unable to spill any registers");
+            XEASSERTALWAYS();
+            return 1;
+          }
+
+          // Demand allocation.
+          if (!TryAllocateRegister(instr->dest)) {
+            // Boned.
+            XELOGE("Register allocation failed");
+            XEASSERTALWAYS();
+            return 1;
+          }
        }
      }

@ -198,228 +155,266 @@ int RegisterAllocationPass::Run(HIRBuilder* builder) {
    block = block->next;
  }

-  // Now have a sorted list of intervals, minus their ending ordinals.
-  Intervals intervals;
-  intervals.unhandled = head_interval;
-  intervals.active = intervals.handled = NULL;
-  while (intervals.unhandled) {
-    // Get next unhandled interval.
-    auto current = intervals.unhandled;
-    intervals.unhandled = intervals.unhandled->next;
-    current->RemoveFromList(&intervals.unhandled);
-
-    // Check for intervals in active that are handled or inactive.
-    auto it = intervals.active;
-    while (it) {
-      auto next = it->next;
-      if (it->end_ordinal <= current->start_ordinal) {
-        // Move from active to handled.
-        it->RemoveFromList(&intervals.active);
-        it->AddToList(&intervals.handled);
-      }
-      it = next;
-    }
-
-    // Find a register for current.
-    if (!TryAllocateFreeReg(current, intervals)) {
-      // Failed, spill.
-      AllocateBlockedReg(builder, current, intervals);
-    }
-
-    if (current->value->reg.index!= -1) {
-      // Add current to active.
-      current->AddToList(&intervals.active);
-    }
-  }
-
  return 0;
 }

-void RegisterAllocationPass::ComputeLastUse(Value* value) {
-  // TODO(benvanik): compute during construction?
-  // Note that this list isn't sorted (unfortunately), so we have to scan
-  // them all.
-  uint32_t max_ordinal = 0;
-  Value::Use* last_use = NULL;
-  auto use = value->use_head;
-  while (use) {
-    if (!last_use || use->instr->ordinal >= max_ordinal) {
-      last_use = use;
-      max_ordinal = use->instr->ordinal;
-    }
-    use = use->next;
-  }
-  value->last_use = last_use ? last_use->instr : NULL;
-}
-
-bool RegisterAllocationPass::TryAllocateFreeReg(
-    Interval* current, Intervals& intervals) {
-  // Reset all registers in the set to unused.
-  auto free_until_set = current->free_until_set;
-  for (uint32_t n = 0; n < free_until_set->count; n++) {
-    free_until_set->pos[n] = -1;
-  }
-
-  // Mark all active registers as used.
-  // TODO(benvanik): keep some kind of bitvector so that this is instant?
-  auto it = intervals.active;
-  while (it) {
-    if (it->free_until_set == free_until_set) {
-      free_until_set->pos[it->value->reg.index] = 0;
-    }
-    it = it->next;
-  }
-
-  uint32_t max_pos = 0;
-  for (uint32_t n = 0; n < free_until_set->count; n++) {
-    if (max_pos == -1) {
-      max_pos = n;
-    } else {
-      if (free_until_set->pos[n] > free_until_set->pos[max_pos]) {
-        max_pos = n;
+void RegisterAllocationPass::DumpUsage(const char* name) {
+#if 0
+  fprintf(stdout, "\n%s:\n", name);
+  for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) {
+    auto usage_set = usage_sets_.all_sets[i];
+    if (usage_set) {
+      fprintf(stdout, "set %s:\n", usage_set->set->name);
+      fprintf(stdout, "  avail: %s\n", usage_set->availability.to_string().c_str());
+      fprintf(stdout, "  upcoming uses:\n");
+      for (auto it = usage_set->upcoming_uses.begin();
+           it != usage_set->upcoming_uses.end(); ++it) {
+        fprintf(stdout, "    v%d, used at %d\n",
+                it->value->ordinal,
+                it->use->instr->ordinal);
      }
    }
  }
-  if (!free_until_set->pos[max_pos]) {
-    // No register available without spilling.
-    return false;
-  }
-  if (current->end_ordinal < free_until_set->pos[max_pos]) {
-    // Register available for the whole interval.
-    current->value->reg.set = free_until_set->set;
-    current->value->reg.index = max_pos;
-  } else {
-    // Register available for the first part of the interval.
-    // Split the interval at where it hits the next one.
-    //current->value->reg = max_pos;
-    //SplitRange(current, free_until_set->pos[max_pos]);
-    // TODO(benvanik): actually split -- for now we just spill.
-    return false;
-  }
-
-  return true;
+  fflush(stdout);
+#endif
 }

-void RegisterAllocationPass::AllocateBlockedReg(
-    HIRBuilder* builder, Interval* current, Intervals& intervals) {
-  auto free_until_set = current->free_until_set;

-  // TODO(benvanik): smart heuristics.
-  //     wimmer AllocateBlockedReg has some stuff for deciding whether to
-  //     spill current or some other active interval - which we ignore.
-
-  // Pick a random interval. Maybe the first. Sure.
-  auto spill_interval = intervals.active;
-  Value* spill_value = NULL;
-  Instr* prev_use = NULL;
-  Instr* next_use = NULL;
-  while (spill_interval) {
-    if (spill_interval->free_until_set != free_until_set ||
-        spill_interval->start_ordinal == current->start_ordinal) {
-      // Only interested in ones of the same register set.
-      // We also ensure that ones at the same ordinal as us are ignored,
-      // which can happen with multiple local inserts/etc.
-      spill_interval = spill_interval->next;
-      continue;
+void RegisterAllocationPass::PrepareBlockState() {
+  for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) {
+    auto usage_set = usage_sets_.all_sets[i];
+    if (usage_set) {
+      usage_set->availability.set();
+      usage_set->upcoming_uses.clear();
    }
-    spill_value = spill_interval->value;
+  }
+  DumpUsage("PrepareBlockState");
+}

-    // Find the uses right before/after current.
-    auto use = spill_value->use_head;
-    while (use) {
-      if (use->instr->ordinal != -1) {
-        if (use->instr->ordinal < current->start_ordinal) {
-          if (!prev_use || prev_use->ordinal < use->instr->ordinal) {
-            prev_use = use->instr;
-          }
-        } else if (use->instr->ordinal > current->start_ordinal) {
-          if (!next_use || next_use->ordinal > use->instr->ordinal) {
-            next_use = use->instr;
-          }
+void RegisterAllocationPass::AdvanceUses(Instr* instr) {
+  for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) {
+    auto usage_set = usage_sets_.all_sets[i];
+    if (!usage_set) {
+      break;
+    }
+    auto& upcoming_uses = usage_set->upcoming_uses;
+    for (auto it = upcoming_uses.begin(); it != upcoming_uses.end();) {
+      if (!it->use) {
+        // No uses at all - we can remove right away.
+        // This comes up from instructions where the dest is never used,
+        // like the ATOMIC ops.
+        MarkRegAvailable(it->value->reg);
+        it = upcoming_uses.erase(it);
+        continue;
+      }
+      if (it->use->instr != instr) {
+        // Not yet at this instruction.
+        ++it;
+        continue;
+      }
+      // The use is from this instruction.
+      if (!it->use->next) {
+        // Last use of the value. We can retire it now.
+        MarkRegAvailable(it->value->reg);
+        it = upcoming_uses.erase(it);
+      } else {
+        // Used again. Push back the next use.
+        // Note that we may be used multiple times this instruction, so
+        // eat those.
+        auto next_use = it->use->next;
+        while (next_use->next && next_use->instr == instr) {
+          next_use = next_use->next;
        }
+        // Remove the iterator.
+        auto value = it->value;
+        it = upcoming_uses.erase(it);
+        upcoming_uses.emplace_back(value, next_use);
      }
-      use = use->next;
    }
-    if (!prev_use) {
-      prev_use = spill_value->def;
-    }
-    if (prev_use->next == next_use) {
-      // Uh, this interval is way too short.
-      spill_interval = spill_interval->next;
-      continue;
-    }
-    XEASSERT(prev_use->ordinal != -1);
-    XEASSERTNOTNULL(next_use);
-    break;
  }
-  XEASSERT(spill_interval->free_until_set == free_until_set);
+  DumpUsage("AdvanceUses");
+}

-  // Find the real last use -- paired ops may require sequences to stay
-  // intact. This is a bad design.
-  auto prev_def_tail = prev_use;
-  while (prev_def_tail &&
-          prev_def_tail->opcode->flags & OPCODE_FLAG_PAIRED_PREV) {
-    prev_def_tail = prev_def_tail->prev;
+bool RegisterAllocationPass::IsRegInUse(const RegAssignment& reg) {
+  RegisterSetUsage* usage_set;
+  if (reg.set == usage_sets_.int_set->set) {
+    usage_set = usage_sets_.int_set;
+  } else if (reg.set == usage_sets_.float_set->set) {
+    usage_set = usage_sets_.float_set;
+  } else {
+    usage_set = usage_sets_.vec_set;
+  }
+  return !usage_set->availability.test(reg.index);
+}
+
+RegisterAllocationPass::RegisterSetUsage*
+RegisterAllocationPass::MarkRegUsed(const RegAssignment& reg,
+                                    Value* value, Value::Use* use) {
+  auto usage_set = RegisterSetForValue(value);
+  usage_set->availability.set(reg.index, false);
+  usage_set->upcoming_uses.emplace_back(value, use);
+  DumpUsage("MarkRegUsed");
+  return usage_set;
+}
+
+RegisterAllocationPass::RegisterSetUsage*
+RegisterAllocationPass::MarkRegAvailable(const hir::RegAssignment& reg) {
+  RegisterSetUsage* usage_set;
+  if (reg.set == usage_sets_.int_set->set) {
+    usage_set = usage_sets_.int_set;
+  } else if (reg.set == usage_sets_.float_set->set) {
+    usage_set = usage_sets_.float_set;
+  } else {
+    usage_set = usage_sets_.vec_set;
+  }
+  usage_set->availability.set(reg.index, true);
+  return usage_set;
+}
+
+bool RegisterAllocationPass::TryAllocateRegister(
+    Value* value, const RegAssignment& preferred_reg) {
+  // If the preferred register matches type and is available, use it.
+  auto usage_set = RegisterSetForValue(value);
+  if (usage_set->set == preferred_reg.set) {
+    // Check if available.
+    if (!IsRegInUse(preferred_reg)) {
+      // Mark as in-use and return. Best case.
+      MarkRegUsed(preferred_reg, value, value->use_head);
+      value->reg = preferred_reg;
+      return true;
+    }
  }

-  Value* new_value;
-  uint32_t end_ordinal;
+  // Otherwise, fallback to allocating like normal.
+  return TryAllocateRegister(value);
+}
+
+bool RegisterAllocationPass::TryAllocateRegister(Value* value) {
+  // Get the set this register is in.
+  RegisterSetUsage* usage_set = RegisterSetForValue(value);
+
+  // Find the first free register, if any.
+  // We have to ensure it's a valid one (in our count).
+  unsigned long first_unused = 0;
+  bool all_used = _BitScanForward(&first_unused, usage_set->availability.to_ulong()) == 0;
+  if (!all_used && first_unused < usage_set->count) {
+    // Available! Use it!.
+    value->reg.set = usage_set->set;
+    value->reg.index = first_unused;
+    MarkRegUsed(value->reg, value, value->use_head);
+    return true;
+  }
+
+  // None available! Spill required.
+  return false;
+}
+
+bool RegisterAllocationPass::SpillOneRegister(
+    HIRBuilder* builder, TypeName required_type) {
+  // Get the set that we will be picking from.
+  RegisterSetUsage* usage_set;
+  if (required_type <= INT64_TYPE) {
+    usage_set = usage_sets_.int_set;
+  } else if (required_type <= FLOAT64_TYPE) {
+    usage_set = usage_sets_.float_set;
+  } else {
+    usage_set = usage_sets_.vec_set;
+  }
+
+  DumpUsage("SpillOneRegister (pre)");
+  // Pick the one with the furthest next use.
+  XEASSERT(!usage_set->upcoming_uses.empty());
+  auto furthest_usage = std::max_element(
+      usage_set->upcoming_uses.begin(), usage_set->upcoming_uses.end(),
+      RegisterUsage::Comparer());
+  Value* spill_value = furthest_usage->value;
+  Value::Use* prev_use = furthest_usage->use->prev;
+  Value::Use* next_use = furthest_usage->use;
+  XEASSERTNOTNULL(next_use);
+  usage_set->upcoming_uses.erase(furthest_usage);
+  DumpUsage("SpillOneRegister (post)");
+  const auto reg = spill_value->reg;
+  
+  // We know the spill_value use list is sorted, so we can cut it right now.
+  // This makes it easier down below.
+  auto new_head_use = next_use;
+
+  // Allocate local.
  if (spill_value->local_slot) {
-    // Value is already assigned a slot, so load from that.
-    // We can then split the interval right after the previous use to
-    // before the next use.
-
-    // Update the last use of the spilled interval/value.
-    end_ordinal = spill_interval->end_ordinal;
-    spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal;
-    XEASSERT(end_ordinal != -1);
-    XEASSERT(spill_interval->end_ordinal != -1);
-
-    // Insert a load right before the next use.
-    new_value = builder->LoadLocal(spill_value->local_slot);
-    builder->last_instr()->MoveBefore(next_use);
-
-    // Update last use info.
-    new_value->last_use = spill_value->last_use;
-    spill_value->last_use = prev_use;
+    // Value is already assigned a slot. Since we allocate in order and this is
+    // all SSA we know the stored value will be exactly what we want. Yay,
+    // we can prevent the redundant store!
+    // In fact, we may even want to pin this spilled value so that we always
+    // use the spilled value and prevent the need for more locals.
  } else {
    // Allocate a local slot.
    spill_value->local_slot = builder->AllocLocal(spill_value->type);

-    // Insert a spill right after the def.
+    // Add store.
    builder->StoreLocal(spill_value->local_slot, spill_value);
    auto spill_store = builder->last_instr();
-    spill_store->MoveBefore(prev_def_tail->next);
+    auto spill_store_use = spill_store->src2_use;
+    XEASSERTNULL(spill_store_use->prev);
+    if (prev_use && prev_use->instr->opcode->flags & OPCODE_FLAG_PAIRED_PREV) {
+      // Instruction is paired. This is bad. We will insert the spill after the
+      // paired instruction.
+      XEASSERTNOTNULL(prev_use->instr->next);
+      spill_store->MoveBefore(prev_use->instr->next);

-    // Update last use of spilled interval/value.
-    end_ordinal = spill_interval->end_ordinal;
-    spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal;
-    XEASSERT(end_ordinal != -1);
-    XEASSERT(spill_interval->end_ordinal != -1);
+      // Update last use.
+      spill_value->last_use = spill_store;
+    } else if (prev_use) {
+      // We insert the store immediately before the previous use.
+      // If we were smarter we could then re-run allocation and reuse the register
+      // once dropped.
+      spill_store->MoveBefore(prev_use->instr);

-    // Insert a load right before the next use.
-    new_value = builder->LoadLocal(spill_value->local_slot);
-    builder->last_instr()->MoveBefore(next_use);
+      // Update last use.
+      spill_value->last_use = prev_use->instr;
+    } else {
+      // This is the first use, so the only thing we have is the define.
+      // Move the store to right after that.
+      spill_store->MoveBefore(spill_value->def->next);

-    // Update last use info.
-    new_value->last_use = spill_value->last_use;
-    spill_value->last_use = spill_store;
+      // Update last use.
+      spill_value->last_use = spill_store;
+    }
  }

-  // Reuse the same local slot. Hooray SSA.
+#if ASSERT_NO_CYCLES
+  builder->AssertNoCycles();
+  spill_value->def->block->AssertNoCycles();
+#endif  // ASSERT_NO_CYCLES
+
+  // Add load.
+  // Inserted immediately before the next use. Since by definition the next
+  // use is after the instruction requesting the spill we know we haven't
+  // done allocation for that code yet and can let that be handled
+  // automatically when we get to it.
+  auto new_value = builder->LoadLocal(spill_value->local_slot);
+  auto spill_load = builder->last_instr();
+  spill_load->MoveBefore(next_use->instr);
+  // Note: implicit first use added.
+
+#if ASSERT_NO_CYCLES
+  builder->AssertNoCycles();
+  spill_value->def->block->AssertNoCycles();
+#endif  // ASSERT_NO_CYCLES
+
+  // Set the local slot of the new value to our existing one. This way we will
+  // reuse that same memory if needed.
  new_value->local_slot = spill_value->local_slot;

-  // Rename all future uses to that loaded value.
-  auto use = spill_value->use_head;
-  while (use) {
-    // TODO(benvanik): keep use list sorted so we don't have to do this.
-    if (use->instr->ordinal <= spill_interval->end_ordinal ||
-        use->instr->ordinal == -1) {
-      use = use->next;
-      continue;
-    }
-    auto next = use->next;
-    auto instr = use->instr;
+  // Rename all future uses of the SSA value to the new value as loaded
+  // from the local.
+  // We can quickly do this by walking the use list. Because the list is
+  // already sorted we know we are going to end up with a sorted list.
+  auto walk_use = new_head_use;
+  auto new_use_tail = walk_use;
+  while (walk_use) {
+    auto next_walk_use = walk_use->next;
+    auto instr = walk_use->instr;
+
    uint32_t signature = instr->opcode->signature;
    if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
      if (instr->src1.value == spill_value) {
@ -436,36 +431,107 @@ void RegisterAllocationPass::AllocateBlockedReg(
        instr->set_src3(new_value);
      }
    }
-    use = next;
+
+    walk_use = next_walk_use;
+    if (walk_use) {
+      new_use_tail = walk_use;
+    }
  }
+  new_value->last_use = new_use_tail->instr;

-  // Create new interval.
-  auto arena = scratch_arena();
-  auto new_interval = arena->Alloc<Interval>();
-  new_interval->start_ordinal = new_value->def->ordinal;
-  new_interval->end_ordinal = end_ordinal;
-  new_interval->value = new_value;
-  new_interval->next = NULL;
-  new_interval->prev = NULL;
-  if (new_value->type <= INT64_TYPE) {
-    new_interval->free_until_set = free_until_sets_.int_set;
-  } else if (new_value->type <= FLOAT64_TYPE) {
-    new_interval->free_until_set = free_until_sets_.float_set;
-  } else {
-    new_interval->free_until_set = free_until_sets_.vec_set;
-  }
+  // Update tracking.
+  MarkRegAvailable(reg);

-  // Remove the old interval from the active list, as it's been spilled.
-  spill_interval->RemoveFromList(&intervals.active);
-  spill_interval->AddToList(&intervals.handled);
-
-  // Insert interval into the right place in the list.
-  // We know it's ahead of us.
-  new_interval->InsertIntoList(&intervals.unhandled);
-
-  // TODO(benvanik): use the register we just freed?
-  //current->value->reg.set = free_until_set->set;
-  //current->value->reg.index = spill_interval->value->reg.index;
-  bool allocated = TryAllocateFreeReg(current, intervals);
-  XEASSERTTRUE(allocated);
+  return true;
+}
+
+RegisterAllocationPass::RegisterSetUsage*
+RegisterAllocationPass::RegisterSetForValue(
+    const Value* value) {
+  if (value->type <= INT64_TYPE) {
+    return usage_sets_.int_set;
+  } else if (value->type <= FLOAT64_TYPE) {
+    return usage_sets_.float_set;
+  } else {
+    return usage_sets_.vec_set;
+  }
+}
+
+namespace {
+int CompareValueUse(const Value::Use* a, const Value::Use* b) {
+  return a->instr->ordinal - b->instr->ordinal;
+}
+}  // namespace
+void RegisterAllocationPass::SortUsageList(Value* value) {
+  // Modified in-place linked list sort from:
+  // http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.c
+  if (!value->use_head) {
+    return;
+  }
+  Value::Use* head = value->use_head;
+  Value::Use* tail = nullptr;
+  int insize = 1;
+  while (true) {
+    auto p = head;
+    head = nullptr;
+    tail = nullptr;
+    // count number of merges we do in this pass
+    int nmerges = 0;
+    while (p) {
+      // there exists a merge to be done
+      nmerges++;
+      // step 'insize' places along from p
+      auto q = p;
+      int psize = 0;
+      for (int i = 0; i < insize; i++) {
+        psize++;
+        q = q->next;
+        if (!q) break;
+      }
+      // if q hasn't fallen off end, we have two lists to merge
+      int qsize = insize;
+      // now we have two lists; merge them
+      while (psize > 0 || (qsize > 0 && q)) {
+        // decide whether next element of merge comes from p or q
+        Value::Use* e = nullptr;
+        if (psize == 0) {
+          // p is empty; e must come from q
+          e = q; q = q->next; qsize--;
+        } else if (qsize == 0 || !q) {
+          // q is empty; e must come from p
+          e = p; p = p->next; psize--;
+        } else if (CompareValueUse(p, q) <= 0) {
+          // First element of p is lower (or same); e must come from p
+          e = p; p = p->next; psize--;
+        } else {
+          // First element of q is lower; e must come from q
+          e = q; q = q->next; qsize--;
+        }
+        // add the next element to the merged list
+        if (tail) {
+          tail->next = e;
+        } else {
+          head = e;
+        }
+        // Maintain reverse pointers in a doubly linked list.
+        e->prev = tail;
+        tail = e;
+      }
+      // now p has stepped 'insize' places along, and q has too
+      p = q;
+    }
+    if (tail) {
+      tail->next = nullptr;
+    }
+    // If we have done only one merge, we're finished
+    if (nmerges <= 1) {
+      // allow for nmerges==0, the empty list case
+      break;
+    }
+    // Otherwise repeat, merging lists twice the size
+    insize *= 2;
+  }
+
+  value->use_head = head;
+  value->last_use = tail->instr;
 }
--- a/src/alloy/compiler/passes/register_allocation_pass.h
+++ b/src/alloy/compiler/passes/register_allocation_pass.h
@ -10,6 +10,10 @@
 #ifndef ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_
 #define ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_

+#include <algorithm>
+#include <bitset>
+#include <vector>
+
 #include <alloy/backend/machine_info.h>
 #include <alloy/compiler/compiler_pass.h>

@ -27,28 +31,53 @@ public:
  virtual int Run(hir::HIRBuilder* builder);

 private:
-  struct Interval;
-  struct Intervals;
-  void ComputeLastUse(hir::Value* value);
-  bool TryAllocateFreeReg(Interval* current, Intervals& intervals);
-  void AllocateBlockedReg(hir::HIRBuilder* builder,
-                          Interval* current, Intervals& intervals);
+  // TODO(benvanik): rewrite all this set shit -- too much indirection, the
+  // complexity is not needed.
+  struct RegisterUsage {
+    hir::Value* value;
+    hir::Value::Use* use;
+    RegisterUsage() : value(nullptr), use(nullptr) {}
+    RegisterUsage(hir::Value* value_, hir::Value::Use* use_)
+        : value(value_), use(use_) {}
+    struct Comparer : std::binary_function<RegisterUsage, RegisterUsage, bool> {
+      bool operator()(const RegisterUsage& a, const RegisterUsage& b) const {
+        return a.use->instr->ordinal < b.use->instr->ordinal;
+      }
+    };
+  };
+  struct RegisterSetUsage {
+    const backend::MachineInfo::RegisterSet* set = nullptr;
+    uint32_t count = 0;
+    std::bitset<32> availability = 0;
+    // TODO(benvanik): another data type.
+    std::vector<RegisterUsage> upcoming_uses;
+  };
+
+  void DumpUsage(const char* name);
+  void PrepareBlockState();
+  void AdvanceUses(hir::Instr* instr);
+  bool IsRegInUse(const hir::RegAssignment& reg);
+  RegisterSetUsage* MarkRegUsed(const hir::RegAssignment& reg,
+                                hir::Value* value, hir::Value::Use* use);
+  RegisterSetUsage* MarkRegAvailable(const hir::RegAssignment& reg);
+
+  bool TryAllocateRegister(hir::Value* value,
+                           const hir::RegAssignment& preferred_reg);
+  bool TryAllocateRegister(hir::Value* value);
+  bool SpillOneRegister(hir::HIRBuilder* builder, hir::TypeName required_type);
+
+  RegisterSetUsage* RegisterSetForValue(const hir::Value* value);
+
+  void SortUsageList(hir::Value* value);

 private:
  const backend::MachineInfo* machine_info_;
-
-  struct RegisterFreeUntilSet {
-    uint32_t count;
-    uint32_t pos[32];
-    const backend::MachineInfo::RegisterSet* set;
-  };
-  struct RegisterFreeUntilSets {
-    RegisterFreeUntilSet* int_set;
-    RegisterFreeUntilSet* float_set;
-    RegisterFreeUntilSet* vec_set;
-    RegisterFreeUntilSet* all_sets[3];
-  };
-  RegisterFreeUntilSets free_until_sets_;
+  struct {
+    RegisterSetUsage* int_set = nullptr;
+    RegisterSetUsage* float_set = nullptr;
+    RegisterSetUsage* vec_set = nullptr;
+    RegisterSetUsage* all_sets[3];
+  } usage_sets_;
 };


--- a/src/alloy/compiler/passes/validation_pass.cc
+++ b/src/alloy/compiler/passes/validation_pass.cc
@ -88,12 +88,12 @@ int ValidationPass::ValidateInstruction(Block* block, Instr* instr) {
 }

 int ValidationPass::ValidateValue(Block* block, Instr* instr, Value* value) {
-  if (value->def) {
-    /*auto def = value->def;
-    XEASSERT(def->block == block);
-    if (def->block != block) {
-      return 1;
-    }*/
-  }
+  //if (value->def) {
+  //  auto def = value->def;
+  //  XEASSERT(def->block == block);
+  //  if (def->block != block) {
+  //    return 1;
+  //  }
+  //}
  return 0;
 }
--- a/src/alloy/core.h
+++ b/src/alloy/core.h
@ -44,6 +44,10 @@ typedef struct XECACHEALIGN vec128_s {
      uint64_t  high;
    };
  };
+
+  bool operator== (const vec128_s& b) const {
+    return low == b.low && high == b.high;
+  }
 } vec128_t;
 XEFORCEINLINE vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) {
  vec128_t v;
--- a/src/alloy/frontend/ppc/ppc_emit_alu.cc
+++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc
@ -643,20 +643,20 @@ XEEMITTER(cmpli,        0x28000000, D  )(PPCHIRBuilder& f, InstrData& i) {
 XEEMITTER(andx,         0x7C000038, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA <- (RS) & (RB)
  Value* ra = f.And(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB));
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

 XEEMITTER(andcx,        0x7C000078, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA <- (RS) & ¬(RB)
  Value* ra = f.And(f.LoadGPR(i.X.RT), f.Not(f.LoadGPR(i.X.RB)));
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -665,8 +665,8 @@ XEEMITTER(andix,        0x70000000, D  )(PPCHIRBuilder& f, InstrData& i) {
  Value* ra = f.And(
      f.LoadGPR(i.D.RT),
      f.LoadConstant((uint64_t)i.D.DS));
-  f.UpdateCR(0, ra);
  f.StoreGPR(i.D.RA, ra);
+  f.UpdateCR(0, ra);
  return 0;
 }

@ -675,8 +675,8 @@ XEEMITTER(andisx,       0x74000000, D  )(PPCHIRBuilder& f, InstrData& i) {
  Value* ra = f.And(
      f.LoadGPR(i.D.RT),
      f.LoadConstant((uint64_t(i.D.DS) << 16)));
-  f.UpdateCR(0, ra);
  f.StoreGPR(i.D.RA, ra);
+  f.UpdateCR(0, ra);
  return 0;
 }

@ -688,10 +688,10 @@ XEEMITTER(cntlzdx,      0x7C000074, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA <- n
  Value* v = f.CountLeadingZeros(f.LoadGPR(i.X.RT));
  v = f.ZeroExtend(v, INT64_TYPE);
+  f.StoreGPR(i.X.RA, v);
  if (i.X.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.X.RA, v);
  return 0;
 }

@ -704,10 +704,10 @@ XEEMITTER(cntlzwx,      0x7C000034, X  )(PPCHIRBuilder& f, InstrData& i) {
  Value* v = f.CountLeadingZeros(
      f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
  v = f.ZeroExtend(v, INT64_TYPE);
+  f.StoreGPR(i.X.RA, v);
  if (i.X.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.X.RA, v);
  return 0;
 }

@ -715,10 +715,10 @@ XEEMITTER(eqvx,         0x7C000238, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA <- (RS) == (RB)
  Value* ra = f.Xor(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB));
  ra = f.Not(ra);
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -728,10 +728,10 @@ XEEMITTER(extsbx,       0x7C000774, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA[0:55] <- i56.s
  Value* rt = f.LoadGPR(i.X.RT);
  rt = f.SignExtend(f.Truncate(rt, INT8_TYPE), INT64_TYPE);
+  f.StoreGPR(i.X.RA, rt);
  if (i.X.Rc) {
    f.UpdateCR(0, rt);
  }
-  f.StoreGPR(i.X.RA, rt);
  return 0;
 }

@ -741,10 +741,10 @@ XEEMITTER(extshx,       0x7C000734, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA[0:47] <- 48.s
  Value* rt = f.LoadGPR(i.X.RT);
  rt = f.SignExtend(f.Truncate(rt, INT16_TYPE), INT64_TYPE);
+  f.StoreGPR(i.X.RA, rt);
  if (i.X.Rc) {
    f.UpdateCR(0, rt);
  }
-  f.StoreGPR(i.X.RA, rt);
  return 0;
 }

@ -754,10 +754,10 @@ XEEMITTER(extswx,       0x7C0007B4, X  )(PPCHIRBuilder& f, InstrData& i) {
  // RA[0:31] <- i32.s
  Value* rt = f.LoadGPR(i.X.RT);
  rt = f.SignExtend(f.Truncate(rt, INT32_TYPE), INT64_TYPE);
+  f.StoreGPR(i.X.RA, rt);
  if (i.X.Rc) {
    f.UpdateCR(0, rt);
  }
-  f.StoreGPR(i.X.RA, rt);
  return 0;
 }

@ -767,10 +767,10 @@ XEEMITTER(nandx,        0x7C0003B8, X  )(PPCHIRBuilder& f, InstrData& i) {
      f.LoadGPR(i.X.RT),
      f.LoadGPR(i.X.RB));
  ra = f.Not(ra);
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -780,10 +780,10 @@ XEEMITTER(norx,         0x7C0000F8, X  )(PPCHIRBuilder& f, InstrData& i) {
      f.LoadGPR(i.X.RT),
      f.LoadGPR(i.X.RB));
  ra = f.Not(ra);
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -803,10 +803,10 @@ XEEMITTER(orx,          0x7C000378, X  )(PPCHIRBuilder& f, InstrData& i) {
        f.LoadGPR(i.X.RT),
        f.LoadGPR(i.X.RB));
  }
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -815,10 +815,10 @@ XEEMITTER(orcx,         0x7C000338, X  )(PPCHIRBuilder& f, InstrData& i) {
  Value* ra = f.Or(
      f.LoadGPR(i.X.RT),
      f.Not(f.LoadGPR(i.X.RB)));
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -849,10 +849,10 @@ XEEMITTER(xorx,         0x7C000278, X  )(PPCHIRBuilder& f, InstrData& i) {
  Value* ra = f.Xor(
      f.LoadGPR(i.X.RT),
      f.LoadGPR(i.X.RB));
+  f.StoreGPR(i.X.RA, ra);
  if (i.X.Rc) {
    f.UpdateCR(0, ra);
  }
-  f.StoreGPR(i.X.RA, ra);
  return 0;
 }

@ -895,10 +895,10 @@ XEEMITTER(rld,          0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) {
    if (m != 0xFFFFFFFFFFFFFFFF) {
      v = f.And(v, f.LoadConstant(m));
    }
+    f.StoreGPR(i.MD.RA, v);
    if (i.MD.Rc) {
      f.UpdateCR(0, v);
    }
-    f.StoreGPR(i.MD.RA, v);
    return 0;
  } else if (i.MD.idx == 1) {
    // XEEMITTER(rldicrx,      0x78000004, MD )
@ -922,10 +922,10 @@ XEEMITTER(rld,          0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) {
        v = f.And(v, f.LoadConstant(m));
      }
    }
+    f.StoreGPR(i.MD.RA, v);
    if (i.MD.Rc) {
      f.UpdateCR(0, v);
    }
-    f.StoreGPR(i.MD.RA, v);
    return 0;
  } else if (i.MD.idx == 2) {
    // XEEMITTER(rldicx,       0x78000008, MD )
@ -959,10 +959,10 @@ XEEMITTER(rld,          0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) {
          f.And(v, f.LoadConstant(m)),
          f.And(ra, f.LoadConstant(~m)));
    }
+    f.StoreGPR(i.MD.RA, v);
    if (i.MD.Rc) {
      f.UpdateCR(0, v);
    }
-    f.StoreGPR(i.MD.RA, v);
    return 0;
  } else {
    XEINSTRNOTIMPLEMENTED();
@ -987,10 +987,10 @@ XEEMITTER(rlwimix,      0x50000000, M  )(PPCHIRBuilder& f, InstrData& i) {
  }
  v = f.ZeroExtend(v, INT64_TYPE);
  v = f.Or(v, f.And(f.LoadGPR(i.M.RA), f.LoadConstant((~(uint64_t)m))));
+  f.StoreGPR(i.M.RA, v);
  if (i.M.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.M.RA, v);
  return 0;
 }

@ -1014,10 +1014,10 @@ XEEMITTER(rlwinmx,      0x54000000, M  )(PPCHIRBuilder& f, InstrData& i) {
    v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32)));
  }
  v = f.ZeroExtend(v, INT64_TYPE);
+  f.StoreGPR(i.M.RA, v);
  if (i.M.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.M.RA, v);
  return 0;
 }

@ -1036,10 +1036,10 @@ XEEMITTER(rlwnmx,       0x5C000000, M  )(PPCHIRBuilder& f, InstrData& i) {
    v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32)));
  }
  v = f.ZeroExtend(v, INT64_TYPE);
+  f.StoreGPR(i.M.RA, v);
  if (i.M.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.M.RA, v);
  return 0;
 }

@ -1146,7 +1146,7 @@ XEEMITTER(sradx,        0x7C000634, X  )(PPCHIRBuilder& f, InstrData& i) {
  // CA is set to 1 if the low-order 32 bits of (RS) contain a negative number
  // and any 1-bits are shifted out of position 63; otherwise CA is set to 0.
  // We already have ca set to indicate the pos 63 bit, now just and in sign.
-  ca = f.And(ca, f.Shr(v, 63));
+  ca = f.And(ca, f.Truncate(f.Shr(v, 63), INT8_TYPE));

  f.StoreCA(ca);
  f.StoreGPR(i.X.RA, v);
@ -1174,15 +1174,15 @@ XEEMITTER(sradix,       0x7C000674, XS )(PPCHIRBuilder& f, InstrData& i) {
  XEASSERT(sh);
  uint64_t mask = XEMASK(64 - sh, 63);
  Value* ca = f.And(
-      f.Shr(v, 63),
+      f.Truncate(f.Shr(v, 63), INT8_TYPE),
      f.IsTrue(f.And(v, f.LoadConstant(mask))));
  f.StoreCA(ca);

  v = f.Sha(v, sh);
+  f.StoreGPR(i.XS.RA, v);
  if (i.XS.Rc) {
    f.UpdateCR(0, v);
  }
-  f.StoreGPR(i.XS.RA, v);
  return 0;
 }

@ -1203,7 +1203,7 @@ XEEMITTER(srawx,        0x7C000630, X  )(PPCHIRBuilder& f, InstrData& i) {
  // is negative.
  Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh));
  Value* ca = f.And(
-      f.Shr(v, 31),
+      f.Truncate(f.Shr(v, 31), INT8_TYPE),
      f.IsTrue(f.And(v, mask)));
  f.StoreCA(ca);
  v = f.Sha(v, sh),
@ -1235,8 +1235,8 @@ XEEMITTER(srawix,       0x7C000670, X  )(PPCHIRBuilder& f, InstrData& i) {
    // is negative.
    uint32_t mask = (uint32_t)XEMASK(64 - i.X.RB, 63);
    ca = f.And(
-        f.Shr(v, 31),
-        f.ZeroExtend(f.IsTrue(f.And(v, f.LoadConstant(mask))), INT32_TYPE));
+        f.Truncate(f.Shr(v, 31), INT8_TYPE),
+        f.IsTrue(f.And(v, f.LoadConstant(mask))));

    v = f.Sha(v, (int8_t)i.X.RB),
    v = f.SignExtend(v, INT64_TYPE);
--- a/src/alloy/frontend/ppc/ppc_hir_builder.cc
+++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc
@ -240,18 +240,18 @@ void PPCHIRBuilder::UpdateCR(

 void PPCHIRBuilder::UpdateCR(
    uint32_t n, Value* lhs, Value* rhs, bool is_signed) {
-  Value* lt;
-  Value* gt;
  if (is_signed) {
-    lt = CompareSLT(lhs, rhs);
-    gt = CompareSGT(lhs, rhs);
+    Value* lt = CompareSLT(lhs, rhs);
+    StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt);
+    Value* gt = CompareSGT(lhs, rhs);
+    StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt);
  } else {
-    lt = CompareULT(lhs, rhs);
-    gt = CompareUGT(lhs, rhs);
+    Value* lt = CompareULT(lhs, rhs);
+    StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt);
+    Value* gt = CompareUGT(lhs, rhs);
+    StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt);
  }
  Value* eq = CompareEQ(lhs, rhs);
-  StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt);
-  StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt);
  StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 2, eq);

  // Value* so = AllocValue(UINT8_TYPE);
@ -280,7 +280,7 @@ Value* PPCHIRBuilder::LoadCA() {
 }

 void PPCHIRBuilder::StoreCA(Value* value) {
-  value = Truncate(value, INT8_TYPE);
+  XEASSERT(value->type == INT8_TYPE);
  StoreContext(offsetof(PPCContext, xer_ca), value);
 }

--- a/src/alloy/hir/block.cc
+++ b/src/alloy/hir/block.cc
@ -0,0 +1,39 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include <alloy/hir/block.h>
+
+#include <alloy/hir/instr.h>
+
+using namespace alloy;
+using namespace alloy::hir;
+
+
+void Block::AssertNoCycles() {
+  Instr* hare = instr_head;
+  Instr* tortoise = instr_head;
+  if (!hare) {
+    return;
+  }
+  while (hare = hare->next) {
+    if (hare == tortoise) {
+      // Cycle!
+      XEASSERTALWAYS();
+    }
+    hare = hare->next;
+    if (hare == tortoise) {
+      // Cycle!
+      XEASSERTALWAYS();
+    }
+    tortoise = tortoise->next;
+    if (!hare || !tortoise) {
+      return;
+    }
+  }
+}
--- a/src/alloy/hir/block.h
+++ b/src/alloy/hir/block.h
@ -61,6 +61,8 @@ public:
  Instr* instr_tail;

  uint16_t ordinal;
+
+  void AssertNoCycles();
 };


--- a/src/alloy/hir/hir_builder.cc
+++ b/src/alloy/hir/hir_builder.cc
@ -92,7 +92,7 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
    case INT8_TYPE:     str->Append("%X", value->constant.i8);  break;
    case INT16_TYPE:    str->Append("%X", value->constant.i16); break;
    case INT32_TYPE:    str->Append("%X", value->constant.i32); break;
-    case INT64_TYPE:    str->Append("%X", value->constant.i64); break;
+    case INT64_TYPE:    str->Append("%llX", value->constant.i64); break;
    case FLOAT32_TYPE:  str->Append("%F", value->constant.f32); break;
    case FLOAT64_TYPE:  str->Append("%F", value->constant.f64); break;
    case VEC128_TYPE:   str->Append("(%F,%F,%F,%F)",
@ -252,6 +252,29 @@ void HIRBuilder::Dump(StringBuffer* str) {
  }
 }

+void HIRBuilder::AssertNoCycles() {
+  Block* hare = block_head_;
+  Block* tortoise = block_head_;
+  if (!hare) {
+    return;
+  }
+  while (hare = hare->next) {
+    if (hare == tortoise) {
+      // Cycle!
+      XEASSERTALWAYS();
+    }
+    hare = hare->next;
+    if (hare == tortoise) {
+      // Cycle!
+      XEASSERTALWAYS();
+    }
+    tortoise = tortoise->next;
+    if (!hare || !tortoise) {
+      return;
+    }
+  }
+}
+
 Block* HIRBuilder::current_block() const {
  return current_block_;
 }
@ -1729,16 +1752,19 @@ Value* HIRBuilder::Extract(Value* value, Value* index,
                                TypeName target_type) {
  // TODO(benvanik): could do some of this as constants.

+  Value* trunc_index = index->type != INT8_TYPE ?
+      Truncate(index, INT8_TYPE) : index;
+
  Instr* i = AppendInstr(
      OPCODE_EXTRACT_info, 0,
      AllocValue(target_type));
  i->set_src1(value);
-  i->set_src2(ZeroExtend(index, INT64_TYPE));
+  i->set_src2(trunc_index);
  i->src3.value = NULL;
  return i->dest;
 }

-Value* HIRBuilder::Extract(Value* value, uint64_t index,
+Value* HIRBuilder::Extract(Value* value, uint8_t index,
                                TypeName target_type) {
  return Extract(value, LoadConstant(index), target_type);
 }
--- a/src/alloy/hir/hir_builder.h
+++ b/src/alloy/hir/hir_builder.h
@ -35,6 +35,7 @@ public:
  virtual int Finalize();

  void Dump(StringBuffer* str);
+  void AssertNoCycles();

  Arena* arena() const { return arena_; }

@ -196,7 +197,7 @@ public:
  Value* Insert(Value* value, Value* index, Value* part);
  Value* Insert(Value* value, uint64_t index, Value* part);
  Value* Extract(Value* value, Value* index, TypeName target_type);
-  Value* Extract(Value* value, uint64_t index, TypeName target_type);
+  Value* Extract(Value* value, uint8_t index, TypeName target_type);
  // i8->i16/i32/... (i8|i8 / i8|i8|i8|i8 / ...)
  // i8/i16/i32 -> vec128
  Value* Splat(Value* value, TypeName target_type);
--- a/src/alloy/hir/instr.cc
+++ b/src/alloy/hir/instr.cc
@ -48,19 +48,6 @@ void Instr::set_src3(Value* value) {
  src3_use = value ? value->AddUse(block->arena, this) : NULL;
 }

-bool Instr::Match(SignatureType dest_req,
-                  SignatureType src1_req,
-                  SignatureType src2_req,
-                  SignatureType src3_req) const {
-  #define TO_SIG_TYPE(v) \
-      (v ? (v->IsConstant() ? SignatureType((v->type + 1) | SIG_TYPE_C) : SignatureType(v->type + 1)) : SIG_TYPE_X)
-  return
-      ((dest_req == SIG_TYPE_IGNORE) || (dest_req == TO_SIG_TYPE(dest))) &&
-      ((src1_req == SIG_TYPE_IGNORE) || (src1_req == TO_SIG_TYPE(src1.value))) &&
-      ((src2_req == SIG_TYPE_IGNORE) || (src2_req == TO_SIG_TYPE(src2.value))) &&
-      ((src3_req == SIG_TYPE_IGNORE) || (src3_req == TO_SIG_TYPE(src3.value)));
-}
-
 void Instr::MoveBefore(Instr* other) {
  if (next == other) {
    return;
--- a/src/alloy/hir/instr.h
+++ b/src/alloy/hir/instr.h
@ -24,26 +24,6 @@ namespace hir {
 class Block;
 class Label;

-enum SignatureType {
-  SIG_TYPE_X      = 0,
-  SIG_TYPE_I8     = 1,
-  SIG_TYPE_I16    = 2,
-  SIG_TYPE_I32    = 3,
-  SIG_TYPE_I64    = 4,
-  SIG_TYPE_F32    = 5,
-  SIG_TYPE_F64    = 6,
-  SIG_TYPE_V128   = 7,
-  SIG_TYPE_C      = (1 << 3),
-  SIG_TYPE_I8C    = SIG_TYPE_C | SIG_TYPE_I8,
-  SIG_TYPE_I16C   = SIG_TYPE_C | SIG_TYPE_I16,
-  SIG_TYPE_I32C   = SIG_TYPE_C | SIG_TYPE_I32,
-  SIG_TYPE_I64C   = SIG_TYPE_C | SIG_TYPE_I64,
-  SIG_TYPE_F32C   = SIG_TYPE_C | SIG_TYPE_F32,
-  SIG_TYPE_F64C   = SIG_TYPE_C | SIG_TYPE_F64,
-  SIG_TYPE_V128C  = SIG_TYPE_C | SIG_TYPE_V128,
-  SIG_TYPE_IGNORE = 0xFF,
-};
-
 class Instr {
 public:
  Block*    block;
@ -74,11 +54,6 @@ public:
  void set_src2(Value* value);
  void set_src3(Value* value);

-  bool Match(SignatureType dest = SIG_TYPE_X,
-             SignatureType src1 = SIG_TYPE_X,
-             SignatureType src2 = SIG_TYPE_X,
-             SignatureType src3 = SIG_TYPE_X) const;
-
  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* opcode, uint16_t flags);
  void Remove();
--- a/src/alloy/hir/opcodes.inl
+++ b/src/alloy/hir/opcodes.inl
@ -11,590 +11,590 @@
 DEFINE_OPCODE(
    OPCODE_COMMENT,
    "comment",
-    OPCODE_SIG_X,
-    OPCODE_FLAG_IGNORE);
+    OPCODE_SIG_X_O,
+    OPCODE_FLAG_IGNORE)

 DEFINE_OPCODE(
    OPCODE_NOP,
    "nop",
    OPCODE_SIG_X,
-    OPCODE_FLAG_IGNORE);
+    OPCODE_FLAG_IGNORE)

 DEFINE_OPCODE(
    OPCODE_SOURCE_OFFSET,
    "source_offset",
    OPCODE_SIG_X_O,
-    OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE);
+    OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE)

 DEFINE_OPCODE(
    OPCODE_DEBUG_BREAK,
    "debug_break",
    OPCODE_SIG_X,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_DEBUG_BREAK_TRUE,
    "debug_break_true",
    OPCODE_SIG_X_V,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_TRAP,
    "trap",
    OPCODE_SIG_X,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_TRAP_TRUE,
    "trap_true",
    OPCODE_SIG_X_V,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_CALL,
    "call",
    OPCODE_SIG_X_S,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_CALL_TRUE,
    "call_true",
    OPCODE_SIG_X_V_S,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_CALL_INDIRECT,
    "call_indirect",
    OPCODE_SIG_X_V,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_CALL_INDIRECT_TRUE,
    "call_indirect_true",
    OPCODE_SIG_X_V_V,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_CALL_EXTERN,
    "call_extern",
    OPCODE_SIG_X_S,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_RETURN,
    "return",
    OPCODE_SIG_X,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_RETURN_TRUE,
    "return_true",
    OPCODE_SIG_X_V,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_SET_RETURN_ADDRESS,
    "set_return_address",
    OPCODE_SIG_X_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_BRANCH,
    "branch",
    OPCODE_SIG_X_L,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_BRANCH_TRUE,
    "branch_true",
    OPCODE_SIG_X_V_L,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_BRANCH_FALSE,
    "branch_false",
    OPCODE_SIG_X_V_L,
-    OPCODE_FLAG_BRANCH);
+    OPCODE_FLAG_BRANCH)

 DEFINE_OPCODE(
    OPCODE_ASSIGN,
    "assign",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_CAST,
    "cast",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ZERO_EXTEND,
    "zero_extend",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SIGN_EXTEND,
    "sign_extend",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_TRUNCATE,
    "truncate",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_CONVERT,
    "convert",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ROUND,
    "round",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_I2F,
    "vector_convert_i2f",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_CONVERT_F2I,
    "vector_convert_f2i",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD_VECTOR_SHL,
    "load_vector_shl",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD_VECTOR_SHR,
    "load_vector_shr",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD_CLOCK,
    "load_clock",
    OPCODE_SIG_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD_LOCAL,
    "load_local",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_STORE_LOCAL,
    "store_local",
    OPCODE_SIG_X_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD_CONTEXT,
    "load_context",
    OPCODE_SIG_V_O,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_STORE_CONTEXT,
    "store_context",
    OPCODE_SIG_X_O_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOAD,
    "load",
    OPCODE_SIG_V_V,
-    OPCODE_FLAG_MEMORY);
+    OPCODE_FLAG_MEMORY)

 DEFINE_OPCODE(
    OPCODE_STORE,
    "store",
    OPCODE_SIG_X_V_V,
-    OPCODE_FLAG_MEMORY);
+    OPCODE_FLAG_MEMORY)

 DEFINE_OPCODE(
    OPCODE_PREFETCH,
    "prefetch",
    OPCODE_SIG_X_V_O,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_MAX,
    "max",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_MIN,
    "min",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SELECT,
    "select",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_IS_TRUE,
    "is_true",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_IS_FALSE,
    "is_false",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_COMPARE_EQ,
    "compare_eq",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)
 DEFINE_OPCODE(
    OPCODE_COMPARE_NE,
    "compare_ne",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)
 DEFINE_OPCODE(
    OPCODE_COMPARE_SLT,
    "compare_slt",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_SLE,
    "compare_sle",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_SGT,
    "compare_sgt",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_SGE,
    "compare_sge",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_ULT,
    "compare_ult",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_ULE,
    "compare_ule",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_UGT,
    "compare_ugt",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_COMPARE_UGE,
    "compare_uge",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_DID_CARRY,
    "did_carry",
    OPCODE_SIG_V_V,
-    OPCODE_FLAG_PAIRED_PREV);
+    OPCODE_FLAG_PAIRED_PREV)
 DEFINE_OPCODE(
    OPCODE_DID_OVERFLOW,
    "did_overflow",
    OPCODE_SIG_V_V,
-    OPCODE_FLAG_PAIRED_PREV);
+    OPCODE_FLAG_PAIRED_PREV)
 DEFINE_OPCODE(
    OPCODE_DID_SATURATE,
    "did_saturate",
    OPCODE_SIG_V_V,
-    OPCODE_FLAG_PAIRED_PREV);
+    OPCODE_FLAG_PAIRED_PREV)

 DEFINE_OPCODE(
    OPCODE_VECTOR_COMPARE_EQ,
    "vector_compare_eq",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)
 DEFINE_OPCODE(
    OPCODE_VECTOR_COMPARE_SGT,
    "vector_compare_sgt",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_VECTOR_COMPARE_SGE,
    "vector_compare_sge",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_VECTOR_COMPARE_UGT,
    "vector_compare_ugt",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
 DEFINE_OPCODE(
    OPCODE_VECTOR_COMPARE_UGE,
    "vector_compare_uge",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ADD,
    "add",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_ADD_CARRY,
    "add_carry",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_ADD,
    "vector_add",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_SUB,
    "sub",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_MUL,
    "mul",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_MUL_HI,
    "mul_hi",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_DIV,
    "div",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_MUL_ADD,
    "mul_add",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_MUL_SUB,
    "mul_sub",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_NEG,
    "neg",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ABS,
    "abs",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SQRT,
    "sqrt",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_RSQRT,
    "rsqrt",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_POW2,
    "pow2",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_LOG2,
    "log2",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_3,
    "dot_product_3",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_DOT_PRODUCT_4,
    "dot_product_4",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_AND,
    "and",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_OR,
    "or",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_XOR,
    "xor",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_COMMUNATIVE);
+    OPCODE_FLAG_COMMUNATIVE)

 DEFINE_OPCODE(
    OPCODE_NOT,
    "not",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SHL,
    "shl",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_SHL,
    "vector_shl",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SHR,
    "shr",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_SHR,
    "vector_shr",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SHA,
    "sha",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_VECTOR_SHA,
    "vector_sha",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ROTATE_LEFT,
    "rotate_left",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_BYTE_SWAP,
    "byte_swap",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_CNTLZ,
    "cntlz",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_INSERT,
    "insert",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_EXTRACT,
    "extract",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SPLAT,
    "splat",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_PERMUTE,
    "permute",
    OPCODE_SIG_V_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_SWIZZLE,
    "swizzle",
    OPCODE_SIG_V_V_O,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_PACK,
    "pack",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_UNPACK,
    "unpack",
    OPCODE_SIG_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_COMPARE_EXCHANGE,
    "compare_exchange",
    OPCODE_SIG_V_V_V_V,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_ATOMIC_EXCHANGE,
    "atomic_exchange",
    OPCODE_SIG_V_V_V,
-    OPCODE_FLAG_VOLATILE);
+    OPCODE_FLAG_VOLATILE)

 DEFINE_OPCODE(
    OPCODE_ATOMIC_ADD,
    "atomic_add",
    OPCODE_SIG_V_V_V,
-    0);
+    0)

 DEFINE_OPCODE(
    OPCODE_ATOMIC_SUB,
    "atomic_sub",
    OPCODE_SIG_V_V_V,
-    0);
+    0)
--- a/src/alloy/hir/sources.gypi
+++ b/src/alloy/hir/sources.gypi
@ -1,6 +1,7 @@
 # Copyright 2013 Ben Vanik. All Rights Reserved.
 {
  'sources': [
+    'block.cc',
    'block.h',
    'hir_builder.cc',
    'hir_builder.h',
--- a/src/alloy/hir/value.cc
+++ b/src/alloy/hir/value.cc
@ -560,6 +560,26 @@ void Value::ByteSwap() {
  }
 }

+void Value::CountLeadingZeros(const ConstantValue& src) {
+  switch (type) {
+  case INT8_TYPE:
+    constant.i8 = __lzcnt16(src.i8) - 8;
+    break;
+  case INT16_TYPE:
+    constant.i8 = __lzcnt16(src.i16);
+    break;
+  case INT32_TYPE:
+    constant.i8 = __lzcnt(src.i32);
+    break;
+  case INT64_TYPE:
+    constant.i8 = __lzcnt64(src.i64);
+    break;
+  default:
+    XEASSERTALWAYS();
+    break;
+  }
+}
+
 bool Value::Compare(Opcode opcode, Value* other) {
  // TODO(benvanik): big matrix.
  XEASSERTALWAYS();
--- a/src/alloy/hir/value.h
+++ b/src/alloy/hir/value.h
@ -68,6 +68,10 @@ enum ValueFlags {
  VALUE_IS_ALLOCATED  = (1 << 2), // Used by backends. Do not set.
 };

+struct RegAssignment {
+  const backend::MachineInfo::RegisterSet* set;
+  int32_t index;
+};

 class Value {
 public:
@ -91,10 +95,7 @@ public:
  TypeName type;

  uint32_t flags;
-  struct {
-    const backend::MachineInfo::RegisterSet* set;
-    int32_t index;
-  } reg;
+  RegAssignment reg;
  ConstantValue constant;

  Instr*    def;
@ -392,6 +393,7 @@ public:
  void Shr(Value* other);
  void Sha(Value* other);
  void ByteSwap();
+  void CountLeadingZeros(const ConstantValue& src);
  bool Compare(Opcode opcode, Value* other);
 };

--- a/third_party/xbyak
+++ b/third_party/xbyak
@ -1 +1 @@
-Subproject commit 702d6e6683c322f08a36ea059f6d6f8263b1bd0d
+Subproject commit 2d599b3bd64a6d13c8b47a5f7410c67837bfff5d
--- a/xenia.gyp
+++ b/xenia.gyp
@ -24,6 +24,18 @@
    'target_arch%': 'x64',
  },

+  'conditions': [
+    ['OS=="win"', {
+      'variables': {
+        'move_command%': 'move'
+      },
+    }, {
+      'variables': {
+        'move_command%': 'mv'
+      },
+    }]
+  ],
+
  'target_defaults': {
    'include_dirs': [
      'include/',
@ -255,6 +267,7 @@
      'include_dirs': [
        '.',
        'src/',
+        '<(INTERMEDIATE_DIR)',
      ],

      'includes': [