diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc
new file mode 100644
index 000000000..7d2d9d80c
--- /dev/null
+++ b/src/xenia/base/dma.cc
@@ -0,0 +1,415 @@
+#include "dma.h"
+
+template <size_t N, typename... Ts>
+static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
+  char buffer[1024];
+  sprintf_s(buffer, fmt, args...);
+  XELOGI("%s", buffer);
+}
+
+//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
+//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
+#define XEDMALOG(...) static_cast<void>(0)
+using xe::swcache::CacheLine;
+static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
+
+XE_FORCEINLINE
+static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to,
+                                 CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    xe::swcache::CacheLine line0, line1, line2, line3;
+
+    xe::swcache::ReadLine(&line0, src1 + i);
+    xe::swcache::ReadLine(&line1, src2 + i);
+    xe::swcache::ReadLine(&line2, src3 + i);
+    xe::swcache::ReadLine(&line3, src4 + i);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(dest1 + i, &line0);
+    xe::swcache::WriteLineNT(dest2 + i, &line1);
+
+    xe::swcache::WriteLineNT(dest3 + i, &line2);
+    xe::swcache::WriteLineNT(dest4 + i, &line3);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+
+namespace xe::dma {
+XE_FORCEINLINE
+static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr,
+                         CacheLine* XE_RESTRICT rdmapping,
+                         uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384Streaming(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    xe::swcache::CacheLine line0, line1;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+
+    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
+  }
+
+  if (i < num_written_lines) {
+    xe::swcache::CacheLine line0;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+  }
+}
+
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length) {
+  return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping,
+                      written_length);
+}
+
+#define XEDMA_NUM_WORKERS 4
+class alignas(256) XeDMACGeneric : public XeDMAC {
+  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
+    std::atomic<uint64_t> free_job_slots_;
+    std::atomic<uint64_t> jobs_submitted_;
+    std::atomic<uint64_t> jobs_completed_;
+    std::atomic<uint32_t> num_workers_awoken_;
+    std::atomic<uint32_t> current_job_serial_;
+
+  } dma_volatile_;
+
+  alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
+
+  volatile uint32_t jobserials_[64];
+
+  alignas(XE_HOST_CACHE_LINE_SIZE)
+      std::unique_ptr<threading::Event> job_done_signals_[64];
+  // really dont like using unique pointer for this...
+  std::unique_ptr<threading::Event> job_submitted_signal_;
+  std::unique_ptr<threading::Event> job_completed_signal_;
+
+  std::unique_ptr<threading::Thread> scheduler_thread_;
+  struct WorkSlice {
+    uint8_t* destination;
+    uint8_t* source;
+    size_t numbytes;
+  };
+  std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
+  std::unique_ptr<threading::Event> worker_has_work_;  //[XEDMA_NUM_WORKERS];
+  std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
+
+  threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
+  WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
+
+  // chrispy: this is bad
+  static uint32_t find_free_hole_in_dword(uint64_t dw) {
+    XEDMALOG("Finding free hole in 0x%llX", dw);
+
+    for (uint32_t i = 0; i < 64; ++i) {
+      if (dw & (1ULL << i)) {
+        continue;
+      }
+
+      return i;
+    }
+    return ~0U;
+  }
+
+  uint32_t allocate_free_dma_slot() {
+    XEDMALOG("Allocating free slot");
+    uint32_t got_slot = 0;
+    uint64_t slots;
+    uint64_t allocated_slot;
+
+    do {
+      slots = dma_volatile_.free_job_slots_.load();
+
+      got_slot = find_free_hole_in_dword(slots);
+      if (!~got_slot) {
+        XEDMALOG("Didn't get a slot!");
+        return ~0U;
+      }
+      allocated_slot = slots | (1ULL << got_slot);
+
+    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
+        slots, allocated_slot)));
+    XEDMALOG("Allocated slot %d", got_slot);
+    return got_slot;
+  }
+  // chrispy: on x86 this can just be interlockedbittestandreset...
+  void free_dma_slot(uint32_t slot) {
+    XEDMALOG("Freeing slot %d", slot);
+    uint64_t slots;
+
+    uint64_t deallocated_slot;
+
+    do {
+      slots = dma_volatile_.free_job_slots_.load();
+
+      deallocated_slot = slots & (~(1ULL << slot));
+
+    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
+        slots, deallocated_slot)));
+  }
+
+  void DoDMAJob(uint32_t idx) {
+    XeDMAJob& job = jobs_[idx];
+    if (job.precall) {
+      job.precall(&job);
+    }
+    // memcpy(job.destination, job.source, job.size);
+
+    size_t job_size = job.size;
+
+    size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
+
+    size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
+
+    size_t rem = job_size - line_rounded;
+
+    size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
+
+    XEDMALOG(
+        "Distributing %d bytes from %p to %p across %d workers, remainder is "
+        "%d",
+        line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
+    if (num_per_worker < 2048) {
+      XEDMALOG("not distributing across workers, num_per_worker < 8192");
+      // not worth splitting up
+      memcpy(job.destination, job.source, job.size);
+      job.signal_on_done->Set();
+    } else {
+      for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
+        worker_workslice_[i].destination =
+            (i * num_per_worker) + job.destination;
+        worker_workslice_[i].source = (i * num_per_worker) + job.source;
+
+        worker_workslice_[i].numbytes = num_per_worker;
+      }
+      if (rem) {
+        __movsb(job.destination + line_rounded, job.source + line_rounded, rem);
+      }
+      // wake them up
+      worker_has_work_->Set();
+      XEDMALOG("Starting waitall for job");
+      threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
+                         false);
+
+      XEDMALOG("Waitall for job completed!");
+      job.signal_on_done->Set();
+    }
+    if (job.postcall) {
+      job.postcall(&job);
+    }
+    ++dma_volatile_.jobs_completed_;
+  }
+
+  void WorkerIter(uint32_t worker_index) {
+    xenia_assert(worker_index < XEDMA_NUM_WORKERS);
+    auto [dest, src, size] = worker_workslice_[worker_index];
+
+    //  if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
+    worker_has_work_->Reset();
+    //}
+    xenia_assert(size < (1ULL << 32));
+    // memcpy(dest, src, size);
+    dma::vastcpy(dest, src, static_cast<uint32_t>(size));
+  }
+  XE_NOINLINE
+  void WorkerMainLoop(uint32_t worker_index) {
+    do {
+      XEDMALOG("Worker iter for worker %d", worker_index);
+      WorkerIter(worker_index);
+
+      XEDMALOG("Worker %d is done\n", worker_index);
+      threading::SignalAndWait(worker_has_finished_[worker_index].get(),
+                               worker_has_work_.get(), false);
+    } while (true);
+  }
+  void WorkerMain(uint32_t worker_index) {
+    XEDMALOG("Entered worker main loop, index %d", worker_index);
+    threading::Wait(worker_has_work_.get(), false);
+    XEDMALOG("First wait for worker %d completed, first job ever",
+             worker_index);
+    WorkerMainLoop(worker_index);
+  }
+
+  static void WorkerMainForwarder(void* ptr) {
+    // we aligned XeDma to 256 bytes and encode extra info in the low 8
+    uintptr_t uptr = (uintptr_t)ptr;
+
+    uint32_t worker_index = (uint8_t)uptr;
+
+    uptr &= ~0xFFULL;
+
+    char name_buffer[64];
+    sprintf_s(name_buffer, "dma_worker_%d", worker_index);
+
+    xe::threading::set_name(name_buffer);
+
+    reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
+  }
+
+  void DMAMain() {
+    XEDMALOG("DmaMain");
+    do {
+      threading::Wait(job_submitted_signal_.get(), false);
+
+      auto slots = dma_volatile_.free_job_slots_.load();
+
+      for (uint32_t i = 0; i < 64; ++i) {
+        if (slots & (1ULL << i)) {
+          XEDMALOG("Got new job at index %d in DMAMain", i);
+          DoDMAJob(i);
+
+          free_dma_slot(i);
+
+          job_completed_signal_->Set();
+          //         break;
+        }
+      }
+
+    } while (true);
+  }
+
+  static void DMAMainForwarder(void* ud) {
+    xe::threading::set_name("dma_main");
+    reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
+  }
+
+ public:
+  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
+    XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
+             job->size);
+    uint32_t slot;
+    do {
+      slot = allocate_free_dma_slot();
+      if (!~slot) {
+        XEDMALOG(
+            "Didn't get a free slot, waiting for a job to complete before "
+            "resuming.");
+        threading::Wait(job_completed_signal_.get(), false);
+
+      } else {
+        break;
+      }
+
+    } while (true);
+    jobs_[slot] = *job;
+
+    jobs_[slot].signal_on_done = job_done_signals_[slot].get();
+    jobs_[slot].signal_on_done->Reset();
+    XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
+
+    uint32_t new_serial = dma_volatile_.current_job_serial_++;
+
+    jobserials_[slot] = new_serial;
+
+    ++dma_volatile_.jobs_submitted_;
+    job_submitted_signal_->Set();
+    return (static_cast<uint64_t>(new_serial) << 32) |
+           static_cast<uint64_t>(slot);
+
+    // return job_done_signals_[slot].get();
+  }
+
+  bool AllJobsDone() {
+    return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
+  }
+  virtual void WaitJobDone(DMACJobHandle handle) override {
+    uint32_t serial = static_cast<uint32_t>(handle >> 32);
+    uint32_t jobid = static_cast<uint32_t>(handle);
+    do {
+      if (jobserials_[jobid] != serial) {
+        return;  // done, our slot was reused
+      }
+
+      auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
+                                     std::chrono::milliseconds{1});
+
+      if (waitres == threading::WaitResult::kTimeout) {
+        continue;
+      } else {
+        return;
+      }
+    } while (true);
+  }
+  virtual void WaitForIdle() override {
+    while (!AllJobsDone()) {
+      threading::MaybeYield();
+    }
+  }
+  XeDMACGeneric() {
+    XEDMALOG("Constructing xedma at addr %p", this);
+    dma_volatile_.free_job_slots_.store(0ULL);
+    dma_volatile_.jobs_submitted_.store(0ULL);
+    dma_volatile_.jobs_completed_.store(0ULL);
+    dma_volatile_.current_job_serial_.store(
+        1ULL);  // so that a jobhandle is never 0
+    std::memset(jobs_, 0, sizeof(jobs_));
+    job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
+    job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
+    worker_has_work_ = threading::Event::CreateManualResetEvent(false);
+    threading::Thread::CreationParameters worker_params{};
+    worker_params.create_suspended = false;
+    worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
+    worker_params.stack_size = 65536;  // dont need much stack at all
+
+    for (uint32_t i = 0; i < 64; ++i) {
+      job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
+    }
+    for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
+      // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
+      worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
+      worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
+
+      uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
+      xenia_assert(!(encoded & 0xFFULL));
+      xenia_assert(i < 256);
+
+      encoded |= i;
+
+      workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
+        XeDMACGeneric::WorkerMainForwarder((void*)encoded);
+      });
+    }
+    threading::Thread::CreationParameters scheduler_params{};
+    scheduler_params.create_suspended = false;
+    scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
+    scheduler_params.stack_size = 65536;
+    scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
+      XeDMACGeneric::DMAMainForwarder((void*)this);
+    });
+  }
+};
+XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
+}  // namespace xe::dma
diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h
new file mode 100644
index 000000000..e95639753
--- /dev/null
+++ b/src/xenia/base/dma.h
@@ -0,0 +1,46 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_DMA_H_
+#define XENIA_BASE_DMA_H_
+#include "memory.h"
+#include "threading.h"
+namespace xe::dma {
+struct XeDMAJob;
+using DmaPrecall = void (*)(XeDMAJob* job);
+using DmaPostcall = void (*)(XeDMAJob* job);
+struct XeDMAJob {
+  threading::Event* signal_on_done;
+  uint8_t* destination;
+  uint8_t* source;
+  size_t size;
+  DmaPrecall precall;
+  DmaPostcall postcall;
+  void* userdata1;
+  void* userdata2;
+};
+using DMACJobHandle = uint64_t;
+
+class XeDMAC {
+ public:
+  virtual ~XeDMAC() {}
+  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
+  virtual void WaitJobDone(DMACJobHandle handle) = 0;
+  virtual void WaitForIdle() = 0;
+};
+
+XeDMAC* CreateDMAC();
+// must be divisible by cache line size
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length);
+
+}  // namespace xe::dma
+
+#endif  // XENIA_BASE_DMA_H_
diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h
index 4cafc7178..6e323ede8 100644
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) {
   return m128_i64<N>(_mm_castps_pd(v));
 }
 /*
-	
-	std::min/max float has handling for nans, where if either argument is nan the first argument is returned
 
-	minss/maxss are different, if either argument is nan the second operand to the instruction is returned
-	this is problematic because we have no assurances from the compiler on the argument ordering
+        std::min/max float has handling for nans, where if either argument is
+   nan the first argument is returned
 
-	so only use in places where nan handling is not needed
+        minss/maxss are different, if either argument is nan the second operand
+   to the instruction is returned this is problematic because we have no
+   assurances from the compiler on the argument ordering
+
+        so only use in places where nan handling is not needed
 */
-static float xe_minf(float x, float y) {
+XE_FORCEINLINE
+static float ArchMin(float x, float y) {
   return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_maxf(float x, float y) {
+XE_FORCEINLINE
+static float ArchMax(float x, float y) {
   return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_rcpf(float den) {
+XE_FORCEINLINE
+static float ArchReciprocal(float den) {
   return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }
 
 #else
-static float xe_minf(float x, float y) { return std::min<float>(x, y); }
-static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
-static float xe_rcpf(float den) { return 1.0f / den; }
+static float ArchMin(float x, float y) { return std::min<float>(x, y); }
+static float ArchMax(float x, float y) { return std::max<float>(x, y); }
+static float ArchReciprocal(float den) { return 1.0f / den; }
 #endif
+XE_FORCEINLINE
+static float RefineReciprocal(float initial, float den) {
+  float t0 = initial * den;
+  float t1 = t0 * initial;
+  float rcp2 = initial + initial;
+  return rcp2 - t1;
+}
+XE_FORCEINLINE
+static float ArchReciprocalRefined(float den) {
+  return RefineReciprocal(ArchReciprocal(den), den);
+}
 
 // Similar to the C++ implementation of XMConvertFloatToHalf and
 // XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
@@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) {
   }
   return T(result);
 }
+namespace divisors {
+union IDivExtraInfo {
+  uint32_t value_;
+  struct {
+    uint32_t shift_ : 31;
+    uint32_t add_ : 1;
+  } info;
+};
+// returns magicnum multiplier
+static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
+  IDivExtraInfo extra;
 
+  uint32_t d = _denom;
+  int p;
+  uint32_t nc, delta, q1, r1, q2, r2;
+  struct {
+    unsigned M;
+    int a;
+    int s;
+  } magu;
+  magu.a = 0;
+  nc = -1 - ((uint32_t) - (int32_t)d) % d;
+  p = 31;
+  q1 = 0x80000000 / nc;
+  r1 = 0x80000000 - q1 * nc;
+  q2 = 0x7FFFFFFF / d;
+  r2 = 0x7FFFFFFF - q2 * d;
+  do {
+    p += 1;
+    if (r1 >= nc - r1) {
+      q1 = 2 * q1 + 1;
+      r1 = 2 * r1 - nc;
+    } else {
+      q1 = 2 * q1;
+      r1 = 2 * r1;
+    }
+    if (r2 + 1 >= d - r2) {
+      if (q2 >= 0x7FFFFFFF) {
+        magu.a = 1;
+      }
+      q2 = 2 * q2 + 1;
+      r2 = 2 * r2 + 1 - d;
+
+    } else {
+      if (q2 >= 0x80000000U) {
+        magu.a = 1;
+      }
+      q2 = 2 * q2;
+      r2 = 2 * r2 + 1;
+    }
+    delta = d - 1 - r2;
+  } while (p < 64 && (q1 < delta || r1 == 0));
+
+  extra.info.add_ = magu.a;
+  extra.info.shift_ = p - 32;
+  out_extra = extra.value_;
+  return static_cast<uint64_t>(q2 + 1);
+}
+
+static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
+                                      uint32_t extradata) {
+  IDivExtraInfo extra;
+
+  extra.value_ = extradata;
+
+  uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
+  if (extra.info.add_) {
+    uint32_t addend = result + num;
+    addend = ((addend < result ? 0x80000000 : 0) | addend);
+    result = addend;
+  }
+  return result >> extra.info.shift_;
+}
+
+static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
+                                       uint32_t extradata, uint32_t original) {
+  uint32_t dived = ApplyUint32Div(num, mul, extradata);
+  unsigned result = num - (dived * original);
+
+  return result;
+}
+
+struct MagicDiv {
+  uint32_t multiplier_;
+  uint32_t extradata_;
+  MagicDiv() : multiplier_(0), extradata_(0) {}
+  MagicDiv(uint32_t original) {
+    multiplier_ = PregenerateUint32Div(original, extradata_);
+  }
+
+  uint32_t Apply(uint32_t numerator) const {
+    return ApplyUint32Div(numerator, multiplier_, extradata_);
+  }
+};
+}  // namespace divisors
 }  // namespace xe
 
 #endif  // XENIA_BASE_MATH_H_
diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h
index 1afeedc14..b924c267c 100644
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@@ -672,25 +672,58 @@ static void Prefetch<PrefetchTag::Level1>(const void* addr) {
 #define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
 #endif
 #if XE_ARCH_AMD64 == 1
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
+  struct {
+    __m256 low32;
+    __m256 high32;
+  };
+  struct {
+    __m128i xmms[4];
+  };
+  float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)];
+};
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
-  __m256i low = _mm256_loadu_si256((const __m256i*)source);
-  __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
-  XE_MSVC_REORDER_BARRIER();
-  _mm256_stream_si256((__m256i*)destination, low);
-  _mm256_stream_si256(&((__m256i*)destination)[1], high);
+static void WriteLineNT(CacheLine* XE_RESTRICT destination,
+                        const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_stream_ps(&destination->floats[0], low);
+  _mm256_stream_ps(&destination->floats[8], high);
 }
 
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
-  __m256i low = _mm256_stream_load_si256((const __m256i*)source);
-  __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
-  XE_MSVC_REORDER_BARRIER();
-  _mm256_storeu_si256((__m256i*)destination, low);
-  _mm256_storeu_si256(&((__m256i*)destination)[1], high);
+static void ReadLineNT(CacheLine* XE_RESTRICT destination,
+                       const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+
+  __m128i first = _mm_stream_load_si128(&source->xmms[0]);
+  __m128i second = _mm_stream_load_si128(&source->xmms[1]);
+  __m128i third = _mm_stream_load_si128(&source->xmms[2]);
+  __m128i fourth = _mm_stream_load_si128(&source->xmms[3]);
+
+  destination->xmms[0] = first;
+  destination->xmms[1] = second;
+  destination->xmms[2] = third;
+  destination->xmms[3] = fourth;
+}
+XE_FORCEINLINE
+static void ReadLine(CacheLine* XE_RESTRICT destination,
+                     const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_storeu_ps(&destination->floats[0], low);
+  _mm256_storeu_ps(&destination->floats[8], high);
+}
+XE_FORCEINLINE
+static void WriteLine(CacheLine* XE_RESTRICT destination,
+                      const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_storeu_ps(&destination->floats[0], low);
+  _mm256_storeu_ps(&destination->floats[8], high);
 }
 
 XE_FORCEINLINE
@@ -699,19 +732,29 @@ XE_FORCEINLINE
 static void ReadFence() { _mm_lfence(); }
 XE_FORCEINLINE
 static void ReadWriteFence() { _mm_mfence(); }
+
 #else
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
+  uint8_t bvals[XE_HOST_CACHE_LINE_SIZE];
+};
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
-  memcpy(destination, source, 64);
+static void WriteLineNT(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
 
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
-  memcpy(destination, source, 64);
+static void ReadLineNT(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
+XE_FORCEINLINE
+static void WriteLine(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
+}
+XE_FORCEINLINE
+static void ReadLine(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
+}
+
 XE_FORCEINLINE
 static void WriteFence() {}
 XE_FORCEINLINE
@@ -720,6 +763,47 @@ XE_FORCEINLINE
 static void ReadWriteFence() {}
 #endif
 }  // namespace swcache
+
+template <unsigned Size>
+static void smallcpy_const(void* destination, const void* source) {
+#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
+  if constexpr ((Size & 7) == 0) {
+    __movsq((unsigned long long*)destination, (const unsigned long long*)source,
+            Size / 8);
+  } else if constexpr ((Size & 3) == 0) {
+    __movsd((unsigned long*)destination, (const unsigned long*)source,
+            Size / 4);
+    // dont even bother with movsw, i think the operand size override prefix
+    // slows it down
+  } else {
+    __movsb((unsigned char*)destination, (const unsigned char*)source, Size);
+  }
+#else
+  memcpy(destination, source, Size);
+#endif
+}
+template <unsigned Size>
+static void smallset_const(void* destination, unsigned char fill_value) {
+#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
+  if constexpr ((Size & 7) == 0) {
+    unsigned long long fill =
+        static_cast<unsigned long long>(fill_value) * 0x0101010101010101ULL;
+
+    __stosq((unsigned long long*)destination, fill, Size / 8);
+  } else if constexpr ((Size & 3) == 0) {
+    static constexpr unsigned long fill =
+        static_cast<unsigned long>(fill_value) * 0x01010101U;
+    __stosd((unsigned long*)destination, fill, Size / 4);
+    // dont even bother with movsw, i think the operand size override prefix
+    // slows it down
+  } else {
+    __stosb((unsigned char*)destination, fill_value, Size);
+  }
+#else
+  memset(destination, fill_value, Size);
+#endif
+}
+
 }  // namespace xe
 
 #endif  // XENIA_BASE_MEMORY_H_
diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h
index a4befb686..84d5893b5 100644
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@@ -59,16 +59,8 @@ class RingBuffer {
   // subtract instead
   void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
   ring_size_t read_count() const {
-// chrispy: these branches are unpredictable
-#if 0
-    if (read_offset_ == write_offset_) {
-      return 0;
-    } else if (read_offset_ < write_offset_) {
-      return write_offset_ - read_offset_;
-    } else {
-      return (capacity_ - read_offset_) + write_offset_;
-    }
-#else
+    // chrispy: these branches are unpredictable
+
     ring_size_t read_offs = read_offset_;
     ring_size_t write_offs = write_offset_;
     ring_size_t cap = capacity_;
@@ -77,14 +69,6 @@ class RingBuffer {
     ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
 
     ring_size_t comparison_value = read_offs <= write_offs;
-#if 0
-    size_t selector =
-        static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
-    offset_delta &= selector;
-
-    wrap_read_count &= ~selector;
-    return offset_delta | wrap_read_count;
-#else
 
     if (XE_LIKELY(read_offs <= write_offs)) {
       return offset_delta;  // will be 0 if they are equal, semantically
@@ -93,8 +77,6 @@ class RingBuffer {
     } else {
       return wrap_read_count;
     }
-#endif
-#endif
   }
 
   ring_size_t write_offset() const { return write_offset_; }
@@ -116,9 +98,9 @@ class RingBuffer {
   void AdvanceWrite(size_t count);
 
   struct ReadRange {
-    const uint8_t* first;
+    const uint8_t* XE_RESTRICT first;
 
-    const uint8_t* second;
+    const uint8_t* XE_RESTRICT second;
     ring_size_t first_length;
     ring_size_t second_length;
   };
@@ -126,9 +108,11 @@ class RingBuffer {
   void EndRead(ReadRange read_range);
 
   /*
-        BeginRead, but if there is a second Range it will prefetch all lines of it
+        BeginRead, but if there is a second Range it will prefetch all lines of
+     it
 
-		this does not prefetch the first range, because software prefetching can do that faster than we can
+                this does not prefetch the first range, because software
+     prefetching can do that faster than we can
   */
   template <swcache::PrefetchTag tag>
   XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
@@ -138,7 +122,7 @@ class RingBuffer {
       ring_size_t numlines =
           xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
           XE_HOST_CACHE_LINE_SIZE;
-	  //chrispy: maybe unroll?
+      // chrispy: maybe unroll?
       for (ring_size_t i = 0; i < numlines; ++i) {
         swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
       }
@@ -187,7 +171,7 @@ class RingBuffer {
   }
 
  private:
-  uint8_t* buffer_ = nullptr;
+  uint8_t* XE_RESTRICT buffer_ = nullptr;
   ring_size_t capacity_ = 0;
   ring_size_t read_offset_ = 0;
   ring_size_t write_offset_ = 0;
diff --git a/src/xenia/base/split_map.h b/src/xenia/base/split_map.h
new file mode 100644
index 000000000..510c2ed70
--- /dev/null
+++ b/src/xenia/base/split_map.h
@@ -0,0 +1,87 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2019 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_SPLIT_MAP_H_
+#define XENIA_BASE_SPLIT_MAP_H_
+#include <algorithm>
+#include <vector>
+namespace xe {
+/*
+        a map structure that is optimized for infrequent
+   reallocation/resizing/erasure and frequent searches by key implemented as 2
+   std::vectors, one of the keys and one of the values
+*/
+template <typename TKey, typename TValue>
+class split_map {
+  using key_vector = std::vector<TKey>;
+  using value_vector = std::vector<TValue>;
+
+  key_vector keys_;
+  value_vector values_;
+
+ public:
+  using my_type = split_map<TKey, TValue>;
+
+  uint32_t IndexForKey(const TKey& k) {
+    auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k);
+    return static_cast<uint32_t>(lbound - keys_.begin());
+  }
+
+  uint32_t size() const { return static_cast<uint32_t>(keys_.size()); }
+  key_vector& Keys() { return keys_; }
+  value_vector& Values() { return values_; }
+  void clear() {
+    keys_.clear();
+    values_.clear();
+  }
+  void resize(uint32_t new_size) {
+    keys_.resize(static_cast<size_t>(new_size));
+    values_.resize(static_cast<size_t>(new_size));
+  }
+
+  void reserve(uint32_t new_size) {
+    keys_.reserve(static_cast<size_t>(new_size));
+    values_.reserve(static_cast<size_t>(new_size));
+  }
+  const TKey* KeyAt(uint32_t index) const {
+    if (index == size()) {
+      return nullptr;
+    } else {
+      return &keys_[index];
+    }
+  }
+  const TValue* ValueAt(uint32_t index) const {
+    if (index == size()) {
+      return nullptr;
+    } else {
+      return &values_[index];
+    }
+  }
+
+  void InsertAt(TKey k, TValue v, uint32_t idx) {
+    uint32_t old_size = size();
+
+    bool needs_shiftup = idx != old_size;
+
+    values_.insert(values_.begin() + idx, v);
+    keys_.insert(keys_.begin() + idx, k);
+  }
+  void EraseAt(uint32_t idx) {
+    uint32_t old_size = size();
+    if (idx == old_size) {
+      return;  // trying to erase nonexistent entry
+    } else {
+      values_.erase(values_.begin() + idx);
+      keys_.erase(keys_.begin() + idx);
+    }
+  }
+};
+}  // namespace xe
+
+#endif  // XENIA_BASE_SPLIT_MAP_H_
\ No newline at end of file
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index d1394d202..17abb72e7 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
             "code. The workaround may cause reduced CPU performance but is a "
             "more accurate emulation",
             "x64");
-
+DEFINE_uint32(align_all_basic_blocks, 0,
+              "Aligns the start of all basic blocks to N bytes. Only specify a "
+              "power of 2, 16 is the recommended value. Results in larger "
+              "icache usage, but potentially faster loops",
+              "x64");
 #if XE_X64_PROFILER_AVAILABLE == 1
 DEFINE_bool(instrument_call_times, false,
             "Compute time taken for functions, for profiling guest code",
@@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
   TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
   TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
   TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C);
   TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
   TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
   TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
@@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
 
   return true;
 }
+#pragma pack(push, 1)
+struct RGCEmitted {
+  uint8_t ff_;
+  uint32_t rgcid_;
+};
+#pragma pack(pop)
 
+#if 0
+void X64Emitter::InjectCallAddresses(void* new_execute_address) {
+  for (auto&& callsite : call_sites_) {
+    RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
+      hunter =
+          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
+    }
+
+    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
+    hunter->rgcid_ =
+        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
+                              reinterpret_cast<intptr_t>(hunter + 1));
+  }
+}
+
+#else
+void X64Emitter::InjectCallAddresses(void* new_execute_address) {
+#if 0
+	RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+
+  std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
+
+  for (auto&& callsite : call_sites_) {
+    id_to_rgc[callsite.offset_] = &callsite;
+  }
+#else
+  RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+  for (auto&& callsite : call_sites_) {
+    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
+      hunter =
+          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
+    }
+
+    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
+    hunter->rgcid_ =
+        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
+                              reinterpret_cast<intptr_t>(hunter + 1));
+  }
+#endif
+}
+#endif
 void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
                           GuestFunction* function) {
   // To avoid changing xbyak, we do a switcharoo here.
@@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
   if (function) {
     code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
                                 new_execute_address, new_write_address);
-    if (cvars::resolve_rel32_guest_calls) {
-      for (auto&& callsite : call_sites_) {
-#pragma pack(push, 1)
-        struct RGCEmitted {
-          uint8_t ff_;
-          uint32_t rgcid_;
-        };
-#pragma pack(pop)
-        RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
-        while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
-          hunter = reinterpret_cast<RGCEmitted*>(
-              reinterpret_cast<char*>(hunter) + 1);
-        }
 
-        hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
-        hunter->rgcid_ =
-            static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
-                                  reinterpret_cast<intptr_t>(hunter + 1));
-      }
+    if (cvars::resolve_rel32_guest_calls) {
+      InjectCallAddresses(new_execute_address);
     }
   } else {
     code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
@@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
       label = label->next;
     }
 
+    if (cvars::align_all_basic_blocks) {
+      align(cvars::align_all_basic_blocks, true);
+    }
     // Process instructions.
     const Instr* instr = block->instr_head;
     while (instr) {
@@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = {
     vec128i(0x7f800000),
     /* XMMThreeFloatMask */
     vec128i(~0U, ~0U, ~0U, 0U),
-    /*XMMXenosF16ExtRangeStart*/
-    vec128f(65504),
-    /*XMMVSRShlByteshuf*/
-    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
-    // XMMVSRMask
-    vec128b(1),
     /*
         XMMF16UnpackLCPI2
         */
@@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = {
     /*XMMXOPWordShiftMask*/
     vec128s(15),
     /*XMMXOPDwordShiftMask*/
-    vec128i(31)
-};
+    vec128i(31)};
 
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
   for (auto& vec : xmm_consts) {
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 01027cc0c..b20978ea3 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -157,9 +157,6 @@ enum XmmConst {
   XMMLVSRTableBase,
   XMMSingleDenormalMask,
   XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
-  XMMXenosF16ExtRangeStart,
-  XMMVSRShlByteshuf,
-  XMMVSRMask,
   XMMF16UnpackLCPI2,  // 0x38000000, 1/ 32768
   XMMF16UnpackLCPI3,  // 0x0x7fe000007fe000
   XMMF16PackLCPI0,
@@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags {
   kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
   kX64EmitBMI1 = 1 << 3,
   kX64EmitBMI2 = 1 << 4,
-  kX64EmitF16C = 1 << 5,
+  kX64EmitPrefetchW = 1 << 5,
   kX64EmitMovbe = 1 << 6,
   kX64EmitGFNI = 1 << 7,
 
@@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags {
                 // inc/dec) do not introduce false dependencies on EFLAGS
                 // because the individual flags are treated as different vars by
                 // the processor. (this applies to zen)
-  kX64EmitPrefetchW = 1 << 16,
-  kX64EmitXOP = 1 << 17,   // chrispy: xop maps really well to many vmx
+  kX64EmitXOP = 1 << 16,   // chrispy: xop maps really well to many vmx
                            // instructions, and FX users need the boost
-  kX64EmitFMA4 = 1 << 18,  // todo: also use on zen1?
-  kX64EmitTBM = 1 << 19
+  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
+  kX64EmitTBM = 1 << 18,
+  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
+  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
+  // instructions by using the legacy sse version if we recently cleared the
+  // high 128 bits of the
 };
 class ResolvableGuestCall {
  public:
@@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
             uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
             void** out_code_address, size_t* out_code_size,
             std::vector<SourceMapEntry>* out_source_map);
+  void InjectCallAddresses(void* new_execute_addr);
 
  public:
   // Reserved:  rsp, rsi, rdi
diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h
index b9257f179..78c459101 100644
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@@ -43,23 +43,23 @@ enum KeyType {
   KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
   KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
 };
-
+using InstrKeyValue = uint32_t;
 #pragma pack(push, 1)
 union InstrKey {
-  uint32_t value;
+  InstrKeyValue value;
   struct {
-    uint32_t opcode : 8;
-    uint32_t dest : 5;
-    uint32_t src1 : 5;
-    uint32_t src2 : 5;
-    uint32_t src3 : 5;
-    uint32_t reserved : 4;
+    InstrKeyValue opcode : 8;
+    InstrKeyValue dest : 5;
+    InstrKeyValue src1 : 5;
+    InstrKeyValue src2 : 5;
+    InstrKeyValue src3 : 5;
+    InstrKeyValue reserved : 4;
   };
 
-  operator uint32_t() const { return value; }
+  operator InstrKeyValue() const { return value; }
 
   InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
-  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(InstrKeyValue v) : value(v) {}
 
   // this used to take about 1% cpu while precompiling
   // it kept reloading opcode, and also constantly repacking and unpacking the
@@ -67,16 +67,16 @@ union InstrKey {
   InstrKey(const Instr* i) : value(0) {
     const OpcodeInfo* info = i->GetOpcodeInfo();
 
-    uint32_t sig = info->signature;
+    InstrKeyValue sig = info->signature;
 
     OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
 
     UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
 
-    uint32_t out_desttype = (uint32_t)dest_type;
-    uint32_t out_src1type = (uint32_t)src1_type;
-    uint32_t out_src2type = (uint32_t)src2_type;
-    uint32_t out_src3type = (uint32_t)src3_type;
+    InstrKeyValue out_desttype = (InstrKeyValue)dest_type;
+    InstrKeyValue out_src1type = (InstrKeyValue)src1_type;
+    InstrKeyValue out_src2type = (InstrKeyValue)src2_type;
+    InstrKeyValue out_src3type = (InstrKeyValue)src3_type;
 
     Value* destv = i->dest;
     // pre-deref, even if not value
@@ -105,7 +105,7 @@ union InstrKey {
   template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
             KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
   struct Construct {
-    static const uint32_t value =
+    static const InstrKeyValue value =
         (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
   };
 };
@@ -307,8 +307,8 @@ struct I<OPCODE, DEST> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       return true;
     }
@@ -329,8 +329,8 @@ struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       return true;
@@ -355,8 +355,8 @@ struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       src2.Load(i->src2);
@@ -385,8 +385,8 @@ struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue ikey) {
+    if (ikey == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       src2.Load(i->src2);
@@ -422,9 +422,9 @@ struct Sequence {
 
   static constexpr uint32_t head_key() { return T::key; }
 
-  static bool Select(X64Emitter& e, const Instr* i) {
+  static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) {
     T args;
-    if (!args.Load(i)) {
+    if (!args.Load(i, ikey)) {
       return false;
     }
     SEQ::Emit(e, args);
diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc
index dc5fa7d3d..54e7ac8a0 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
   if (valid) {
     auto name = i.src2.value->name;
     switch (opcode) {
-      case OPCODE_IS_TRUE:
-        e.jnz(name, e.T_NEAR);
-        break;
-      case OPCODE_IS_FALSE:
-        e.jz(name, e.T_NEAR);
-        break;
       case OPCODE_COMPARE_EQ:
         e.je(name, e.T_NEAR);
         break;
@@ -299,26 +293,14 @@ struct CALL_TRUE_I64
 struct CALL_TRUE_F32
     : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-    e.ForgetMxcsrMode();
+    assert_impossible_sequence(CALL_TRUE_F32);
   }
 };
 
 struct CALL_TRUE_F64
     : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-    e.ForgetMxcsrMode();
+    assert_impossible_sequence(CALL_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32
     : Sequence<CALL_INDIRECT_TRUE_F32,
                I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F32);
   }
 };
 struct CALL_INDIRECT_TRUE_F64
     : Sequence<CALL_INDIRECT_TRUE_F64,
                I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
@@ -486,15 +460,13 @@ struct RETURN_TRUE_I64
 struct RETURN_TRUE_F32
     : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+    assert_impossible_sequence(RETURN_TRUE_F32);
   }
 };
 struct RETURN_TRUE_F64
     : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+    assert_impossible_sequence(RETURN_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
@@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64
 struct BRANCH_TRUE_F32
     : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
-        i.instr->prev->dest == i.src1.value) {
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    } else if (i.instr->prev &&
-               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
-               i.instr->prev->dest == i.src1.value) {
-      e.jz(i.src2.value->name, e.T_NEAR);
-    } else {
-      e.vptest(i.src1, i.src1);
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    }
+    /*
+                chrispy: right now, im not confident that we are always clearing
+       the upper 96 bits of registers, making vptest extremely unsafe. many
+       ss/sd operations copy over the upper 96 from the source, and for abs we
+       negate ALL elements, making the top 64 bits contain 0x80000000 etc
+        */
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovd(e.eax, input);
+    e.test(e.eax, e.eax);
+    e.jnz(i.src2.value->name, e.T_NEAR);
   }
 };
 struct BRANCH_TRUE_F64
     : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
-        i.instr->prev->dest == i.src1.value) {
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    } else if (i.instr->prev &&
-               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
-               i.instr->prev->dest == i.src1.value) {
-      e.jz(i.src2.value->name, e.T_NEAR);
-    } else {
-      e.vptest(i.src1, i.src1);
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    }
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovq(e.rax, input);
+    e.test(e.rax, e.rax);
+    e.jnz(i.src2.value->name, e.T_NEAR);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32
     : Sequence<BRANCH_FALSE_F32,
                I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovd(e.eax, input);
+    e.test(e.eax, e.eax);
     e.jz(i.src2.value->name, e.T_NEAR);
   }
 };
@@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64
     : Sequence<BRANCH_FALSE_F64,
                I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovq(e.rax, input);
+    e.test(e.rax, e.rax);
     e.jz(i.src2.value->name, e.T_NEAR);
   }
 };
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index 70865c30e..cc6c1cf32 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
   if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
     return false;
   }
+  if (IsTracingData()) {  // incompatible with tracing
+    return false;
+  }
   uint32_t guestaddr = i->GuestAddressFor();
   if (!guestaddr) {
     return false;
@@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
 
   return flags && flags->accessed_mmio;
 }
+template <typename T, bool swap>
+static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
+  if (swap) {
+    value = xe::byte_swap(value);
+  }
+  if (guestaddr >= 0xE0000000) {
+    guestaddr += 0x1000;
+  }
 
+  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
+
+  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
+  if (!gaddr) {
+    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
+  } else {
+    value = xe::byte_swap(value); /*
+          was having issues, found by comparing the values used with exceptions
+          to these that we were reversed...
+    */
+    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
+  }
+}
+
+template <typename T, bool swap>
+static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) {
+  T value;
+
+  if (guestaddr >= 0xE0000000) {
+    guestaddr += 0x1000;
+  }
+
+  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
+
+  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
+  if (!gaddr) {
+    value = *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr);
+    if (swap) {
+      value = xe::byte_swap(value);
+    }
+  } else {
+    /*
+          was having issues, found by comparing the values used with exceptions
+          to these that we were reversed...
+    */
+    value = gaddr->read(nullptr, gaddr->callback_context, guestaddr);
+  }
+  return value;
+}
 // ============================================================================
 // OPCODE_LOAD_OFFSET
 // ============================================================================
@@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16
 struct LOAD_OFFSET_I32
     : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+
+      if (i.src2.is_constant) {
+        e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
+      } else {
+        e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32());
+      }
+
+      e.CallNativeSafe(addrptr);
+      e.mov(i.dest, e.eax);
+    } else {
+      auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+          e.movbe(i.dest, e.dword[addr]);
+        } else {
+          e.mov(i.dest, e.dword[addr]);
+          e.bswap(i.dest);
+        }
       } else {
         e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
       }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
     }
   }
 };
@@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64
 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
                      LOAD_OFFSET_I32, LOAD_OFFSET_I64);
 
-template <typename T, bool swap>
-static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
-  if (swap) {
-    value = xe::byte_swap(value);
-  }
-  if (guestaddr >= 0xE0000000) {
-    guestaddr += 0x1000;
-  }
-
-  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
-
-  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
-  if (!gaddr) {
-    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
-  } else {
-    value = xe::byte_swap(value); /*
-          was having issues, found by comparing the values used with exceptions
-          to these that we were reversed...
-    */
-    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
-  }
-}
 // ============================================================================
 // OPCODE_STORE_OFFSET
 // ============================================================================
@@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
 };
 struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+
+      e.CallNativeSafe(addrptr);
+      e.mov(i.dest, e.eax);
+    } else {
+      auto addr = ComputeMemoryAddress(e, i.src1);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+          e.movbe(i.dest, e.dword[addr]);
+        } else {
+          e.mov(i.dest, e.dword[addr]);
+          e.bswap(i.dest);
+        }
       } else {
         e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
       }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
-    }
-    if (IsTracingData()) {
-      e.mov(e.GetNativeParam(1).cvt32(), i.dest);
-      e.lea(e.GetNativeParam(0), e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+      if (IsTracingData()) {
+        e.mov(e.GetNativeParam(1).cvt32(), i.dest);
+        e.lea(e.GetNativeParam(0), e.ptr[addr]);
+        e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+      }
     }
   }
 };
@@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
         } else {
           e.mov(e.dword[addr], i.src2);
         }
+        if (IsTracingData()) {
+          e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
+          e.lea(e.GetNativeParam(0), e.ptr[addr]);
+          e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
+        }
       }
     }
-    if (IsTracingData()) {
-      auto addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
-      e.lea(e.GetNativeParam(0), e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
-    }
   }
 };
 struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 46eb285cf..e5843f37d 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -19,15 +19,15 @@
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 
-DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
+DEFINE_bool(xop_rotates, false, "rotate via xop", "x64");
 
-DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
+DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64");
 
-DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
+DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64");
 
-DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
+DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64");
 
-DEFINE_bool(xop_compares, true, "compare via xop", "X64");
+DEFINE_bool(xop_compares, true, "compare via xop", "x64");
 
 namespace xe {
 namespace cpu {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 48e340d42..bf5addd3d 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -67,7 +67,7 @@ using namespace xe::cpu::hir;
 
 using xe::cpu::hir::Instr;
 
-typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey);
 std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 
 // ============================================================================
@@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn,
   }
   return false;
 }
-#define EMITTER_IS_TRUE(typ, tester)                                 \
-  struct IS_TRUE_##typ                                               \
-      : Sequence<IS_TRUE_##typ, I<OPCODE_IS_TRUE, I8Op, typ##Op>> {  \
-    static void Emit(X64Emitter& e, const EmitArgType& i) {          \
-      e.tester(i.src1, i.src1);                                      \
-      unsigned ctxoffset = 0;                                        \
-      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
-        e.setnz(e.byte[e.GetContextReg() + ctxoffset]);              \
-      } else {                                                       \
-        e.setnz(i.dest);                                             \
-      }                                                              \
-    }                                                                \
-  }
-
-#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test)
-
-EMITTER_IS_TRUE_INT(I8);
-EMITTER_IS_TRUE_INT(I16);
-EMITTER_IS_TRUE_INT(I32);
-EMITTER_IS_TRUE_INT(I64);
-EMITTER_IS_TRUE(F32, vtestps);
-EMITTER_IS_TRUE(F64, vtestpd);
-
-EMITTER_IS_TRUE(V128, vptest);
-
-EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
-                     IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
-
-#define EMITTER_IS_FALSE(typ, tester)                                 \
-  struct IS_FALSE_##typ                                               \
-      : Sequence<IS_FALSE_##typ, I<OPCODE_IS_FALSE, I8Op, typ##Op>> { \
-    static void Emit(X64Emitter& e, const EmitArgType& i) {           \
-      e.tester(i.src1, i.src1);                                       \
-      unsigned ctxoffset = 0;                                         \
-      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) {  \
-        e.setz(e.byte[e.GetContextReg() + ctxoffset]);                \
-      } else {                                                        \
-        e.setz(i.dest);                                               \
-      }                                                               \
-    }                                                                 \
-  }
-#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test)
-EMITTER_IS_FALSE_INT(I8);
-EMITTER_IS_FALSE_INT(I16);
-EMITTER_IS_FALSE_INT(I32);
-EMITTER_IS_FALSE_INT(I64);
-EMITTER_IS_FALSE(F32, vtestps);
-EMITTER_IS_FALSE(F64, vtestpd);
-
-EMITTER_IS_FALSE(V128, vptest);
-
-EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
-                     IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
 
 // ============================================================================
 // OPCODE_IS_NAN
@@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
 
     auto it = sequence_table.find(key);
     if (it != sequence_table.end()) {
-      if (it->second(*e, i)) {
+      if (it->second(*e, i, InstrKey(i))) {
         *new_tail = i->next;
         return true;
       }
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index 0ce6a7f57..05a5b42d4 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -25,7 +25,7 @@ namespace x64 {
 
 class X64Emitter;
 
-typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey);
 extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 
 template <typename T>
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 26f0fecb4..2d1654030 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             }
           }
           break;
-        case OPCODE_IS_TRUE:
-          if (i->src1.value->IsConstant()) {
-            if (i->src1.value->IsConstantTrue()) {
-              v->set_constant(uint8_t(1));
-            } else {
-              v->set_constant(uint8_t(0));
-            }
-            i->Remove();
-            result = true;
-          }
-          break;
-        case OPCODE_IS_FALSE:
-          if (i->src1.value->IsConstant()) {
-            if (i->src1.value->IsConstantFalse()) {
-              v->set_constant(uint8_t(1));
-            } else {
-              v->set_constant(uint8_t(0));
-            }
-            i->Remove();
-            result = true;
-          }
-          break;
         case OPCODE_IS_NAN:
           if (i->src1.value->IsConstant()) {
             if (i->src1.value->type == FLOAT32_TYPE &&
@@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             if (should_skip_because_of_float) {
               break;
-			}
+            }
             v->set_from(i->src1.value);
             v->Max(i->src2.value);
             i->Remove();
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index e94b570dc..a8c93c3b6 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
   bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
 
   Value* new_value = nullptr;
-  if (xorop == OPCODE_IS_FALSE) {
-    new_value = builder->IsTrue(xordef->src1.value);
-
-  } else if (xorop == OPCODE_IS_TRUE) {
-    new_value = builder->IsFalse(xordef->src1.value);
-  } else if (xorop == OPCODE_COMPARE_EQ) {
+  if (xorop == OPCODE_COMPARE_EQ) {
     new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
 
   } else if (xorop == OPCODE_COMPARE_NE) {
@@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
   return false;
 }
 bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
-  return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
+  return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE;
 }
 
 inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
@@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   if (!var_definition) {
     return false;
   }
-  // x == 0 -> !x
-  if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
-    return true;
-  }
-  // x != 0 -> !!x
-  if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) {
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
-    return true;
-  }
 
   if (cmpop == OPCODE_COMPARE_ULE &&
       constant_unpacked ==
           0) {  // less than or equal to zero = (== 0) = IS_FALSE
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_EQ_info;
+
     return true;
   }
   // todo: OPCODE_COMPARE_NE too?
@@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   }
   if (cmpop == OPCODE_COMPARE_ULT &&
       constant_unpacked == 1) {  // unsigned lt 1 means == 0
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
+                                 // i->Replace(&OPCODE_IS_FALSE_info, 0);
+
+    i->opcode = &OPCODE_COMPARE_EQ_info;
+
+    // i->set_src1(variable);
+    i->set_src2(builder->LoadZero(variable->type));
     return true;
   }
   if (cmpop == OPCODE_COMPARE_UGT &&
       constant_unpacked == 0) {  // unsigned gt 1 means != 0
 
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
+    // i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_NE_info;
     return true;
   }
 
@@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   } else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
              constant_unpacked == 0) {
     // signbit cant be set, and checking if gt 0, so actually checking != 0
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
+
+    // i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_NE_info;
+
     return true;
   }
 
@@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     Value* constant_replacement = nullptr;
 
     if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
-      repl = &OPCODE_IS_TRUE_info;
+      repl = &OPCODE_COMPARE_NE_info;
     } else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
-      repl = &OPCODE_IS_FALSE_info;
+      repl = &OPCODE_COMPARE_EQ_info;
 
     } else if (cmpop == OPCODE_COMPARE_UGT) {
       // impossible, cannot be greater than mask
@@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     if (repl) {
       i->Replace(repl, 0);
       i->set_src1(variable);
+      i->set_src2(builder->LoadZero(variable->type));
       return true;
     }
     if (constant_replacement) {
@@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
 }
 bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
                                             hir::HIRBuilder* builder) {
-  bool istrue = i->opcode == &OPCODE_IS_TRUE_info;
-  bool isfalse = i->opcode == &OPCODE_IS_FALSE_info;
+  bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
+  bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
 
-  Value* input = i->src1.value;
+  auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
+
+  if (!input_cosntant || input_cosntant->AsUint64() != 0) {
+    return false;
+  }
+
+  // Value* input = i->src1.value;
   TypeName input_type = input->type;
   if (!IsScalarIntegralType(input_type)) {
     return false;
@@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
           i->set_src1(isfalsetest);
 
         } else {
-          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          // i->Replace(&OPCODE_IS_FALSE_info, 0);
+          i->Replace(&OPCODE_COMPARE_EQ_info, 0);
           i->set_src1(lz_input);
+          i->set_src2(builder->LoadZero(lz_input->type));
         }
         return true;
       }
@@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
     while (i) {
       // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
       // don't handle these in our simplifications, so skip
-      if (i->dest && IsScalarIntegralType(i->dest->type)) {
+      if (i->AllScalarIntegral()) {
         Opcode iop = i->opcode->num;
 
         if (iop == OPCODE_OR) {
@@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
           result |= CheckAdd(i, builder);
         } else if (IsScalarBasicCmp(iop)) {
           result |= CheckScalarConstCmp(i, builder);
-        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
           result |= CheckIsTrueIsFalse(i, builder);
         } else if (iop == OPCODE_SHR) {
           result |= CheckSHR(i, builder);
diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc
index fda6812b4..82e3021f9 100644
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
 
 Value* HIRBuilder::Convert(Value* value, TypeName target_type,
                            RoundMode round_mode) {
-
   Instr* i =
       AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
   i->set_src1(value);
@@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
 Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
   ASSERT_FLOAT_OR_VECTOR_TYPE(value);
 
-
   Instr* i =
       AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
   i->set_src1(value);
@@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
   i->set_src2(value);
   i->src3.value = NULL;
 }
-
+Value* HIRBuilder::LoadVectorLeft(Value* address) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+  return i->dest;
+}
+Value* HIRBuilder::LoadVectorRight(Value* address) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+  return i->dest;
+}
+void HIRBuilder::StoreVectorLeft(Value* address, Value* value) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_STVL_info, 0);
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+}
+void HIRBuilder::StoreVectorRight(Value* address, Value* value) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_STVR_info, 0);
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+}
 void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
   ASSERT_ADDRESS_TYPE(address);
   ASSERT_TYPES_EQUAL(address, length);
@@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) {
 Value* HIRBuilder::Max(Value* value1, Value* value2) {
   ASSERT_TYPES_EQUAL(value1, value2);
 
-  if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
+  if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
       value2->IsConstant()) {
     return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
   }
@@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
   i->set_src3(value2);
   return i->dest;
 }
+static Value* OrLanes32(HIRBuilder& f, Value* value) {
+  hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE);
+  hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE);
+  hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE);
+  hir::Value* ored = f.Or(v1, v2);
 
+  hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE);
+  ored = f.Or(ored, v3);
+
+  ored = f.Or(ored, v4);
+  return ored;
+}
 Value* HIRBuilder::IsTrue(Value* value) {
+  assert_true(value);
+  if (value->type == VEC128_TYPE) {
+    // chrispy: this probably doesnt happen often enough to be worth its own
+    // opcode or special code path but this could be optimized to not require as
+    // many extracts, we can shuffle and or v128 and then extract the low
+
+    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
+  }
+
   if (value->IsConstant()) {
     return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
   }
 
-  Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE));
-  i->set_src1(value);
-  i->src2.value = i->src3.value = NULL;
-  return i->dest;
+  return CompareNE(value, LoadZero(value->type));
 }
 
 Value* HIRBuilder::IsFalse(Value* value) {
+  assert_true(value);
+
+  if (value->type == VEC128_TYPE) {
+    // chrispy: this probably doesnt happen often enough to be worth its own
+    // opcode or special code path but this could be optimized to not require as
+    // many extracts, we can shuffle and or v128 and then extract the low
+
+    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
+  }
+
   if (value->IsConstant()) {
     return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
   }
 
-  Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE));
-  i->set_src1(value);
-  i->src2.value = i->src3.value = NULL;
-  return i->dest;
+  return CompareEQ(value, LoadZero(value->type));
 }
 
 Value* HIRBuilder::IsNan(Value* value) {
diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h
index 62164e52d..4a200e0e5 100644
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@@ -166,6 +166,11 @@ class HIRBuilder {
                    uint32_t store_flags = 0);
 
   Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
+
+  Value* LoadVectorLeft(Value* address);
+  Value* LoadVectorRight(Value* address);
+  void StoreVectorLeft(Value* address, Value* value);
+  void StoreVectorRight(Value* address, Value* value);
   void Store(Value* address, Value* value, uint32_t store_flags = 0);
   void Memset(Value* address, Value* value, Value* length);
   void CacheControl(Value* address, size_t cache_line_size,
@@ -268,6 +273,7 @@ class HIRBuilder {
                                Value* new_value);
   Value* AtomicAdd(Value* address, Value* value);
   Value* AtomicSub(Value* address, Value* value);
+
   void SetNJM(Value* value);
 
  protected:
diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc
index 149103d43..0e4a7c2fb 100644
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const {
 
   return 0;  // eek.
 }
+bool Instr::AllScalarIntegral() {
+  bool result = true;
+  if (dest) {
+    if (!IsScalarIntegralType(dest->type)) {
+      return false;
+    }
+  }
 
+  VisitValueOperands([&result](Value* v, uint32_t idx) {
+    result = result && IsScalarIntegralType(v->type);
+  });
+  return result;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h
index 38afef241..a50219ceb 100644
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr
   const hir::Instr* GetNonFakePrev() const;
 
   uint32_t GuestAddressFor() const;
+
+  bool AllScalarIntegral();  // dest and all srcs are scalar integral
 };
 
 }  // namespace hir
diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h
index a51fda6d5..a59b1c72e 100644
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@@ -210,10 +210,10 @@ enum Opcode {
   OPCODE_STORE,
   // chrispy: todo: implement, our current codegen for the unaligned loads is
   // very bad
-  OPCODE_LVLX,
-  OPCODE_LVRX,
-  OPCODE_STVLX,
-  OPCODE_STVRX,
+  OPCODE_LVL,
+  OPCODE_LVR,
+  OPCODE_STVL,
+  OPCODE_STVR,
   OPCODE_MEMSET,
   OPCODE_CACHE_CONTROL,
   OPCODE_MEMORY_BARRIER,
@@ -222,8 +222,6 @@ enum Opcode {
   OPCODE_MIN,
   OPCODE_VECTOR_MIN,
   OPCODE_SELECT,
-  OPCODE_IS_TRUE,
-  OPCODE_IS_FALSE,
   OPCODE_IS_NAN,
   OPCODE_COMPARE_EQ,
   OPCODE_COMPARE_NE,
diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl
index 46a328903..783e9a439 100644
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@@ -303,17 +303,6 @@ DEFINE_OPCODE(
     OPCODE_SIG_V_V_V_V,
     0)
 
-DEFINE_OPCODE(
-    OPCODE_IS_TRUE,
-    "is_true",
-    OPCODE_SIG_V_V,
-    0)
-
-DEFINE_OPCODE(
-    OPCODE_IS_FALSE,
-    "is_false",
-    OPCODE_SIG_V_V,
-    0)
 
 DEFINE_OPCODE(
     OPCODE_IS_NAN,
@@ -706,4 +695,27 @@ DEFINE_OPCODE(
 	OPCODE_SIG_X_V,
 	0
 )
+DEFINE_OPCODE(
+	OPCODE_LVL,
+	"loadv_left",
+	OPCODE_SIG_V_V,
+	OPCODE_FLAG_MEMORY
+)
 
+DEFINE_OPCODE(
+	OPCODE_LVR,
+	"loadv_right",
+	OPCODE_SIG_V_V,
+	OPCODE_FLAG_MEMORY
+)
+DEFINE_OPCODE(
+    OPCODE_STVL,
+    "storev_left",
+    OPCODE_SIG_X_V_V,
+    OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(
+    OPCODE_STVR,
+    "storev_right",
+    OPCODE_SIG_X_V_V,
+    OPCODE_FLAG_MEMORY)
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc
index 4aedfdd26..263d3675a 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
   // Testing for all 1's and all 0's.
   // if (Rc) CR6 = all_equal | 0 | none_equal | 0
   // TODO(benvanik): efficient instruction?
+
+  // chrispy: nothing seems to write cr6_1, figure out if no documented
+  // instructions write anything other than 0 to it and remove these stores if
+  // so
   StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
   StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
   StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index db8c874f2..a670cc9d6 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -7,18 +7,17 @@
  ******************************************************************************
  */
 
+#include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <utility>
-
 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
-#include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/draw_util.h"
@@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() {
   bool draw_resolution_scale_not_clamped =
       TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
                                                  draw_resolution_scale_y);
+
   if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
           draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
     draw_resolution_scale_not_clamped = false;
@@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() {
 
   CommandProcessor::ShutdownContext();
 }
-
+// todo: bit-pack the bools and use bitarith to reduce branches
 void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
   CommandProcessor::WriteRegister(index, value);
 
-  if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
-      index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-    if (frame_open_) {
-      uint32_t float_constant_index =
-          (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
-      if (float_constant_index >= 256) {
-        float_constant_index -= 256;
-        if (current_float_constant_map_pixel_[float_constant_index >> 6] &
-            (1ull << (float_constant_index & 63))) {
-          cbuffer_binding_float_pixel_.up_to_date = false;
-        }
-      } else {
-        if (current_float_constant_map_vertex_[float_constant_index >> 6] &
-            (1ull << (float_constant_index & 63))) {
-          cbuffer_binding_float_vertex_.up_to_date = false;
+  bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date;
+  bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date;
+  bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date;
+
+  if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
+      index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
+    cbuffer_binding_fetch_.up_to_date = false;
+    // texture cache is never nullptr
+    // if (texture_cache_ != nullptr) {
+    texture_cache_->TextureFetchConstantWritten(
+        (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+    // }
+  } else {
+    if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd |
+          cbuf_binding_bool_loop_utd)) {
+      return;
+    }
+
+    if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
+        index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
+      if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) {
+        return;
+      }
+      if (frame_open_) {
+        uint32_t float_constant_index =
+            (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
+        if (float_constant_index >= 256) {
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_binding_float_pixel_.up_to_date = false;
+          }
+        } else {
+          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_binding_float_vertex_.up_to_date = false;
+          }
         }
       }
-    }
-  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
-             index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
-    cbuffer_binding_bool_loop_.up_to_date = false;
-  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-             index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-    cbuffer_binding_fetch_.up_to_date = false;
-    if (texture_cache_ != nullptr) {
-      texture_cache_->TextureFetchConstantWritten(
-          (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+    } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
+               index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
+      cbuffer_binding_bool_loop_.up_to_date = false;
     }
   }
 }
@@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
   uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
   draw_util::ViewportInfo viewport_info;
-  draw_util::GetHostViewportInfo(
-      regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
+  draw_util::GetViewportInfoArgs gviargs{};
+
+  gviargs.Setup(
+      draw_resolution_scale_x, draw_resolution_scale_y,
+      texture_cache_->draw_resolution_scale_x_divisor(),
+      texture_cache_->draw_resolution_scale_y_divisor(), true,
       D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
       normalized_depth_control,
       host_render_targets_used &&
           render_target_cache_->depth_float24_convert_in_pixel_shader(),
-      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
-      viewport_info);
+      host_render_targets_used, pixel_shader && pixel_shader->writes_depth());
+  gviargs.SetupRegisterValues(regs);
+
+  if (gviargs == previous_viewport_info_args_) {
+    viewport_info = previous_viewport_info_;
+  } else {
+    draw_util::GetHostViewportInfo(&gviargs, viewport_info);
+    previous_viewport_info_args_ = gviargs;
+    previous_viewport_info_ = viewport_info;
+  }
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
   scissor.offset[0] *= draw_resolution_scale_x;
@@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() {
     shared_memory_->InitializeTraceCompleteDownloads();
   }
 }
+static void DmaPrefunc(dma::XeDMAJob* job) {
+  D3D12_RANGE readback_range;
+  readback_range.Begin = 0;
+  readback_range.End = job->size;
+  void* readback_mapping;
+  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
+
+  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
+  xenia_assert(SUCCEEDED(mapres));
+
+  job->source = (uint8_t*)readback_mapping;
+}
+
+static void DmaPostfunc(dma::XeDMAJob* job) {
+  D3D12_RANGE readback_write_range = {};
+  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
+  readback_buffer->Unmap(0, &readback_write_range);
+}
 
 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() {
           readback_buffer, 0, shared_memory_buffer, written_address,
           written_length);
       if (AwaitAllQueueOperationsCompletion()) {
+#if 1
         D3D12_RANGE readback_range;
         readback_range.Begin = 0;
         readback_range.End = written_length;
         void* readback_mapping;
         if (SUCCEEDED(
                 readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          std::memcpy(memory_->TranslatePhysical(written_address),
-                      readback_mapping, written_length);
+          // chrispy: this memcpy needs to be optimized as much as possible
+
+          auto physaddr = memory_->TranslatePhysical(written_address);
+          dma::vastcpy(physaddr, (uint8_t*)readback_mapping,
+                                      written_length);
+         // XEDmaCpy(physaddr, readback_mapping, written_length);
           D3D12_RANGE readback_write_range = {};
           readback_buffer->Unmap(0, &readback_write_range);
         }
+
+#else
+        dma::XeDMAJob job{};
+        job.destination = memory_->TranslatePhysical(written_address);
+        job.size = written_length;
+        job.source = nullptr;
+        job.userdata1 = (void*)readback_buffer;
+        job.precall = DmaPrefunc;
+        job.postcall = DmaPostfunc;
+
+        readback_available_ = GetDMAC()->PushDMAJob(&job);
+
+#endif
       }
     }
   }
@@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings(
     if (bool_loop_constants == nullptr) {
       return false;
     }
-    std::memcpy(bool_loop_constants,
-                &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
-                kBoolLoopConstantsSize);
+    xe::smallcpy_const<kBoolLoopConstantsSize>(
+        bool_loop_constants,
+        &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32);
+
     cbuffer_binding_bool_loop_.up_to_date = true;
     current_graphics_root_up_to_date_ &=
         ~(1u << root_parameter_bool_loop_constants);
@@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings(
     if (fetch_constants == nullptr) {
       return false;
     }
-    std::memcpy(fetch_constants,
-                &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32,
-                kFetchConstantsSize);
+    xe::smallcpy_const<kFetchConstantsSize>(
+        fetch_constants, &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32);
+
     cbuffer_binding_fetch_.up_to_date = true;
     current_graphics_root_up_to_date_ &=
         ~(1u << root_parameter_fetch_constants);
@@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
   }
+  #if 0
+  if (readback_available_) {
+    GetDMAC()->WaitJobDone(readback_available_);
+    readback_available_ = 0;
+  }
+  #endif
   size = xe::align(size, kReadbackBufferSizeIncrement);
   if (size > readback_buffer_size_) {
     const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
@@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
       readback_buffer_->Release();
     }
     readback_buffer_ = buffer;
+    readback_buffer_size_ = size;
   }
   return readback_buffer_;
 }
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 2b43233b9..e64447a4e 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -1,3 +1,4 @@
+/**
 /**
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
@@ -19,6 +20,7 @@
 #include <utility>
 
 #include "xenia/base/assert.h"
+#include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
   ID3D12Resource* readback_buffer_ = nullptr;
+  dma::DMACJobHandle readback_available_ = 0;
   uint32_t readback_buffer_size_ = 0;
 
   std::atomic<bool> pix_capture_requested_ = false;
@@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor {
   DxbcShaderTranslator::SystemConstants system_constants_;
 
   // Float constant usage masks of the last draw call.
-  uint64_t current_float_constant_map_vertex_[4];
-  uint64_t current_float_constant_map_pixel_[4];
-
+  // chrispy: make sure accesses to these cant cross cacheline boundaries
+  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
+    uint64_t current_float_constant_map_vertex_[4];
+    uint64_t current_float_constant_map_pixel_[4];
+  };
   // Constant buffer bindings.
   struct ConstantBufferBinding {
     D3D12_GPU_VIRTUAL_ADDRESS address;
@@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   // Current primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
+
+  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
+  draw_util::ViewportInfo previous_viewport_info_;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 10017fff1..02d1f6750 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
   return false;
 }
 
-void GetHostViewportInfo(const RegisterFile& regs,
-                         uint32_t draw_resolution_scale_x,
-                         uint32_t draw_resolution_scale_y,
-                         bool origin_bottom_left, uint32_t x_max,
-                         uint32_t y_max, bool allow_reverse_z,
-                         reg::RB_DEPTHCONTROL normalized_depth_control,
-                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
-                         bool pixel_shader_writes_depth,
+static float ViewportRecip2_0(float f) {
+  float f1 = ArchReciprocalRefined(f);
+  return f1 + f1;
+}
+
+// chrispy: todo, the int/float divides and the nan-checked mins show up
+// relatively high on uprof when i uc to 1.7ghz
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out) {
-  assert_not_zero(draw_resolution_scale_x);
-  assert_not_zero(draw_resolution_scale_y);
+  assert_not_zero(args->draw_resolution_scale_x);
+  assert_not_zero(args->draw_resolution_scale_y);
 
   // A vertex position goes the following path:
   //
@@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs,
   // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
   // D24FS8 as well.
 
-  auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
-  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
-  auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
+  auto pa_cl_clip_cntl = args->pa_cl_clip_cntl;
+  auto pa_cl_vte_cntl = args->pa_cl_vte_cntl;
+  auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl;
+  auto pa_su_vtx_cntl = args->pa_su_vtx_cntl;
 
   // Obtain the original viewport values in a normalized way.
   float scale_xy[] = {
-      pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
-                                       : 1.0f,
-      pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
-                                       : 1.0f,
+
+      pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f,
+      pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f,
   };
-  float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
-                      ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
-                      : 1.0f;
+  float scale_z =
+      pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f;
+
   float offset_base_xy[] = {
-      pa_cl_vte_cntl.vport_x_offset_ena
-          ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
-          : 0.0f,
-      pa_cl_vte_cntl.vport_y_offset_ena
-          ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
-          : 0.0f,
+      pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f,
+      pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f,
   };
-  float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
-                       ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
-                       : 0.0f;
+  float offset_z =
+      pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f;
   // Calculate all the integer.0 or integer.5 offsetting exactly at full
   // precision, separately so it can be used in other integer calculations
   // without double rounding if needed.
   float offset_add_xy[2] = {};
   if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    auto pa_sc_window_offset = args->pa_sc_window_offset;
     offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
     offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
   }
@@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
 
   // The maximum value is at least the maximum host render target size anyway -
   // and a guest pixel is always treated as a whole with resolution scaling.
-  uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x,
-                                y_max / draw_resolution_scale_y};
+  // cbrispy: todo, this integer divides show up high on the profiler somehow
+  // (it was a very long session, too)
+  uint32_t xy_max_unscaled[] = {
+      args->draw_resolution_scale_x_divisor.Apply(args->x_max),
+      args->draw_resolution_scale_y_divisor.Apply(args->y_max)};
   assert_not_zero(xy_max_unscaled[0]);
   assert_not_zero(xy_max_unscaled[1]);
 
@@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
           std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
       viewport_info_out.xy_extent[i] =
           extent_axis_unscaled *
-          (i ? draw_resolution_scale_y : draw_resolution_scale_x);
+          (i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x);
       float extent_axis_unscaled_float = float(extent_axis_unscaled);
-      float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
+
+      float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float);
+
       ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
       ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
                        offset_add_xy[i]) *
@@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       // doing truncation for simplicity - since maxing with 0 is done anyway
       // (we only return viewports in the positive quarter-plane).
       uint32_t axis_resolution_scale =
-          i ? draw_resolution_scale_y : draw_resolution_scale_x;
+          i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x;
       float offset_axis = offset_base_xy[i] + offset_add_xy[i];
       float scale_axis = scale_xy[i];
       float scale_axis_abs = std::abs(scale_xy[i]);
@@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs,
         // space, a region previously outside -W...W should end up within it, so
         // the scale should be < 1.
         float axis_extent_rounded = float(axis_extent_int);
-        ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
+        float inv_axis_extent_rounded =
+            ArchReciprocalRefined(axis_extent_rounded);
+
+        ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded;
         // Move the origin of the snapped coordinates back to the original one.
         ndc_offset_axis = (float(offset_axis) -
                            (float(axis_0_int) + axis_extent_rounded * 0.5f)) *
-                          2.0f / axis_extent_rounded;
+                          2.0f * inv_axis_extent_rounded;
       } else {
         // Empty viewport (everything outside the viewport scissor).
         ndc_scale_axis = 1.0f;
@@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       ndc_scale[2] = 0.5f;
       ndc_offset[2] = 0.5f;
     }
-    if (pixel_shader_writes_depth) {
+    if (args->pixel_shader_writes_depth) {
       // Allow the pixel shader to write any depth value since
       // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
       // shaders don't have access to the original Z in the viewport space
@@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       // Direct3D 12 doesn't allow reverse depth range - on some drivers it
       // works, on some drivers it doesn't, actually, but it was never
       // explicitly allowed by the specification.
-      if (!allow_reverse_z && z_min > z_max) {
+      if (!args->allow_reverse_z && z_min > z_max) {
         std::swap(z_min, z_max);
         ndc_scale[2] = -ndc_scale[2];
         ndc_offset[2] = 1.0f - ndc_offset[2];
@@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs,
     }
   }
 
-  if (normalized_depth_control.z_enable &&
-      regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
-          xenos::DepthRenderTargetFormat::kD24FS8) {
-    if (convert_z_to_float24) {
+  if (args->normalized_depth_control.z_enable &&
+      args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
+    if (args->convert_z_to_float24) {
       // Need to adjust the bounds that the resulting depth values will be
       // clamped to after the pixel shader. Preferring adding some error to
       // interpolated Z instead if conversion can't be done exactly, without
@@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
       z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
     }
-    if (full_float24_in_0_to_1) {
+    if (args->full_float24_in_0_to_1) {
       // Remap the full [0...2) float24 range to [0...1) support data round-trip
       // during render target ownership transfer of EDRAM tiles through depth
       // input without unrestricted depth range.
@@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
   viewport_info_out.z_min = z_min;
   viewport_info_out.z_max = z_max;
 
-  if (origin_bottom_left) {
+  if (args->origin_bottom_left) {
     ndc_scale[1] = -ndc_scale[1];
     ndc_offset[1] = -ndc_offset[1];
   }
@@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs,
     viewport_info_out.ndc_offset[i] = ndc_offset[i];
   }
 }
-
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
                 bool clamp_to_surface_pitch) {
   auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
@@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
            xenos::kMaxResolveSize);
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
-  //fails in forza horizon 1
+  // fails in forza horizon 1
   assert_true(x0 < x1 && y0 < y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index a365b5436..2f27f6e01 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -277,18 +277,151 @@ struct ViewportInfo {
   float ndc_scale[3];
   float ndc_offset[3];
 };
+static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t),
+              "Change in depthrendertargetformat throws off "
+              "getviewportinfoargs by a bit");
+struct GetViewportInfoArgs {
+  union alignas(64) {
+    struct {
+      // group 1
+      uint32_t x_max;
+      uint32_t y_max;
+      union {
+        struct {
+          uint32_t origin_bottom_left : 1;
+          uint32_t allow_reverse_z : 1;
+          uint32_t convert_z_to_float24 : 1;
+          uint32_t full_float24_in_0_to_1 : 1;
+          uint32_t pixel_shader_writes_depth : 1;
+          xenos::DepthRenderTargetFormat depth_format : 1;
+        };
+        uint32_t packed_portions;
+      };
+      reg::RB_DEPTHCONTROL normalized_depth_control;
+      // group 2
+      reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl;
+      reg::PA_CL_VTE_CNTL pa_cl_vte_cntl;
+      reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl;
+      reg::PA_SU_VTX_CNTL pa_su_vtx_cntl;
+      // group 3
+      reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset;
+      float PA_CL_VPORT_XSCALE;
+      float PA_CL_VPORT_YSCALE;
+      float PA_CL_VPORT_ZSCALE;
+
+      float PA_CL_VPORT_XOFFSET;
+      float PA_CL_VPORT_YOFFSET;
+      float PA_CL_VPORT_ZOFFSET;
+      uint32_t padding_set_to_0;
+    };
+#if XE_ARCH_AMD64 == 1
+    struct {
+      __m128i first4;   // x_max, y_max, packed_portions,
+                        // normalized_depth_control
+      __m128i second4;  // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl,
+                        // pa_su_vtx_cntl
+      __m128i third4;   // pa_sc_window_offset, PA_CL_VPORT_XSCALE,
+                        // PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE
+      __m128i last4;    // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET,
+                        // PA_CL_VPORT_ZOFFSET, padding_set_to_0
+    };
+#endif
+  };
+
+  // everything that follows here does not need to be compared
+  uint32_t draw_resolution_scale_x;
+  uint32_t draw_resolution_scale_y;
+  divisors::MagicDiv draw_resolution_scale_x_divisor;
+  divisors::MagicDiv draw_resolution_scale_y_divisor;
+  void Setup(uint32_t _draw_resolution_scale_x,
+             uint32_t _draw_resolution_scale_y,
+             divisors::MagicDiv _draw_resolution_scale_x_divisor,
+             divisors::MagicDiv _draw_resolution_scale_y_divisor,
+             bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max,
+             bool _allow_reverse_z,
+             reg::RB_DEPTHCONTROL _normalized_depth_control,
+             bool _convert_z_to_float24, bool _full_float24_in_0_to_1,
+             bool _pixel_shader_writes_depth) {
+    packed_portions = 0;
+    padding_set_to_0 = 0;  // important to zero this
+    draw_resolution_scale_x = _draw_resolution_scale_x;
+    draw_resolution_scale_y = _draw_resolution_scale_y;
+    draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor;
+    draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor;
+    origin_bottom_left = _origin_bottom_left;
+    x_max = _x_max;
+    y_max = _y_max;
+    allow_reverse_z = _allow_reverse_z;
+    normalized_depth_control = _normalized_depth_control;
+    convert_z_to_float24 = _convert_z_to_float24;
+    full_float24_in_0_to_1 = _full_float24_in_0_to_1;
+    pixel_shader_writes_depth = _pixel_shader_writes_depth;
+  }
+
+  void SetupRegisterValues(const RegisterFile& regs) {
+    pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
+    pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+    pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
+    pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
+    PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
+    PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
+    PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;
+    PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;
+    PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+    PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;
+    pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    depth_format = regs.Get<reg::RB_DEPTH_INFO>().depth_format;
+  }
+  XE_FORCEINLINE
+  bool operator==(const GetViewportInfoArgs& prev) {
+#if XE_ARCH_AMD64 == 0
+    bool result = true;
+
+    auto accum_eq = [&result](auto x, auto y) { result &= (x == y); };
+
+#define EQC(field) accum_eq(field, prev.field)
+    EQC(x_max);
+    EQC(y_max);
+    EQC(packed_portions);
+    EQC(normalized_depth_control.value);
+    EQC(pa_cl_clip_cntl.value);
+    EQC(pa_cl_vte_cntl.value);
+
+    EQC(pa_su_sc_mode_cntl.value);
+    EQC(pa_su_vtx_cntl.value);
+    EQC(PA_CL_VPORT_XSCALE);
+    EQC(PA_CL_VPORT_YSCALE);
+    EQC(PA_CL_VPORT_ZSCALE);
+    EQC(PA_CL_VPORT_XOFFSET);
+    EQC(PA_CL_VPORT_YOFFSET);
+    EQC(PA_CL_VPORT_ZOFFSET);
+    EQC(pa_sc_window_offset.value);
+
+#undef EQC
+    return result;
+#else
+    __m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4);
+    __m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4);
+
+    __m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4);
+    __m128i unified1 = _mm_and_si128(mask1, mask2);
+    __m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4);
+
+    __m128i unified2 = _mm_and_si128(unified1, mask3);
+
+    __m128i unified3 = _mm_and_si128(unified2, mask4);
+
+    return _mm_movemask_epi8(unified3) == 0xFFFF;
+
+#endif
+  }
+};
+
 // Converts the guest viewport (or fakes one if drawing without a viewport) to
 // a viewport, plus values to multiply-add the returned position by, usable on
 // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
 // Direct3D clip space with 0...W Z rather than -W...W.
-void GetHostViewportInfo(const RegisterFile& regs,
-                         uint32_t draw_resolution_scale_x,
-                         uint32_t draw_resolution_scale_y,
-                         bool origin_bottom_left, uint32_t x_max,
-                         uint32_t y_max, bool allow_reverse_z,
-                         reg::RB_DEPTHCONTROL normalized_depth_control,
-                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
-                         bool pixel_shader_writes_depth,
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out);
 
 struct Scissor {
diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc
index 2695b22d9..54c8b3655 100644
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
     }
     // Make sure the same render target isn't bound into two different slots
     // over time.
-    for (uint32_t i = 1; are_accumulated_render_targets_valid_ &&
-                         i < 1 + xenos::kMaxColorRenderTargets;
-         ++i) {
-      const RenderTarget* render_target =
-          last_update_accumulated_render_targets_[i];
-      if (!render_target) {
-        continue;
-      }
-      for (uint32_t j = 0; j < i; ++j) {
-        if (last_update_accumulated_render_targets_[j] == render_target) {
-          are_accumulated_render_targets_valid_ = false;
-          break;
+    // chrispy: this needs optimization!
+    if (are_accumulated_render_targets_valid_) {
+      for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) {
+        const RenderTarget* render_target =
+            last_update_accumulated_render_targets_[i];
+        if (!render_target) {
+          continue;
+        }
+        for (uint32_t j = 0; j < i; ++j) {
+          if (last_update_accumulated_render_targets_[j] == render_target) {
+            are_accumulated_render_targets_valid_ = false;
+            goto exit_slot_check_loop;
+          }
         }
       }
+    exit_slot_check_loop:;
     }
   }
   if (!are_accumulated_render_targets_valid_) {
diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc
index 05f6e2090..b697ff73c 100644
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file,
     : register_file_(register_file),
       shared_memory_(shared_memory),
       draw_resolution_scale_x_(draw_resolution_scale_x),
-      draw_resolution_scale_y_(draw_resolution_scale_y) {
+      draw_resolution_scale_y_(draw_resolution_scale_y),
+      draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
+      draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
   assert_true(draw_resolution_scale_x >= 1);
   assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
   assert_true(draw_resolution_scale_y >= 1);
@@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
       uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
   uint32_t config_y =
       uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
+
   uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
   uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
   x_out = clamped_x;
@@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }
 
 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const global_unique_lock_type& global_lock,
-    bool is_mip) {
+    [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
   if (is_mip) {
     assert_not_zero(GetGuestMipsSize());
     mips_outdated_ = true;
@@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback(
 }
 
 void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
-                                 void* context,
-    void* data, uint64_t argument, bool invalidated_by_gpu) {
+                                 void* context, void* data, uint64_t argument,
+                                 bool invalidated_by_gpu) {
   Texture& texture = *static_cast<Texture*>(context);
   texture.WatchCallback(global_lock, argument != 0);
   texture.texture_cache().texture_became_outdated_.store(
@@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }
 
 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const global_unique_lock_type& global_lock,
-    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
+    const global_unique_lock_type& global_lock, uint32_t address_first,
+    uint32_t address_last, bool invalidated_by_gpu) {
   assert_true(IsDrawResolutionScaled());
   if (invalidated_by_gpu) {
     // Resolves themselves do exactly the opposite of what this should do.
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index cb690a286..c028c1be4 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -19,6 +19,7 @@
 
 #include "xenia/base/assert.h"
 #include "xenia/base/hash.h"
+#include "xenia/base/math.h"
 #include "xenia/base/mutex.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shared_memory.h"
@@ -70,6 +71,14 @@ class TextureCache {
   static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
   uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
   uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
+
+  divisors::MagicDiv draw_resolution_scale_x_divisor() const {
+    return draw_resolution_scale_x_divisor_;
+  }
+  divisors::MagicDiv draw_resolution_scale_y_divisor() const {
+    return draw_resolution_scale_y_divisor_;
+  }
+
   bool IsDrawResolutionScaled() const {
     return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
   }
@@ -576,8 +585,8 @@ class TextureCache {
 
   // Shared memory callback for texture data invalidation.
   static void WatchCallback(const global_unique_lock_type& global_lock,
-                            void* context,
-      void* data, uint64_t argument, bool invalidated_by_gpu);
+                            void* context, void* data, uint64_t argument,
+                            bool invalidated_by_gpu);
 
   // Checks if there are any pages that contain scaled resolve data within the
   // range.
@@ -588,14 +597,15 @@ class TextureCache {
       const global_unique_lock_type& global_lock, void* context,
       uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
   void ScaledResolveGlobalWatchCallback(
-      const global_unique_lock_type& global_lock,
-      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
+      const global_unique_lock_type& global_lock, uint32_t address_first,
+      uint32_t address_last, bool invalidated_by_gpu);
 
   const RegisterFile& register_file_;
   SharedMemory& shared_memory_;
   uint32_t draw_resolution_scale_x_;
   uint32_t draw_resolution_scale_y_;
-
+  divisors::MagicDiv draw_resolution_scale_x_divisor_;
+  divisors::MagicDiv draw_resolution_scale_y_divisor_;
   static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
 
   xe::global_critical_region global_critical_region_;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 47d7506e6..9ac9f13e2 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
 
   // Get dynamic rasterizer state.
   draw_util::ViewportInfo viewport_info;
+
   // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
   // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
   // maxViewportDimensions must be greater than or equal to the size of the
@@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   // life. Or even disregard the viewport bounds range in the fragment shader
   // interlocks case completely - apply the viewport and the scissor offset
   // directly to pixel address and to things like ps_param_gen.
-  draw_util::GetHostViewportInfo(
-      regs, 1, 1, false, device_limits.maxViewportDimensions[0],
-      device_limits.maxViewportDimensions[1], true, normalized_depth_control,
-      false, host_render_targets_used,
-      pixel_shader && pixel_shader->writes_depth(), viewport_info);
+  draw_util::GetViewportInfoArgs gviargs{};
+  gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false,
+                device_limits.maxViewportDimensions[0],
+
+                device_limits.maxViewportDimensions[1], true,
+                normalized_depth_control, false, host_render_targets_used,
+                pixel_shader && pixel_shader->writes_depth());
+  gviargs.SetupRegisterValues(regs);
+
+  draw_util::GetHostViewportInfo(&gviargs, viewport_info);
 
   // Update dynamic graphics pipeline state.
   UpdateDynamicState(viewport_info, primitive_polygonal,
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 55a16df1d..df59b561c 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
          format == ColorRenderTargetFormat::k_32_32_FLOAT;
 }
 
-inline uint32_t GetColorRenderTargetFormatComponentCount(
+// if 0, 1
+// if 1, 2
+// if 3, 4
+// 2 bits per entry, shift and add 1
+
+using ColorFormatComponentTable = uint32_t;
+
+static constexpr uint32_t GetComponentCountConst(
     ColorRenderTargetFormat format) {
   switch (format) {
     case ColorRenderTargetFormat::k_8_8_8_8:
@@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount(
     case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
     case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
     case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
-      return 4;
+      return 4 - 1;
     case ColorRenderTargetFormat::k_16_16:
     case ColorRenderTargetFormat::k_16_16_FLOAT:
     case ColorRenderTargetFormat::k_32_32_FLOAT:
-      return 2;
+      return 2 - 1;
     case ColorRenderTargetFormat::k_32_FLOAT:
-      return 1;
+      return 1 - 1;
     default:
-      assert_unhandled_case(format);
       return 0;
   }
 }
+namespace detail {
+static constexpr uint32_t encode_format_component_table() {
+  uint32_t result = 0;
 
+#define ADDFORMAT(name)                                           \
+  result |= GetComponentCountConst(ColorRenderTargetFormat::name) \
+            << (static_cast<uint32_t>(ColorRenderTargetFormat::name) * 2)
+  ADDFORMAT(k_8_8_8_8);
+  ADDFORMAT(k_8_8_8_8_GAMMA);
+  ADDFORMAT(k_2_10_10_10);
+  ADDFORMAT(k_2_10_10_10_FLOAT);
+
+  ADDFORMAT(k_16_16_16_16);
+  ADDFORMAT(k_16_16_16_16_FLOAT);
+  ADDFORMAT(k_2_10_10_10_AS_10_10_10_10);
+  ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16);
+
+  ADDFORMAT(k_16_16);
+  ADDFORMAT(k_16_16_FLOAT);
+  ADDFORMAT(k_32_32_FLOAT);
+  ADDFORMAT(k_32_FLOAT);
+  return result;
+}
+constexpr uint32_t color_format_component_table =
+    encode_format_component_table();
+
+}  // namespace detail
+constexpr uint32_t GetColorRenderTargetFormatComponentCount(
+    ColorRenderTargetFormat format) {
+  return ((detail::color_format_component_table >>
+           (static_cast<uint32_t>(format) * 2)) &
+          0b11) +
+         1;
+}
 // Returns the version of the format with the same packing and meaning of values
 // stored in it, but without blending precision modifiers.
 constexpr ColorRenderTargetFormat GetStorageColorFormat(