From f31869092c39319533f85b39c96c2f6d7f8e730a Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sun, 28 Aug 2022 14:24:25 -0700
Subject: [PATCH] Fixed a bug with readback_resolve and readback_memexport that
 was responsible for a large portion of their overhead. readback_memexport and
 resolve are now usable for games, depending on your hardware. in my case
 games that were slideshows now run at like 20-30 fps, and my hardware isnt
 the best for xenia. add split_map class for mapping keys to values in a way
 that optimizes for frequent searches and infrequent insertions/removals
 remove jump table implementation of GetColorRenderTargetFormatComponentCount,
 it was appearing relatively high in profiles. instead pack the component
 counts into a single 32 bit word, which is indexed by shifting Add cvar to
 align all basic blocks to a boundary Add mmio aware load paths liberally
 apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE
 opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want
 to simplify our set of opcodes for future backends More work on
 LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only
 compute instrkey once Add code for pre-computing integer division magic
 numbers Optimized GetHostViewportInfo a little Move args for
 GetHostViewportInfo into a class, cache the result and compare for future
 queries. moved GetHostViewportInfo far lower on the profile Add (currently
 not functional, and very racy) asynchronous memcpy code. will improve it and
 actually use it in future commits. Add non-temporal memcpy function for huge
 page-aligned allocations. Used for copying to shared memory/readback hoist
 are_accumulated_render_targets_valid_ check out of loop in
 render_target_cache already bound check. Add stosb/movsb code for small
 constant memcpys/memsets that arent worth the overhead of memcpy/memset

---
 src/xenia/base/dma.cc                         | 415 ++++++++++++++++++
 src/xenia/base/dma.h                          |  46 ++
 src/xenia/base/math.h                         | 132 +++++-
 src/xenia/base/memory.h                       | 128 +++++-
 src/xenia/base/ring_buffer.h                  |  36 +-
 src/xenia/base/split_map.h                    |  87 ++++
 src/xenia/cpu/backend/x64/x64_emitter.cc      |  87 ++--
 src/xenia/cpu/backend/x64/x64_emitter.h       |  17 +-
 src/xenia/cpu/backend/x64/x64_op.h            |  52 +--
 src/xenia/cpu/backend/x64/x64_seq_control.cc  |  84 ++--
 src/xenia/cpu/backend/x64/x64_seq_memory.cc   | 159 +++++--
 src/xenia/cpu/backend/x64/x64_seq_vector.cc   |  10 +-
 src/xenia/cpu/backend/x64/x64_sequences.cc    |  57 +--
 src/xenia/cpu/backend/x64/x64_sequences.h     |   2 +-
 .../passes/constant_propagation_pass.cc       |  24 +-
 .../compiler/passes/simplification_pass.cc    |  69 ++-
 src/xenia/cpu/hir/hir_builder.cc              |  73 ++-
 src/xenia/cpu/hir/hir_builder.h               |   6 +
 src/xenia/cpu/hir/instr.cc                    |  12 +
 src/xenia/cpu/hir/instr.h                     |   2 +
 src/xenia/cpu/hir/opcodes.h                   |  10 +-
 src/xenia/cpu/hir/opcodes.inl                 |  34 +-
 src/xenia/cpu/ppc/ppc_hir_builder.cc          |   4 +
 .../gpu/d3d12/d3d12_command_processor.cc      | 151 +++++--
 src/xenia/gpu/d3d12/d3d12_command_processor.h |  14 +-
 src/xenia/gpu/draw_util.cc                    |  96 ++--
 src/xenia/gpu/draw_util.h                     | 149 ++++++-
 src/xenia/gpu/render_target_cache.cc          |  26 +-
 src/xenia/gpu/texture_cache.cc                |  16 +-
 src/xenia/gpu/texture_cache.h                 |  20 +-
 .../gpu/vulkan/vulkan_command_processor.cc    |  16 +-
 src/xenia/gpu/xenos.h                         |  49 ++-
 32 files changed, 1576 insertions(+), 507 deletions(-)
 create mode 100644 src/xenia/base/dma.cc
 create mode 100644 src/xenia/base/dma.h
 create mode 100644 src/xenia/base/split_map.h

diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc
new file mode 100644
index 000000000..7d2d9d80c
--- /dev/null
+++ b/src/xenia/base/dma.cc
@@ -0,0 +1,415 @@
+#include "dma.h"
+
+template <size_t N, typename... Ts>
+static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
+  char buffer[1024];
+  sprintf_s(buffer, fmt, args...);
+  XELOGI("%s", buffer);
+}
+
+//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
+//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
+#define XEDMALOG(...) static_cast<void>(0)
+using xe::swcache::CacheLine;
+static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
+
+XE_FORCEINLINE
+static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to,
+                                 CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    xe::swcache::CacheLine line0, line1, line2, line3;
+
+    xe::swcache::ReadLine(&line0, src1 + i);
+    xe::swcache::ReadLine(&line1, src2 + i);
+    xe::swcache::ReadLine(&line2, src3 + i);
+    xe::swcache::ReadLine(&line3, src4 + i);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(dest1 + i, &line0);
+    xe::swcache::WriteLineNT(dest2 + i, &line1);
+
+    xe::swcache::WriteLineNT(dest3 + i, &line2);
+    xe::swcache::WriteLineNT(dest4 + i, &line3);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+
+namespace xe::dma {
+XE_FORCEINLINE
+static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr,
+                         CacheLine* XE_RESTRICT rdmapping,
+                         uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384Streaming(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    xe::swcache::CacheLine line0, line1;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+
+    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
+  }
+
+  if (i < num_written_lines) {
+    xe::swcache::CacheLine line0;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+  }
+}
+
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length) {
+  return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping,
+                      written_length);
+}
+
+#define XEDMA_NUM_WORKERS 4
+class alignas(256) XeDMACGeneric : public XeDMAC {
+  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
+    std::atomic<uint64_t> free_job_slots_;
+    std::atomic<uint64_t> jobs_submitted_;
+    std::atomic<uint64_t> jobs_completed_;
+    std::atomic<uint32_t> num_workers_awoken_;
+    std::atomic<uint32_t> current_job_serial_;
+
+  } dma_volatile_;
+
+  alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
+
+  volatile uint32_t jobserials_[64];
+
+  alignas(XE_HOST_CACHE_LINE_SIZE)
+      std::unique_ptr<threading::Event> job_done_signals_[64];
+  // really dont like using unique pointer for this...
+  std::unique_ptr<threading::Event> job_submitted_signal_;
+  std::unique_ptr<threading::Event> job_completed_signal_;
+
+  std::unique_ptr<threading::Thread> scheduler_thread_;
+  struct WorkSlice {
+    uint8_t* destination;
+    uint8_t* source;
+    size_t numbytes;
+  };
+  std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
+  std::unique_ptr<threading::Event> worker_has_work_;  //[XEDMA_NUM_WORKERS];
+  std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
+
+  threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
+  WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
+
+  // chrispy: this is bad
+  static uint32_t find_free_hole_in_dword(uint64_t dw) {
+    XEDMALOG("Finding free hole in 0x%llX", dw);
+
+    for (uint32_t i = 0; i < 64; ++i) {
+      if (dw & (1ULL << i)) {
+        continue;
+      }
+
+      return i;
+    }
+    return ~0U;
+  }
+
+  uint32_t allocate_free_dma_slot() {
+    XEDMALOG("Allocating free slot");
+    uint32_t got_slot = 0;
+    uint64_t slots;
+    uint64_t allocated_slot;
+
+    do {
+      slots = dma_volatile_.free_job_slots_.load();
+
+      got_slot = find_free_hole_in_dword(slots);
+      if (!~got_slot) {
+        XEDMALOG("Didn't get a slot!");
+        return ~0U;
+      }
+      allocated_slot = slots | (1ULL << got_slot);
+
+    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
+        slots, allocated_slot)));
+    XEDMALOG("Allocated slot %d", got_slot);
+    return got_slot;
+  }
+  // chrispy: on x86 this can just be interlockedbittestandreset...
+  void free_dma_slot(uint32_t slot) {
+    XEDMALOG("Freeing slot %d", slot);
+    uint64_t slots;
+
+    uint64_t deallocated_slot;
+
+    do {
+      slots = dma_volatile_.free_job_slots_.load();
+
+      deallocated_slot = slots & (~(1ULL << slot));
+
+    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
+        slots, deallocated_slot)));
+  }
+
+  void DoDMAJob(uint32_t idx) {
+    XeDMAJob& job = jobs_[idx];
+    if (job.precall) {
+      job.precall(&job);
+    }
+    // memcpy(job.destination, job.source, job.size);
+
+    size_t job_size = job.size;
+
+    size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
+
+    size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
+
+    size_t rem = job_size - line_rounded;
+
+    size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
+
+    XEDMALOG(
+        "Distributing %d bytes from %p to %p across %d workers, remainder is "
+        "%d",
+        line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
+    if (num_per_worker < 2048) {
+      XEDMALOG("not distributing across workers, num_per_worker < 8192");
+      // not worth splitting up
+      memcpy(job.destination, job.source, job.size);
+      job.signal_on_done->Set();
+    } else {
+      for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
+        worker_workslice_[i].destination =
+            (i * num_per_worker) + job.destination;
+        worker_workslice_[i].source = (i * num_per_worker) + job.source;
+
+        worker_workslice_[i].numbytes = num_per_worker;
+      }
+      if (rem) {
+        __movsb(job.destination + line_rounded, job.source + line_rounded, rem);
+      }
+      // wake them up
+      worker_has_work_->Set();
+      XEDMALOG("Starting waitall for job");
+      threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
+                         false);
+
+      XEDMALOG("Waitall for job completed!");
+      job.signal_on_done->Set();
+    }
+    if (job.postcall) {
+      job.postcall(&job);
+    }
+    ++dma_volatile_.jobs_completed_;
+  }
+
+  void WorkerIter(uint32_t worker_index) {
+    xenia_assert(worker_index < XEDMA_NUM_WORKERS);
+    auto [dest, src, size] = worker_workslice_[worker_index];
+
+    //  if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
+    worker_has_work_->Reset();
+    //}
+    xenia_assert(size < (1ULL << 32));
+    // memcpy(dest, src, size);
+    dma::vastcpy(dest, src, static_cast<uint32_t>(size));
+  }
+  XE_NOINLINE
+  void WorkerMainLoop(uint32_t worker_index) {
+    do {
+      XEDMALOG("Worker iter for worker %d", worker_index);
+      WorkerIter(worker_index);
+
+      XEDMALOG("Worker %d is done\n", worker_index);
+      threading::SignalAndWait(worker_has_finished_[worker_index].get(),
+                               worker_has_work_.get(), false);
+    } while (true);
+  }
+  void WorkerMain(uint32_t worker_index) {
+    XEDMALOG("Entered worker main loop, index %d", worker_index);
+    threading::Wait(worker_has_work_.get(), false);
+    XEDMALOG("First wait for worker %d completed, first job ever",
+             worker_index);
+    WorkerMainLoop(worker_index);
+  }
+
+  static void WorkerMainForwarder(void* ptr) {
+    // we aligned XeDma to 256 bytes and encode extra info in the low 8
+    uintptr_t uptr = (uintptr_t)ptr;
+
+    uint32_t worker_index = (uint8_t)uptr;
+
+    uptr &= ~0xFFULL;
+
+    char name_buffer[64];
+    sprintf_s(name_buffer, "dma_worker_%d", worker_index);
+
+    xe::threading::set_name(name_buffer);
+
+    reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
+  }
+
+  void DMAMain() {
+    XEDMALOG("DmaMain");
+    do {
+      threading::Wait(job_submitted_signal_.get(), false);
+
+      auto slots = dma_volatile_.free_job_slots_.load();
+
+      for (uint32_t i = 0; i < 64; ++i) {
+        if (slots & (1ULL << i)) {
+          XEDMALOG("Got new job at index %d in DMAMain", i);
+          DoDMAJob(i);
+
+          free_dma_slot(i);
+
+          job_completed_signal_->Set();
+          //         break;
+        }
+      }
+
+    } while (true);
+  }
+
+  static void DMAMainForwarder(void* ud) {
+    xe::threading::set_name("dma_main");
+    reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
+  }
+
+ public:
+  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
+    XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
+             job->size);
+    uint32_t slot;
+    do {
+      slot = allocate_free_dma_slot();
+      if (!~slot) {
+        XEDMALOG(
+            "Didn't get a free slot, waiting for a job to complete before "
+            "resuming.");
+        threading::Wait(job_completed_signal_.get(), false);
+
+      } else {
+        break;
+      }
+
+    } while (true);
+    jobs_[slot] = *job;
+
+    jobs_[slot].signal_on_done = job_done_signals_[slot].get();
+    jobs_[slot].signal_on_done->Reset();
+    XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
+
+    uint32_t new_serial = dma_volatile_.current_job_serial_++;
+
+    jobserials_[slot] = new_serial;
+
+    ++dma_volatile_.jobs_submitted_;
+    job_submitted_signal_->Set();
+    return (static_cast<uint64_t>(new_serial) << 32) |
+           static_cast<uint64_t>(slot);
+
+    // return job_done_signals_[slot].get();
+  }
+
+  bool AllJobsDone() {
+    return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
+  }
+  virtual void WaitJobDone(DMACJobHandle handle) override {
+    uint32_t serial = static_cast<uint32_t>(handle >> 32);
+    uint32_t jobid = static_cast<uint32_t>(handle);
+    do {
+      if (jobserials_[jobid] != serial) {
+        return;  // done, our slot was reused
+      }
+
+      auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
+                                     std::chrono::milliseconds{1});
+
+      if (waitres == threading::WaitResult::kTimeout) {
+        continue;
+      } else {
+        return;
+      }
+    } while (true);
+  }
+  virtual void WaitForIdle() override {
+    while (!AllJobsDone()) {
+      threading::MaybeYield();
+    }
+  }
+  XeDMACGeneric() {
+    XEDMALOG("Constructing xedma at addr %p", this);
+    dma_volatile_.free_job_slots_.store(0ULL);
+    dma_volatile_.jobs_submitted_.store(0ULL);
+    dma_volatile_.jobs_completed_.store(0ULL);
+    dma_volatile_.current_job_serial_.store(
+        1ULL);  // so that a jobhandle is never 0
+    std::memset(jobs_, 0, sizeof(jobs_));
+    job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
+    job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
+    worker_has_work_ = threading::Event::CreateManualResetEvent(false);
+    threading::Thread::CreationParameters worker_params{};
+    worker_params.create_suspended = false;
+    worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
+    worker_params.stack_size = 65536;  // dont need much stack at all
+
+    for (uint32_t i = 0; i < 64; ++i) {
+      job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
+    }
+    for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
+      // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
+      worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
+      worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
+
+      uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
+      xenia_assert(!(encoded & 0xFFULL));
+      xenia_assert(i < 256);
+
+      encoded |= i;
+
+      workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
+        XeDMACGeneric::WorkerMainForwarder((void*)encoded);
+      });
+    }
+    threading::Thread::CreationParameters scheduler_params{};
+    scheduler_params.create_suspended = false;
+    scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
+    scheduler_params.stack_size = 65536;
+    scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
+      XeDMACGeneric::DMAMainForwarder((void*)this);
+    });
+  }
+};
+XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
+}  // namespace xe::dma
diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h
new file mode 100644
index 000000000..e95639753
--- /dev/null
+++ b/src/xenia/base/dma.h
@@ -0,0 +1,46 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_DMA_H_
+#define XENIA_BASE_DMA_H_
+#include "memory.h"
+#include "threading.h"
+namespace xe::dma {
+struct XeDMAJob;
+using DmaPrecall = void (*)(XeDMAJob* job);
+using DmaPostcall = void (*)(XeDMAJob* job);
+struct XeDMAJob {
+  threading::Event* signal_on_done;
+  uint8_t* destination;
+  uint8_t* source;
+  size_t size;
+  DmaPrecall precall;
+  DmaPostcall postcall;
+  void* userdata1;
+  void* userdata2;
+};
+using DMACJobHandle = uint64_t;
+
+class XeDMAC {
+ public:
+  virtual ~XeDMAC() {}
+  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
+  virtual void WaitJobDone(DMACJobHandle handle) = 0;
+  virtual void WaitForIdle() = 0;
+};
+
+XeDMAC* CreateDMAC();
+// must be divisible by cache line size
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length);
+
+}  // namespace xe::dma
+
+#endif  // XENIA_BASE_DMA_H_
diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h
index 4cafc7178..6e323ede8 100644
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) {
   return m128_i64<N>(_mm_castps_pd(v));
 }
 /*
-	
-	std::min/max float has handling for nans, where if either argument is nan the first argument is returned
 
-	minss/maxss are different, if either argument is nan the second operand to the instruction is returned
-	this is problematic because we have no assurances from the compiler on the argument ordering
+        std::min/max float has handling for nans, where if either argument is
+   nan the first argument is returned
 
-	so only use in places where nan handling is not needed
+        minss/maxss are different, if either argument is nan the second operand
+   to the instruction is returned this is problematic because we have no
+   assurances from the compiler on the argument ordering
+
+        so only use in places where nan handling is not needed
 */
-static float xe_minf(float x, float y) {
+XE_FORCEINLINE
+static float ArchMin(float x, float y) {
   return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_maxf(float x, float y) {
+XE_FORCEINLINE
+static float ArchMax(float x, float y) {
   return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_rcpf(float den) {
+XE_FORCEINLINE
+static float ArchReciprocal(float den) {
   return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }
 
 #else
-static float xe_minf(float x, float y) { return std::min<float>(x, y); }
-static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
-static float xe_rcpf(float den) { return 1.0f / den; }
+static float ArchMin(float x, float y) { return std::min<float>(x, y); }
+static float ArchMax(float x, float y) { return std::max<float>(x, y); }
+static float ArchReciprocal(float den) { return 1.0f / den; }
 #endif
+XE_FORCEINLINE
+static float RefineReciprocal(float initial, float den) {
+  float t0 = initial * den;
+  float t1 = t0 * initial;
+  float rcp2 = initial + initial;
+  return rcp2 - t1;
+}
+XE_FORCEINLINE
+static float ArchReciprocalRefined(float den) {
+  return RefineReciprocal(ArchReciprocal(den), den);
+}
 
 // Similar to the C++ implementation of XMConvertFloatToHalf and
 // XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
@@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) {
   }
   return T(result);
 }
+namespace divisors {
+union IDivExtraInfo {
+  uint32_t value_;
+  struct {
+    uint32_t shift_ : 31;
+    uint32_t add_ : 1;
+  } info;
+};
+// returns magicnum multiplier
+static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
+  IDivExtraInfo extra;
 
+  uint32_t d = _denom;
+  int p;
+  uint32_t nc, delta, q1, r1, q2, r2;
+  struct {
+    unsigned M;
+    int a;
+    int s;
+  } magu;
+  magu.a = 0;
+  nc = -1 - ((uint32_t) - (int32_t)d) % d;
+  p = 31;
+  q1 = 0x80000000 / nc;
+  r1 = 0x80000000 - q1 * nc;
+  q2 = 0x7FFFFFFF / d;
+  r2 = 0x7FFFFFFF - q2 * d;
+  do {
+    p += 1;
+    if (r1 >= nc - r1) {
+      q1 = 2 * q1 + 1;
+      r1 = 2 * r1 - nc;
+    } else {
+      q1 = 2 * q1;
+      r1 = 2 * r1;
+    }
+    if (r2 + 1 >= d - r2) {
+      if (q2 >= 0x7FFFFFFF) {
+        magu.a = 1;
+      }
+      q2 = 2 * q2 + 1;
+      r2 = 2 * r2 + 1 - d;
+
+    } else {
+      if (q2 >= 0x80000000U) {
+        magu.a = 1;
+      }
+      q2 = 2 * q2;
+      r2 = 2 * r2 + 1;
+    }
+    delta = d - 1 - r2;
+  } while (p < 64 && (q1 < delta || r1 == 0));
+
+  extra.info.add_ = magu.a;
+  extra.info.shift_ = p - 32;
+  out_extra = extra.value_;
+  return static_cast<uint64_t>(q2 + 1);
+}
+
+static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
+                                      uint32_t extradata) {
+  IDivExtraInfo extra;
+
+  extra.value_ = extradata;
+
+  uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
+  if (extra.info.add_) {
+    uint32_t addend = result + num;
+    addend = ((addend < result ? 0x80000000 : 0) | addend);
+    result = addend;
+  }
+  return result >> extra.info.shift_;
+}
+
+static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
+                                       uint32_t extradata, uint32_t original) {
+  uint32_t dived = ApplyUint32Div(num, mul, extradata);
+  unsigned result = num - (dived * original);
+
+  return result;
+}
+
+struct MagicDiv {
+  uint32_t multiplier_;
+  uint32_t extradata_;
+  MagicDiv() : multiplier_(0), extradata_(0) {}
+  MagicDiv(uint32_t original) {
+    multiplier_ = PregenerateUint32Div(original, extradata_);
+  }
+
+  uint32_t Apply(uint32_t numerator) const {
+    return ApplyUint32Div(numerator, multiplier_, extradata_);
+  }
+};
+}  // namespace divisors
 }  // namespace xe
 
 #endif  // XENIA_BASE_MATH_H_
diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h
index 1afeedc14..b924c267c 100644
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@@ -672,25 +672,58 @@ static void Prefetch<PrefetchTag::Level1>(const void* addr) {
 #define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
 #endif
 #if XE_ARCH_AMD64 == 1
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
+  struct {
+    __m256 low32;
+    __m256 high32;
+  };
+  struct {
+    __m128i xmms[4];
+  };
+  float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)];
+};
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
-  __m256i low = _mm256_loadu_si256((const __m256i*)source);
-  __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
-  XE_MSVC_REORDER_BARRIER();
-  _mm256_stream_si256((__m256i*)destination, low);
-  _mm256_stream_si256(&((__m256i*)destination)[1], high);
+static void WriteLineNT(CacheLine* XE_RESTRICT destination,
+                        const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_stream_ps(&destination->floats[0], low);
+  _mm256_stream_ps(&destination->floats[8], high);
 }
 
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
-  __m256i low = _mm256_stream_load_si256((const __m256i*)source);
-  __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
-  XE_MSVC_REORDER_BARRIER();
-  _mm256_storeu_si256((__m256i*)destination, low);
-  _mm256_storeu_si256(&((__m256i*)destination)[1], high);
+static void ReadLineNT(CacheLine* XE_RESTRICT destination,
+                       const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+
+  __m128i first = _mm_stream_load_si128(&source->xmms[0]);
+  __m128i second = _mm_stream_load_si128(&source->xmms[1]);
+  __m128i third = _mm_stream_load_si128(&source->xmms[2]);
+  __m128i fourth = _mm_stream_load_si128(&source->xmms[3]);
+
+  destination->xmms[0] = first;
+  destination->xmms[1] = second;
+  destination->xmms[2] = third;
+  destination->xmms[3] = fourth;
+}
+XE_FORCEINLINE
+static void ReadLine(CacheLine* XE_RESTRICT destination,
+                     const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_storeu_ps(&destination->floats[0], low);
+  _mm256_storeu_ps(&destination->floats[8], high);
+}
+XE_FORCEINLINE
+static void WriteLine(CacheLine* XE_RESTRICT destination,
+                      const CacheLine* XE_RESTRICT source) {
+  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
+  _mm256_storeu_ps(&destination->floats[0], low);
+  _mm256_storeu_ps(&destination->floats[8], high);
 }
 
 XE_FORCEINLINE
@@ -699,19 +732,29 @@ XE_FORCEINLINE
 static void ReadFence() { _mm_lfence(); }
 XE_FORCEINLINE
 static void ReadWriteFence() { _mm_mfence(); }
+
 #else
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
+  uint8_t bvals[XE_HOST_CACHE_LINE_SIZE];
+};
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
-  memcpy(destination, source, 64);
+static void WriteLineNT(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
 
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
-  memcpy(destination, source, 64);
+static void ReadLineNT(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
+XE_FORCEINLINE
+static void WriteLine(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
+}
+XE_FORCEINLINE
+static void ReadLine(CacheLine* destination, const CacheLine* source) {
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
+}
+
 XE_FORCEINLINE
 static void WriteFence() {}
 XE_FORCEINLINE
@@ -720,6 +763,47 @@ XE_FORCEINLINE
 static void ReadWriteFence() {}
 #endif
 }  // namespace swcache
+
+template <unsigned Size>
+static void smallcpy_const(void* destination, const void* source) {
+#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
+  if constexpr ((Size & 7) == 0) {
+    __movsq((unsigned long long*)destination, (const unsigned long long*)source,
+            Size / 8);
+  } else if constexpr ((Size & 3) == 0) {
+    __movsd((unsigned long*)destination, (const unsigned long*)source,
+            Size / 4);
+    // dont even bother with movsw, i think the operand size override prefix
+    // slows it down
+  } else {
+    __movsb((unsigned char*)destination, (const unsigned char*)source, Size);
+  }
+#else
+  memcpy(destination, source, Size);
+#endif
+}
+template <unsigned Size>
+static void smallset_const(void* destination, unsigned char fill_value) {
+#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
+  if constexpr ((Size & 7) == 0) {
+    unsigned long long fill =
+        static_cast<unsigned long long>(fill_value) * 0x0101010101010101ULL;
+
+    __stosq((unsigned long long*)destination, fill, Size / 8);
+  } else if constexpr ((Size & 3) == 0) {
+    static constexpr unsigned long fill =
+        static_cast<unsigned long>(fill_value) * 0x01010101U;
+    __stosd((unsigned long*)destination, fill, Size / 4);
+    // dont even bother with movsw, i think the operand size override prefix
+    // slows it down
+  } else {
+    __stosb((unsigned char*)destination, fill_value, Size);
+  }
+#else
+  memset(destination, fill_value, Size);
+#endif
+}
+
 }  // namespace xe
 
 #endif  // XENIA_BASE_MEMORY_H_
diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h
index a4befb686..84d5893b5 100644
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@@ -59,16 +59,8 @@ class RingBuffer {
   // subtract instead
   void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
   ring_size_t read_count() const {
-// chrispy: these branches are unpredictable
-#if 0
-    if (read_offset_ == write_offset_) {
-      return 0;
-    } else if (read_offset_ < write_offset_) {
-      return write_offset_ - read_offset_;
-    } else {
-      return (capacity_ - read_offset_) + write_offset_;
-    }
-#else
+    // chrispy: these branches are unpredictable
+
     ring_size_t read_offs = read_offset_;
     ring_size_t write_offs = write_offset_;
     ring_size_t cap = capacity_;
@@ -77,14 +69,6 @@ class RingBuffer {
     ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
 
     ring_size_t comparison_value = read_offs <= write_offs;
-#if 0
-    size_t selector =
-        static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
-    offset_delta &= selector;
-
-    wrap_read_count &= ~selector;
-    return offset_delta | wrap_read_count;
-#else
 
     if (XE_LIKELY(read_offs <= write_offs)) {
       return offset_delta;  // will be 0 if they are equal, semantically
@@ -93,8 +77,6 @@ class RingBuffer {
     } else {
       return wrap_read_count;
     }
-#endif
-#endif
   }
 
   ring_size_t write_offset() const { return write_offset_; }
@@ -116,9 +98,9 @@ class RingBuffer {
   void AdvanceWrite(size_t count);
 
   struct ReadRange {
-    const uint8_t* first;
+    const uint8_t* XE_RESTRICT first;
 
-    const uint8_t* second;
+    const uint8_t* XE_RESTRICT second;
     ring_size_t first_length;
     ring_size_t second_length;
   };
@@ -126,9 +108,11 @@ class RingBuffer {
   void EndRead(ReadRange read_range);
 
   /*
-        BeginRead, but if there is a second Range it will prefetch all lines of it
+        BeginRead, but if there is a second Range it will prefetch all lines of
+     it
 
-		this does not prefetch the first range, because software prefetching can do that faster than we can
+                this does not prefetch the first range, because software
+     prefetching can do that faster than we can
   */
   template <swcache::PrefetchTag tag>
   XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
@@ -138,7 +122,7 @@ class RingBuffer {
       ring_size_t numlines =
           xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
           XE_HOST_CACHE_LINE_SIZE;
-	  //chrispy: maybe unroll?
+      // chrispy: maybe unroll?
       for (ring_size_t i = 0; i < numlines; ++i) {
         swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
       }
@@ -187,7 +171,7 @@ class RingBuffer {
   }
 
  private:
-  uint8_t* buffer_ = nullptr;
+  uint8_t* XE_RESTRICT buffer_ = nullptr;
   ring_size_t capacity_ = 0;
   ring_size_t read_offset_ = 0;
   ring_size_t write_offset_ = 0;
diff --git a/src/xenia/base/split_map.h b/src/xenia/base/split_map.h
new file mode 100644
index 000000000..510c2ed70
--- /dev/null
+++ b/src/xenia/base/split_map.h
@@ -0,0 +1,87 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2019 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_SPLIT_MAP_H_
+#define XENIA_BASE_SPLIT_MAP_H_
+#include <algorithm>
+#include <vector>
+namespace xe {
+/*
+        a map structure that is optimized for infrequent
+   reallocation/resizing/erasure and frequent searches by key implemented as 2
+   std::vectors, one of the keys and one of the values
+*/
+template <typename TKey, typename TValue>
+class split_map {
+  using key_vector = std::vector<TKey>;
+  using value_vector = std::vector<TValue>;
+
+  key_vector keys_;
+  value_vector values_;
+
+ public:
+  using my_type = split_map<TKey, TValue>;
+
+  uint32_t IndexForKey(const TKey& k) {
+    auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k);
+    return static_cast<uint32_t>(lbound - keys_.begin());
+  }
+
+  uint32_t size() const { return static_cast<uint32_t>(keys_.size()); }
+  key_vector& Keys() { return keys_; }
+  value_vector& Values() { return values_; }
+  void clear() {
+    keys_.clear();
+    values_.clear();
+  }
+  void resize(uint32_t new_size) {
+    keys_.resize(static_cast<size_t>(new_size));
+    values_.resize(static_cast<size_t>(new_size));
+  }
+
+  void reserve(uint32_t new_size) {
+    keys_.reserve(static_cast<size_t>(new_size));
+    values_.reserve(static_cast<size_t>(new_size));
+  }
+  const TKey* KeyAt(uint32_t index) const {
+    if (index == size()) {
+      return nullptr;
+    } else {
+      return &keys_[index];
+    }
+  }
+  const TValue* ValueAt(uint32_t index) const {
+    if (index == size()) {
+      return nullptr;
+    } else {
+      return &values_[index];
+    }
+  }
+
+  void InsertAt(TKey k, TValue v, uint32_t idx) {
+    uint32_t old_size = size();
+
+    bool needs_shiftup = idx != old_size;
+
+    values_.insert(values_.begin() + idx, v);
+    keys_.insert(keys_.begin() + idx, k);
+  }
+  void EraseAt(uint32_t idx) {
+    uint32_t old_size = size();
+    if (idx == old_size) {
+      return;  // trying to erase nonexistent entry
+    } else {
+      values_.erase(values_.begin() + idx);
+      keys_.erase(keys_.begin() + idx);
+    }
+  }
+};
+}  // namespace xe
+
+#endif  // XENIA_BASE_SPLIT_MAP_H_
\ No newline at end of file
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index d1394d202..17abb72e7 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
             "code. The workaround may cause reduced CPU performance but is a "
             "more accurate emulation",
             "x64");
-
+DEFINE_uint32(align_all_basic_blocks, 0,
+              "Aligns the start of all basic blocks to N bytes. Only specify a "
+              "power of 2, 16 is the recommended value. Results in larger "
+              "icache usage, but potentially faster loops",
+              "x64");
 #if XE_X64_PROFILER_AVAILABLE == 1
 DEFINE_bool(instrument_call_times, false,
             "Compute time taken for functions, for profiling guest code",
@@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
   TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
   TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
   TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
-  TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C);
   TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
   TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
   TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
@@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
 
   return true;
 }
+#pragma pack(push, 1)
+struct RGCEmitted {
+  uint8_t ff_;
+  uint32_t rgcid_;
+};
+#pragma pack(pop)
 
+#if 0
+void X64Emitter::InjectCallAddresses(void* new_execute_address) {
+  for (auto&& callsite : call_sites_) {
+    RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
+      hunter =
+          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
+    }
+
+    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
+    hunter->rgcid_ =
+        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
+                              reinterpret_cast<intptr_t>(hunter + 1));
+  }
+}
+
+#else
+void X64Emitter::InjectCallAddresses(void* new_execute_address) {
+#if 0
+	RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+
+  std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
+
+  for (auto&& callsite : call_sites_) {
+    id_to_rgc[callsite.offset_] = &callsite;
+  }
+#else
+  RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
+  for (auto&& callsite : call_sites_) {
+    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
+      hunter =
+          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
+    }
+
+    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
+    hunter->rgcid_ =
+        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
+                              reinterpret_cast<intptr_t>(hunter + 1));
+  }
+#endif
+}
+#endif
 void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
                           GuestFunction* function) {
   // To avoid changing xbyak, we do a switcharoo here.
@@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
   if (function) {
     code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
                                 new_execute_address, new_write_address);
-    if (cvars::resolve_rel32_guest_calls) {
-      for (auto&& callsite : call_sites_) {
-#pragma pack(push, 1)
-        struct RGCEmitted {
-          uint8_t ff_;
-          uint32_t rgcid_;
-        };
-#pragma pack(pop)
-        RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
-        while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
-          hunter = reinterpret_cast<RGCEmitted*>(
-              reinterpret_cast<char*>(hunter) + 1);
-        }
 
-        hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
-        hunter->rgcid_ =
-            static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
-                                  reinterpret_cast<intptr_t>(hunter + 1));
-      }
+    if (cvars::resolve_rel32_guest_calls) {
+      InjectCallAddresses(new_execute_address);
     }
   } else {
     code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
@@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
       label = label->next;
     }
 
+    if (cvars::align_all_basic_blocks) {
+      align(cvars::align_all_basic_blocks, true);
+    }
     // Process instructions.
     const Instr* instr = block->instr_head;
     while (instr) {
@@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = {
     vec128i(0x7f800000),
     /* XMMThreeFloatMask */
     vec128i(~0U, ~0U, ~0U, 0U),
-    /*XMMXenosF16ExtRangeStart*/
-    vec128f(65504),
-    /*XMMVSRShlByteshuf*/
-    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
-    // XMMVSRMask
-    vec128b(1),
     /*
         XMMF16UnpackLCPI2
         */
@@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = {
     /*XMMXOPWordShiftMask*/
     vec128s(15),
     /*XMMXOPDwordShiftMask*/
-    vec128i(31)
-};
+    vec128i(31)};
 
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
   for (auto& vec : xmm_consts) {
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 01027cc0c..b20978ea3 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -157,9 +157,6 @@ enum XmmConst {
   XMMLVSRTableBase,
   XMMSingleDenormalMask,
   XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
-  XMMXenosF16ExtRangeStart,
-  XMMVSRShlByteshuf,
-  XMMVSRMask,
   XMMF16UnpackLCPI2,  // 0x38000000, 1/ 32768
   XMMF16UnpackLCPI3,  // 0x0x7fe000007fe000
   XMMF16PackLCPI0,
@@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags {
   kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
   kX64EmitBMI1 = 1 << 3,
   kX64EmitBMI2 = 1 << 4,
-  kX64EmitF16C = 1 << 5,
+  kX64EmitPrefetchW = 1 << 5,
   kX64EmitMovbe = 1 << 6,
   kX64EmitGFNI = 1 << 7,
 
@@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags {
                 // inc/dec) do not introduce false dependencies on EFLAGS
                 // because the individual flags are treated as different vars by
                 // the processor. (this applies to zen)
-  kX64EmitPrefetchW = 1 << 16,
-  kX64EmitXOP = 1 << 17,   // chrispy: xop maps really well to many vmx
+  kX64EmitXOP = 1 << 16,   // chrispy: xop maps really well to many vmx
                            // instructions, and FX users need the boost
-  kX64EmitFMA4 = 1 << 18,  // todo: also use on zen1?
-  kX64EmitTBM = 1 << 19
+  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
+  kX64EmitTBM = 1 << 18,
+  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
+  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
+  // instructions by using the legacy sse version if we recently cleared the
+  // high 128 bits of the
 };
 class ResolvableGuestCall {
  public:
@@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
             uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
             void** out_code_address, size_t* out_code_size,
             std::vector<SourceMapEntry>* out_source_map);
+  void InjectCallAddresses(void* new_execute_addr);
 
  public:
   // Reserved:  rsp, rsi, rdi
diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h
index b9257f179..78c459101 100644
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@@ -43,23 +43,23 @@ enum KeyType {
   KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
   KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
 };
-
+using InstrKeyValue = uint32_t;
 #pragma pack(push, 1)
 union InstrKey {
-  uint32_t value;
+  InstrKeyValue value;
   struct {
-    uint32_t opcode : 8;
-    uint32_t dest : 5;
-    uint32_t src1 : 5;
-    uint32_t src2 : 5;
-    uint32_t src3 : 5;
-    uint32_t reserved : 4;
+    InstrKeyValue opcode : 8;
+    InstrKeyValue dest : 5;
+    InstrKeyValue src1 : 5;
+    InstrKeyValue src2 : 5;
+    InstrKeyValue src3 : 5;
+    InstrKeyValue reserved : 4;
   };
 
-  operator uint32_t() const { return value; }
+  operator InstrKeyValue() const { return value; }
 
   InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
-  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(InstrKeyValue v) : value(v) {}
 
   // this used to take about 1% cpu while precompiling
   // it kept reloading opcode, and also constantly repacking and unpacking the
@@ -67,16 +67,16 @@ union InstrKey {
   InstrKey(const Instr* i) : value(0) {
     const OpcodeInfo* info = i->GetOpcodeInfo();
 
-    uint32_t sig = info->signature;
+    InstrKeyValue sig = info->signature;
 
     OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
 
     UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
 
-    uint32_t out_desttype = (uint32_t)dest_type;
-    uint32_t out_src1type = (uint32_t)src1_type;
-    uint32_t out_src2type = (uint32_t)src2_type;
-    uint32_t out_src3type = (uint32_t)src3_type;
+    InstrKeyValue out_desttype = (InstrKeyValue)dest_type;
+    InstrKeyValue out_src1type = (InstrKeyValue)src1_type;
+    InstrKeyValue out_src2type = (InstrKeyValue)src2_type;
+    InstrKeyValue out_src3type = (InstrKeyValue)src3_type;
 
     Value* destv = i->dest;
     // pre-deref, even if not value
@@ -105,7 +105,7 @@ union InstrKey {
   template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
             KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
   struct Construct {
-    static const uint32_t value =
+    static const InstrKeyValue value =
         (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
   };
 };
@@ -307,8 +307,8 @@ struct I<OPCODE, DEST> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       return true;
     }
@@ -329,8 +329,8 @@ struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       return true;
@@ -355,8 +355,8 @@ struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
+    if (kv == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       src2.Load(i->src2);
@@ -385,8 +385,8 @@ struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
  protected:
   template <typename SEQ, typename T>
   friend struct Sequence;
-  bool Load(const Instr* i) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+  bool Load(const Instr* i, InstrKeyValue ikey) {
+    if (ikey == key && BASE::LoadDest(i)) {
       instr = i;
       src1.Load(i->src1);
       src2.Load(i->src2);
@@ -422,9 +422,9 @@ struct Sequence {
 
   static constexpr uint32_t head_key() { return T::key; }
 
-  static bool Select(X64Emitter& e, const Instr* i) {
+  static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) {
     T args;
-    if (!args.Load(i)) {
+    if (!args.Load(i, ikey)) {
       return false;
     }
     SEQ::Emit(e, args);
diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc
index dc5fa7d3d..54e7ac8a0 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
   if (valid) {
     auto name = i.src2.value->name;
     switch (opcode) {
-      case OPCODE_IS_TRUE:
-        e.jnz(name, e.T_NEAR);
-        break;
-      case OPCODE_IS_FALSE:
-        e.jz(name, e.T_NEAR);
-        break;
       case OPCODE_COMPARE_EQ:
         e.je(name, e.T_NEAR);
         break;
@@ -299,26 +293,14 @@ struct CALL_TRUE_I64
 struct CALL_TRUE_F32
     : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-    e.ForgetMxcsrMode();
+    assert_impossible_sequence(CALL_TRUE_F32);
   }
 };
 
 struct CALL_TRUE_F64
     : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip);
-    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
-    e.L(skip);
-    e.ForgetMxcsrMode();
+    assert_impossible_sequence(CALL_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32
     : Sequence<CALL_INDIRECT_TRUE_F32,
                I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F32);
   }
 };
 struct CALL_INDIRECT_TRUE_F64
     : Sequence<CALL_INDIRECT_TRUE_F64,
                I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    Xbyak::Label skip;
-    e.jz(skip, CodeGenerator::T_NEAR);
-    e.CallIndirect(i.instr, i.src2);
-    e.L(skip);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
@@ -486,15 +460,13 @@ struct RETURN_TRUE_I64
 struct RETURN_TRUE_F32
     : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+    assert_impossible_sequence(RETURN_TRUE_F32);
   }
 };
 struct RETURN_TRUE_F64
     : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
-    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
+    assert_impossible_sequence(RETURN_TRUE_F64);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
@@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64
 struct BRANCH_TRUE_F32
     : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
-        i.instr->prev->dest == i.src1.value) {
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    } else if (i.instr->prev &&
-               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
-               i.instr->prev->dest == i.src1.value) {
-      e.jz(i.src2.value->name, e.T_NEAR);
-    } else {
-      e.vptest(i.src1, i.src1);
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    }
+    /*
+                chrispy: right now, im not confident that we are always clearing
+       the upper 96 bits of registers, making vptest extremely unsafe. many
+       ss/sd operations copy over the upper 96 from the source, and for abs we
+       negate ALL elements, making the top 64 bits contain 0x80000000 etc
+        */
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovd(e.eax, input);
+    e.test(e.eax, e.eax);
+    e.jnz(i.src2.value->name, e.T_NEAR);
   }
 };
 struct BRANCH_TRUE_F64
     : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
-        i.instr->prev->dest == i.src1.value) {
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    } else if (i.instr->prev &&
-               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
-               i.instr->prev->dest == i.src1.value) {
-      e.jz(i.src2.value->name, e.T_NEAR);
-    } else {
-      e.vptest(i.src1, i.src1);
-      e.jnz(i.src2.value->name, e.T_NEAR);
-    }
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovq(e.rax, input);
+    e.test(e.rax, e.rax);
+    e.jnz(i.src2.value->name, e.T_NEAR);
   }
 };
 EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32
     : Sequence<BRANCH_FALSE_F32,
                I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovd(e.eax, input);
+    e.test(e.eax, e.eax);
     e.jz(i.src2.value->name, e.T_NEAR);
   }
 };
@@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64
     : Sequence<BRANCH_FALSE_F64,
                I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    e.vmovq(e.rax, input);
+    e.test(e.rax, e.rax);
     e.jz(i.src2.value->name, e.T_NEAR);
   }
 };
diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
index 70865c30e..cc6c1cf32 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
   if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
     return false;
   }
+  if (IsTracingData()) {  // incompatible with tracing
+    return false;
+  }
   uint32_t guestaddr = i->GuestAddressFor();
   if (!guestaddr) {
     return false;
@@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
 
   return flags && flags->accessed_mmio;
 }
+template <typename T, bool swap>
+static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
+  if (swap) {
+    value = xe::byte_swap(value);
+  }
+  if (guestaddr >= 0xE0000000) {
+    guestaddr += 0x1000;
+  }
 
+  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
+
+  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
+  if (!gaddr) {
+    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
+  } else {
+    value = xe::byte_swap(value); /*
+          was having issues, found by comparing the values used with exceptions
+          to these that we were reversed...
+    */
+    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
+  }
+}
+
+template <typename T, bool swap>
+static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) {
+  T value;
+
+  if (guestaddr >= 0xE0000000) {
+    guestaddr += 0x1000;
+  }
+
+  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
+
+  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
+  if (!gaddr) {
+    value = *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr);
+    if (swap) {
+      value = xe::byte_swap(value);
+    }
+  } else {
+    /*
+          was having issues, found by comparing the values used with exceptions
+          to these that we were reversed...
+    */
+    value = gaddr->read(nullptr, gaddr->callback_context, guestaddr);
+  }
+  return value;
+}
 // ============================================================================
 // OPCODE_LOAD_OFFSET
 // ============================================================================
@@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16
 struct LOAD_OFFSET_I32
     : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+
+      if (i.src2.is_constant) {
+        e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
+      } else {
+        e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32());
+      }
+
+      e.CallNativeSafe(addrptr);
+      e.mov(i.dest, e.eax);
+    } else {
+      auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+          e.movbe(i.dest, e.dword[addr]);
+        } else {
+          e.mov(i.dest, e.dword[addr]);
+          e.bswap(i.dest);
+        }
       } else {
         e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
       }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
     }
   }
 };
@@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64
 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
                      LOAD_OFFSET_I32, LOAD_OFFSET_I64);
 
-template <typename T, bool swap>
-static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
-  if (swap) {
-    value = xe::byte_swap(value);
-  }
-  if (guestaddr >= 0xE0000000) {
-    guestaddr += 0x1000;
-  }
-
-  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
-
-  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
-  if (!gaddr) {
-    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
-  } else {
-    value = xe::byte_swap(value); /*
-          was having issues, found by comparing the values used with exceptions
-          to these that we were reversed...
-    */
-    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
-  }
-}
 // ============================================================================
 // OPCODE_STORE_OFFSET
 // ============================================================================
@@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
 };
 struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
-        e.movbe(i.dest, e.dword[addr]);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+
+      e.CallNativeSafe(addrptr);
+      e.mov(i.dest, e.eax);
+    } else {
+      auto addr = ComputeMemoryAddress(e, i.src1);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+          e.movbe(i.dest, e.dword[addr]);
+        } else {
+          e.mov(i.dest, e.dword[addr]);
+          e.bswap(i.dest);
+        }
       } else {
         e.mov(i.dest, e.dword[addr]);
-        e.bswap(i.dest);
       }
-    } else {
-      e.mov(i.dest, e.dword[addr]);
-    }
-    if (IsTracingData()) {
-      e.mov(e.GetNativeParam(1).cvt32(), i.dest);
-      e.lea(e.GetNativeParam(0), e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+      if (IsTracingData()) {
+        e.mov(e.GetNativeParam(1).cvt32(), i.dest);
+        e.lea(e.GetNativeParam(0), e.ptr[addr]);
+        e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+      }
     }
   }
 };
@@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
         } else {
           e.mov(e.dword[addr], i.src2);
         }
+        if (IsTracingData()) {
+          e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
+          e.lea(e.GetNativeParam(0), e.ptr[addr]);
+          e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
+        }
       }
     }
-    if (IsTracingData()) {
-      auto addr = ComputeMemoryAddress(e, i.src1);
-      e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
-      e.lea(e.GetNativeParam(0), e.ptr[addr]);
-      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
-    }
   }
 };
 struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 46eb285cf..e5843f37d 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -19,15 +19,15 @@
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 
-DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
+DEFINE_bool(xop_rotates, false, "rotate via xop", "x64");
 
-DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
+DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64");
 
-DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
+DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64");
 
-DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
+DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64");
 
-DEFINE_bool(xop_compares, true, "compare via xop", "X64");
+DEFINE_bool(xop_compares, true, "compare via xop", "x64");
 
 namespace xe {
 namespace cpu {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 48e340d42..bf5addd3d 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -67,7 +67,7 @@ using namespace xe::cpu::hir;
 
 using xe::cpu::hir::Instr;
 
-typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey);
 std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 
 // ============================================================================
@@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn,
   }
   return false;
 }
-#define EMITTER_IS_TRUE(typ, tester)                                 \
-  struct IS_TRUE_##typ                                               \
-      : Sequence<IS_TRUE_##typ, I<OPCODE_IS_TRUE, I8Op, typ##Op>> {  \
-    static void Emit(X64Emitter& e, const EmitArgType& i) {          \
-      e.tester(i.src1, i.src1);                                      \
-      unsigned ctxoffset = 0;                                        \
-      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
-        e.setnz(e.byte[e.GetContextReg() + ctxoffset]);              \
-      } else {                                                       \
-        e.setnz(i.dest);                                             \
-      }                                                              \
-    }                                                                \
-  }
-
-#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test)
-
-EMITTER_IS_TRUE_INT(I8);
-EMITTER_IS_TRUE_INT(I16);
-EMITTER_IS_TRUE_INT(I32);
-EMITTER_IS_TRUE_INT(I64);
-EMITTER_IS_TRUE(F32, vtestps);
-EMITTER_IS_TRUE(F64, vtestpd);
-
-EMITTER_IS_TRUE(V128, vptest);
-
-EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
-                     IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
-
-#define EMITTER_IS_FALSE(typ, tester)                                 \
-  struct IS_FALSE_##typ                                               \
-      : Sequence<IS_FALSE_##typ, I<OPCODE_IS_FALSE, I8Op, typ##Op>> { \
-    static void Emit(X64Emitter& e, const EmitArgType& i) {           \
-      e.tester(i.src1, i.src1);                                       \
-      unsigned ctxoffset = 0;                                         \
-      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) {  \
-        e.setz(e.byte[e.GetContextReg() + ctxoffset]);                \
-      } else {                                                        \
-        e.setz(i.dest);                                               \
-      }                                                               \
-    }                                                                 \
-  }
-#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test)
-EMITTER_IS_FALSE_INT(I8);
-EMITTER_IS_FALSE_INT(I16);
-EMITTER_IS_FALSE_INT(I32);
-EMITTER_IS_FALSE_INT(I64);
-EMITTER_IS_FALSE(F32, vtestps);
-EMITTER_IS_FALSE(F64, vtestpd);
-
-EMITTER_IS_FALSE(V128, vptest);
-
-EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
-                     IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
 
 // ============================================================================
 // OPCODE_IS_NAN
@@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
 
     auto it = sequence_table.find(key);
     if (it != sequence_table.end()) {
-      if (it->second(*e, i)) {
+      if (it->second(*e, i, InstrKey(i))) {
         *new_tail = i->next;
         return true;
       }
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h
index 0ce6a7f57..05a5b42d4 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@@ -25,7 +25,7 @@ namespace x64 {
 
 class X64Emitter;
 
-typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey);
 extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 
 template <typename T>
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 26f0fecb4..2d1654030 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             }
           }
           break;
-        case OPCODE_IS_TRUE:
-          if (i->src1.value->IsConstant()) {
-            if (i->src1.value->IsConstantTrue()) {
-              v->set_constant(uint8_t(1));
-            } else {
-              v->set_constant(uint8_t(0));
-            }
-            i->Remove();
-            result = true;
-          }
-          break;
-        case OPCODE_IS_FALSE:
-          if (i->src1.value->IsConstant()) {
-            if (i->src1.value->IsConstantFalse()) {
-              v->set_constant(uint8_t(1));
-            } else {
-              v->set_constant(uint8_t(0));
-            }
-            i->Remove();
-            result = true;
-          }
-          break;
         case OPCODE_IS_NAN:
           if (i->src1.value->IsConstant()) {
             if (i->src1.value->type == FLOAT32_TYPE &&
@@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             if (should_skip_because_of_float) {
               break;
-			}
+            }
             v->set_from(i->src1.value);
             v->Max(i->src2.value);
             i->Remove();
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index e94b570dc..a8c93c3b6 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
   bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
 
   Value* new_value = nullptr;
-  if (xorop == OPCODE_IS_FALSE) {
-    new_value = builder->IsTrue(xordef->src1.value);
-
-  } else if (xorop == OPCODE_IS_TRUE) {
-    new_value = builder->IsFalse(xordef->src1.value);
-  } else if (xorop == OPCODE_COMPARE_EQ) {
+  if (xorop == OPCODE_COMPARE_EQ) {
     new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
 
   } else if (xorop == OPCODE_COMPARE_NE) {
@@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
   return false;
 }
 bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
-  return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
+  return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE;
 }
 
 inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
@@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   if (!var_definition) {
     return false;
   }
-  // x == 0 -> !x
-  if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
-    return true;
-  }
-  // x != 0 -> !!x
-  if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) {
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
-    return true;
-  }
 
   if (cmpop == OPCODE_COMPARE_ULE &&
       constant_unpacked ==
           0) {  // less than or equal to zero = (== 0) = IS_FALSE
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_EQ_info;
+
     return true;
   }
   // todo: OPCODE_COMPARE_NE too?
@@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   }
   if (cmpop == OPCODE_COMPARE_ULT &&
       constant_unpacked == 1) {  // unsigned lt 1 means == 0
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
+                                 // i->Replace(&OPCODE_IS_FALSE_info, 0);
+
+    i->opcode = &OPCODE_COMPARE_EQ_info;
+
+    // i->set_src1(variable);
+    i->set_src2(builder->LoadZero(variable->type));
     return true;
   }
   if (cmpop == OPCODE_COMPARE_UGT &&
       constant_unpacked == 0) {  // unsigned gt 1 means != 0
 
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
+    // i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_NE_info;
     return true;
   }
 
@@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
   } else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
              constant_unpacked == 0) {
     // signbit cant be set, and checking if gt 0, so actually checking != 0
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
+
+    // i->set_src1(variable);
+    i->opcode = &OPCODE_COMPARE_NE_info;
+
     return true;
   }
 
@@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     Value* constant_replacement = nullptr;
 
     if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
-      repl = &OPCODE_IS_TRUE_info;
+      repl = &OPCODE_COMPARE_NE_info;
     } else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
-      repl = &OPCODE_IS_FALSE_info;
+      repl = &OPCODE_COMPARE_EQ_info;
 
     } else if (cmpop == OPCODE_COMPARE_UGT) {
       // impossible, cannot be greater than mask
@@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
     if (repl) {
       i->Replace(repl, 0);
       i->set_src1(variable);
+      i->set_src2(builder->LoadZero(variable->type));
       return true;
     }
     if (constant_replacement) {
@@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
 }
 bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
                                             hir::HIRBuilder* builder) {
-  bool istrue = i->opcode == &OPCODE_IS_TRUE_info;
-  bool isfalse = i->opcode == &OPCODE_IS_FALSE_info;
+  bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
+  bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
 
-  Value* input = i->src1.value;
+  auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
+
+  if (!input_cosntant || input_cosntant->AsUint64() != 0) {
+    return false;
+  }
+
+  // Value* input = i->src1.value;
   TypeName input_type = input->type;
   if (!IsScalarIntegralType(input_type)) {
     return false;
@@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
           i->set_src1(isfalsetest);
 
         } else {
-          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          // i->Replace(&OPCODE_IS_FALSE_info, 0);
+          i->Replace(&OPCODE_COMPARE_EQ_info, 0);
           i->set_src1(lz_input);
+          i->set_src2(builder->LoadZero(lz_input->type));
         }
         return true;
       }
@@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
     while (i) {
       // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
       // don't handle these in our simplifications, so skip
-      if (i->dest && IsScalarIntegralType(i->dest->type)) {
+      if (i->AllScalarIntegral()) {
         Opcode iop = i->opcode->num;
 
         if (iop == OPCODE_OR) {
@@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
           result |= CheckAdd(i, builder);
         } else if (IsScalarBasicCmp(iop)) {
           result |= CheckScalarConstCmp(i, builder);
-        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
           result |= CheckIsTrueIsFalse(i, builder);
         } else if (iop == OPCODE_SHR) {
           result |= CheckSHR(i, builder);
diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc
index fda6812b4..82e3021f9 100644
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
 
 Value* HIRBuilder::Convert(Value* value, TypeName target_type,
                            RoundMode round_mode) {
-
   Instr* i =
       AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
   i->set_src1(value);
@@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
 Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
   ASSERT_FLOAT_OR_VECTOR_TYPE(value);
 
-
   Instr* i =
       AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
   i->set_src1(value);
@@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
   i->set_src2(value);
   i->src3.value = NULL;
 }
-
+Value* HIRBuilder::LoadVectorLeft(Value* address) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+  return i->dest;
+}
+Value* HIRBuilder::LoadVectorRight(Value* address) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE));
+  i->set_src1(address);
+  i->src2.value = i->src3.value = NULL;
+  return i->dest;
+}
+void HIRBuilder::StoreVectorLeft(Value* address, Value* value) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_STVL_info, 0);
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+}
+void HIRBuilder::StoreVectorRight(Value* address, Value* value) {
+  ASSERT_ADDRESS_TYPE(address);
+  Instr* i = AppendInstr(OPCODE_STVR_info, 0);
+  i->set_src1(address);
+  i->set_src2(value);
+  i->src3.value = NULL;
+}
 void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
   ASSERT_ADDRESS_TYPE(address);
   ASSERT_TYPES_EQUAL(address, length);
@@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) {
 Value* HIRBuilder::Max(Value* value1, Value* value2) {
   ASSERT_TYPES_EQUAL(value1, value2);
 
-  if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
+  if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
       value2->IsConstant()) {
     return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
   }
@@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
   i->set_src3(value2);
   return i->dest;
 }
+static Value* OrLanes32(HIRBuilder& f, Value* value) {
+  hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE);
+  hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE);
+  hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE);
+  hir::Value* ored = f.Or(v1, v2);
 
+  hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE);
+  ored = f.Or(ored, v3);
+
+  ored = f.Or(ored, v4);
+  return ored;
+}
 Value* HIRBuilder::IsTrue(Value* value) {
+  assert_true(value);
+  if (value->type == VEC128_TYPE) {
+    // chrispy: this probably doesnt happen often enough to be worth its own
+    // opcode or special code path but this could be optimized to not require as
+    // many extracts, we can shuffle and or v128 and then extract the low
+
+    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
+  }
+
   if (value->IsConstant()) {
     return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
   }
 
-  Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE));
-  i->set_src1(value);
-  i->src2.value = i->src3.value = NULL;
-  return i->dest;
+  return CompareNE(value, LoadZero(value->type));
 }
 
 Value* HIRBuilder::IsFalse(Value* value) {
+  assert_true(value);
+
+  if (value->type == VEC128_TYPE) {
+    // chrispy: this probably doesnt happen often enough to be worth its own
+    // opcode or special code path but this could be optimized to not require as
+    // many extracts, we can shuffle and or v128 and then extract the low
+
+    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
+  }
+
   if (value->IsConstant()) {
     return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
   }
 
-  Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE));
-  i->set_src1(value);
-  i->src2.value = i->src3.value = NULL;
-  return i->dest;
+  return CompareEQ(value, LoadZero(value->type));
 }
 
 Value* HIRBuilder::IsNan(Value* value) {
diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h
index 62164e52d..4a200e0e5 100644
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@@ -166,6 +166,11 @@ class HIRBuilder {
                    uint32_t store_flags = 0);
 
   Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
+
+  Value* LoadVectorLeft(Value* address);
+  Value* LoadVectorRight(Value* address);
+  void StoreVectorLeft(Value* address, Value* value);
+  void StoreVectorRight(Value* address, Value* value);
   void Store(Value* address, Value* value, uint32_t store_flags = 0);
   void Memset(Value* address, Value* value, Value* length);
   void CacheControl(Value* address, size_t cache_line_size,
@@ -268,6 +273,7 @@ class HIRBuilder {
                                Value* new_value);
   Value* AtomicAdd(Value* address, Value* value);
   Value* AtomicSub(Value* address, Value* value);
+
   void SetNJM(Value* value);
 
  protected:
diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc
index 149103d43..0e4a7c2fb 100644
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const {
 
   return 0;  // eek.
 }
+bool Instr::AllScalarIntegral() {
+  bool result = true;
+  if (dest) {
+    if (!IsScalarIntegralType(dest->type)) {
+      return false;
+    }
+  }
 
+  VisitValueOperands([&result](Value* v, uint32_t idx) {
+    result = result && IsScalarIntegralType(v->type);
+  });
+  return result;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h
index 38afef241..a50219ceb 100644
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr
   const hir::Instr* GetNonFakePrev() const;
 
   uint32_t GuestAddressFor() const;
+
+  bool AllScalarIntegral();  // dest and all srcs are scalar integral
 };
 
 }  // namespace hir
diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h
index a51fda6d5..a59b1c72e 100644
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@@ -210,10 +210,10 @@ enum Opcode {
   OPCODE_STORE,
   // chrispy: todo: implement, our current codegen for the unaligned loads is
   // very bad
-  OPCODE_LVLX,
-  OPCODE_LVRX,
-  OPCODE_STVLX,
-  OPCODE_STVRX,
+  OPCODE_LVL,
+  OPCODE_LVR,
+  OPCODE_STVL,
+  OPCODE_STVR,
   OPCODE_MEMSET,
   OPCODE_CACHE_CONTROL,
   OPCODE_MEMORY_BARRIER,
@@ -222,8 +222,6 @@ enum Opcode {
   OPCODE_MIN,
   OPCODE_VECTOR_MIN,
   OPCODE_SELECT,
-  OPCODE_IS_TRUE,
-  OPCODE_IS_FALSE,
   OPCODE_IS_NAN,
   OPCODE_COMPARE_EQ,
   OPCODE_COMPARE_NE,
diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl
index 46a328903..783e9a439 100644
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@@ -303,17 +303,6 @@ DEFINE_OPCODE(
     OPCODE_SIG_V_V_V_V,
     0)
 
-DEFINE_OPCODE(
-    OPCODE_IS_TRUE,
-    "is_true",
-    OPCODE_SIG_V_V,
-    0)
-
-DEFINE_OPCODE(
-    OPCODE_IS_FALSE,
-    "is_false",
-    OPCODE_SIG_V_V,
-    0)
 
 DEFINE_OPCODE(
     OPCODE_IS_NAN,
@@ -706,4 +695,27 @@ DEFINE_OPCODE(
 	OPCODE_SIG_X_V,
 	0
 )
+DEFINE_OPCODE(
+	OPCODE_LVL,
+	"loadv_left",
+	OPCODE_SIG_V_V,
+	OPCODE_FLAG_MEMORY
+)
 
+DEFINE_OPCODE(
+	OPCODE_LVR,
+	"loadv_right",
+	OPCODE_SIG_V_V,
+	OPCODE_FLAG_MEMORY
+)
+DEFINE_OPCODE(
+    OPCODE_STVL,
+    "storev_left",
+    OPCODE_SIG_X_V_V,
+    OPCODE_FLAG_MEMORY)
+
+DEFINE_OPCODE(
+    OPCODE_STVR,
+    "storev_right",
+    OPCODE_SIG_X_V_V,
+    OPCODE_FLAG_MEMORY)
diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc
index 4aedfdd26..263d3675a 100644
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
   // Testing for all 1's and all 0's.
   // if (Rc) CR6 = all_equal | 0 | none_equal | 0
   // TODO(benvanik): efficient instruction?
+
+  // chrispy: nothing seems to write cr6_1, figure out if no documented
+  // instructions write anything other than 0 to it and remove these stores if
+  // so
   StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
   StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
   StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index db8c874f2..a670cc9d6 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -7,18 +7,17 @@
  ******************************************************************************
  */
 
+#include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <utility>
-
 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
-#include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/draw_util.h"
@@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() {
   bool draw_resolution_scale_not_clamped =
       TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
                                                  draw_resolution_scale_y);
+
   if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
           draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
     draw_resolution_scale_not_clamped = false;
@@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() {
 
   CommandProcessor::ShutdownContext();
 }
-
+// todo: bit-pack the bools and use bitarith to reduce branches
 void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
   CommandProcessor::WriteRegister(index, value);
 
-  if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
-      index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-    if (frame_open_) {
-      uint32_t float_constant_index =
-          (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
-      if (float_constant_index >= 256) {
-        float_constant_index -= 256;
-        if (current_float_constant_map_pixel_[float_constant_index >> 6] &
-            (1ull << (float_constant_index & 63))) {
-          cbuffer_binding_float_pixel_.up_to_date = false;
-        }
-      } else {
-        if (current_float_constant_map_vertex_[float_constant_index >> 6] &
-            (1ull << (float_constant_index & 63))) {
-          cbuffer_binding_float_vertex_.up_to_date = false;
+  bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date;
+  bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date;
+  bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date;
+
+  if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
+      index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
+    cbuffer_binding_fetch_.up_to_date = false;
+    // texture cache is never nullptr
+    // if (texture_cache_ != nullptr) {
+    texture_cache_->TextureFetchConstantWritten(
+        (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+    // }
+  } else {
+    if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd |
+          cbuf_binding_bool_loop_utd)) {
+      return;
+    }
+
+    if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
+        index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
+      if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) {
+        return;
+      }
+      if (frame_open_) {
+        uint32_t float_constant_index =
+            (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
+        if (float_constant_index >= 256) {
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_binding_float_pixel_.up_to_date = false;
+          }
+        } else {
+          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_binding_float_vertex_.up_to_date = false;
+          }
         }
       }
-    }
-  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
-             index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
-    cbuffer_binding_bool_loop_.up_to_date = false;
-  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-             index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-    cbuffer_binding_fetch_.up_to_date = false;
-    if (texture_cache_ != nullptr) {
-      texture_cache_->TextureFetchConstantWritten(
-          (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+    } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
+               index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
+      cbuffer_binding_bool_loop_.up_to_date = false;
     }
   }
 }
@@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
   uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
   draw_util::ViewportInfo viewport_info;
-  draw_util::GetHostViewportInfo(
-      regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
+  draw_util::GetViewportInfoArgs gviargs{};
+
+  gviargs.Setup(
+      draw_resolution_scale_x, draw_resolution_scale_y,
+      texture_cache_->draw_resolution_scale_x_divisor(),
+      texture_cache_->draw_resolution_scale_y_divisor(), true,
       D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
       normalized_depth_control,
       host_render_targets_used &&
           render_target_cache_->depth_float24_convert_in_pixel_shader(),
-      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
-      viewport_info);
+      host_render_targets_used, pixel_shader && pixel_shader->writes_depth());
+  gviargs.SetupRegisterValues(regs);
+
+  if (gviargs == previous_viewport_info_args_) {
+    viewport_info = previous_viewport_info_;
+  } else {
+    draw_util::GetHostViewportInfo(&gviargs, viewport_info);
+    previous_viewport_info_args_ = gviargs;
+    previous_viewport_info_ = viewport_info;
+  }
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
   scissor.offset[0] *= draw_resolution_scale_x;
@@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() {
     shared_memory_->InitializeTraceCompleteDownloads();
   }
 }
+static void DmaPrefunc(dma::XeDMAJob* job) {
+  D3D12_RANGE readback_range;
+  readback_range.Begin = 0;
+  readback_range.End = job->size;
+  void* readback_mapping;
+  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
+
+  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
+  xenia_assert(SUCCEEDED(mapres));
+
+  job->source = (uint8_t*)readback_mapping;
+}
+
+static void DmaPostfunc(dma::XeDMAJob* job) {
+  D3D12_RANGE readback_write_range = {};
+  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
+  readback_buffer->Unmap(0, &readback_write_range);
+}
 
 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() {
           readback_buffer, 0, shared_memory_buffer, written_address,
           written_length);
       if (AwaitAllQueueOperationsCompletion()) {
+#if 1
         D3D12_RANGE readback_range;
         readback_range.Begin = 0;
         readback_range.End = written_length;
         void* readback_mapping;
         if (SUCCEEDED(
                 readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          std::memcpy(memory_->TranslatePhysical(written_address),
-                      readback_mapping, written_length);
+          // chrispy: this memcpy needs to be optimized as much as possible
+
+          auto physaddr = memory_->TranslatePhysical(written_address);
+          dma::vastcpy(physaddr, (uint8_t*)readback_mapping,
+                                      written_length);
+         // XEDmaCpy(physaddr, readback_mapping, written_length);
           D3D12_RANGE readback_write_range = {};
           readback_buffer->Unmap(0, &readback_write_range);
         }
+
+#else
+        dma::XeDMAJob job{};
+        job.destination = memory_->TranslatePhysical(written_address);
+        job.size = written_length;
+        job.source = nullptr;
+        job.userdata1 = (void*)readback_buffer;
+        job.precall = DmaPrefunc;
+        job.postcall = DmaPostfunc;
+
+        readback_available_ = GetDMAC()->PushDMAJob(&job);
+
+#endif
       }
     }
   }
@@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings(
     if (bool_loop_constants == nullptr) {
       return false;
     }
-    std::memcpy(bool_loop_constants,
-                &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
-                kBoolLoopConstantsSize);
+    xe::smallcpy_const<kBoolLoopConstantsSize>(
+        bool_loop_constants,
+        &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32);
+
     cbuffer_binding_bool_loop_.up_to_date = true;
     current_graphics_root_up_to_date_ &=
         ~(1u << root_parameter_bool_loop_constants);
@@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings(
     if (fetch_constants == nullptr) {
       return false;
     }
-    std::memcpy(fetch_constants,
-                &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32,
-                kFetchConstantsSize);
+    xe::smallcpy_const<kFetchConstantsSize>(
+        fetch_constants, &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32);
+
     cbuffer_binding_fetch_.up_to_date = true;
     current_graphics_root_up_to_date_ &=
         ~(1u << root_parameter_fetch_constants);
@@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
   }
+  #if 0
+  if (readback_available_) {
+    GetDMAC()->WaitJobDone(readback_available_);
+    readback_available_ = 0;
+  }
+  #endif
   size = xe::align(size, kReadbackBufferSizeIncrement);
   if (size > readback_buffer_size_) {
     const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
@@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
       readback_buffer_->Release();
     }
     readback_buffer_ = buffer;
+    readback_buffer_size_ = size;
   }
   return readback_buffer_;
 }
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 2b43233b9..e64447a4e 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -1,3 +1,4 @@
+/**
 /**
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
@@ -19,6 +20,7 @@
 #include <utility>
 
 #include "xenia/base/assert.h"
+#include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
   ID3D12Resource* readback_buffer_ = nullptr;
+  dma::DMACJobHandle readback_available_ = 0;
   uint32_t readback_buffer_size_ = 0;
 
   std::atomic<bool> pix_capture_requested_ = false;
@@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor {
   DxbcShaderTranslator::SystemConstants system_constants_;
 
   // Float constant usage masks of the last draw call.
-  uint64_t current_float_constant_map_vertex_[4];
-  uint64_t current_float_constant_map_pixel_[4];
-
+  // chrispy: make sure accesses to these cant cross cacheline boundaries
+  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
+    uint64_t current_float_constant_map_vertex_[4];
+    uint64_t current_float_constant_map_pixel_[4];
+  };
   // Constant buffer bindings.
   struct ConstantBufferBinding {
     D3D12_GPU_VIRTUAL_ADDRESS address;
@@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   // Current primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
+
+  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
+  draw_util::ViewportInfo previous_viewport_info_;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 10017fff1..02d1f6750 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
   return false;
 }
 
-void GetHostViewportInfo(const RegisterFile& regs,
-                         uint32_t draw_resolution_scale_x,
-                         uint32_t draw_resolution_scale_y,
-                         bool origin_bottom_left, uint32_t x_max,
-                         uint32_t y_max, bool allow_reverse_z,
-                         reg::RB_DEPTHCONTROL normalized_depth_control,
-                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
-                         bool pixel_shader_writes_depth,
+static float ViewportRecip2_0(float f) {
+  float f1 = ArchReciprocalRefined(f);
+  return f1 + f1;
+}
+
+// chrispy: todo, the int/float divides and the nan-checked mins show up
+// relatively high on uprof when i uc to 1.7ghz
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out) {
-  assert_not_zero(draw_resolution_scale_x);
-  assert_not_zero(draw_resolution_scale_y);
+  assert_not_zero(args->draw_resolution_scale_x);
+  assert_not_zero(args->draw_resolution_scale_y);
 
   // A vertex position goes the following path:
   //
@@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs,
   // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
   // D24FS8 as well.
 
-  auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
-  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
-  auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
+  auto pa_cl_clip_cntl = args->pa_cl_clip_cntl;
+  auto pa_cl_vte_cntl = args->pa_cl_vte_cntl;
+  auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl;
+  auto pa_su_vtx_cntl = args->pa_su_vtx_cntl;
 
   // Obtain the original viewport values in a normalized way.
   float scale_xy[] = {
-      pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
-                                       : 1.0f,
-      pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
-                                       : 1.0f,
+
+      pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f,
+      pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f,
   };
-  float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
-                      ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
-                      : 1.0f;
+  float scale_z =
+      pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f;
+
   float offset_base_xy[] = {
-      pa_cl_vte_cntl.vport_x_offset_ena
-          ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
-          : 0.0f,
-      pa_cl_vte_cntl.vport_y_offset_ena
-          ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
-          : 0.0f,
+      pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f,
+      pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f,
   };
-  float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
-                       ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
-                       : 0.0f;
+  float offset_z =
+      pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f;
   // Calculate all the integer.0 or integer.5 offsetting exactly at full
   // precision, separately so it can be used in other integer calculations
   // without double rounding if needed.
   float offset_add_xy[2] = {};
   if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    auto pa_sc_window_offset = args->pa_sc_window_offset;
     offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
     offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
   }
@@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
 
   // The maximum value is at least the maximum host render target size anyway -
   // and a guest pixel is always treated as a whole with resolution scaling.
-  uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x,
-                                y_max / draw_resolution_scale_y};
+  // cbrispy: todo, this integer divides show up high on the profiler somehow
+  // (it was a very long session, too)
+  uint32_t xy_max_unscaled[] = {
+      args->draw_resolution_scale_x_divisor.Apply(args->x_max),
+      args->draw_resolution_scale_y_divisor.Apply(args->y_max)};
   assert_not_zero(xy_max_unscaled[0]);
   assert_not_zero(xy_max_unscaled[1]);
 
@@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
           std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
       viewport_info_out.xy_extent[i] =
           extent_axis_unscaled *
-          (i ? draw_resolution_scale_y : draw_resolution_scale_x);
+          (i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x);
       float extent_axis_unscaled_float = float(extent_axis_unscaled);
-      float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
+
+      float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float);
+
       ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
       ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
                        offset_add_xy[i]) *
@@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       // doing truncation for simplicity - since maxing with 0 is done anyway
       // (we only return viewports in the positive quarter-plane).
       uint32_t axis_resolution_scale =
-          i ? draw_resolution_scale_y : draw_resolution_scale_x;
+          i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x;
       float offset_axis = offset_base_xy[i] + offset_add_xy[i];
       float scale_axis = scale_xy[i];
       float scale_axis_abs = std::abs(scale_xy[i]);
@@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs,
         // space, a region previously outside -W...W should end up within it, so
         // the scale should be < 1.
         float axis_extent_rounded = float(axis_extent_int);
-        ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
+        float inv_axis_extent_rounded =
+            ArchReciprocalRefined(axis_extent_rounded);
+
+        ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded;
         // Move the origin of the snapped coordinates back to the original one.
         ndc_offset_axis = (float(offset_axis) -
                            (float(axis_0_int) + axis_extent_rounded * 0.5f)) *
-                          2.0f / axis_extent_rounded;
+                          2.0f * inv_axis_extent_rounded;
       } else {
         // Empty viewport (everything outside the viewport scissor).
         ndc_scale_axis = 1.0f;
@@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       ndc_scale[2] = 0.5f;
       ndc_offset[2] = 0.5f;
     }
-    if (pixel_shader_writes_depth) {
+    if (args->pixel_shader_writes_depth) {
       // Allow the pixel shader to write any depth value since
       // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
       // shaders don't have access to the original Z in the viewport space
@@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       // Direct3D 12 doesn't allow reverse depth range - on some drivers it
       // works, on some drivers it doesn't, actually, but it was never
       // explicitly allowed by the specification.
-      if (!allow_reverse_z && z_min > z_max) {
+      if (!args->allow_reverse_z && z_min > z_max) {
         std::swap(z_min, z_max);
         ndc_scale[2] = -ndc_scale[2];
         ndc_offset[2] = 1.0f - ndc_offset[2];
@@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs,
     }
   }
 
-  if (normalized_depth_control.z_enable &&
-      regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
-          xenos::DepthRenderTargetFormat::kD24FS8) {
-    if (convert_z_to_float24) {
+  if (args->normalized_depth_control.z_enable &&
+      args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
+    if (args->convert_z_to_float24) {
       // Need to adjust the bounds that the resulting depth values will be
       // clamped to after the pixel shader. Preferring adding some error to
       // interpolated Z instead if conversion can't be done exactly, without
@@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
       z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
       z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
     }
-    if (full_float24_in_0_to_1) {
+    if (args->full_float24_in_0_to_1) {
       // Remap the full [0...2) float24 range to [0...1) support data round-trip
       // during render target ownership transfer of EDRAM tiles through depth
       // input without unrestricted depth range.
@@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
   viewport_info_out.z_min = z_min;
   viewport_info_out.z_max = z_max;
 
-  if (origin_bottom_left) {
+  if (args->origin_bottom_left) {
     ndc_scale[1] = -ndc_scale[1];
     ndc_offset[1] = -ndc_offset[1];
   }
@@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs,
     viewport_info_out.ndc_offset[i] = ndc_offset[i];
   }
 }
-
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
                 bool clamp_to_surface_pitch) {
   auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
@@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
            xenos::kMaxResolveSize);
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
-  //fails in forza horizon 1
+  // fails in forza horizon 1
   assert_true(x0 < x1 && y0 < y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index a365b5436..2f27f6e01 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -277,18 +277,151 @@ struct ViewportInfo {
   float ndc_scale[3];
   float ndc_offset[3];
 };
+static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t),
+              "Change in depthrendertargetformat throws off "
+              "getviewportinfoargs by a bit");
+struct GetViewportInfoArgs {
+  union alignas(64) {
+    struct {
+      // group 1
+      uint32_t x_max;
+      uint32_t y_max;
+      union {
+        struct {
+          uint32_t origin_bottom_left : 1;
+          uint32_t allow_reverse_z : 1;
+          uint32_t convert_z_to_float24 : 1;
+          uint32_t full_float24_in_0_to_1 : 1;
+          uint32_t pixel_shader_writes_depth : 1;
+          xenos::DepthRenderTargetFormat depth_format : 1;
+        };
+        uint32_t packed_portions;
+      };
+      reg::RB_DEPTHCONTROL normalized_depth_control;
+      // group 2
+      reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl;
+      reg::PA_CL_VTE_CNTL pa_cl_vte_cntl;
+      reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl;
+      reg::PA_SU_VTX_CNTL pa_su_vtx_cntl;
+      // group 3
+      reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset;
+      float PA_CL_VPORT_XSCALE;
+      float PA_CL_VPORT_YSCALE;
+      float PA_CL_VPORT_ZSCALE;
+
+      float PA_CL_VPORT_XOFFSET;
+      float PA_CL_VPORT_YOFFSET;
+      float PA_CL_VPORT_ZOFFSET;
+      uint32_t padding_set_to_0;
+    };
+#if XE_ARCH_AMD64 == 1
+    struct {
+      __m128i first4;   // x_max, y_max, packed_portions,
+                        // normalized_depth_control
+      __m128i second4;  // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl,
+                        // pa_su_vtx_cntl
+      __m128i third4;   // pa_sc_window_offset, PA_CL_VPORT_XSCALE,
+                        // PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE
+      __m128i last4;    // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET,
+                        // PA_CL_VPORT_ZOFFSET, padding_set_to_0
+    };
+#endif
+  };
+
+  // everything that follows here does not need to be compared
+  uint32_t draw_resolution_scale_x;
+  uint32_t draw_resolution_scale_y;
+  divisors::MagicDiv draw_resolution_scale_x_divisor;
+  divisors::MagicDiv draw_resolution_scale_y_divisor;
+  void Setup(uint32_t _draw_resolution_scale_x,
+             uint32_t _draw_resolution_scale_y,
+             divisors::MagicDiv _draw_resolution_scale_x_divisor,
+             divisors::MagicDiv _draw_resolution_scale_y_divisor,
+             bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max,
+             bool _allow_reverse_z,
+             reg::RB_DEPTHCONTROL _normalized_depth_control,
+             bool _convert_z_to_float24, bool _full_float24_in_0_to_1,
+             bool _pixel_shader_writes_depth) {
+    packed_portions = 0;
+    padding_set_to_0 = 0;  // important to zero this
+    draw_resolution_scale_x = _draw_resolution_scale_x;
+    draw_resolution_scale_y = _draw_resolution_scale_y;
+    draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor;
+    draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor;
+    origin_bottom_left = _origin_bottom_left;
+    x_max = _x_max;
+    y_max = _y_max;
+    allow_reverse_z = _allow_reverse_z;
+    normalized_depth_control = _normalized_depth_control;
+    convert_z_to_float24 = _convert_z_to_float24;
+    full_float24_in_0_to_1 = _full_float24_in_0_to_1;
+    pixel_shader_writes_depth = _pixel_shader_writes_depth;
+  }
+
+  void SetupRegisterValues(const RegisterFile& regs) {
+    pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
+    pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+    pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
+    pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
+    PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
+    PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
+    PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;
+    PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;
+    PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+    PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;
+    pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    depth_format = regs.Get<reg::RB_DEPTH_INFO>().depth_format;
+  }
+  XE_FORCEINLINE
+  bool operator==(const GetViewportInfoArgs& prev) {
+#if XE_ARCH_AMD64 == 0
+    bool result = true;
+
+    auto accum_eq = [&result](auto x, auto y) { result &= (x == y); };
+
+#define EQC(field) accum_eq(field, prev.field)
+    EQC(x_max);
+    EQC(y_max);
+    EQC(packed_portions);
+    EQC(normalized_depth_control.value);
+    EQC(pa_cl_clip_cntl.value);
+    EQC(pa_cl_vte_cntl.value);
+
+    EQC(pa_su_sc_mode_cntl.value);
+    EQC(pa_su_vtx_cntl.value);
+    EQC(PA_CL_VPORT_XSCALE);
+    EQC(PA_CL_VPORT_YSCALE);
+    EQC(PA_CL_VPORT_ZSCALE);
+    EQC(PA_CL_VPORT_XOFFSET);
+    EQC(PA_CL_VPORT_YOFFSET);
+    EQC(PA_CL_VPORT_ZOFFSET);
+    EQC(pa_sc_window_offset.value);
+
+#undef EQC
+    return result;
+#else
+    __m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4);
+    __m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4);
+
+    __m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4);
+    __m128i unified1 = _mm_and_si128(mask1, mask2);
+    __m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4);
+
+    __m128i unified2 = _mm_and_si128(unified1, mask3);
+
+    __m128i unified3 = _mm_and_si128(unified2, mask4);
+
+    return _mm_movemask_epi8(unified3) == 0xFFFF;
+
+#endif
+  }
+};
+
 // Converts the guest viewport (or fakes one if drawing without a viewport) to
 // a viewport, plus values to multiply-add the returned position by, usable on
 // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
 // Direct3D clip space with 0...W Z rather than -W...W.
-void GetHostViewportInfo(const RegisterFile& regs,
-                         uint32_t draw_resolution_scale_x,
-                         uint32_t draw_resolution_scale_y,
-                         bool origin_bottom_left, uint32_t x_max,
-                         uint32_t y_max, bool allow_reverse_z,
-                         reg::RB_DEPTHCONTROL normalized_depth_control,
-                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
-                         bool pixel_shader_writes_depth,
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out);
 
 struct Scissor {
diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc
index 2695b22d9..54c8b3655 100644
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
     }
     // Make sure the same render target isn't bound into two different slots
     // over time.
-    for (uint32_t i = 1; are_accumulated_render_targets_valid_ &&
-                         i < 1 + xenos::kMaxColorRenderTargets;
-         ++i) {
-      const RenderTarget* render_target =
-          last_update_accumulated_render_targets_[i];
-      if (!render_target) {
-        continue;
-      }
-      for (uint32_t j = 0; j < i; ++j) {
-        if (last_update_accumulated_render_targets_[j] == render_target) {
-          are_accumulated_render_targets_valid_ = false;
-          break;
+    // chrispy: this needs optimization!
+    if (are_accumulated_render_targets_valid_) {
+      for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) {
+        const RenderTarget* render_target =
+            last_update_accumulated_render_targets_[i];
+        if (!render_target) {
+          continue;
+        }
+        for (uint32_t j = 0; j < i; ++j) {
+          if (last_update_accumulated_render_targets_[j] == render_target) {
+            are_accumulated_render_targets_valid_ = false;
+            goto exit_slot_check_loop;
+          }
         }
       }
+    exit_slot_check_loop:;
     }
   }
   if (!are_accumulated_render_targets_valid_) {
diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc
index 05f6e2090..b697ff73c 100644
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file,
     : register_file_(register_file),
       shared_memory_(shared_memory),
       draw_resolution_scale_x_(draw_resolution_scale_x),
-      draw_resolution_scale_y_(draw_resolution_scale_y) {
+      draw_resolution_scale_y_(draw_resolution_scale_y),
+      draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
+      draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
   assert_true(draw_resolution_scale_x >= 1);
   assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
   assert_true(draw_resolution_scale_y >= 1);
@@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
       uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
   uint32_t config_y =
       uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
+
   uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
   uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
   x_out = clamped_x;
@@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }
 
 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const global_unique_lock_type& global_lock,
-    bool is_mip) {
+    [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
   if (is_mip) {
     assert_not_zero(GetGuestMipsSize());
     mips_outdated_ = true;
@@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback(
 }
 
 void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
-                                 void* context,
-    void* data, uint64_t argument, bool invalidated_by_gpu) {
+                                 void* context, void* data, uint64_t argument,
+                                 bool invalidated_by_gpu) {
   Texture& texture = *static_cast<Texture*>(context);
   texture.WatchCallback(global_lock, argument != 0);
   texture.texture_cache().texture_became_outdated_.store(
@@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }
 
 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const global_unique_lock_type& global_lock,
-    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
+    const global_unique_lock_type& global_lock, uint32_t address_first,
+    uint32_t address_last, bool invalidated_by_gpu) {
   assert_true(IsDrawResolutionScaled());
   if (invalidated_by_gpu) {
     // Resolves themselves do exactly the opposite of what this should do.
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index cb690a286..c028c1be4 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -19,6 +19,7 @@
 
 #include "xenia/base/assert.h"
 #include "xenia/base/hash.h"
+#include "xenia/base/math.h"
 #include "xenia/base/mutex.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shared_memory.h"
@@ -70,6 +71,14 @@ class TextureCache {
   static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
   uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
   uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
+
+  divisors::MagicDiv draw_resolution_scale_x_divisor() const {
+    return draw_resolution_scale_x_divisor_;
+  }
+  divisors::MagicDiv draw_resolution_scale_y_divisor() const {
+    return draw_resolution_scale_y_divisor_;
+  }
+
   bool IsDrawResolutionScaled() const {
     return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
   }
@@ -576,8 +585,8 @@ class TextureCache {
 
   // Shared memory callback for texture data invalidation.
   static void WatchCallback(const global_unique_lock_type& global_lock,
-                            void* context,
-      void* data, uint64_t argument, bool invalidated_by_gpu);
+                            void* context, void* data, uint64_t argument,
+                            bool invalidated_by_gpu);
 
   // Checks if there are any pages that contain scaled resolve data within the
   // range.
@@ -588,14 +597,15 @@ class TextureCache {
       const global_unique_lock_type& global_lock, void* context,
       uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
   void ScaledResolveGlobalWatchCallback(
-      const global_unique_lock_type& global_lock,
-      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
+      const global_unique_lock_type& global_lock, uint32_t address_first,
+      uint32_t address_last, bool invalidated_by_gpu);
 
   const RegisterFile& register_file_;
   SharedMemory& shared_memory_;
   uint32_t draw_resolution_scale_x_;
   uint32_t draw_resolution_scale_y_;
-
+  divisors::MagicDiv draw_resolution_scale_x_divisor_;
+  divisors::MagicDiv draw_resolution_scale_y_divisor_;
   static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
 
   xe::global_critical_region global_critical_region_;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 47d7506e6..9ac9f13e2 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
 
   // Get dynamic rasterizer state.
   draw_util::ViewportInfo viewport_info;
+
   // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
   // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
   // maxViewportDimensions must be greater than or equal to the size of the
@@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   // life. Or even disregard the viewport bounds range in the fragment shader
   // interlocks case completely - apply the viewport and the scissor offset
   // directly to pixel address and to things like ps_param_gen.
-  draw_util::GetHostViewportInfo(
-      regs, 1, 1, false, device_limits.maxViewportDimensions[0],
-      device_limits.maxViewportDimensions[1], true, normalized_depth_control,
-      false, host_render_targets_used,
-      pixel_shader && pixel_shader->writes_depth(), viewport_info);
+  draw_util::GetViewportInfoArgs gviargs{};
+  gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false,
+                device_limits.maxViewportDimensions[0],
+
+                device_limits.maxViewportDimensions[1], true,
+                normalized_depth_control, false, host_render_targets_used,
+                pixel_shader && pixel_shader->writes_depth());
+  gviargs.SetupRegisterValues(regs);
+
+  draw_util::GetHostViewportInfo(&gviargs, viewport_info);
 
   // Update dynamic graphics pipeline state.
   UpdateDynamicState(viewport_info, primitive_polygonal,
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 55a16df1d..df59b561c 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
          format == ColorRenderTargetFormat::k_32_32_FLOAT;
 }
 
-inline uint32_t GetColorRenderTargetFormatComponentCount(
+// if 0, 1
+// if 1, 2
+// if 3, 4
+// 2 bits per entry, shift and add 1
+
+using ColorFormatComponentTable = uint32_t;
+
+static constexpr uint32_t GetComponentCountConst(
     ColorRenderTargetFormat format) {
   switch (format) {
     case ColorRenderTargetFormat::k_8_8_8_8:
@@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount(
     case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
     case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
     case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
-      return 4;
+      return 4 - 1;
     case ColorRenderTargetFormat::k_16_16:
     case ColorRenderTargetFormat::k_16_16_FLOAT:
     case ColorRenderTargetFormat::k_32_32_FLOAT:
-      return 2;
+      return 2 - 1;
     case ColorRenderTargetFormat::k_32_FLOAT:
-      return 1;
+      return 1 - 1;
     default:
-      assert_unhandled_case(format);
       return 0;
   }
 }
+namespace detail {
+static constexpr uint32_t encode_format_component_table() {
+  uint32_t result = 0;
 
+#define ADDFORMAT(name)                                           \
+  result |= GetComponentCountConst(ColorRenderTargetFormat::name) \
+            << (static_cast<uint32_t>(ColorRenderTargetFormat::name) * 2)
+  ADDFORMAT(k_8_8_8_8);
+  ADDFORMAT(k_8_8_8_8_GAMMA);
+  ADDFORMAT(k_2_10_10_10);
+  ADDFORMAT(k_2_10_10_10_FLOAT);
+
+  ADDFORMAT(k_16_16_16_16);
+  ADDFORMAT(k_16_16_16_16_FLOAT);
+  ADDFORMAT(k_2_10_10_10_AS_10_10_10_10);
+  ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16);
+
+  ADDFORMAT(k_16_16);
+  ADDFORMAT(k_16_16_FLOAT);
+  ADDFORMAT(k_32_32_FLOAT);
+  ADDFORMAT(k_32_FLOAT);
+  return result;
+}
+constexpr uint32_t color_format_component_table =
+    encode_format_component_table();
+
+}  // namespace detail
+constexpr uint32_t GetColorRenderTargetFormatComponentCount(
+    ColorRenderTargetFormat format) {
+  return ((detail::color_format_component_table >>
+           (static_cast<uint32_t>(format) * 2)) &
+          0b11) +
+         1;
+}
 // Returns the version of the format with the same packing and meaning of values
 // stored in it, but without blending precision modifiers.
 constexpr ColorRenderTargetFormat GetStorageColorFormat(