Merge pull request #66 from chrisps/canary_experimental

Huge boost to readback_memexport/resolve performance by fixing old bug; miscellaneous optimizations
2022-08-29 00:55:24 +02:00 · 2022-08-29 00:55:24 +02:00 · c1d3e35eb9
parent 335a390d43 78c9a48bc2
commit c1d3e35eb9
33 changed files with 1593 additions and 513 deletions
--- a/src/xenia/base/dma.cc
+++ b/src/xenia/base/dma.cc
@ -0,0 +1,415 @@
 #include "dma.h"
 template <size_t N, typename... Ts>
 static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
  char buffer[1024];
  sprintf_s(buffer, fmt, args...);
  XELOGI("%s", buffer);
 }
 //#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
 //#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
 #define XEDMALOG(...) static_cast<void>(0)
 using xe::swcache::CacheLine;
 static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
 XE_FORCEINLINE
 static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to,
                                 CacheLine* XE_RESTRICT from) {
  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
  CacheLine* dest1 = to;
  CacheLine* src1 = from;
  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
 #pragma loop(no_vector)
  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
    xe::swcache::CacheLine line0, line1, line2, line3;
    xe::swcache::ReadLine(&line0, src1 + i);
    xe::swcache::ReadLine(&line1, src2 + i);
    xe::swcache::ReadLine(&line2, src3 + i);
    xe::swcache::ReadLine(&line3, src4 + i);
    XE_MSVC_REORDER_BARRIER();
    xe::swcache::WriteLineNT(dest1 + i, &line0);
    xe::swcache::WriteLineNT(dest2 + i, &line1);
    xe::swcache::WriteLineNT(dest3 + i, &line2);
    xe::swcache::WriteLineNT(dest4 + i, &line3);
  }
  XE_MSVC_REORDER_BARRIER();
 }
 namespace xe::dma {
 XE_FORCEINLINE
 static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr,
                         CacheLine* XE_RESTRICT rdmapping,
                         uint32_t written_length) {
  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
  while (written_length >= 16384) {
    XeCopy16384Streaming(physaddr, rdmapping);
    physaddr += NUM_LINES_FOR_16K;
    rdmapping += NUM_LINES_FOR_16K;
    written_length -= 16384;
  }
  if (!written_length) {
    return;
  }
  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
  uint32_t i = 0;
  for (; i + 1 < num_written_lines; i += 2) {
    xe::swcache::CacheLine line0, line1;
    xe::swcache::ReadLine(&line0, rdmapping + i);
    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
    XE_MSVC_REORDER_BARRIER();
    xe::swcache::WriteLineNT(physaddr + i, &line0);
    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
  }
  if (i < num_written_lines) {
    xe::swcache::CacheLine line0;
    xe::swcache::ReadLine(&line0, rdmapping + i);
    xe::swcache::WriteLineNT(physaddr + i, &line0);
  }
 }
 XE_NOINLINE
 void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
             uint32_t written_length) {
  return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping,
                      written_length);
 }
 #define XEDMA_NUM_WORKERS 4
 class alignas(256) XeDMACGeneric : public XeDMAC {
  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
    std::atomic<uint64_t> free_job_slots_;
    std::atomic<uint64_t> jobs_submitted_;
    std::atomic<uint64_t> jobs_completed_;
    std::atomic<uint32_t> num_workers_awoken_;
    std::atomic<uint32_t> current_job_serial_;
  } dma_volatile_;
  alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
  volatile uint32_t jobserials_[64];
  alignas(XE_HOST_CACHE_LINE_SIZE)
      std::unique_ptr<threading::Event> job_done_signals_[64];
  // really dont like using unique pointer for this...
  std::unique_ptr<threading::Event> job_submitted_signal_;
  std::unique_ptr<threading::Event> job_completed_signal_;
  std::unique_ptr<threading::Thread> scheduler_thread_;
  struct WorkSlice {
    uint8_t* destination;
    uint8_t* source;
    size_t numbytes;
  };
  std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
  std::unique_ptr<threading::Event> worker_has_work_;  //[XEDMA_NUM_WORKERS];
  std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
  threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
  WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
  // chrispy: this is bad
  static uint32_t find_free_hole_in_dword(uint64_t dw) {
    XEDMALOG("Finding free hole in 0x%llX", dw);
    for (uint32_t i = 0; i < 64; ++i) {
      if (dw & (1ULL << i)) {
        continue;
      }
      return i;
    }
    return ~0U;
  }
  uint32_t allocate_free_dma_slot() {
    XEDMALOG("Allocating free slot");
    uint32_t got_slot = 0;
    uint64_t slots;
    uint64_t allocated_slot;
    do {
      slots = dma_volatile_.free_job_slots_.load();
      got_slot = find_free_hole_in_dword(slots);
      if (!~got_slot) {
        XEDMALOG("Didn't get a slot!");
        return ~0U;
      }
      allocated_slot = slots | (1ULL << got_slot);
    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
        slots, allocated_slot)));
    XEDMALOG("Allocated slot %d", got_slot);
    return got_slot;
  }
  // chrispy: on x86 this can just be interlockedbittestandreset...
  void free_dma_slot(uint32_t slot) {
    XEDMALOG("Freeing slot %d", slot);
    uint64_t slots;
    uint64_t deallocated_slot;
    do {
      slots = dma_volatile_.free_job_slots_.load();
      deallocated_slot = slots & (~(1ULL << slot));
    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
        slots, deallocated_slot)));
  }
  void DoDMAJob(uint32_t idx) {
    XeDMAJob& job = jobs_[idx];
    if (job.precall) {
      job.precall(&job);
    }
    // memcpy(job.destination, job.source, job.size);
    size_t job_size = job.size;
    size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
    size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
    size_t rem = job_size - line_rounded;
    size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
    XEDMALOG(
        "Distributing %d bytes from %p to %p across %d workers, remainder is "
        "%d",
        line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
    if (num_per_worker < 2048) {
      XEDMALOG("not distributing across workers, num_per_worker < 8192");
      // not worth splitting up
      memcpy(job.destination, job.source, job.size);
      job.signal_on_done->Set();
    } else {
      for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
        worker_workslice_[i].destination =
            (i * num_per_worker) + job.destination;
        worker_workslice_[i].source = (i * num_per_worker) + job.source;
        worker_workslice_[i].numbytes = num_per_worker;
      }
      if (rem) {
        __movsb(job.destination + line_rounded, job.source + line_rounded, rem);
      }
      // wake them up
      worker_has_work_->Set();
      XEDMALOG("Starting waitall for job");
      threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
                         false);
      XEDMALOG("Waitall for job completed!");
      job.signal_on_done->Set();
    }
    if (job.postcall) {
      job.postcall(&job);
    }
    ++dma_volatile_.jobs_completed_;
  }
  void WorkerIter(uint32_t worker_index) {
    xenia_assert(worker_index < XEDMA_NUM_WORKERS);
    auto [dest, src, size] = worker_workslice_[worker_index];
    //  if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
    worker_has_work_->Reset();
    //}
    xenia_assert(size < (1ULL << 32));
    // memcpy(dest, src, size);
    dma::vastcpy(dest, src, static_cast<uint32_t>(size));
  }
  XE_NOINLINE
  void WorkerMainLoop(uint32_t worker_index) {
    do {
      XEDMALOG("Worker iter for worker %d", worker_index);
      WorkerIter(worker_index);
      XEDMALOG("Worker %d is done\n", worker_index);
      threading::SignalAndWait(worker_has_finished_[worker_index].get(),
                               worker_has_work_.get(), false);
    } while (true);
  }
  void WorkerMain(uint32_t worker_index) {
    XEDMALOG("Entered worker main loop, index %d", worker_index);
    threading::Wait(worker_has_work_.get(), false);
    XEDMALOG("First wait for worker %d completed, first job ever",
             worker_index);
    WorkerMainLoop(worker_index);
  }
  static void WorkerMainForwarder(void* ptr) {
    // we aligned XeDma to 256 bytes and encode extra info in the low 8
    uintptr_t uptr = (uintptr_t)ptr;
    uint32_t worker_index = (uint8_t)uptr;
    uptr &= ~0xFFULL;
    char name_buffer[64];
    sprintf_s(name_buffer, "dma_worker_%d", worker_index);
    xe::threading::set_name(name_buffer);
    reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
  }
  void DMAMain() {
    XEDMALOG("DmaMain");
    do {
      threading::Wait(job_submitted_signal_.get(), false);
      auto slots = dma_volatile_.free_job_slots_.load();
      for (uint32_t i = 0; i < 64; ++i) {
        if (slots & (1ULL << i)) {
          XEDMALOG("Got new job at index %d in DMAMain", i);
          DoDMAJob(i);
          free_dma_slot(i);
          job_completed_signal_->Set();
          //         break;
        }
      }
    } while (true);
  }
  static void DMAMainForwarder(void* ud) {
    xe::threading::set_name("dma_main");
    reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
  }
 public:
  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
    XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
             job->size);
    uint32_t slot;
    do {
      slot = allocate_free_dma_slot();
      if (!~slot) {
        XEDMALOG(
            "Didn't get a free slot, waiting for a job to complete before "
            "resuming.");
        threading::Wait(job_completed_signal_.get(), false);
      } else {
        break;
      }
    } while (true);
    jobs_[slot] = *job;
    jobs_[slot].signal_on_done = job_done_signals_[slot].get();
    jobs_[slot].signal_on_done->Reset();
    XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
    uint32_t new_serial = dma_volatile_.current_job_serial_++;
    jobserials_[slot] = new_serial;
    ++dma_volatile_.jobs_submitted_;
    job_submitted_signal_->Set();
    return (static_cast<uint64_t>(new_serial) << 32) |
           static_cast<uint64_t>(slot);
    // return job_done_signals_[slot].get();
  }
  bool AllJobsDone() {
    return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
  }
  virtual void WaitJobDone(DMACJobHandle handle) override {
    uint32_t serial = static_cast<uint32_t>(handle >> 32);
    uint32_t jobid = static_cast<uint32_t>(handle);
    do {
      if (jobserials_[jobid] != serial) {
        return;  // done, our slot was reused
      }
      auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
                                     std::chrono::milliseconds{1});
      if (waitres == threading::WaitResult::kTimeout) {
        continue;
      } else {
        return;
      }
    } while (true);
  }
  virtual void WaitForIdle() override {
    while (!AllJobsDone()) {
      threading::MaybeYield();
    }
  }
  XeDMACGeneric() {
    XEDMALOG("Constructing xedma at addr %p", this);
    dma_volatile_.free_job_slots_.store(0ULL);
    dma_volatile_.jobs_submitted_.store(0ULL);
    dma_volatile_.jobs_completed_.store(0ULL);
    dma_volatile_.current_job_serial_.store(
        1ULL);  // so that a jobhandle is never 0
    std::memset(jobs_, 0, sizeof(jobs_));
    job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
    job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
    worker_has_work_ = threading::Event::CreateManualResetEvent(false);
    threading::Thread::CreationParameters worker_params{};
    worker_params.create_suspended = false;
    worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
    worker_params.stack_size = 65536;  // dont need much stack at all
    for (uint32_t i = 0; i < 64; ++i) {
      job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
    }
    for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
      // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
      worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
      worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
      uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
      xenia_assert(!(encoded & 0xFFULL));
      xenia_assert(i < 256);
      encoded |= i;
      workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
        XeDMACGeneric::WorkerMainForwarder((void*)encoded);
      });
    }
    threading::Thread::CreationParameters scheduler_params{};
    scheduler_params.create_suspended = false;
    scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
    scheduler_params.stack_size = 65536;
    scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
      XeDMACGeneric::DMAMainForwarder((void*)this);
    });
  }
 };
 XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
 }  // namespace xe::dma
--- a/src/xenia/base/dma.h
+++ b/src/xenia/base/dma.h
@ -0,0 +1,46 @@
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2020 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
 #ifndef XENIA_BASE_DMA_H_
 #define XENIA_BASE_DMA_H_
 #include "memory.h"
 #include "threading.h"
 namespace xe::dma {
 struct XeDMAJob;
 using DmaPrecall = void (*)(XeDMAJob* job);
 using DmaPostcall = void (*)(XeDMAJob* job);
 struct XeDMAJob {
  threading::Event* signal_on_done;
  uint8_t* destination;
  uint8_t* source;
  size_t size;
  DmaPrecall precall;
  DmaPostcall postcall;
  void* userdata1;
  void* userdata2;
 };
 using DMACJobHandle = uint64_t;
 class XeDMAC {
 public:
  virtual ~XeDMAC() {}
  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
  virtual void WaitJobDone(DMACJobHandle handle) = 0;
  virtual void WaitForIdle() = 0;
 };
 XeDMAC* CreateDMAC();
 // must be divisible by cache line size
 XE_NOINLINE
 void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
             uint32_t written_length);
 }  // namespace xe::dma
 #endif  // XENIA_BASE_DMA_H_
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) {
  return m128_i64<N>(_mm_castps_pd(v));
 }
 /*
 	std::min/max float has handling for nans, where if either argument is nan the first argument is returned
-	minss/maxss are different, if either argument is nan the second operand to the instruction is returned
+        std::min/max float has handling for nans, where if either argument is
-	this is problematic because we have no assurances from the compiler on the argument ordering
+   nan the first argument is returned
-	so only use in places where nan handling is not needed
+        minss/maxss are different, if either argument is nan the second operand
   to the instruction is returned this is problematic because we have no
   assurances from the compiler on the argument ordering
        so only use in places where nan handling is not needed
 */
-static float xe_minf(float x, float y) {
+XE_FORCEINLINE
 static float ArchMin(float x, float y) {
  return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_maxf(float x, float y) {
+XE_FORCEINLINE
 static float ArchMax(float x, float y) {
  return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
 }
-static float xe_rcpf(float den) {
+XE_FORCEINLINE
 static float ArchReciprocal(float den) {
  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }
 #else
-static float xe_minf(float x, float y) { return std::min<float>(x, y); }
+static float ArchMin(float x, float y) { return std::min<float>(x, y); }
-static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
+static float ArchMax(float x, float y) { return std::max<float>(x, y); }
-static float xe_rcpf(float den) { return 1.0f / den; }
+static float ArchReciprocal(float den) { return 1.0f / den; }
 #endif
 XE_FORCEINLINE
 static float RefineReciprocal(float initial, float den) {
  float t0 = initial * den;
  float t1 = t0 * initial;
  float rcp2 = initial + initial;
  return rcp2 - t1;
 }
 XE_FORCEINLINE
 static float ArchReciprocalRefined(float den) {
  return RefineReciprocal(ArchReciprocal(den), den);
 }
 // Similar to the C++ implementation of XMConvertFloatToHalf and
 // XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) {
  }
  return T(result);
 }
 namespace divisors {
 union IDivExtraInfo {
  uint32_t value_;
  struct {
    uint32_t shift_ : 31;
    uint32_t add_ : 1;
  } info;
 };
 // returns magicnum multiplier
 static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
  IDivExtraInfo extra;
  uint32_t d = _denom;
  int p;
  uint32_t nc, delta, q1, r1, q2, r2;
  struct {
    unsigned M;
    int a;
    int s;
  } magu;
  magu.a = 0;
  nc = -1 - ((uint32_t) - (int32_t)d) % d;
  p = 31;
  q1 = 0x80000000 / nc;
  r1 = 0x80000000 - q1 * nc;
  q2 = 0x7FFFFFFF / d;
  r2 = 0x7FFFFFFF - q2 * d;
  do {
    p += 1;
    if (r1 >= nc - r1) {
      q1 = 2 * q1 + 1;
      r1 = 2 * r1 - nc;
    } else {
      q1 = 2 * q1;
      r1 = 2 * r1;
    }
    if (r2 + 1 >= d - r2) {
      if (q2 >= 0x7FFFFFFF) {
        magu.a = 1;
      }
      q2 = 2 * q2 + 1;
      r2 = 2 * r2 + 1 - d;
    } else {
      if (q2 >= 0x80000000U) {
        magu.a = 1;
      }
      q2 = 2 * q2;
      r2 = 2 * r2 + 1;
    }
    delta = d - 1 - r2;
  } while (p < 64 && (q1 < delta || r1 == 0));
  extra.info.add_ = magu.a;
  extra.info.shift_ = p - 32;
  out_extra = extra.value_;
  return static_cast<uint64_t>(q2 + 1);
 }
 static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
                                      uint32_t extradata) {
  IDivExtraInfo extra;
  extra.value_ = extradata;
  uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
  if (extra.info.add_) {
    uint32_t addend = result + num;
    addend = ((addend < result ? 0x80000000 : 0) | addend);
    result = addend;
  }
  return result >> extra.info.shift_;
 }
 static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
                                       uint32_t extradata, uint32_t original) {
  uint32_t dived = ApplyUint32Div(num, mul, extradata);
  unsigned result = num - (dived * original);
  return result;
 }
 struct MagicDiv {
  uint32_t multiplier_;
  uint32_t extradata_;
  MagicDiv() : multiplier_(0), extradata_(0) {}
  MagicDiv(uint32_t original) {
    multiplier_ = PregenerateUint32Div(original, extradata_);
  }
  uint32_t Apply(uint32_t numerator) const {
    return ApplyUint32Div(numerator, multiplier_, extradata_);
  }
 };
 }  // namespace divisors
 }  // namespace xe
 #endif  // XENIA_BASE_MATH_H_
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -672,25 +672,58 @@ static void Prefetch<PrefetchTag::Level1>(const void* addr) {
 #define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
 #endif
 #if XE_ARCH_AMD64 == 1
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
  struct {
    __m256 low32;
    __m256 high32;
  };
  struct {
    __m128i xmms[4];
  };
  float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)];
 };
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
+static void WriteLineNT(CacheLine* XE_RESTRICT destination,
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+                        const CacheLine* XE_RESTRICT source) {
-  __m256i low = _mm256_loadu_si256((const __m256i*)source);
+  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
-  __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
+  __m256 low = _mm256_loadu_ps(&source->floats[0]);
-  XE_MSVC_REORDER_BARRIER();
+  __m256 high = _mm256_loadu_ps(&source->floats[8]);
-  _mm256_stream_si256((__m256i*)destination, low);
+  _mm256_stream_ps(&destination->floats[0], low);
-  _mm256_stream_si256(&((__m256i*)destination)[1], high);
+  _mm256_stream_ps(&destination->floats[8], high);
 }
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
+static void ReadLineNT(CacheLine* XE_RESTRICT destination,
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+                       const CacheLine* XE_RESTRICT source) {
-  __m256i low = _mm256_stream_load_si256((const __m256i*)source);
+  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
-  __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
+
-  XE_MSVC_REORDER_BARRIER();
+  __m128i first = _mm_stream_load_si128(&source->xmms[0]);
-  _mm256_storeu_si256((__m256i*)destination, low);
+  __m128i second = _mm_stream_load_si128(&source->xmms[1]);
-  _mm256_storeu_si256(&((__m256i*)destination)[1], high);
+  __m128i third = _mm_stream_load_si128(&source->xmms[2]);
  __m128i fourth = _mm_stream_load_si128(&source->xmms[3]);
  destination->xmms[0] = first;
  destination->xmms[1] = second;
  destination->xmms[2] = third;
  destination->xmms[3] = fourth;
 }
 XE_FORCEINLINE
 static void ReadLine(CacheLine* XE_RESTRICT destination,
                     const CacheLine* XE_RESTRICT source) {
  assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
  __m256 low = _mm256_loadu_ps(&source->floats[0]);
  __m256 high = _mm256_loadu_ps(&source->floats[8]);
  _mm256_storeu_ps(&destination->floats[0], low);
  _mm256_storeu_ps(&destination->floats[8], high);
 }
 XE_FORCEINLINE
 static void WriteLine(CacheLine* XE_RESTRICT destination,
                      const CacheLine* XE_RESTRICT source) {
  assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
  __m256 low = _mm256_loadu_ps(&source->floats[0]);
  __m256 high = _mm256_loadu_ps(&source->floats[8]);
  _mm256_storeu_ps(&destination->floats[0], low);
  _mm256_storeu_ps(&destination->floats[8], high);
 }
 XE_FORCEINLINE
@ -699,19 +732,29 @@ XE_FORCEINLINE
 static void ReadFence() { _mm_lfence(); }
 XE_FORCEINLINE
 static void ReadWriteFence() { _mm_mfence(); }
 #else
-
+union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
  uint8_t bvals[XE_HOST_CACHE_LINE_SIZE];
 };
 XE_FORCEINLINE
-static void WriteLineNT(void* destination, const void* source) {
+static void WriteLineNT(CacheLine* destination, const CacheLine* source) {
-  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
  memcpy(destination, source, 64);
 }
 XE_FORCEINLINE
-static void ReadLineNT(void* destination, const void* source) {
+static void ReadLineNT(CacheLine* destination, const CacheLine* source) {
-  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
  memcpy(destination, source, 64);
 }
 XE_FORCEINLINE
 static void WriteLine(CacheLine* destination, const CacheLine* source) {
  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
 XE_FORCEINLINE
 static void ReadLine(CacheLine* destination, const CacheLine* source) {
  memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
 }
 XE_FORCEINLINE
 static void WriteFence() {}
 XE_FORCEINLINE
@ -720,6 +763,47 @@ XE_FORCEINLINE
 static void ReadWriteFence() {}
 #endif
 }  // namespace swcache
 template <unsigned Size>
 static void smallcpy_const(void* destination, const void* source) {
 #if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
  if constexpr ((Size & 7) == 0) {
    __movsq((unsigned long long*)destination, (const unsigned long long*)source,
            Size / 8);
  } else if constexpr ((Size & 3) == 0) {
    __movsd((unsigned long*)destination, (const unsigned long*)source,
            Size / 4);
    // dont even bother with movsw, i think the operand size override prefix
    // slows it down
  } else {
    __movsb((unsigned char*)destination, (const unsigned char*)source, Size);
  }
 #else
  memcpy(destination, source, Size);
 #endif
 }
 template <unsigned Size>
 static void smallset_const(void* destination, unsigned char fill_value) {
 #if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
  if constexpr ((Size & 7) == 0) {
    unsigned long long fill =
        static_cast<unsigned long long>(fill_value) * 0x0101010101010101ULL;
    __stosq((unsigned long long*)destination, fill, Size / 8);
  } else if constexpr ((Size & 3) == 0) {
    static constexpr unsigned long fill =
        static_cast<unsigned long>(fill_value) * 0x01010101U;
    __stosd((unsigned long*)destination, fill, Size / 4);
    // dont even bother with movsw, i think the operand size override prefix
    // slows it down
  } else {
    __stosb((unsigned char*)destination, fill_value, Size);
  }
 #else
  memset(destination, fill_value, Size);
 #endif
 }
 }  // namespace xe
 #endif  // XENIA_BASE_MEMORY_H_
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@ -59,16 +59,8 @@ class RingBuffer {
  // subtract instead
  void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
  ring_size_t read_count() const {
-// chrispy: these branches are unpredictable
+    // chrispy: these branches are unpredictable
-#if 0
+
    if (read_offset_ == write_offset_) {
      return 0;
    } else if (read_offset_ < write_offset_) {
      return write_offset_ - read_offset_;
    } else {
      return (capacity_ - read_offset_) + write_offset_;
    }
 #else
    ring_size_t read_offs = read_offset_;
    ring_size_t write_offs = write_offset_;
    ring_size_t cap = capacity_;
@ -77,14 +69,6 @@ class RingBuffer {
    ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
    ring_size_t comparison_value = read_offs <= write_offs;
 #if 0
    size_t selector =
        static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
    offset_delta &= selector;
    wrap_read_count &= ~selector;
    return offset_delta | wrap_read_count;
 #else
    if (XE_LIKELY(read_offs <= write_offs)) {
      return offset_delta;  // will be 0 if they are equal, semantically
@ -93,8 +77,6 @@ class RingBuffer {
    } else {
      return wrap_read_count;
    }
 #endif
 #endif
  }
  ring_size_t write_offset() const { return write_offset_; }
@ -116,9 +98,9 @@ class RingBuffer {
  void AdvanceWrite(size_t count);
  struct ReadRange {
-    const uint8_t* first;
+    const uint8_t* XE_RESTRICT first;
-    const uint8_t* second;
+    const uint8_t* XE_RESTRICT second;
    ring_size_t first_length;
    ring_size_t second_length;
  };
@ -126,9 +108,11 @@ class RingBuffer {
  void EndRead(ReadRange read_range);
  /*
-        BeginRead, but if there is a second Range it will prefetch all lines of it
+        BeginRead, but if there is a second Range it will prefetch all lines of
     it
-		this does not prefetch the first range, because software prefetching can do that faster than we can
+                this does not prefetch the first range, because software
     prefetching can do that faster than we can
  */
  template <swcache::PrefetchTag tag>
  XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
@ -138,7 +122,7 @@ class RingBuffer {
      ring_size_t numlines =
          xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
          XE_HOST_CACHE_LINE_SIZE;
-	  //chrispy: maybe unroll?
+      // chrispy: maybe unroll?
      for (ring_size_t i = 0; i < numlines; ++i) {
        swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
      }
@ -187,7 +171,7 @@ class RingBuffer {
  }
 private:
-  uint8_t* buffer_ = nullptr;
+  uint8_t* XE_RESTRICT buffer_ = nullptr;
  ring_size_t capacity_ = 0;
  ring_size_t read_offset_ = 0;
  ring_size_t write_offset_ = 0;
--- a/src/xenia/base/split_map.h
+++ b/src/xenia/base/split_map.h
@ -0,0 +1,87 @@
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2019 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
 #ifndef XENIA_BASE_SPLIT_MAP_H_
 #define XENIA_BASE_SPLIT_MAP_H_
 #include <algorithm>
 #include <vector>
 namespace xe {
 /*
        a map structure that is optimized for infrequent
   reallocation/resizing/erasure and frequent searches by key implemented as 2
   std::vectors, one of the keys and one of the values
 */
 template <typename TKey, typename TValue>
 class split_map {
  using key_vector = std::vector<TKey>;
  using value_vector = std::vector<TValue>;
  key_vector keys_;
  value_vector values_;
 public:
  using my_type = split_map<TKey, TValue>;
  uint32_t IndexForKey(const TKey& k) {
    auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k);
    return static_cast<uint32_t>(lbound - keys_.begin());
  }
  uint32_t size() const { return static_cast<uint32_t>(keys_.size()); }
  key_vector& Keys() { return keys_; }
  value_vector& Values() { return values_; }
  void clear() {
    keys_.clear();
    values_.clear();
  }
  void resize(uint32_t new_size) {
    keys_.resize(static_cast<size_t>(new_size));
    values_.resize(static_cast<size_t>(new_size));
  }
  void reserve(uint32_t new_size) {
    keys_.reserve(static_cast<size_t>(new_size));
    values_.reserve(static_cast<size_t>(new_size));
  }
  const TKey* KeyAt(uint32_t index) const {
    if (index == size()) {
      return nullptr;
    } else {
      return &keys_[index];
    }
  }
  const TValue* ValueAt(uint32_t index) const {
    if (index == size()) {
      return nullptr;
    } else {
      return &values_[index];
    }
  }
  void InsertAt(TKey k, TValue v, uint32_t idx) {
    uint32_t old_size = size();
    bool needs_shiftup = idx != old_size;
    values_.insert(values_.begin() + idx, v);
    keys_.insert(keys_.begin() + idx, k);
  }
  void EraseAt(uint32_t idx) {
    uint32_t old_size = size();
    if (idx == old_size) {
      return;  // trying to erase nonexistent entry
    } else {
      values_.erase(values_.begin() + idx);
      keys_.erase(keys_.begin() + idx);
    }
  }
 };
 }  // namespace xe
 #endif  // XENIA_BASE_SPLIT_MAP_H_
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
            "code. The workaround may cause reduced CPU performance but is a "
            "more accurate emulation",
            "x64");
-
+DEFINE_uint32(align_all_basic_blocks, 0,
              "Aligns the start of all basic blocks to N bytes. Only specify a "
              "power of 2, 16 is the recommended value. Results in larger "
              "icache usage, but potentially faster loops",
              "x64");
 #if XE_X64_PROFILER_AVAILABLE == 1
 DEFINE_bool(instrument_call_times, false,
            "Compute time taken for functions, for profiling guest code",
@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
  TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C);
  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
  return true;
 }
 #pragma pack(push, 1)
 struct RGCEmitted {
  uint8_t ff_;
  uint32_t rgcid_;
 };
 #pragma pack(pop)
 #if 0
 void X64Emitter::InjectCallAddresses(void* new_execute_address) {
  for (auto&& callsite : call_sites_) {
    RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
      hunter =
          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
    }
    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
    hunter->rgcid_ =
        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
                              reinterpret_cast<intptr_t>(hunter + 1));
  }
 }
 #else
 void X64Emitter::InjectCallAddresses(void* new_execute_address) {
 #if 0
 	RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
  std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
  for (auto&& callsite : call_sites_) {
    id_to_rgc[callsite.offset_] = &callsite;
  }
 #else
  RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
  for (auto&& callsite : call_sites_) {
    while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
      hunter =
          reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
    }
    hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
    hunter->rgcid_ =
        static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
                              reinterpret_cast<intptr_t>(hunter + 1));
  }
 #endif
 }
 #endif
 void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
                          GuestFunction* function) {
  // To avoid changing xbyak, we do a switcharoo here.
@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
  if (function) {
    code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
                                new_execute_address, new_write_address);
    if (cvars::resolve_rel32_guest_calls) {
      for (auto&& callsite : call_sites_) {
 #pragma pack(push, 1)
        struct RGCEmitted {
          uint8_t ff_;
          uint32_t rgcid_;
        };
 #pragma pack(pop)
        RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
        while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
          hunter = reinterpret_cast<RGCEmitted*>(
              reinterpret_cast<char*>(hunter) + 1);
        }
-        hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
+    if (cvars::resolve_rel32_guest_calls) {
-        hunter->rgcid_ =
+      InjectCallAddresses(new_execute_address);
            static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
                                  reinterpret_cast<intptr_t>(hunter + 1));
      }
    }
  } else {
    code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
      label = label->next;
    }
    if (cvars::align_all_basic_blocks) {
      align(cvars::align_all_basic_blocks, true);
    }
    // Process instructions.
    const Instr* instr = block->instr_head;
    while (instr) {
@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = {
    vec128i(0x7f800000),
    /* XMMThreeFloatMask */
    vec128i(~0U, ~0U, ~0U, 0U),
    /*XMMXenosF16ExtRangeStart*/
    vec128f(65504),
    /*XMMVSRShlByteshuf*/
    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
    // XMMVSRMask
    vec128b(1),
    /*
        XMMF16UnpackLCPI2
        */
@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = {
    /*XMMXOPWordShiftMask*/
    vec128s(15),
    /*XMMXOPDwordShiftMask*/
-    vec128i(31)
+    vec128i(31)};
 };
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -157,9 +157,6 @@ enum XmmConst {
  XMMLVSRTableBase,
  XMMSingleDenormalMask,
  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
  XMMXenosF16ExtRangeStart,
  XMMVSRShlByteshuf,
  XMMVSRMask,
  XMMF16UnpackLCPI2,  // 0x38000000, 1/ 32768
  XMMF16UnpackLCPI3,  // 0x0x7fe000007fe000
  XMMF16PackLCPI0,
@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags {
  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
  kX64EmitBMI1 = 1 << 3,
  kX64EmitBMI2 = 1 << 4,
-  kX64EmitF16C = 1 << 5,
+  kX64EmitPrefetchW = 1 << 5,
  kX64EmitMovbe = 1 << 6,
  kX64EmitGFNI = 1 << 7,
@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags {
                // inc/dec) do not introduce false dependencies on EFLAGS
                // because the individual flags are treated as different vars by
                // the processor. (this applies to zen)
-  kX64EmitPrefetchW = 1 << 16,
+  kX64EmitXOP = 1 << 16,   // chrispy: xop maps really well to many vmx
  kX64EmitXOP = 1 << 17,   // chrispy: xop maps really well to many vmx
                           // instructions, and FX users need the boost
-  kX64EmitFMA4 = 1 << 18,  // todo: also use on zen1?
+  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
-  kX64EmitTBM = 1 << 19
+  kX64EmitTBM = 1 << 18,
  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
  // instructions by using the legacy sse version if we recently cleared the
  // high 128 bits of the
 };
 class ResolvableGuestCall {
 public:
@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
            uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
            void** out_code_address, size_t* out_code_size,
            std::vector<SourceMapEntry>* out_source_map);
  void InjectCallAddresses(void* new_execute_addr);
 public:
  // Reserved:  rsp, rsi, rdi
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@ -43,23 +43,23 @@ enum KeyType {
  KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
  KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
 };
-
+using InstrKeyValue = uint32_t;
 #pragma pack(push, 1)
 union InstrKey {
-  uint32_t value;
+  InstrKeyValue value;
  struct {
-    uint32_t opcode : 8;
+    InstrKeyValue opcode : 8;
-    uint32_t dest : 5;
+    InstrKeyValue dest : 5;
-    uint32_t src1 : 5;
+    InstrKeyValue src1 : 5;
-    uint32_t src2 : 5;
+    InstrKeyValue src2 : 5;
-    uint32_t src3 : 5;
+    InstrKeyValue src3 : 5;
-    uint32_t reserved : 4;
+    InstrKeyValue reserved : 4;
  };
-  operator uint32_t() const { return value; }
+  operator InstrKeyValue() const { return value; }
  InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
-  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(InstrKeyValue v) : value(v) {}
  // this used to take about 1% cpu while precompiling
  // it kept reloading opcode, and also constantly repacking and unpacking the
@ -67,16 +67,16 @@ union InstrKey {
  InstrKey(const Instr* i) : value(0) {
    const OpcodeInfo* info = i->GetOpcodeInfo();
-    uint32_t sig = info->signature;
+    InstrKeyValue sig = info->signature;
    OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
    UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
-    uint32_t out_desttype = (uint32_t)dest_type;
+    InstrKeyValue out_desttype = (InstrKeyValue)dest_type;
-    uint32_t out_src1type = (uint32_t)src1_type;
+    InstrKeyValue out_src1type = (InstrKeyValue)src1_type;
-    uint32_t out_src2type = (uint32_t)src2_type;
+    InstrKeyValue out_src2type = (InstrKeyValue)src2_type;
-    uint32_t out_src3type = (uint32_t)src3_type;
+    InstrKeyValue out_src3type = (InstrKeyValue)src3_type;
    Value* destv = i->dest;
    // pre-deref, even if not value
@ -105,7 +105,7 @@ union InstrKey {
  template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
            KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
  struct Construct {
-    static const uint32_t value =
+    static const InstrKeyValue value =
        (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
  };
 };
@ -307,8 +307,8 @@ struct I<OPCODE, DEST> : DestField<DEST> {
 protected:
  template <typename SEQ, typename T>
  friend struct Sequence;
-  bool Load(const Instr* i) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+    if (kv == key && BASE::LoadDest(i)) {
      instr = i;
      return true;
    }
@ -329,8 +329,8 @@ struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
 protected:
  template <typename SEQ, typename T>
  friend struct Sequence;
-  bool Load(const Instr* i) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+    if (kv == key && BASE::LoadDest(i)) {
      instr = i;
      src1.Load(i->src1);
      return true;
@ -355,8 +355,8 @@ struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
 protected:
  template <typename SEQ, typename T>
  friend struct Sequence;
-  bool Load(const Instr* i) {
+  bool Load(const Instr* i, InstrKeyValue kv) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+    if (kv == key && BASE::LoadDest(i)) {
      instr = i;
      src1.Load(i->src1);
      src2.Load(i->src2);
@ -385,8 +385,8 @@ struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
 protected:
  template <typename SEQ, typename T>
  friend struct Sequence;
-  bool Load(const Instr* i) {
+  bool Load(const Instr* i, InstrKeyValue ikey) {
-    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+    if (ikey == key && BASE::LoadDest(i)) {
      instr = i;
      src1.Load(i->src1);
      src2.Load(i->src2);
@ -422,9 +422,9 @@ struct Sequence {
  static constexpr uint32_t head_key() { return T::key; }
-  static bool Select(X64Emitter& e, const Instr* i) {
+  static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) {
    T args;
-    if (!args.Load(i)) {
+    if (!args.Load(i, ikey)) {
      return false;
    }
    SEQ::Emit(e, args);
--- a/src/xenia/cpu/backend/x64/x64_seq_control.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc
@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
  if (valid) {
    auto name = i.src2.value->name;
    switch (opcode) {
      case OPCODE_IS_TRUE:
        e.jnz(name, e.T_NEAR);
        break;
      case OPCODE_IS_FALSE:
        e.jz(name, e.T_NEAR);
        break;
      case OPCODE_COMPARE_EQ:
        e.je(name, e.T_NEAR);
        break;
@ -299,26 +293,14 @@ struct CALL_TRUE_I64
 struct CALL_TRUE_F32
    : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
+    assert_impossible_sequence(CALL_TRUE_F32);
    e.vptest(i.src1, i.src1);
    Xbyak::Label skip;
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 struct CALL_TRUE_F64
    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->is_guest());
+    assert_impossible_sequence(CALL_TRUE_F64);
    e.vptest(i.src1, i.src1);
    Xbyak::Label skip;
    e.jz(skip);
    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
    e.L(skip);
    e.ForgetMxcsrMode();
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32
    : Sequence<CALL_INDIRECT_TRUE_F32,
               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F32);
    Xbyak::Label skip;
    e.jz(skip, CodeGenerator::T_NEAR);
    e.CallIndirect(i.instr, i.src2);
    e.L(skip);
  }
 };
 struct CALL_INDIRECT_TRUE_F64
    : Sequence<CALL_INDIRECT_TRUE_F64,
               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    assert_impossible_sequence(CALL_INDIRECT_TRUE_F64);
    Xbyak::Label skip;
    e.jz(skip, CodeGenerator::T_NEAR);
    e.CallIndirect(i.instr, i.src2);
    e.L(skip);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
@ -486,15 +460,13 @@ struct RETURN_TRUE_I64
 struct RETURN_TRUE_F32
    : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    assert_impossible_sequence(RETURN_TRUE_F32);
    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
  }
 };
 struct RETURN_TRUE_F64
    : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    assert_impossible_sequence(RETURN_TRUE_F64);
    e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64
 struct BRANCH_TRUE_F32
    : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
+    /*
-        i.instr->prev->dest == i.src1.value) {
+                chrispy: right now, im not confident that we are always clearing
-      e.jnz(i.src2.value->name, e.T_NEAR);
+       the upper 96 bits of registers, making vptest extremely unsafe. many
-    } else if (i.instr->prev &&
+       ss/sd operations copy over the upper 96 from the source, and for abs we
-               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
+       negate ALL elements, making the top 64 bits contain 0x80000000 etc
-               i.instr->prev->dest == i.src1.value) {
+        */
-      e.jz(i.src2.value->name, e.T_NEAR);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
-    } else {
+    e.vmovd(e.eax, input);
-      e.vptest(i.src1, i.src1);
+    e.test(e.eax, e.eax);
-      e.jnz(i.src2.value->name, e.T_NEAR);
+    e.jnz(i.src2.value->name, e.T_NEAR);
    }
  }
 };
 struct BRANCH_TRUE_F64
    : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
-        i.instr->prev->dest == i.src1.value) {
+    e.vmovq(e.rax, input);
-      e.jnz(i.src2.value->name, e.T_NEAR);
+    e.test(e.rax, e.rax);
-    } else if (i.instr->prev &&
+    e.jnz(i.src2.value->name, e.T_NEAR);
               i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
               i.instr->prev->dest == i.src1.value) {
      e.jz(i.src2.value->name, e.T_NEAR);
    } else {
      e.vptest(i.src1, i.src1);
      e.jnz(i.src2.value->name, e.T_NEAR);
    }
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32
    : Sequence<BRANCH_FALSE_F32,
               I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
    e.vmovd(e.eax, input);
    e.test(e.eax, e.eax);
    e.jz(i.src2.value->name, e.T_NEAR);
  }
 };
@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64
    : Sequence<BRANCH_FALSE_F64,
               I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    e.vptest(i.src1, i.src1);
+    Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
    e.vmovq(e.rax, input);
    e.test(e.rax, e.rax);
    e.jz(i.src2.value->name, e.T_NEAR);
  }
 };
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
  if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
    return false;
  }
  if (IsTracingData()) {  // incompatible with tracing
    return false;
  }
  uint32_t guestaddr = i->GuestAddressFor();
  if (!guestaddr) {
    return false;
@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
  return flags && flags->accessed_mmio;
 }
 template <typename T, bool swap>
 static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
  if (swap) {
    value = xe::byte_swap(value);
  }
  if (guestaddr >= 0xE0000000) {
    guestaddr += 0x1000;
  }
  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
  if (!gaddr) {
    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
  } else {
    value = xe::byte_swap(value); /*
          was having issues, found by comparing the values used with exceptions
          to these that we were reversed...
    */
    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
  }
 }
 template <typename T, bool swap>
 static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) {
  T value;
  if (guestaddr >= 0xE0000000) {
    guestaddr += 0x1000;
  }
  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
  if (!gaddr) {
    value = *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr);
    if (swap) {
      value = xe::byte_swap(value);
    }
  } else {
    /*
          was having issues, found by comparing the values used with exceptions
          to these that we were reversed...
    */
    value = gaddr->read(nullptr, gaddr->callback_context, guestaddr);
  }
  return value;
 }
 // ============================================================================
 // OPCODE_LOAD_OFFSET
 // ============================================================================
@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16
 struct LOAD_OFFSET_I32
    : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+
-        e.movbe(i.dest, e.dword[addr]);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
      }
      if (i.src1.is_constant) {
        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
      } else {
        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
      }
      if (i.src2.is_constant) {
        e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
      } else {
        e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32());
      }
      e.CallNativeSafe(addrptr);
      e.mov(i.dest, e.eax);
    } else {
      auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
          e.movbe(i.dest, e.dword[addr]);
        } else {
          e.mov(i.dest, e.dword[addr]);
          e.bswap(i.dest);
        }
      } else {
        e.mov(i.dest, e.dword[addr]);
        e.bswap(i.dest);
      }
    } else {
      e.mov(i.dest, e.dword[addr]);
    }
  }
 };
@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64
 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
                     LOAD_OFFSET_I32, LOAD_OFFSET_I64);
 template <typename T, bool swap>
 static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
  if (swap) {
    value = xe::byte_swap(value);
  }
  if (guestaddr >= 0xE0000000) {
    guestaddr += 0x1000;
  }
  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
  if (!gaddr) {
    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
  } else {
    value = xe::byte_swap(value); /*
          was having issues, found by comparing the values used with exceptions
          to these that we were reversed...
    */
    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
  }
 }
 // ============================================================================
 // OPCODE_STORE_OFFSET
 // ============================================================================
@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
 };
 struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    auto addr = ComputeMemoryAddress(e, i.src1);
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
-    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
-      if (e.IsFeatureEnabled(kX64EmitMovbe)) {
+
-        e.movbe(i.dest, e.dword[addr]);
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
      }
      if (i.src1.is_constant) {
        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
      } else {
        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
      }
      e.CallNativeSafe(addrptr);
      e.mov(i.dest, e.eax);
    } else {
      auto addr = ComputeMemoryAddress(e, i.src1);
      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        if (e.IsFeatureEnabled(kX64EmitMovbe)) {
          e.movbe(i.dest, e.dword[addr]);
        } else {
          e.mov(i.dest, e.dword[addr]);
          e.bswap(i.dest);
        }
      } else {
        e.mov(i.dest, e.dword[addr]);
        e.bswap(i.dest);
      }
-    } else {
+      if (IsTracingData()) {
-      e.mov(i.dest, e.dword[addr]);
+        e.mov(e.GetNativeParam(1).cvt32(), i.dest);
-    }
+        e.lea(e.GetNativeParam(0), e.ptr[addr]);
-    if (IsTracingData()) {
+        e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
-      e.mov(e.GetNativeParam(1).cvt32(), i.dest);
+      }
      e.lea(e.GetNativeParam(0), e.ptr[addr]);
      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
    }
  }
 };
@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
        } else {
          e.mov(e.dword[addr], i.src2);
        }
        if (IsTracingData()) {
          e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
          e.lea(e.GetNativeParam(0), e.ptr[addr]);
          e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
        }
      }
    }
    if (IsTracingData()) {
      auto addr = ComputeMemoryAddress(e, i.src1);
      e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
      e.lea(e.GetNativeParam(0), e.ptr[addr]);
      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
    }
  }
 };
 struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -19,15 +19,15 @@
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
-DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
+DEFINE_bool(xop_rotates, false, "rotate via xop", "x64");
-DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
+DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64");
-DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
+DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64");
-DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
+DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64");
-DEFINE_bool(xop_compares, true, "compare via xop", "X64");
+DEFINE_bool(xop_compares, true, "compare via xop", "x64");
 namespace xe {
 namespace cpu {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -67,7 +67,7 @@ using namespace xe::cpu::hir;
 using xe::cpu::hir::Instr;
-typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey);
 std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 // ============================================================================
@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn,
  }
  return false;
 }
 #define EMITTER_IS_TRUE(typ, tester)                                 \
  struct IS_TRUE_##typ                                               \
      : Sequence<IS_TRUE_##typ, I<OPCODE_IS_TRUE, I8Op, typ##Op>> {  \
    static void Emit(X64Emitter& e, const EmitArgType& i) {          \
      e.tester(i.src1, i.src1);                                      \
      unsigned ctxoffset = 0;                                        \
      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
        e.setnz(e.byte[e.GetContextReg() + ctxoffset]);              \
      } else {                                                       \
        e.setnz(i.dest);                                             \
      }                                                              \
    }                                                                \
  }
 #define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test)
 EMITTER_IS_TRUE_INT(I8);
 EMITTER_IS_TRUE_INT(I16);
 EMITTER_IS_TRUE_INT(I32);
 EMITTER_IS_TRUE_INT(I64);
 EMITTER_IS_TRUE(F32, vtestps);
 EMITTER_IS_TRUE(F64, vtestpd);
 EMITTER_IS_TRUE(V128, vptest);
 EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
                     IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
 #define EMITTER_IS_FALSE(typ, tester)                                 \
  struct IS_FALSE_##typ                                               \
      : Sequence<IS_FALSE_##typ, I<OPCODE_IS_FALSE, I8Op, typ##Op>> { \
    static void Emit(X64Emitter& e, const EmitArgType& i) {           \
      e.tester(i.src1, i.src1);                                       \
      unsigned ctxoffset = 0;                                         \
      if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) {  \
        e.setz(e.byte[e.GetContextReg() + ctxoffset]);                \
      } else {                                                        \
        e.setz(i.dest);                                               \
      }                                                               \
    }                                                                 \
  }
 #define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test)
 EMITTER_IS_FALSE_INT(I8);
 EMITTER_IS_FALSE_INT(I16);
 EMITTER_IS_FALSE_INT(I32);
 EMITTER_IS_FALSE_INT(I64);
 EMITTER_IS_FALSE(F32, vtestps);
 EMITTER_IS_FALSE(F64, vtestpd);
 EMITTER_IS_FALSE(V128, vptest);
 EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
                     IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
 // ============================================================================
 // OPCODE_IS_NAN
@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
    auto it = sequence_table.find(key);
    if (it != sequence_table.end()) {
-      if (it->second(*e, i)) {
+      if (it->second(*e, i, InstrKey(i))) {
        *new_tail = i->next;
        return true;
      }
--- a/src/xenia/cpu/backend/x64/x64_sequences.h
+++ b/src/xenia/cpu/backend/x64/x64_sequences.h
@ -25,7 +25,7 @@ namespace x64 {
 class X64Emitter;
-typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*);
+typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey);
 extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
 template <typename T>
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            }
          }
          break;
        case OPCODE_IS_TRUE:
          if (i->src1.value->IsConstant()) {
            if (i->src1.value->IsConstantTrue()) {
              v->set_constant(uint8_t(1));
            } else {
              v->set_constant(uint8_t(0));
            }
            i->Remove();
            result = true;
          }
          break;
        case OPCODE_IS_FALSE:
          if (i->src1.value->IsConstant()) {
            if (i->src1.value->IsConstantFalse()) {
              v->set_constant(uint8_t(1));
            } else {
              v->set_constant(uint8_t(0));
            }
            i->Remove();
            result = true;
          }
          break;
        case OPCODE_IS_NAN:
          if (i->src1.value->IsConstant()) {
            if (i->src1.value->type == FLOAT32_TYPE &&
@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            if (should_skip_because_of_float) {
              break;
-			}
+            }
            v->set_from(i->src1.value);
            v->Max(i->src2.value);
            i->Remove();
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
  bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
  Value* new_value = nullptr;
-  if (xorop == OPCODE_IS_FALSE) {
+  if (xorop == OPCODE_COMPARE_EQ) {
    new_value = builder->IsTrue(xordef->src1.value);
  } else if (xorop == OPCODE_IS_TRUE) {
    new_value = builder->IsFalse(xordef->src1.value);
  } else if (xorop == OPCODE_COMPARE_EQ) {
    new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
  } else if (xorop == OPCODE_COMPARE_NE) {
@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
  return false;
 }
 bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
-  return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
+  return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE;
 }
 inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
  if (!var_definition) {
    return false;
  }
  // x == 0 -> !x
  if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
    i->Replace(&OPCODE_IS_FALSE_info, 0);
    i->set_src1(variable);
    return true;
  }
  // x != 0 -> !!x
  if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) {
    i->Replace(&OPCODE_IS_TRUE_info, 0);
    i->set_src1(variable);
    return true;
  }
  if (cmpop == OPCODE_COMPARE_ULE &&
      constant_unpacked ==
          0) {  // less than or equal to zero = (== 0) = IS_FALSE
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
+    i->opcode = &OPCODE_COMPARE_EQ_info;
-    i->set_src1(variable);
+
    return true;
  }
  // todo: OPCODE_COMPARE_NE too?
@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
  }
  if (cmpop == OPCODE_COMPARE_ULT &&
      constant_unpacked == 1) {  // unsigned lt 1 means == 0
-    i->Replace(&OPCODE_IS_FALSE_info, 0);
+                                 // i->Replace(&OPCODE_IS_FALSE_info, 0);
-    i->set_src1(variable);
+
    i->opcode = &OPCODE_COMPARE_EQ_info;
    // i->set_src1(variable);
    i->set_src2(builder->LoadZero(variable->type));
    return true;
  }
  if (cmpop == OPCODE_COMPARE_UGT &&
      constant_unpacked == 0) {  // unsigned gt 1 means != 0
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+    // i->set_src1(variable);
    i->opcode = &OPCODE_COMPARE_NE_info;
    return true;
  }
@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
  } else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
             constant_unpacked == 0) {
    // signbit cant be set, and checking if gt 0, so actually checking != 0
-    i->Replace(&OPCODE_IS_TRUE_info, 0);
+    // i->Replace(&OPCODE_IS_TRUE_info, 0);
-    i->set_src1(variable);
+
    // i->set_src1(variable);
    i->opcode = &OPCODE_COMPARE_NE_info;
    return true;
  }
@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    Value* constant_replacement = nullptr;
    if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
-      repl = &OPCODE_IS_TRUE_info;
+      repl = &OPCODE_COMPARE_NE_info;
    } else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
-      repl = &OPCODE_IS_FALSE_info;
+      repl = &OPCODE_COMPARE_EQ_info;
    } else if (cmpop == OPCODE_COMPARE_UGT) {
      // impossible, cannot be greater than mask
@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
    if (repl) {
      i->Replace(repl, 0);
      i->set_src1(variable);
      i->set_src2(builder->LoadZero(variable->type));
      return true;
    }
    if (constant_replacement) {
@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
 }
 bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
                                            hir::HIRBuilder* builder) {
-  bool istrue = i->opcode == &OPCODE_IS_TRUE_info;
+  bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
-  bool isfalse = i->opcode == &OPCODE_IS_FALSE_info;
+  bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
-  Value* input = i->src1.value;
+  auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
  if (!input_cosntant || input_cosntant->AsUint64() != 0) {
    return false;
  }
  // Value* input = i->src1.value;
  TypeName input_type = input->type;
  if (!IsScalarIntegralType(input_type)) {
    return false;
@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
          i->set_src1(isfalsetest);
        } else {
-          i->Replace(&OPCODE_IS_FALSE_info, 0);
+          // i->Replace(&OPCODE_IS_FALSE_info, 0);
          i->Replace(&OPCODE_COMPARE_EQ_info, 0);
          i->set_src1(lz_input);
          i->set_src2(builder->LoadZero(lz_input->type));
        }
        return true;
      }
@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
    while (i) {
      // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
      // don't handle these in our simplifications, so skip
-      if (i->dest && IsScalarIntegralType(i->dest->type)) {
+      if (i->AllScalarIntegral()) {
        Opcode iop = i->opcode->num;
        if (iop == OPCODE_OR) {
@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
          result |= CheckAdd(i, builder);
        } else if (IsScalarBasicCmp(iop)) {
          result |= CheckScalarConstCmp(i, builder);
        } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
          result |= CheckIsTrueIsFalse(i, builder);
        } else if (iop == OPCODE_SHR) {
          result |= CheckSHR(i, builder);
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
 Value* HIRBuilder::Convert(Value* value, TypeName target_type,
                           RoundMode round_mode) {
  Instr* i =
      AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
  i->set_src1(value);
@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
 Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
  ASSERT_FLOAT_OR_VECTOR_TYPE(value);
  Instr* i =
      AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
  i->set_src1(value);
@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
  i->set_src2(value);
  i->src3.value = NULL;
 }
-
+Value* HIRBuilder::LoadVectorLeft(Value* address) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(address);
  i->src2.value = i->src3.value = NULL;
  return i->dest;
 }
 Value* HIRBuilder::LoadVectorRight(Value* address) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE));
  i->set_src1(address);
  i->src2.value = i->src3.value = NULL;
  return i->dest;
 }
 void HIRBuilder::StoreVectorLeft(Value* address, Value* value) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_STVL_info, 0);
  i->set_src1(address);
  i->set_src2(value);
  i->src3.value = NULL;
 }
 void HIRBuilder::StoreVectorRight(Value* address, Value* value) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_STVR_info, 0);
  i->set_src1(address);
  i->set_src2(value);
  i->src3.value = NULL;
 }
 void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
  ASSERT_ADDRESS_TYPE(address);
  ASSERT_TYPES_EQUAL(address, length);
@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) {
 Value* HIRBuilder::Max(Value* value1, Value* value2) {
  ASSERT_TYPES_EQUAL(value1, value2);
-  if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
+  if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
      value2->IsConstant()) {
    return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
  }
@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
  i->set_src3(value2);
  return i->dest;
 }
 static Value* OrLanes32(HIRBuilder& f, Value* value) {
  hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE);
  hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE);
  hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE);
  hir::Value* ored = f.Or(v1, v2);
  hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE);
  ored = f.Or(ored, v3);
  ored = f.Or(ored, v4);
  return ored;
 }
 Value* HIRBuilder::IsTrue(Value* value) {
  assert_true(value);
  if (value->type == VEC128_TYPE) {
    // chrispy: this probably doesnt happen often enough to be worth its own
    // opcode or special code path but this could be optimized to not require as
    // many extracts, we can shuffle and or v128 and then extract the low
    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
  }
  if (value->IsConstant()) {
    return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
  }
-  Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE));
+  return CompareNE(value, LoadZero(value->type));
  i->set_src1(value);
  i->src2.value = i->src3.value = NULL;
  return i->dest;
 }
 Value* HIRBuilder::IsFalse(Value* value) {
  assert_true(value);
  if (value->type == VEC128_TYPE) {
    // chrispy: this probably doesnt happen often enough to be worth its own
    // opcode or special code path but this could be optimized to not require as
    // many extracts, we can shuffle and or v128 and then extract the low
    return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
  }
  if (value->IsConstant()) {
    return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
  }
-  Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE));
+  return CompareEQ(value, LoadZero(value->type));
  i->set_src1(value);
  i->src2.value = i->src3.value = NULL;
  return i->dest;
 }
 Value* HIRBuilder::IsNan(Value* value) {
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -166,6 +166,11 @@ class HIRBuilder {
                   uint32_t store_flags = 0);
  Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
  Value* LoadVectorLeft(Value* address);
  Value* LoadVectorRight(Value* address);
  void StoreVectorLeft(Value* address, Value* value);
  void StoreVectorRight(Value* address, Value* value);
  void Store(Value* address, Value* value, uint32_t store_flags = 0);
  void Memset(Value* address, Value* value, Value* length);
  void CacheControl(Value* address, size_t cache_line_size,
@ -268,6 +273,7 @@ class HIRBuilder {
                               Value* new_value);
  Value* AtomicAdd(Value* address, Value* value);
  Value* AtomicSub(Value* address, Value* value);
  void SetNJM(Value* value);
 protected:
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const {
  return 0;  // eek.
 }
 bool Instr::AllScalarIntegral() {
  bool result = true;
  if (dest) {
    if (!IsScalarIntegralType(dest->type)) {
      return false;
    }
  }
  VisitValueOperands([&result](Value* v, uint32_t idx) {
    result = result && IsScalarIntegralType(v->type);
  });
  return result;
 }
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr
  const hir::Instr* GetNonFakePrev() const;
  uint32_t GuestAddressFor() const;
  bool AllScalarIntegral();  // dest and all srcs are scalar integral
 };
 }  // namespace hir
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -210,10 +210,10 @@ enum Opcode {
  OPCODE_STORE,
  // chrispy: todo: implement, our current codegen for the unaligned loads is
  // very bad
-  OPCODE_LVLX,
+  OPCODE_LVL,
-  OPCODE_LVRX,
+  OPCODE_LVR,
-  OPCODE_STVLX,
+  OPCODE_STVL,
-  OPCODE_STVRX,
+  OPCODE_STVR,
  OPCODE_MEMSET,
  OPCODE_CACHE_CONTROL,
  OPCODE_MEMORY_BARRIER,
@ -222,8 +222,6 @@ enum Opcode {
  OPCODE_MIN,
  OPCODE_VECTOR_MIN,
  OPCODE_SELECT,
  OPCODE_IS_TRUE,
  OPCODE_IS_FALSE,
  OPCODE_IS_NAN,
  OPCODE_COMPARE_EQ,
  OPCODE_COMPARE_NE,
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -303,17 +303,6 @@ DEFINE_OPCODE(
    OPCODE_SIG_V_V_V_V,
    0)
 DEFINE_OPCODE(
    OPCODE_IS_TRUE,
    "is_true",
    OPCODE_SIG_V_V,
    0)
 DEFINE_OPCODE(
    OPCODE_IS_FALSE,
    "is_false",
    OPCODE_SIG_V_V,
    0)
 DEFINE_OPCODE(
    OPCODE_IS_NAN,
@ -706,4 +695,27 @@ DEFINE_OPCODE(
 	OPCODE_SIG_X_V,
 	0
 )
 DEFINE_OPCODE(
 	OPCODE_LVL,
 	"loadv_left",
 	OPCODE_SIG_V_V,
 	OPCODE_FLAG_MEMORY
 )
 DEFINE_OPCODE(
 	OPCODE_LVR,
 	"loadv_right",
 	OPCODE_SIG_V_V,
 	OPCODE_FLAG_MEMORY
 )
 DEFINE_OPCODE(
    OPCODE_STVL,
    "storev_left",
    OPCODE_SIG_X_V_V,
    OPCODE_FLAG_MEMORY)
 DEFINE_OPCODE(
    OPCODE_STVR,
    "storev_right",
    OPCODE_SIG_X_V_V,
    OPCODE_FLAG_MEMORY)
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
  // Testing for all 1's and all 0's.
  // if (Rc) CR6 = all_equal | 0 | none_equal | 0
  // TODO(benvanik): efficient instruction?
  // chrispy: nothing seems to write cr6_1, figure out if no documented
  // instructions write anything other than 0 to it and remove these stores if
  // so
  StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
  StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
  StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -7,18 +7,17 @@
 ******************************************************************************
 */
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <utility>
 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/draw_util.h"
@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() {
  bool draw_resolution_scale_not_clamped =
      TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
                                                 draw_resolution_scale_y);
  if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
          draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
    draw_resolution_scale_not_clamped = false;
@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() {
  CommandProcessor::ShutdownContext();
 }
-
+// todo: bit-pack the bools and use bitarith to reduce branches
 void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
  CommandProcessor::WriteRegister(index, value);
-  if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
+  bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date;
-      index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
+  bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date;
-    if (frame_open_) {
+  bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date;
-      uint32_t float_constant_index =
+
-          (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
+  if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-      if (float_constant_index >= 256) {
+      index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-        float_constant_index -= 256;
+    cbuffer_binding_fetch_.up_to_date = false;
-        if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+    // texture cache is never nullptr
-            (1ull << (float_constant_index & 63))) {
+    // if (texture_cache_ != nullptr) {
-          cbuffer_binding_float_pixel_.up_to_date = false;
+    texture_cache_->TextureFetchConstantWritten(
-        }
+        (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
-      } else {
+    // }
-        if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+  } else {
-            (1ull << (float_constant_index & 63))) {
+    if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd |
-          cbuffer_binding_float_vertex_.up_to_date = false;
+          cbuf_binding_bool_loop_utd)) {
      return;
    }
    if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
        index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
      if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) {
        return;
      }
      if (frame_open_) {
        uint32_t float_constant_index =
            (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
        if (float_constant_index >= 256) {
          float_constant_index -= 256;
          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
              (1ull << (float_constant_index & 63))) {
            cbuffer_binding_float_pixel_.up_to_date = false;
          }
        } else {
          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
              (1ull << (float_constant_index & 63))) {
            cbuffer_binding_float_vertex_.up_to_date = false;
          }
        }
      }
-    }
+    } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
-  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
+               index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
-             index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
+      cbuffer_binding_bool_loop_.up_to_date = false;
    cbuffer_binding_bool_loop_.up_to_date = false;
  } else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
             index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
    cbuffer_binding_fetch_.up_to_date = false;
    if (texture_cache_ != nullptr) {
      texture_cache_->TextureFetchConstantWritten(
          (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
    }
  }
 }
@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
  uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
  draw_util::ViewportInfo viewport_info;
-  draw_util::GetHostViewportInfo(
+  draw_util::GetViewportInfoArgs gviargs{};
-      regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
+
  gviargs.Setup(
      draw_resolution_scale_x, draw_resolution_scale_y,
      texture_cache_->draw_resolution_scale_x_divisor(),
      texture_cache_->draw_resolution_scale_y_divisor(), true,
      D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
      normalized_depth_control,
      host_render_targets_used &&
          render_target_cache_->depth_float24_convert_in_pixel_shader(),
-      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
+      host_render_targets_used, pixel_shader && pixel_shader->writes_depth());
-      viewport_info);
+  gviargs.SetupRegisterValues(regs);
  if (gviargs == previous_viewport_info_args_) {
    viewport_info = previous_viewport_info_;
  } else {
    draw_util::GetHostViewportInfo(&gviargs, viewport_info);
    previous_viewport_info_args_ = gviargs;
    previous_viewport_info_ = viewport_info;
  }
  draw_util::Scissor scissor;
  draw_util::GetScissor(regs, scissor);
  scissor.offset[0] *= draw_resolution_scale_x;
@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() {
    shared_memory_->InitializeTraceCompleteDownloads();
  }
 }
 static void DmaPrefunc(dma::XeDMAJob* job) {
  D3D12_RANGE readback_range;
  readback_range.Begin = 0;
  readback_range.End = job->size;
  void* readback_mapping;
  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
  xenia_assert(SUCCEEDED(mapres));
  job->source = (uint8_t*)readback_mapping;
 }
 static void DmaPostfunc(dma::XeDMAJob* job) {
  D3D12_RANGE readback_write_range = {};
  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
  readback_buffer->Unmap(0, &readback_write_range);
 }
 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() {
          readback_buffer, 0, shared_memory_buffer, written_address,
          written_length);
      if (AwaitAllQueueOperationsCompletion()) {
 #if 1
        D3D12_RANGE readback_range;
        readback_range.Begin = 0;
        readback_range.End = written_length;
        void* readback_mapping;
        if (SUCCEEDED(
                readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          std::memcpy(memory_->TranslatePhysical(written_address),
+          // chrispy: this memcpy needs to be optimized as much as possible
-                      readback_mapping, written_length);
+
          auto physaddr = memory_->TranslatePhysical(written_address);
          dma::vastcpy(physaddr, (uint8_t*)readback_mapping,
                                      written_length);
         // XEDmaCpy(physaddr, readback_mapping, written_length);
          D3D12_RANGE readback_write_range = {};
          readback_buffer->Unmap(0, &readback_write_range);
        }
 #else
        dma::XeDMAJob job{};
        job.destination = memory_->TranslatePhysical(written_address);
        job.size = written_length;
        job.source = nullptr;
        job.userdata1 = (void*)readback_buffer;
        job.precall = DmaPrefunc;
        job.postcall = DmaPostfunc;
        readback_available_ = GetDMAC()->PushDMAJob(&job);
 #endif
      }
    }
  }
@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings(
    if (bool_loop_constants == nullptr) {
      return false;
    }
-    std::memcpy(bool_loop_constants,
+    xe::smallcpy_const<kBoolLoopConstantsSize>(
-                &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
+        bool_loop_constants,
-                kBoolLoopConstantsSize);
+        &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32);
    cbuffer_binding_bool_loop_.up_to_date = true;
    current_graphics_root_up_to_date_ &=
        ~(1u << root_parameter_bool_loop_constants);
@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings(
    if (fetch_constants == nullptr) {
      return false;
    }
-    std::memcpy(fetch_constants,
+    xe::smallcpy_const<kFetchConstantsSize>(
-                &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32,
+        fetch_constants, &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32);
-                kFetchConstantsSize);
+
    cbuffer_binding_fetch_.up_to_date = true;
    current_graphics_root_up_to_date_ &=
        ~(1u << root_parameter_fetch_constants);
@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
  if (size == 0) {
    return nullptr;
  }
  #if 0
  if (readback_available_) {
    GetDMAC()->WaitJobDone(readback_available_);
    readback_available_ = 0;
  }
  #endif
  size = xe::align(size, kReadbackBufferSizeIncrement);
  if (size > readback_buffer_size_) {
    const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
      readback_buffer_->Release();
    }
    readback_buffer_ = buffer;
    readback_buffer_size_ = size;
  }
  return readback_buffer_;
 }
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -1,3 +1,4 @@
 /**
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
@ -19,6 +20,7 @@
 #include <utility>
 #include "xenia/base/assert.h"
 #include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
  static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
  ID3D12Resource* readback_buffer_ = nullptr;
  dma::DMACJobHandle readback_available_ = 0;
  uint32_t readback_buffer_size_ = 0;
  std::atomic<bool> pix_capture_requested_ = false;
@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor {
  DxbcShaderTranslator::SystemConstants system_constants_;
  // Float constant usage masks of the last draw call.
-  uint64_t current_float_constant_map_vertex_[4];
+  // chrispy: make sure accesses to these cant cross cacheline boundaries
-  uint64_t current_float_constant_map_pixel_[4];
+  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
-
+    uint64_t current_float_constant_map_vertex_[4];
    uint64_t current_float_constant_map_pixel_[4];
  };
  // Constant buffer bindings.
  struct ConstantBufferBinding {
    D3D12_GPU_VIRTUAL_ADDRESS address;
@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
  // Current primitive topology.
  D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
  draw_util::ViewportInfo previous_viewport_info_;
 };
 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@ -406,14 +406,15 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
 }
 bool D3D12SharedMemory::UploadRanges(
-    const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) {
+    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
    unsigned num_upload_page_ranges) {
  if (!num_upload_page_ranges) {
    return true;
  }
  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
  command_processor_.SubmitBarriers();
  auto& command_list = command_processor_.GetDeferredCommandList();
-  //for (auto upload_range : upload_page_ranges) {
+  // for (auto upload_range : upload_page_ranges) {
  for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
    auto& upload_range = upload_page_ranges[i];
    uint32_t upload_range_start = upload_range.first;
@ -434,10 +435,20 @@ bool D3D12SharedMemory::UploadRanges(
      }
      MakeRangeValid(upload_range_start << page_size_log2(),
                     uint32_t(upload_buffer_size), false, false);
-      std::memcpy(
+
-          upload_buffer_mapping,
+      if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
-          memory().TranslatePhysical(upload_range_start << page_size_log2()),
+        dma::vastcpy(
-          upload_buffer_size);
+            upload_buffer_mapping,
            memory().TranslatePhysical(upload_range_start << page_size_log2()),
            static_cast<uint32_t>(upload_buffer_size));
        swcache::WriteFence();
      } else {
        memcpy(
            upload_buffer_mapping,
            memory().TranslatePhysical(upload_range_start << page_size_log2()),
            upload_buffer_size);
      }
      command_list.D3DCopyBufferRegion(
          buffer_, upload_range_start << page_size_log2(), upload_buffer,
          UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
  return false;
 }
-void GetHostViewportInfo(const RegisterFile& regs,
+static float ViewportRecip2_0(float f) {
-                         uint32_t draw_resolution_scale_x,
+  float f1 = ArchReciprocalRefined(f);
-                         uint32_t draw_resolution_scale_y,
+  return f1 + f1;
-                         bool origin_bottom_left, uint32_t x_max,
+}
-                         uint32_t y_max, bool allow_reverse_z,
+
-                         reg::RB_DEPTHCONTROL normalized_depth_control,
+// chrispy: todo, the int/float divides and the nan-checked mins show up
-                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
+// relatively high on uprof when i uc to 1.7ghz
-                         bool pixel_shader_writes_depth,
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                         ViewportInfo& viewport_info_out) {
-  assert_not_zero(draw_resolution_scale_x);
+  assert_not_zero(args->draw_resolution_scale_x);
-  assert_not_zero(draw_resolution_scale_y);
+  assert_not_zero(args->draw_resolution_scale_y);
  // A vertex position goes the following path:
  //
@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs,
  // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
  // D24FS8 as well.
-  auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
+  auto pa_cl_clip_cntl = args->pa_cl_clip_cntl;
-  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+  auto pa_cl_vte_cntl = args->pa_cl_vte_cntl;
-  auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
+  auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl;
-  auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
+  auto pa_su_vtx_cntl = args->pa_su_vtx_cntl;
  // Obtain the original viewport values in a normalized way.
  float scale_xy[] = {
-      pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
+
-                                       : 1.0f,
+      pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f,
-      pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+      pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f,
                                       : 1.0f,
  };
-  float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
+  float scale_z =
-                      ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
+      pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f;
-                      : 1.0f;
+
  float offset_base_xy[] = {
-      pa_cl_vte_cntl.vport_x_offset_ena
+      pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f,
-          ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
+      pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f,
          : 0.0f,
      pa_cl_vte_cntl.vport_y_offset_ena
          ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
          : 0.0f,
  };
-  float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
+  float offset_z =
-                       ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
+      pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f;
                       : 0.0f;
  // Calculate all the integer.0 or integer.5 offsetting exactly at full
  // precision, separately so it can be used in other integer calculations
  // without double rounding if needed.
  float offset_add_xy[2] = {};
  if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    auto pa_sc_window_offset = args->pa_sc_window_offset;
    offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
    offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
  }
@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
  // The maximum value is at least the maximum host render target size anyway -
  // and a guest pixel is always treated as a whole with resolution scaling.
-  uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x,
+  // cbrispy: todo, this integer divides show up high on the profiler somehow
-                                y_max / draw_resolution_scale_y};
+  // (it was a very long session, too)
  uint32_t xy_max_unscaled[] = {
      args->draw_resolution_scale_x_divisor.Apply(args->x_max),
      args->draw_resolution_scale_y_divisor.Apply(args->y_max)};
  assert_not_zero(xy_max_unscaled[0]);
  assert_not_zero(xy_max_unscaled[1]);
@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
          std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
      viewport_info_out.xy_extent[i] =
          extent_axis_unscaled *
-          (i ? draw_resolution_scale_y : draw_resolution_scale_x);
+          (i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x);
      float extent_axis_unscaled_float = float(extent_axis_unscaled);
-      float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
+
      float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float);
      ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
      ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
                       offset_add_xy[i]) *
@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
      // doing truncation for simplicity - since maxing with 0 is done anyway
      // (we only return viewports in the positive quarter-plane).
      uint32_t axis_resolution_scale =
-          i ? draw_resolution_scale_y : draw_resolution_scale_x;
+          i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x;
      float offset_axis = offset_base_xy[i] + offset_add_xy[i];
      float scale_axis = scale_xy[i];
      float scale_axis_abs = std::abs(scale_xy[i]);
@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs,
        // space, a region previously outside -W...W should end up within it, so
        // the scale should be < 1.
        float axis_extent_rounded = float(axis_extent_int);
-        ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
+        float inv_axis_extent_rounded =
            ArchReciprocalRefined(axis_extent_rounded);
        ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded;
        // Move the origin of the snapped coordinates back to the original one.
        ndc_offset_axis = (float(offset_axis) -
                           (float(axis_0_int) + axis_extent_rounded * 0.5f)) *
-                          2.0f / axis_extent_rounded;
+                          2.0f * inv_axis_extent_rounded;
      } else {
        // Empty viewport (everything outside the viewport scissor).
        ndc_scale_axis = 1.0f;
@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
      ndc_scale[2] = 0.5f;
      ndc_offset[2] = 0.5f;
    }
-    if (pixel_shader_writes_depth) {
+    if (args->pixel_shader_writes_depth) {
      // Allow the pixel shader to write any depth value since
      // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
      // shaders don't have access to the original Z in the viewport space
@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
      // Direct3D 12 doesn't allow reverse depth range - on some drivers it
      // works, on some drivers it doesn't, actually, but it was never
      // explicitly allowed by the specification.
-      if (!allow_reverse_z && z_min > z_max) {
+      if (!args->allow_reverse_z && z_min > z_max) {
        std::swap(z_min, z_max);
        ndc_scale[2] = -ndc_scale[2];
        ndc_offset[2] = 1.0f - ndc_offset[2];
@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs,
    }
  }
-  if (normalized_depth_control.z_enable &&
+  if (args->normalized_depth_control.z_enable &&
-      regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
+      args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
-          xenos::DepthRenderTargetFormat::kD24FS8) {
+    if (args->convert_z_to_float24) {
    if (convert_z_to_float24) {
      // Need to adjust the bounds that the resulting depth values will be
      // clamped to after the pixel shader. Preferring adding some error to
      // interpolated Z instead if conversion can't be done exactly, without
@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
    }
-    if (full_float24_in_0_to_1) {
+    if (args->full_float24_in_0_to_1) {
      // Remap the full [0...2) float24 range to [0...1) support data round-trip
      // during render target ownership transfer of EDRAM tiles through depth
      // input without unrestricted depth range.
@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
  viewport_info_out.z_min = z_min;
  viewport_info_out.z_max = z_max;
-  if (origin_bottom_left) {
+  if (args->origin_bottom_left) {
    ndc_scale[1] = -ndc_scale[1];
    ndc_offset[1] = -ndc_offset[1];
  }
@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs,
    viewport_info_out.ndc_offset[i] = ndc_offset[i];
  }
 }
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
                bool clamp_to_surface_pitch) {
  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
           xenos::kMaxResolveSize);
    y1 = y0 + int32_t(xenos::kMaxResolveSize);
  }
-  //fails in forza horizon 1
+  // fails in forza horizon 1
  assert_true(x0 < x1 && y0 < y1);
  if (x0 >= x1 || y0 >= y1) {
    XELOGE("Resolve region is empty");
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -277,18 +277,151 @@ struct ViewportInfo {
  float ndc_scale[3];
  float ndc_offset[3];
 };
 static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t),
              "Change in depthrendertargetformat throws off "
              "getviewportinfoargs by a bit");
 struct GetViewportInfoArgs {
  union alignas(64) {
    struct {
      // group 1
      uint32_t x_max;
      uint32_t y_max;
      union {
        struct {
          uint32_t origin_bottom_left : 1;
          uint32_t allow_reverse_z : 1;
          uint32_t convert_z_to_float24 : 1;
          uint32_t full_float24_in_0_to_1 : 1;
          uint32_t pixel_shader_writes_depth : 1;
          xenos::DepthRenderTargetFormat depth_format : 1;
        };
        uint32_t packed_portions;
      };
      reg::RB_DEPTHCONTROL normalized_depth_control;
      // group 2
      reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl;
      reg::PA_CL_VTE_CNTL pa_cl_vte_cntl;
      reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl;
      reg::PA_SU_VTX_CNTL pa_su_vtx_cntl;
      // group 3
      reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset;
      float PA_CL_VPORT_XSCALE;
      float PA_CL_VPORT_YSCALE;
      float PA_CL_VPORT_ZSCALE;
      float PA_CL_VPORT_XOFFSET;
      float PA_CL_VPORT_YOFFSET;
      float PA_CL_VPORT_ZOFFSET;
      uint32_t padding_set_to_0;
    };
 #if XE_ARCH_AMD64 == 1
    struct {
      __m128i first4;   // x_max, y_max, packed_portions,
                        // normalized_depth_control
      __m128i second4;  // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl,
                        // pa_su_vtx_cntl
      __m128i third4;   // pa_sc_window_offset, PA_CL_VPORT_XSCALE,
                        // PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE
      __m128i last4;    // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET,
                        // PA_CL_VPORT_ZOFFSET, padding_set_to_0
    };
 #endif
  };
  // everything that follows here does not need to be compared
  uint32_t draw_resolution_scale_x;
  uint32_t draw_resolution_scale_y;
  divisors::MagicDiv draw_resolution_scale_x_divisor;
  divisors::MagicDiv draw_resolution_scale_y_divisor;
  void Setup(uint32_t _draw_resolution_scale_x,
             uint32_t _draw_resolution_scale_y,
             divisors::MagicDiv _draw_resolution_scale_x_divisor,
             divisors::MagicDiv _draw_resolution_scale_y_divisor,
             bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max,
             bool _allow_reverse_z,
             reg::RB_DEPTHCONTROL _normalized_depth_control,
             bool _convert_z_to_float24, bool _full_float24_in_0_to_1,
             bool _pixel_shader_writes_depth) {
    packed_portions = 0;
    padding_set_to_0 = 0;  // important to zero this
    draw_resolution_scale_x = _draw_resolution_scale_x;
    draw_resolution_scale_y = _draw_resolution_scale_y;
    draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor;
    draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor;
    origin_bottom_left = _origin_bottom_left;
    x_max = _x_max;
    y_max = _y_max;
    allow_reverse_z = _allow_reverse_z;
    normalized_depth_control = _normalized_depth_control;
    convert_z_to_float24 = _convert_z_to_float24;
    full_float24_in_0_to_1 = _full_float24_in_0_to_1;
    pixel_shader_writes_depth = _pixel_shader_writes_depth;
  }
  void SetupRegisterValues(const RegisterFile& regs) {
    pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
    pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
    pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
    pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
    PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
    PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
    PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;
    PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;
    PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
    PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;
    pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
    depth_format = regs.Get<reg::RB_DEPTH_INFO>().depth_format;
  }
  XE_FORCEINLINE
  bool operator==(const GetViewportInfoArgs& prev) {
 #if XE_ARCH_AMD64 == 0
    bool result = true;
    auto accum_eq = [&result](auto x, auto y) { result &= (x == y); };
 #define EQC(field) accum_eq(field, prev.field)
    EQC(x_max);
    EQC(y_max);
    EQC(packed_portions);
    EQC(normalized_depth_control.value);
    EQC(pa_cl_clip_cntl.value);
    EQC(pa_cl_vte_cntl.value);
    EQC(pa_su_sc_mode_cntl.value);
    EQC(pa_su_vtx_cntl.value);
    EQC(PA_CL_VPORT_XSCALE);
    EQC(PA_CL_VPORT_YSCALE);
    EQC(PA_CL_VPORT_ZSCALE);
    EQC(PA_CL_VPORT_XOFFSET);
    EQC(PA_CL_VPORT_YOFFSET);
    EQC(PA_CL_VPORT_ZOFFSET);
    EQC(pa_sc_window_offset.value);
 #undef EQC
    return result;
 #else
    __m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4);
    __m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4);
    __m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4);
    __m128i unified1 = _mm_and_si128(mask1, mask2);
    __m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4);
    __m128i unified2 = _mm_and_si128(unified1, mask3);
    __m128i unified3 = _mm_and_si128(unified2, mask4);
    return _mm_movemask_epi8(unified3) == 0xFFFF;
 #endif
  }
 };
 // Converts the guest viewport (or fakes one if drawing without a viewport) to
 // a viewport, plus values to multiply-add the returned position by, usable on
 // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
 // Direct3D clip space with 0...W Z rather than -W...W.
-void GetHostViewportInfo(const RegisterFile& regs,
+void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                         uint32_t draw_resolution_scale_x,
                         uint32_t draw_resolution_scale_y,
                         bool origin_bottom_left, uint32_t x_max,
                         uint32_t y_max, bool allow_reverse_z,
                         reg::RB_DEPTHCONTROL normalized_depth_control,
                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
                         bool pixel_shader_writes_depth,
                         ViewportInfo& viewport_info_out);
 struct Scissor {
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
    }
    // Make sure the same render target isn't bound into two different slots
    // over time.
-    for (uint32_t i = 1; are_accumulated_render_targets_valid_ &&
+    // chrispy: this needs optimization!
-                         i < 1 + xenos::kMaxColorRenderTargets;
+    if (are_accumulated_render_targets_valid_) {
-         ++i) {
+      for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) {
-      const RenderTarget* render_target =
+        const RenderTarget* render_target =
-          last_update_accumulated_render_targets_[i];
+            last_update_accumulated_render_targets_[i];
-      if (!render_target) {
+        if (!render_target) {
-        continue;
+          continue;
-      }
+        }
-      for (uint32_t j = 0; j < i; ++j) {
+        for (uint32_t j = 0; j < i; ++j) {
-        if (last_update_accumulated_render_targets_[j] == render_target) {
+          if (last_update_accumulated_render_targets_[j] == render_target) {
-          are_accumulated_render_targets_valid_ = false;
+            are_accumulated_render_targets_valid_ = false;
-          break;
+            goto exit_slot_check_loop;
          }
        }
      }
    exit_slot_check_loop:;
    }
  }
  if (!are_accumulated_render_targets_valid_) {
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file,
    : register_file_(register_file),
      shared_memory_(shared_memory),
      draw_resolution_scale_x_(draw_resolution_scale_x),
-      draw_resolution_scale_y_(draw_resolution_scale_y) {
+      draw_resolution_scale_y_(draw_resolution_scale_y),
      draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
      draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
  assert_true(draw_resolution_scale_x >= 1);
  assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
  assert_true(draw_resolution_scale_y >= 1);
@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
      uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
  uint32_t config_y =
      uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
  uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
  uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
  x_out = clamped_x;
@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }
 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const global_unique_lock_type& global_lock,
+    [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
    bool is_mip) {
  if (is_mip) {
    assert_not_zero(GetGuestMipsSize());
    mips_outdated_ = true;
@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback(
 }
 void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
-                                 void* context,
+                                 void* context, void* data, uint64_t argument,
-    void* data, uint64_t argument, bool invalidated_by_gpu) {
+                                 bool invalidated_by_gpu) {
  Texture& texture = *static_cast<Texture*>(context);
  texture.WatchCallback(global_lock, argument != 0);
  texture.texture_cache().texture_became_outdated_.store(
@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }
 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const global_unique_lock_type& global_lock,
+    const global_unique_lock_type& global_lock, uint32_t address_first,
-    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
+    uint32_t address_last, bool invalidated_by_gpu) {
  assert_true(IsDrawResolutionScaled());
  if (invalidated_by_gpu) {
    // Resolves themselves do exactly the opposite of what this should do.
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -19,6 +19,7 @@
 #include "xenia/base/assert.h"
 #include "xenia/base/hash.h"
 #include "xenia/base/math.h"
 #include "xenia/base/mutex.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shared_memory.h"
@ -70,6 +71,14 @@ class TextureCache {
  static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
  uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
  uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
  divisors::MagicDiv draw_resolution_scale_x_divisor() const {
    return draw_resolution_scale_x_divisor_;
  }
  divisors::MagicDiv draw_resolution_scale_y_divisor() const {
    return draw_resolution_scale_y_divisor_;
  }
  bool IsDrawResolutionScaled() const {
    return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
  }
@ -576,8 +585,8 @@ class TextureCache {
  // Shared memory callback for texture data invalidation.
  static void WatchCallback(const global_unique_lock_type& global_lock,
-                            void* context,
+                            void* context, void* data, uint64_t argument,
-      void* data, uint64_t argument, bool invalidated_by_gpu);
+                            bool invalidated_by_gpu);
  // Checks if there are any pages that contain scaled resolve data within the
  // range.
@ -588,14 +597,15 @@ class TextureCache {
      const global_unique_lock_type& global_lock, void* context,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
  void ScaledResolveGlobalWatchCallback(
-      const global_unique_lock_type& global_lock,
+      const global_unique_lock_type& global_lock, uint32_t address_first,
-      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
+      uint32_t address_last, bool invalidated_by_gpu);
  const RegisterFile& register_file_;
  SharedMemory& shared_memory_;
  uint32_t draw_resolution_scale_x_;
  uint32_t draw_resolution_scale_y_;
-
+  divisors::MagicDiv draw_resolution_scale_x_divisor_;
  divisors::MagicDiv draw_resolution_scale_y_divisor_;
  static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
  xe::global_critical_region global_critical_region_;
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
  // Get dynamic rasterizer state.
  draw_util::ViewportInfo viewport_info;
  // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
  // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
  // maxViewportDimensions must be greater than or equal to the size of the
@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
  // life. Or even disregard the viewport bounds range in the fragment shader
  // interlocks case completely - apply the viewport and the scissor offset
  // directly to pixel address and to things like ps_param_gen.
-  draw_util::GetHostViewportInfo(
+  draw_util::GetViewportInfoArgs gviargs{};
-      regs, 1, 1, false, device_limits.maxViewportDimensions[0],
+  gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false,
-      device_limits.maxViewportDimensions[1], true, normalized_depth_control,
+                device_limits.maxViewportDimensions[0],
-      false, host_render_targets_used,
+
-      pixel_shader && pixel_shader->writes_depth(), viewport_info);
+                device_limits.maxViewportDimensions[1], true,
                normalized_depth_control, false, host_render_targets_used,
                pixel_shader && pixel_shader->writes_depth());
  gviargs.SetupRegisterValues(regs);
  draw_util::GetHostViewportInfo(&gviargs, viewport_info);
  // Update dynamic graphics pipeline state.
  UpdateDynamicState(viewport_info, primitive_polygonal,
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
         format == ColorRenderTargetFormat::k_32_32_FLOAT;
 }
-inline uint32_t GetColorRenderTargetFormatComponentCount(
+// if 0, 1
 // if 1, 2
 // if 3, 4
 // 2 bits per entry, shift and add 1
 using ColorFormatComponentTable = uint32_t;
 static constexpr uint32_t GetComponentCountConst(
    ColorRenderTargetFormat format) {
  switch (format) {
    case ColorRenderTargetFormat::k_8_8_8_8:
@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount(
    case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
    case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
    case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
-      return 4;
+      return 4 - 1;
    case ColorRenderTargetFormat::k_16_16:
    case ColorRenderTargetFormat::k_16_16_FLOAT:
    case ColorRenderTargetFormat::k_32_32_FLOAT:
-      return 2;
+      return 2 - 1;
    case ColorRenderTargetFormat::k_32_FLOAT:
-      return 1;
+      return 1 - 1;
    default:
      assert_unhandled_case(format);
      return 0;
  }
 }
 namespace detail {
 static constexpr uint32_t encode_format_component_table() {
  uint32_t result = 0;
 #define ADDFORMAT(name)                                           \
  result |= GetComponentCountConst(ColorRenderTargetFormat::name) \
            << (static_cast<uint32_t>(ColorRenderTargetFormat::name) * 2)
  ADDFORMAT(k_8_8_8_8);
  ADDFORMAT(k_8_8_8_8_GAMMA);
  ADDFORMAT(k_2_10_10_10);
  ADDFORMAT(k_2_10_10_10_FLOAT);
  ADDFORMAT(k_16_16_16_16);
  ADDFORMAT(k_16_16_16_16_FLOAT);
  ADDFORMAT(k_2_10_10_10_AS_10_10_10_10);
  ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16);
  ADDFORMAT(k_16_16);
  ADDFORMAT(k_16_16_FLOAT);
  ADDFORMAT(k_32_32_FLOAT);
  ADDFORMAT(k_32_FLOAT);
  return result;
 }
 constexpr uint32_t color_format_component_table =
    encode_format_component_table();
 }  // namespace detail
 constexpr uint32_t GetColorRenderTargetFormatComponentCount(
    ColorRenderTargetFormat format) {
  return ((detail::color_format_component_table >>
           (static_cast<uint32_t>(format) * 2)) &
          0b11) +
         1;
 }
 // Returns the version of the format with the same packing and meaning of values
 // stored in it, but without blending precision modifiers.
 constexpr ColorRenderTargetFormat GetStorageColorFormat(