Merge pull request #74 from chrisps/canary_experimental

Misc optimizations
2022-09-11 18:02:00 -04:00 · 2022-09-11 18:02:00 -04:00 · b4224ff3dc
parent 0c576877c8 0fd4a2533b
commit b4224ff3dc
67 changed files with 3373 additions and 2184 deletions
--- a/src/xenia/app/xenia_main.cc
+++ b/src/xenia/app/xenia_main.cc
@ -379,6 +379,9 @@ std::vector<std::unique_ptr<hid::InputDriver>> EmulatorApp::CreateInputDrivers(
 }

 bool EmulatorApp::OnInitialize() {
+#if XE_ARCH_AMD64 == 1
+  amd64::InitFeatureFlags();
+#endif
  Profiler::Initialize();
  Profiler::ThreadEnter("Main");

--- a/src/xenia/base/clock.cc
+++ b/src/xenia/base/clock.cc
@ -51,7 +51,7 @@ uint64_t last_guest_tick_count_ = 0;
 uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();


-using tick_mutex_type = xe_unlikely_mutex;  
+using tick_mutex_type = std::mutex;  

 // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
 // std::mutex tick_mutex_;
--- a/src/xenia/base/dma.cc
+++ b/src/xenia/base/dma.cc
@ -1,7 +1,15 @@
 #include "dma.h"
 #include "logging.h"
+#include "mutex.h"
+#include "platform_win.h"
 #include "xbyak/xbyak/xbyak_util.h"

+XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
+                NtDelayExecutionPointer);
+XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
+XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
+                NtAlertThreadByThreadId);
+
 template <size_t N, typename... Ts>
 static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
  char buffer[1024];
@ -213,320 +221,140 @@ void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
                          written_length);
 }

-#define XEDMA_NUM_WORKERS 4
-class alignas(256) XeDMACGeneric : public XeDMAC {
+#define MAX_INFLIGHT_DMAJOBS 65536
+#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
+class XeDMACGeneric : public XeDMAC {
+  std::unique_ptr<xe::threading::Thread> thrd_;
+  XeDMAJob* jobs_ring_;
+  volatile std::atomic<uintptr_t> write_ptr_;
+
  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
-    std::atomic<uint64_t> free_job_slots_;
-    std::atomic<uint64_t> jobs_submitted_;
-    std::atomic<uint64_t> jobs_completed_;
-    std::atomic<uint32_t> num_workers_awoken_;
-    std::atomic<uint32_t> current_job_serial_;
-
-  } dma_volatile_;
-
-  alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
-
-  volatile uint32_t jobserials_[64];
-
-  alignas(XE_HOST_CACHE_LINE_SIZE)
-      std::unique_ptr<threading::Event> job_done_signals_[64];
-  // really dont like using unique pointer for this...
-  std::unique_ptr<threading::Event> job_submitted_signal_;
-  std::unique_ptr<threading::Event> job_completed_signal_;
-
-  std::unique_ptr<threading::Thread> scheduler_thread_;
-  struct WorkSlice {
-    uint8_t* destination;
-    uint8_t* source;
-    size_t numbytes;
+    volatile std::atomic<uintptr_t> read_ptr_;
+    xe_mutex push_into_ring_lock_;
  };
-  std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
-  std::unique_ptr<threading::Event> worker_has_work_;  //[XEDMA_NUM_WORKERS];
-  std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
-
-  threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
-  WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
-
-  // chrispy: this is bad
-  static uint32_t find_free_hole_in_dword(uint64_t dw) {
-    XEDMALOG("Finding free hole in 0x%llX", dw);
-
-    for (uint32_t i = 0; i < 64; ++i) {
-      if (dw & (1ULL << i)) {
-        continue;
-      }
-
-      return i;
-    }
-    return ~0U;
-  }
-
-  uint32_t allocate_free_dma_slot() {
-    XEDMALOG("Allocating free slot");
-    uint32_t got_slot = 0;
-    uint64_t slots;
-    uint64_t allocated_slot;
-
-    do {
-      slots = dma_volatile_.free_job_slots_.load();
-
-      got_slot = find_free_hole_in_dword(slots);
-      if (!~got_slot) {
-        XEDMALOG("Didn't get a slot!");
-        return ~0U;
-      }
-      allocated_slot = slots | (1ULL << got_slot);
-
-    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
-        slots, allocated_slot)));
-    XEDMALOG("Allocated slot %d", got_slot);
-    return got_slot;
-  }
-  // chrispy: on x86 this can just be interlockedbittestandreset...
-  void free_dma_slot(uint32_t slot) {
-    XEDMALOG("Freeing slot %d", slot);
-    uint64_t slots;
-
-    uint64_t deallocated_slot;
-
-    do {
-      slots = dma_volatile_.free_job_slots_.load();
-
-      deallocated_slot = slots & (~(1ULL << slot));
-
-    } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
-        slots, deallocated_slot)));
-  }
-
-  void DoDMAJob(uint32_t idx) {
-    XeDMAJob& job = jobs_[idx];
-    if (job.precall) {
-      job.precall(&job);
-    }
-    // memcpy(job.destination, job.source, job.size);
-
-    size_t job_size = job.size;
-
-    size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
-
-    size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
-
-    size_t rem = job_size - line_rounded;
-
-    size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
-
-    XEDMALOG(
-        "Distributing %d bytes from %p to %p across %d workers, remainder is "
-        "%d",
-        line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
-    if (num_per_worker < 2048) {
-      XEDMALOG("not distributing across workers, num_per_worker < 8192");
-      // not worth splitting up
-      memcpy(job.destination, job.source, job.size);
-      job.signal_on_done->Set();
-    } else {
-      for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
-        worker_workslice_[i].destination =
-            (i * num_per_worker) + job.destination;
-        worker_workslice_[i].source = (i * num_per_worker) + job.source;
-
-        worker_workslice_[i].numbytes = num_per_worker;
-      }
-      if (rem) {
-        __movsb(job.destination + line_rounded, job.source + line_rounded, rem);
-      }
-      // wake them up
-      worker_has_work_->Set();
-      XEDMALOG("Starting waitall for job");
-      threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
-                         false);
-
-      XEDMALOG("Waitall for job completed!");
-      job.signal_on_done->Set();
-    }
-    if (job.postcall) {
-      job.postcall(&job);
-    }
-    ++dma_volatile_.jobs_completed_;
-  }
-
-  void WorkerIter(uint32_t worker_index) {
-    xenia_assert(worker_index < XEDMA_NUM_WORKERS);
-    auto [dest, src, size] = worker_workslice_[worker_index];
-
-    //  if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
-    worker_has_work_->Reset();
-    //}
-    xenia_assert(size < (1ULL << 32));
-    // memcpy(dest, src, size);
-    dma::vastcpy(dest, src, static_cast<uint32_t>(size));
-  }
-  XE_NOINLINE
-  void WorkerMainLoop(uint32_t worker_index) {
-    do {
-      XEDMALOG("Worker iter for worker %d", worker_index);
-      WorkerIter(worker_index);
-
-      XEDMALOG("Worker %d is done\n", worker_index);
-      threading::SignalAndWait(worker_has_finished_[worker_index].get(),
-                               worker_has_work_.get(), false);
-    } while (true);
-  }
-  void WorkerMain(uint32_t worker_index) {
-    XEDMALOG("Entered worker main loop, index %d", worker_index);
-    threading::Wait(worker_has_work_.get(), false);
-    XEDMALOG("First wait for worker %d completed, first job ever",
-             worker_index);
-    WorkerMainLoop(worker_index);
-  }
-
-  static void WorkerMainForwarder(void* ptr) {
-    // we aligned XeDma to 256 bytes and encode extra info in the low 8
-    uintptr_t uptr = (uintptr_t)ptr;
-
-    uint32_t worker_index = (uint8_t)uptr;
-
-    uptr &= ~0xFFULL;
-
-    char name_buffer[64];
-    sprintf_s(name_buffer, "dma_worker_%d", worker_index);
-
-    xe::threading::set_name(name_buffer);
-
-    reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
-  }
-
-  void DMAMain() {
-    XEDMALOG("DmaMain");
-    do {
-      threading::Wait(job_submitted_signal_.get(), false);
-
-      auto slots = dma_volatile_.free_job_slots_.load();
-
-      for (uint32_t i = 0; i < 64; ++i) {
-        if (slots & (1ULL << i)) {
-          XEDMALOG("Got new job at index %d in DMAMain", i);
-          DoDMAJob(i);
-
-          free_dma_slot(i);
-
-          job_completed_signal_->Set();
-          //         break;
-        }
-      }
-
-    } while (true);
-  }
-
-  static void DMAMainForwarder(void* ud) {
-    xe::threading::set_name("dma_main");
-    reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
-  }
+  HANDLE gotjob_event;
+  void WorkerWait();

 public:
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
-    XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
-             job->size);
-    uint32_t slot;
-    do {
-      slot = allocate_free_dma_slot();
-      if (!~slot) {
-        XEDMALOG(
-            "Didn't get a free slot, waiting for a job to complete before "
-            "resuming.");
-        threading::Wait(job_completed_signal_.get(), false);
+  virtual ~XeDMACGeneric() {}
+  void WorkerThreadMain();
+  XeDMACGeneric() {
+    threading::Thread::CreationParameters crparams;
+    crparams.create_suspended = true;
+    crparams.initial_priority = threading::ThreadPriority::kNormal;
+    crparams.stack_size = 65536;
+    gotjob_event = CreateEventA(nullptr, false, false, nullptr);
+    thrd_ = std::move(threading::Thread::Create(
+        crparams, [this]() { this->WorkerThreadMain(); }));

-      } else {
-        break;
-      }
+    jobs_ring_ = (XeDMAJob*)_aligned_malloc(
+        MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);

-    } while (true);
-    jobs_[slot] = *job;
+    write_ptr_ = 0;
+    read_ptr_ = 0;

-    jobs_[slot].signal_on_done = job_done_signals_[slot].get();
-    jobs_[slot].signal_on_done->Reset();
-    XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
-
-    uint32_t new_serial = dma_volatile_.current_job_serial_++;
-
-    jobserials_[slot] = new_serial;
-
-    ++dma_volatile_.jobs_submitted_;
-    job_submitted_signal_->Set();
-    return (static_cast<uint64_t>(new_serial) << 32) |
-           static_cast<uint64_t>(slot);
-
-    // return job_done_signals_[slot].get();
+    thrd_->Resume();
  }

-  bool AllJobsDone() {
-    return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
+  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
+    // std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
+    HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
+    {
+      job->dmac_specific_ = (uintptr_t)dmacevent;
+
+      jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
+      write_ptr_++;
+      SetEvent(gotjob_event);
+    }
+    return (DMACJobHandle)dmacevent;
  }
  virtual void WaitJobDone(DMACJobHandle handle) override {
-    uint32_t serial = static_cast<uint32_t>(handle >> 32);
-    uint32_t jobid = static_cast<uint32_t>(handle);
-    do {
-      if (jobserials_[jobid] != serial) {
-        return;  // done, our slot was reused
+    while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
+      // NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
+    //  while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
+  //           WAIT_TIMEOUT) {
+   //    ;
      }
+    //}

-      auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
-                                     std::chrono::milliseconds{1});
-
-      if (waitres == threading::WaitResult::kTimeout) {
-        continue;
-      } else {
-        return;
-      }
-    } while (true);
+    // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
+    CloseHandle((HANDLE)handle);
  }
  virtual void WaitForIdle() override {
-    while (!AllJobsDone()) {
+    while (write_ptr_ != read_ptr_) {
      threading::MaybeYield();
    }
  }
-  XeDMACGeneric() {
-    XEDMALOG("Constructing xedma at addr %p", this);
-    dma_volatile_.free_job_slots_.store(0ULL);
-    dma_volatile_.jobs_submitted_.store(0ULL);
-    dma_volatile_.jobs_completed_.store(0ULL);
-    dma_volatile_.current_job_serial_.store(
-        1ULL);  // so that a jobhandle is never 0
-    std::memset(jobs_, 0, sizeof(jobs_));
-    job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
-    job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
-    worker_has_work_ = threading::Event::CreateManualResetEvent(false);
-    threading::Thread::CreationParameters worker_params{};
-    worker_params.create_suspended = false;
-    worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
-    worker_params.stack_size = 65536;  // dont need much stack at all
-
-    for (uint32_t i = 0; i < 64; ++i) {
-      job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
-    }
-    for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
-      // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
-      worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
-      worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
-
-      uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
-      xenia_assert(!(encoded & 0xFFULL));
-      xenia_assert(i < 256);
-
-      encoded |= i;
-
-      workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
-        XeDMACGeneric::WorkerMainForwarder((void*)encoded);
-      });
-    }
-    threading::Thread::CreationParameters scheduler_params{};
-    scheduler_params.create_suspended = false;
-    scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
-    scheduler_params.stack_size = 65536;
-    scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
-      XeDMACGeneric::DMAMainForwarder((void*)this);
-    });
-  }
 };
+void XeDMACGeneric::WorkerWait() {
+  constexpr unsigned NUM_PAUSE_SPINS = 2048;
+  constexpr unsigned NUM_YIELD_SPINS = 8;
+#if 0
+
+  for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
+    if (write_ptr_ == read_ptr_) {
+      _mm_pause();
+    } else {
+      break;
+    }
+  }
+  for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
+    if (write_ptr_ == read_ptr_) {
+      threading::MaybeYield();
+    } else {
+      break;
+    }
+  }
+  LARGE_INTEGER yield_execution_delay{};
+  yield_execution_delay.QuadPart =
+      -2000;  //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
+  while (write_ptr_ == read_ptr_) {
+    NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
+  }
+#else
+  do {
+    if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
+      while (write_ptr_ == read_ptr_) {
+        _mm_pause();
+      }
+    }
+
+  } while (write_ptr_ == read_ptr_);
+#endif
+}
+void XeDMACGeneric::WorkerThreadMain() {
+  while (true) {
+    this->WorkerWait();
+
+    XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
+    swcache::ReadFence();
+
+    if (current_job.precall) {
+      current_job.precall(&current_job);
+    }
+
+    size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
+    size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
+
+    size_t line_rem = current_job.size - line_rounded;
+
+    vastcpy(current_job.destination, current_job.source,
+            static_cast<uint32_t>(line_rounded));
+
+    if (line_rem) {
+      __movsb(current_job.destination + line_rounded,
+              current_job.source + line_rounded, line_rem);
+    }
+
+    if (current_job.postcall) {
+      current_job.postcall(&current_job);
+    }
+    read_ptr_++;
+    swcache::WriteFence();
+
+    SetEvent((HANDLE)current_job.dmac_specific_);
+  }
+}
+
 XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
 }  // namespace xe::dma
--- a/src/xenia/base/dma.h
+++ b/src/xenia/base/dma.h
@ -16,7 +16,8 @@ struct XeDMAJob;
 using DmaPrecall = void (*)(XeDMAJob* job);
 using DmaPostcall = void (*)(XeDMAJob* job);
 struct XeDMAJob {
-  threading::Event* signal_on_done;
+  //threading::Event* signal_on_done;
+  uintptr_t dmac_specific_;
  uint8_t* destination;
  uint8_t* source;
  size_t size;
--- a/src/xenia/base/logging.cc
+++ b/src/xenia/base/logging.cc
@ -472,7 +472,7 @@ bool logging::internal::ShouldLog(LogLevel log_level) {
 std::pair<char*, size_t> logging::internal::GetThreadBuffer() {
  return {thread_log_buffer_, sizeof(thread_log_buffer_)};
 }
-
+XE_NOALIAS
 void logging::internal::AppendLogLine(LogLevel log_level,
                                      const char prefix_char, size_t written) {
  if (!logger_ || !ShouldLog(log_level) || !written) {
--- a/src/xenia/base/logging.h
+++ b/src/xenia/base/logging.h
@ -74,11 +74,15 @@ namespace internal {

 bool ShouldLog(LogLevel log_level);
 std::pair<char*, size_t> GetThreadBuffer();
-
+XE_NOALIAS
 void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);

 }  // namespace internal
+//technically, noalias is incorrect here, these functions do in fact alias global memory,
+//but msvc will not optimize the calls away, and the global memory modified by the calls is limited to internal logging variables,
+//so it might as well be noalias
 template <typename... Args>
+XE_NOALIAS
 XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
                                                         const char prefix_char,
                                                         const char* format,
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -400,10 +400,91 @@ static float ArchReciprocal(float den) {
  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
 }

+#if 0
+using ArchFloatMask = float;
+
+XE_FORCEINLINE
+static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
+  return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
+}
+XE_FORCEINLINE
+static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
+}
+XE_FORCEINLINE
+static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
+}
+
+XE_FORCEINLINE
+static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
+}
+
+XE_FORCEINLINE
+static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
+  return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
+}
+
+constexpr ArchFloatMask floatmask_zero = .0f;
+#else
+using ArchFloatMask = __m128;
+
+XE_FORCEINLINE
+static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
+  return _mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y));
+}
+XE_FORCEINLINE
+static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_or_ps(x, y);
+}
+XE_FORCEINLINE
+static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_xor_ps(x, y);
+}
+
+XE_FORCEINLINE
+static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return _mm_and_ps(x, y);
+}
+
+XE_FORCEINLINE
+static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
+  return static_cast<uint32_t>(_mm_movemask_ps(x) &1);
+}
+
+constexpr ArchFloatMask floatmask_zero{.0f};
+#endif
 #else
 static float ArchMin(float x, float y) { return std::min<float>(x, y); }
 static float ArchMax(float x, float y) { return std::max<float>(x, y); }
 static float ArchReciprocal(float den) { return 1.0f / den; }
+using ArchFloatMask = unsigned;
+
+XE_FORCEINLINE
+static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
+  return static_cast<unsigned>(-static_cast<signed>(x != y));
+}
+
+XE_FORCEINLINE
+static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return x | y;
+}
+XE_FORCEINLINE
+static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return x ^ y;
+}
+
+XE_FORCEINLINE
+static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
+  return x & y;
+}
+constexpr ArchFloatMask floatmask_zero = 0;
+
+
+XE_FORCEINLINE
+static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { return x >> 31; }
+
 #endif
 XE_FORCEINLINE
 static float RefineReciprocal(float initial, float den) {
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -115,14 +115,17 @@
 #define XE_COLD __declspec(code_seg(".cold"))
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
-
+#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
+#define	XE_NOALIAS		__declspec(noalias)
 #elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
 #define XE_FORCEINLINE __attribute__((always_inline))
 #define XE_NOINLINE __attribute__((noinline))
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
-
+#define XE_NOALIAS		
+//cant do unevaluated assume
+#define XE_MSVC_ASSUME(...) static_cast<void>(0)
 #else
 #define XE_FORCEINLINE inline
 #define XE_NOINLINE
@ -130,6 +133,9 @@

 #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
 #define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]]
+#define XE_NOALIAS
+#define XE_MSVC_ASSUME(...) static_cast<void>(0)
+
 #endif

 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
@ -174,5 +180,7 @@ const char kPathSeparator = '/';
 const char kGuestPathSeparator = '\\';

 }  // namespace xe
-
+#if XE_ARCH_AMD64==1
+#include "platform_amd64.h"
+#endif
 #endif  // XENIA_BASE_PLATFORM_H_
--- a/src/xenia/base/platform_amd64.cc
+++ b/src/xenia/base/platform_amd64.cc
@ -0,0 +1,115 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+
+#include "xenia/base/cvar.h"
+#include "xenia/base/platform.h"
+
+#include "third_party/xbyak/xbyak/xbyak.h"
+#include "third_party/xbyak/xbyak/xbyak_util.h"
+DEFINE_int32(x64_extension_mask, -1,
+             "Allow the detection and utilization of specific instruction set "
+             "features.\n"
+             "    0 = x86_64 + AVX1\n"
+             "    1 = AVX2\n"
+             "    2 = FMA\n"
+             "    4 = LZCNT\n"
+             "    8 = BMI1\n"
+             "   16 = BMI2\n"
+             "   32 = F16C\n"
+             "   64 = Movbe\n"
+             "  128 = GFNI\n"
+             "  256 = AVX512F\n"
+             "  512 = AVX512VL\n"
+             " 1024 = AVX512BW\n"
+             " 2048 = AVX512DQ\n"
+             "   -1 = Detect and utilize all possible processor features\n",
+             "x64");
+namespace xe {
+namespace amd64 {
+static uint32_t g_feature_flags = 0U;
+static bool g_did_initialize_feature_flags = false;
+uint32_t GetFeatureFlags() { 
+	xenia_assert(g_did_initialize_feature_flags);
+	return g_feature_flags; 
+}
+XE_COLD
+XE_NOINLINE
+void InitFeatureFlags() {
+  uint32_t feature_flags_ = 0U;
+
+  Xbyak::util::Cpu cpu_;
+#define TEST_EMIT_FEATURE(emit, ext)                \
+  if ((cvars::x64_extension_mask & emit) == emit) { \
+    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
+  }
+
+  TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
+  TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
+  TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
+  TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
+  TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
+  TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
+  TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
+  TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
+  TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
+  TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
+  TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
+  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
+  TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
+#undef TEST_EMIT_FEATURE
+  /*
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+  latest version of xbyak
+*/
+  unsigned int data[4];
+  Xbyak::util::Cpu::getCpuid(0x80000001, data);
+  unsigned amd_flags = data[2];
+  if (amd_flags & (1U << 5)) {
+    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+      feature_flags_ |= kX64EmitLZCNT;
+    }
+  }
+  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+  if (amd_flags & (1U << 16)) {
+    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+      feature_flags_ |= kX64EmitFMA4;
+    }
+  }
+  if (amd_flags & (1U << 21)) {
+    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+      feature_flags_ |= kX64EmitTBM;
+    }
+  }
+  if (amd_flags & (1U << 11)) {
+    if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
+      feature_flags_ |= kX64EmitXOP;
+    }
+  }
+  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
+    bool is_zennish = cpu_.displayFamily >= 0x17;
+    /*
+                chrispy: according to agner's tables, all amd architectures that
+       we support (ones with avx) have the same timings for
+       jrcxz/loop/loope/loopne as for other jmps
+        */
+    feature_flags_ |= kX64FastJrcx;
+    feature_flags_ |= kX64FastLoop;
+    if (is_zennish) {
+      // ik that i heard somewhere that this is the case for zen, but i need to
+      // verify. cant find my original source for that.
+      // todo: ask agner?
+      feature_flags_ |= kX64FlagsIndependentVars;
+    }
+  }
+  g_feature_flags = feature_flags_;
+  g_did_initialize_feature_flags = true;
+}
+}  // namespace amd64
+}  // namespace xe
--- a/src/xenia/base/platform_amd64.h
+++ b/src/xenia/base/platform_amd64.h
@ -0,0 +1,61 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2019 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_PLATFORM_AMD64_H_
+#define XENIA_BASE_PLATFORM_AMD64_H_
+#include <cstdint>
+
+namespace xe {
+namespace amd64 {
+enum X64FeatureFlags {
+  kX64EmitAVX2 = 1 << 0,
+  kX64EmitFMA = 1 << 1,
+  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
+  kX64EmitBMI1 = 1 << 3,
+  kX64EmitBMI2 = 1 << 4,
+  kX64EmitPrefetchW = 1 << 5,
+  kX64EmitMovbe = 1 << 6,
+  kX64EmitGFNI = 1 << 7,
+
+  kX64EmitAVX512F = 1 << 8,
+  kX64EmitAVX512VL = 1 << 9,
+
+  kX64EmitAVX512BW = 1 << 10,
+  kX64EmitAVX512DQ = 1 << 11,
+
+  kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
+  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
+  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
+  kX64FastLoop =
+      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64EmitAVX512VBMI = 1 << 14,
+  kX64FlagsIndependentVars =
+      1 << 15,  // if true, instructions that only modify some flags (like
+                // inc/dec) do not introduce false dependencies on EFLAGS
+                // because the individual flags are treated as different vars by
+                // the processor. (this applies to zen)
+  kX64EmitXOP = 1 << 16,   // chrispy: xop maps really well to many vmx
+                           // instructions, and FX users need the boost
+  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
+  kX64EmitTBM = 1 << 18,
+  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
+  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
+  // instructions by using the legacy sse version if we recently cleared the
+  // high 128 bits of the
+};
+
+XE_NOALIAS
+uint32_t GetFeatureFlags();
+XE_COLD
+void InitFeatureFlags();
+
+}
+}  // namespace xe
+
+#endif  // XENIA_BASE_PLATFORM_AMD64_H_
--- a/src/xenia/base/simple_freelist.h
+++ b/src/xenia/base/simple_freelist.h
@ -0,0 +1,40 @@
+#pragma once
+namespace xe {
+/*
+        a very simple freelist, intended to be used with HIRFunction/Arena to
+   eliminate our last-level cache miss problems with HIR simplifications not
+   thread safe, doesnt need to be
+*/
+template <typename T>
+struct SimpleFreelist {
+  union Node {
+    union Node* next_;
+    T entry_;
+  };
+  Node* head_;
+
+  static_assert(sizeof(T) >= sizeof(void*));
+  SimpleFreelist() : head_(nullptr) {}
+  T* NewEntry() {
+    Node* result_node = head_;
+    if (!result_node) {
+      return nullptr;
+    } else {
+      head_ = result_node->next_;
+
+      memset(result_node, 0, sizeof(T));
+      return &result_node->entry_;
+      // return new (&result_node->entry_) T(args...);
+    }
+  }
+
+  void DeleteEntry(T* value) {
+    memset(value, 0, sizeof(T));
+    Node* node = reinterpret_cast<Node*>(value);
+    node->next_ = head_;
+    head_ = node;
+  }
+  void Reset() { head_ = nullptr;
+  }
+};
+}  // namespace xe
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -50,6 +50,9 @@ XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer);
 // counts
 XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
                NtReleaseSemaphorePointer);
+
+XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
+                NtDelayExecutionPointer);
 namespace xe {
 namespace threading {

@ -109,13 +112,30 @@ void set_name(const std::string_view name) {
  set_name(GetCurrentThread(), name);
 }

+// checked ntoskrnl, it does not modify delay, so we can place this as a
+// constant and avoid creating a stack variable
+static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL};
+
 void MaybeYield() {
+#if 0
 #if defined(XE_USE_NTDLL_FUNCTIONS)
+	
  NtYieldExecutionPointer.invoke();
 #else
  SwitchToThread();
 #endif
-
+#else
+  // chrispy: SwitchToThread will only switch to a ready thread on the current
+  // processor, so if one is not ready we end up spinning, constantly calling
+  // switchtothread without doing any work, heating up the users cpu sleep(0)
+  // however will yield to threads on other processors and surrenders the
+  // current timeslice
+#if defined(XE_USE_NTDLL_FUNCTIONS)
+  NtDelayExecutionPointer.invoke(0, &sleepdelay0_for_maybeyield);
+#else
+  ::Sleep(0);
+#endif
+#endif
  // memorybarrier is really not necessary here...
  MemoryBarrier();
 }
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -26,24 +26,6 @@
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/stack_walker.h"
 #include "xenia/cpu/xex_module.h"
-DEFINE_int32(x64_extension_mask, -1,
-             "Allow the detection and utilization of specific instruction set "
-             "features.\n"
-             "    0 = x86_64 + AVX1\n"
-             "    1 = AVX2\n"
-             "    2 = FMA\n"
-             "    4 = LZCNT\n"
-             "    8 = BMI1\n"
-             "   16 = BMI2\n"
-             "   32 = F16C\n"
-             "   64 = Movbe\n"
-             "  128 = GFNI\n"
-             "  256 = AVX512F\n"
-             "  512 = AVX512VL\n"
-             " 1024 = AVX512BW\n"
-             " 2048 = AVX512DQ\n"
-             "   -1 = Detect and utilize all possible processor features\n",
-             "x64");

 DEFINE_bool(record_mmio_access_exceptions, true,
            "For guest addresses records whether we caught any mmio accesses "
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -103,7 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
        "FAQ for system requirements at https://xenia.jp");
    return;
  }
-
+#if 1
+  feature_flags_ = amd64::GetFeatureFlags();
+#else
 #define TEST_EMIT_FEATURE(emit, ext)                \
  if ((cvars::x64_extension_mask & emit) == emit) { \
    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
@ -168,6 +170,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
      feature_flags_ |= kX64FlagsIndependentVars;
    }
  }
+#endif
  may_use_membase32_as_zero_reg_ =
      static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
          processor()->memory()->virtual_membase())) == 0;
@ -913,6 +916,8 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,

 static const vec128_t xmm_consts[] = {
    /* XMMZero                */ vec128f(0.0f),
+    /* XMMByteSwapMask        */
+    vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
    /* XMMOne                 */ vec128f(1.0f),
    /* XMMOnePD               */ vec128d(1.0),
    /* XMMNegativeOne         */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
@ -937,8 +942,7 @@ static const vec128_t xmm_consts[] = {
    vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
    /* XMMAbsMaskPD           */
    vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
-    /* XMMByteSwapMask        */
-    vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
+
    /* XMMByteOrderMask       */
    vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
    /* XMMPermuteControl15    */ vec128b(15),
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -34,7 +34,7 @@ namespace xe {
 namespace cpu {
 namespace backend {
 namespace x64 {
-
+using namespace amd64;
 class X64Backend;
 class X64CodeCache;

@ -81,6 +81,7 @@ static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
 }
 enum XmmConst {
  XMMZero = 0,
+  XMMByteSwapMask,
  XMMOne,
  XMMOnePD,
  XMMNegativeOne,
@ -97,7 +98,7 @@ enum XmmConst {
  XMMSignMaskPD,
  XMMAbsMaskPS,
  XMMAbsMaskPD,
-  XMMByteSwapMask,
+
  XMMByteOrderMask,
  XMMPermuteControl15,
  XMMPermuteByteMask,
@ -189,42 +190,6 @@ class XbyakAllocator : public Xbyak::Allocator {
  virtual bool useProtect() const { return false; }
 };

-enum X64EmitterFeatureFlags {
-  kX64EmitAVX2 = 1 << 0,
-  kX64EmitFMA = 1 << 1,
-  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
-  kX64EmitBMI1 = 1 << 3,
-  kX64EmitBMI2 = 1 << 4,
-  kX64EmitPrefetchW = 1 << 5,
-  kX64EmitMovbe = 1 << 6,
-  kX64EmitGFNI = 1 << 7,
-
-  kX64EmitAVX512F = 1 << 8,
-  kX64EmitAVX512VL = 1 << 9,
-
-  kX64EmitAVX512BW = 1 << 10,
-  kX64EmitAVX512DQ = 1 << 11,
-
-  kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
-  kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
-  kX64FastJrcx = 1 << 12,  // jrcxz is as fast as any other jump ( >= Zen1)
-  kX64FastLoop =
-      1 << 13,  // loop/loope/loopne is as fast as any other jump ( >= Zen2)
-  kX64EmitAVX512VBMI = 1 << 14,
-  kX64FlagsIndependentVars =
-      1 << 15,  // if true, instructions that only modify some flags (like
-                // inc/dec) do not introduce false dependencies on EFLAGS
-                // because the individual flags are treated as different vars by
-                // the processor. (this applies to zen)
-  kX64EmitXOP = 1 << 16,   // chrispy: xop maps really well to many vmx
-                           // instructions, and FX users need the boost
-  kX64EmitFMA4 = 1 << 17,  // todo: also use on zen1?
-  kX64EmitTBM = 1 << 18,
-  // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
-  // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
-  // instructions by using the legacy sse version if we recently cleared the
-  // high 128 bits of the
-};
 class ResolvableGuestCall {
 public:
  bool is_jump_;
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -1354,15 +1354,17 @@ struct VECTOR_SHA_V128
  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
    // TODO(benvanik): native version (with shift magic).
    if (i.src2.is_constant) {
-      if (e.IsFeatureEnabled(kX64EmitGFNI)) {
-        const auto& shamt = i.src2.constant();
-        bool all_same = true;
-        for (size_t n = 0; n < 16 - n; ++n) {
-          if (shamt.u8[n] != shamt.u8[n + 1]) {
-            all_same = false;
-            break;
-          }
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
        }
+      }
+
+	  
+      if (e.IsFeatureEnabled(kX64EmitGFNI)) {
        if (all_same) {
          // Every count is the same, so we can use gf2p8affineqb.
          const uint8_t shift_amount = shamt.u8[0] & 0b111;
@ -1375,6 +1377,19 @@ struct VECTOR_SHA_V128
          return;
        }
      }
+      else if (all_same) {
+        Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
+
+        e.vpmovsxbw(e.xmm0, to_be_shifted);  //_mm_srai_epi16 / psraw
+        e.vpunpckhqdq(e.xmm2, to_be_shifted, to_be_shifted);
+        e.vpmovsxbw(e.xmm1, e.xmm2);
+        e.vpsraw(e.xmm0, shamt.u8[0]);
+        e.vpsraw(e.xmm1, shamt.u8[0]);
+        e.vpacksswb(i.dest, e.xmm0, e.xmm1);
+        return;
+      }
+
+
      e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
    } else {
      e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -3234,7 +3234,17 @@ struct SET_ROUNDING_MODE_I32
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
-
+// ============================================================================
+// OPCODE_DELAY_EXECUTION
+// ============================================================================
+struct DELAY_EXECUTION
+    : Sequence<DELAY_EXECUTION, I<OPCODE_DELAY_EXECUTION, VoidOp>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    // todo: what if they dont have smt?
+    e.pause();
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DELAY_EXECUTION, DELAY_EXECUTION);
 // Include anchors to other sequence sources so they get included in the build.
 extern volatile int anchor_control;
 static int anchor_control_dest = anchor_control;
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -98,7 +98,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstantTrue()) {
              i->Replace(&OPCODE_DEBUG_BREAK_info, i->flags);
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          }
@ -109,7 +109,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstantTrue()) {
              i->Replace(&OPCODE_TRAP_info, i->flags);
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          }
@ -122,7 +122,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->Replace(&OPCODE_CALL_info, i->flags);
              i->src1.symbol = symbol;
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          }
@ -146,7 +146,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->Replace(&OPCODE_CALL_INDIRECT_info, i->flags);
              i->set_src1(value);
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          } else if (i->src2.value->IsConstant()) {  // chrispy: fix h3 bug from
@ -172,7 +172,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->Replace(&OPCODE_BRANCH_info, i->flags);
              i->src1.label = label;
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          }
@ -184,7 +184,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->Replace(&OPCODE_BRANCH_info, i->flags);
              i->src1.label = label;
            } else {
-              i->Remove();
+              i->UnlinkAndNOP();
            }
            result = true;
          }
@ -195,7 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            TypeName target_type = v->type;
            v->set_from(i->src1.value);
            v->Cast(target_type);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -204,7 +204,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            TypeName target_type = v->type;
            v->set_from(i->src1.value);
            v->Convert(target_type, RoundMode(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -212,7 +212,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Round(RoundMode(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -221,7 +221,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            TypeName target_type = v->type;
            v->set_from(i->src1.value);
            v->ZeroExtend(target_type);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -230,7 +230,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            TypeName target_type = v->type;
            v->set_from(i->src1.value);
            v->SignExtend(target_type);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -239,7 +239,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            TypeName target_type = v->type;
            v->set_from(i->src1.value);
            v->Truncate(target_type);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -247,7 +247,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            if (!(i->src1.value->AsUint32() & 0xF)) {
              v->set_zero(VEC128_TYPE);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
              break;
            }
@ -281,22 +281,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                switch (v->type) {
                  case INT8_TYPE:
                    v->set_constant(xe::load<uint8_t>(host_addr));
-                    i->Remove();
+                    i->UnlinkAndNOP();
                    result = true;
                    break;
                  case INT16_TYPE:
                    v->set_constant(xe::load<uint16_t>(host_addr));
-                    i->Remove();
+                    i->UnlinkAndNOP();
                    result = true;
                    break;
                  case INT32_TYPE:
                    v->set_constant(xe::load<uint32_t>(host_addr));
-                    i->Remove();
+                    i->UnlinkAndNOP();
                    result = true;
                    break;
                  case INT64_TYPE:
                    v->set_constant(xe::load<uint64_t>(host_addr));
-                    i->Remove();
+                    i->UnlinkAndNOP();
                    result = true;
                    break;
                  case VEC128_TYPE:
@ -304,7 +304,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                    val.low = xe::load<uint64_t>(host_addr);
                    val.high = xe::load<uint64_t>(host_addr + 8);
                    v->set_constant(val);
-                    i->Remove();
+                    i->UnlinkAndNOP();
                    result = true;
                    break;
                  default:
@ -357,14 +357,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
                         i->src3.value->IsConstant()) {
                v->set_from(i->src2.value);
                v->Select(i->src3.value, i->src1.value);
-                i->Remove();
+                i->UnlinkAndNOP();
                result = true;
              }
            } else {
              if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
                v->set_from(i->src2.value);
                v->Select(i->src3.value, i->src1.value);
-                i->Remove();
+                i->UnlinkAndNOP();
                result = true;
              }
            }
@ -381,7 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            } else {
              v->set_constant(uint8_t(0));
            }
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -391,7 +391,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantEQ(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -399,7 +399,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantNE(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -407,7 +407,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantSLT(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -415,7 +415,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantSLE(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -423,7 +423,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantSGT(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -431,7 +431,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantSGE(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -439,7 +439,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantULT(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -447,7 +447,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantULE(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -455,7 +455,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantUGT(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -463,7 +463,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            bool value = i->src1.value->IsConstantUGE(i->src2.value);
            i->dest->set_constant(uint8_t(value));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -477,7 +477,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Add(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -489,7 +489,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              TypeName target_type = v->type;
              v->set_from(ca);
              v->ZeroExtend(target_type);
-              i->Remove();
+              i->UnlinkAndNOP();
            } else {
              if (i->dest->type == ca->type) {
                i->Replace(&OPCODE_ASSIGN_info, 0);
@ -507,7 +507,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              !should_skip_because_of_float) {
            v->set_from(i->src1.value);
            v->Sub(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -516,7 +516,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Mul(i->src2.value);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
            } else if (i->src1.value->IsConstant() ||
                       i->src2.value->IsConstant()) {
@ -548,7 +548,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -557,13 +557,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
            } else if (!i->src2.value->MaybeFloaty() &&
                       i->src2.value->IsConstantZero()) {
              // division by 0 == 0 every time,
              v->set_zero(i->src2.value->type);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
            } else if (i->src2.value->IsConstant()) {
              // Division by one = no-op.
@ -592,7 +592,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            }
            v->set_from(i->src1.value);
            v->Max(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -600,7 +600,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Neg();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -608,7 +608,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Abs();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -616,7 +616,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Sqrt();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -624,7 +624,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->RSqrt();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -632,7 +632,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Recip();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -640,7 +640,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->And(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -648,7 +648,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->AndNot(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -656,7 +656,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Or(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -664,13 +664,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Xor(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          } else if (!i->src1.value->IsConstant() &&
                     !i->src2.value->IsConstant() &&
                     i->src1.value == i->src2.value) {
            v->set_zero(v->type);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -678,7 +678,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Not();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -687,7 +687,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Shl(i->src2.value);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
            } else if (i->src2.value->IsConstantZero()) {
              auto src1 = i->src1.value;
@ -702,7 +702,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
              v->set_from(i->src1.value);
              v->Shr(i->src2.value);
-              i->Remove();
+              i->UnlinkAndNOP();
              result = true;
            } else if (i->src2.value->IsConstantZero()) {
              auto src1 = i->src1.value;
@ -716,7 +716,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Sha(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -724,7 +724,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->RotateLeft(i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -732,7 +732,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->ByteSwap();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -740,7 +740,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_zero(v->type);
            v->CountLeadingZeros(i->src1.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -751,7 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
            v->set_from(i->src1.value);
            v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }

@ -765,7 +765,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            */

            v->set_zero(VEC128_TYPE);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }

@ -777,7 +777,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->src3.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -785,7 +785,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -793,7 +793,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_zero(v->type);
            v->Extract(i->src1.value, i->src2.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -801,7 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_zero(v->type);
            v->Splat(i->src1.value);
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -809,7 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -817,7 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -825,7 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -833,7 +833,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -841,7 +841,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -850,7 +850,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            v->set_zero(VEC128_TYPE);
            v->VectorConvertF2I(i->src1.value,
                                !!(i->flags & ARITHMETIC_UNSIGNED));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -859,7 +859,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            v->set_zero(VEC128_TYPE);
            v->VectorConvertI2F(i->src1.value,
                                !!(i->flags & ARITHMETIC_UNSIGNED));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -867,7 +867,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorShl(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -875,7 +875,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorShr(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -883,7 +883,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->VectorRol(i->src2.value, hir::TypeName(i->flags));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -894,7 +894,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            v->VectorAdd(i->src2.value, hir::TypeName(i->flags & 0xFF),
                         !!(arith_flags & ARITHMETIC_UNSIGNED),
                         !!(arith_flags & ARITHMETIC_SATURATE));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -905,7 +905,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF),
                         !!(arith_flags & ARITHMETIC_UNSIGNED),
                         !!(arith_flags & ARITHMETIC_SATURATE));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -917,7 +917,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
            v->VectorAverage(i->src2.value, hir::TypeName(i->flags & 0xFF),
                             !!(arith_flags & ARITHMETIC_UNSIGNED),
                             !!(arith_flags & ARITHMETIC_SATURATE));
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
@ -926,7 +926,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          if (i->src1.value->IsConstant()) {
            v->set_from(i->src1.value);
            v->DenormalFlush();
-            i->Remove();
+            i->UnlinkAndNOP();
            result = true;
          }
          break;
--- a/src/xenia/cpu/compiler/passes/context_promotion_pass.cc
+++ b/src/xenia/cpu/compiler/passes/context_promotion_pass.cc
@ -146,7 +146,7 @@ void ContextPromotionPass::RemoveDeadStoresBlock(Block* block) {
        validity.set(static_cast<uint32_t>(offset));
      } else {
        // Already written to. Remove this store.
-        i->Remove();
+        i->UnlinkAndNOP();
      }
    }
    i = prev;
--- a/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc
+++ b/src/xenia/cpu/compiler/passes/dead_code_elimination_pass.cc
@ -120,7 +120,8 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {
        Instr* next = i->next;
        if (i->opcode == &OPCODE_NOP_info) {
          // Nop - remove!
-          i->Remove();
+          i->UnlinkAndNOP();
+          i->Deallocate();
        }
        i = next;
      }
@ -148,7 +149,9 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {

 void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
  i->opcode = &hir::OPCODE_NOP_info;
-  i->dest->def = NULL;
+  if (i->dest) {
+    i->dest->def = NULL;
+  }
  i->dest = NULL;

 #define MAKE_NOP_SRC(n)                                  \
@ -163,7 +166,9 @@ void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
      if (value->def && value->def != i) {               \
        MakeNopRecursive(value->def);                    \
      }                                                  \
+      HIRBuilder::GetCurrent()->DeallocateValue(value);  \
    }                                                    \
+    HIRBuilder::GetCurrent()->DeallocateUse(use);        \
  }
  MAKE_NOP_SRC(1);
  MAKE_NOP_SRC(2);
@ -189,7 +194,8 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) {
    use = use->next;
  }

-  i->Remove();
+  i->UnlinkAndNOP();
+  i->Deallocate();
 }

 bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
@ -204,11 +210,11 @@ bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
    }

    // Load/store are paired. They can both be removed.
-    use_instr->Remove();
+    use_instr->UnlinkAndNOP();
  }

-  i->Remove();
-
+  i->UnlinkAndNOP();
+  i->Deallocate();
  return false;
 }

--- a/src/xenia/cpu/compiler/passes/finalization_pass.cc
+++ b/src/xenia/cpu/compiler/passes/finalization_pass.cc
@ -61,7 +61,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
      auto target = tail->src1.label;
      if (target->block == block->next) {
        // Jumping to subsequent block. Remove.
-        tail->Remove();
+        tail->UnlinkAndNOP();
      }
    }

--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -46,15 +46,27 @@ namespace hir {
              (value->type) == FLOAT64_TYPE || (value->type) == VEC128_TYPE)
 #define ASSERT_TYPES_EQUAL(value1, value2) \
  assert_true((value1->type) == (value2->type))
-
+thread_local HIRBuilder* thrd_current_hirfunction = nullptr;
 HIRBuilder::HIRBuilder() {
  arena_ = new Arena();
  Reset();
 }

+HIRBuilder* HIRBuilder::GetCurrent() { return thrd_current_hirfunction; }
+
+void HIRBuilder::MakeCurrent() { thrd_current_hirfunction = this; }
+void HIRBuilder::RemoveCurrent() {
+  if (thrd_current_hirfunction == this) {
+    thrd_current_hirfunction = nullptr;
+  }
+}
+
 HIRBuilder::~HIRBuilder() {
  Reset();
  delete arena_;
+  if (thrd_current_hirfunction == this) {
+    thrd_current_hirfunction = nullptr;
+  }
 }

 void HIRBuilder::Reset() {
@ -105,7 +117,37 @@ bool HIRBuilder::Finalize() {
  }
  return true;
 }
+Instr* HIRBuilder::AllocateInstruction() {
+  Instr* result = free_instrs_.NewEntry();
+  if (result) {
+    return result;
+  }
+  return arena()->Alloc<Instr>();
+}

+Value* HIRBuilder::AllocateValue() {
+  Value* result = free_values_.NewEntry();
+  if (result) {
+    return result;
+  }
+  return arena()->Alloc<Value>();
+}
+Value::Use* HIRBuilder::AllocateUse() {
+  Value::Use* result = free_uses_.NewEntry();
+  if (result) {
+    return result;
+  }
+  return arena()->Alloc<Value::Use>();
+}
+void HIRBuilder::DeallocateInstruction(Instr* instr) {
+  // free_instrs_.DeleteEntry(instr);
+}
+void HIRBuilder::DeallocateValue(Value* value) {
+  // free_values_.DeleteEntry(value);
+}
+void HIRBuilder::DeallocateUse(Value::Use* use) {
+  // free_uses_.DeleteEntry(use);
+}
 void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
  if (value->IsConstant()) {
    switch (value->type) {
@ -545,12 +587,12 @@ void HIRBuilder::MergeAdjacentBlocks(Block* left, Block* right) {
    auto sig = left->instr_tail->opcode->signature;
    if (GET_OPCODE_SIG_TYPE_SRC1(sig) == OPCODE_SIG_TYPE_L) {
      if (left->instr_tail->src1.label->block == right) {
-        left->instr_tail->Remove();
+        left->instr_tail->UnlinkAndNOP();
      }
    }
    if (GET_OPCODE_SIG_TYPE_SRC2(sig) == OPCODE_SIG_TYPE_L) {
      if (left->instr_tail->src2.label->block == right) {
-        left->instr_tail->Remove();
+        left->instr_tail->UnlinkAndNOP();
      }
    }
  }
@ -678,7 +720,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
  }
  Block* block = current_block_;

-  Instr* instr = arena_->Alloc<Instr>();
+  Instr* instr = AllocateInstruction();
  instr->next = NULL;
  instr->prev = block->instr_tail;
  if (block->instr_tail) {
@ -705,7 +747,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
 }

 Value* HIRBuilder::AllocValue(TypeName type) {
-  Value* value = arena_->Alloc<Value>();
+  Value* value = AllocateValue();
  value->ordinal = next_value_ordinal_++;
  value->type = type;
  value->flags = 0;
@ -719,7 +761,7 @@ Value* HIRBuilder::AllocValue(TypeName type) {
 }

 Value* HIRBuilder::CloneValue(Value* source) {
-  Value* value = arena_->Alloc<Value>();
+  Value* value = AllocateValue();
  value->ordinal = next_value_ordinal_++;
  value->type = source->type;
  value->flags = source->flags;
@ -1295,6 +1337,9 @@ void HIRBuilder::CacheControl(Value* address, size_t cache_line_size,

 void HIRBuilder::MemoryBarrier() { AppendInstr(OPCODE_MEMORY_BARRIER_info, 0); }

+void HIRBuilder::DelayExecution() {
+  AppendInstr(OPCODE_DELAY_EXECUTION_info, 0);
+}
 void HIRBuilder::SetRoundingMode(Value* value) {
  ASSERT_INTEGER_TYPE(value);
  Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -15,6 +15,8 @@
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/arena.h"
 #include "xenia/base/string_buffer.h"
+
+#include "xenia/base/simple_freelist.h"
 #include "xenia/cpu/hir/block.h"
 #include "xenia/cpu/hir/instr.h"
 #include "xenia/cpu/hir/label.h"
@ -31,11 +33,20 @@ enum FunctionAttributes {
 };

 class HIRBuilder {
+  SimpleFreelist<Instr> free_instrs_;
+  SimpleFreelist<Value> free_values_;
+  SimpleFreelist<Value::Use> free_uses_;
+
 public:
  HIRBuilder();
  virtual ~HIRBuilder();
+  static HIRBuilder* GetCurrent();
+
+  void MakeCurrent();
+  void RemoveCurrent();

  virtual void Reset();
+
  virtual bool Finalize();

  void Dump(StringBuffer* str);
@ -66,6 +77,18 @@ class HIRBuilder {
  void RemoveBlock(Block* block);
  void MergeAdjacentBlocks(Block* left, Block* right);

+  Instr* AllocateInstruction();
+
+  Value* AllocateValue();
+  Value::Use* AllocateUse();
+  void DeallocateInstruction(Instr* instr);
+  void DeallocateValue(Value* value);
+  void DeallocateUse(Value::Use* use);
+  void ResetPools() {
+    free_instrs_.Reset();
+    free_uses_.Reset();
+    free_values_.Reset();
+  }
  // static allocations:
  // Value* AllocStatic(size_t length);

@ -176,7 +199,7 @@ class HIRBuilder {
  void CacheControl(Value* address, size_t cache_line_size,
                    CacheControlType type);
  void MemoryBarrier();
-
+  void DelayExecution();
  void SetRoundingMode(Value* value);
  Value* Max(Value* value1, Value* value2);
  Value* VectorMax(Value* value1, Value* value2, TypeName part_type,
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -10,7 +10,7 @@
 #include "xenia/cpu/hir/instr.h"

 #include "xenia/cpu/hir/block.h"
-
+#include "xenia/cpu/hir/hir_builder.h"
 namespace xe {
 namespace cpu {
 namespace hir {
@ -62,21 +62,35 @@ void Instr::Replace(const OpcodeInfo* new_opcode, uint16_t new_flags) {
  if (src1_use) {
    src1.value->RemoveUse(src1_use);
    src1.value = NULL;
-    src1_use = NULL;
+    // src1_use = NULL;
  }
  if (src2_use) {
    src2.value->RemoveUse(src2_use);
    src2.value = NULL;
-    src2_use = NULL;
+    // src2_use = NULL;
  }
  if (src3_use) {
    src3.value->RemoveUse(src3_use);
    src3.value = NULL;
-    src3_use = NULL;
+    // src3_use = NULL;
+  }
+
+  if (src1_use) {
+    HIRBuilder::GetCurrent()->DeallocateUse(src1_use);
+    src1_use = nullptr;
+  }
+  if (src2_use) {
+    HIRBuilder::GetCurrent()->DeallocateUse(src2_use);
+    src2_use = nullptr;
+  }
+
+  if (src3_use) {
+    HIRBuilder::GetCurrent()->DeallocateUse(src3_use);
+    src3_use = nullptr;
  }
 }

-void Instr::Remove() {
+void Instr::UnlinkAndNOP() {
  // Remove all srcs/dest.
  Replace(&OPCODE_NOP_info, 0);

@ -91,6 +105,10 @@ void Instr::Remove() {
    block->instr_tail = prev;
  }
 }
+
+void Instr::Deallocate() {
+  HIRBuilder::GetCurrent()->DeallocateInstruction(this);
+}
 Instr* Instr::GetDestDefSkipAssigns() {
  Instr* current_def = this;

--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -78,7 +78,12 @@ class Instr {

  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
-  void Remove();
+  void UnlinkAndNOP();
+  //chrispy: wanted to change this one to Remove, but i changed Remove's name to UnlinkAndNOP,
+  //so if changes happened in master that we wanted to port over, and those changes used Remove, then we would have a lot of issues that the cause of would
+  //be difficult to track 
+  //^todo: rework this comment, im frazzled
+  void Deallocate();
  const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
  // if opcode is null, we have bigger problems
  Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -292,7 +292,7 @@ enum Opcode {
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
  OPCODE_SET_NJM,
-
+	OPCODE_DELAY_EXECUTION, //for db16cyc
  __OPCODE_MAX_VALUE,  // Keep at end.
 };

--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -218,7 +218,7 @@ DEFINE_OPCODE(
    "context_barrier",
    OPCODE_SIG_X,
    0)
-
+DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
 DEFINE_OPCODE(
    OPCODE_LOAD_MMIO,
    "load_mmio",
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@ -16,13 +16,13 @@
 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
 #include "xenia/base/math.h"
-
+#include "xenia/cpu/hir/hir_builder.h"
 namespace xe {
 namespace cpu {
 namespace hir {

 Value::Use* Value::AddUse(Arena* arena, Instr* instr) {
-  Use* use = arena->Alloc<Use>();
+  Use* use = HIRBuilder::GetCurrent()->AllocateUse();
  use->instr = instr;
  use->prev = NULL;
  use->next = use_head;
@ -42,6 +42,8 @@ void Value::RemoveUse(Use* use) {
  if (use->next) {
    use->next->prev = use->prev;
  }
+
+  //HIRBuilder::GetCurrent()->DeallocateUse(use);
 }

 uint32_t Value::AsUint32() {
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -789,8 +789,15 @@ int InstrEmit_norx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_orx(PPCHIRBuilder& f, const InstrData& i) {
  // RA <- (RS) | (RB)
  if (i.X.RT == i.X.RB && i.X.RT == i.X.RA && !i.X.Rc) {
-    // Sometimes used as no-op.
-    f.Nop();
+    // chrispy: this special version of orx is db16cyc and is heavily used in
+    // spinlocks. since we do not emit any code for this we end up wasting a ton
+    // of power
+    if (i.code == 0x7FFFFB78) {
+      f.DelayExecution();
+    } else {
+      // Sometimes used as no-op.
+      f.Nop();
+    }
    return 0;
  }
  Value* ra;
--- a/src/xenia/cpu/ppc/ppc_frontend.cc
+++ b/src/xenia/cpu/ppc/ppc_frontend.cc
@ -117,6 +117,7 @@ bool PPCFrontend::DefineFunction(GuestFunction* function,
                                 uint32_t debug_info_flags) {
  auto translator = translator_pool_.Allocate(this);
  bool result = translator->Translate(function, debug_info_flags);
+  translator->Reset();
  translator_pool_.Release(translator);
  return result;
 }
--- a/src/xenia/cpu/ppc/ppc_translator.cc
+++ b/src/xenia/cpu/ppc/ppc_translator.cc
@ -96,10 +96,25 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) {

 PPCTranslator::~PPCTranslator() = default;

+class HirBuilderScope {
+  PPCHIRBuilder* builder_;
+
+ public:
+  HirBuilderScope(PPCHIRBuilder* builder) : builder_(builder) {
+    builder_->MakeCurrent();
+  }
+
+  ~HirBuilderScope() {
+    if (builder_) {
+      builder_->RemoveCurrent();
+	}
+  }
+};
+
 bool PPCTranslator::Translate(GuestFunction* function,
                              uint32_t debug_info_flags) {
  SCOPE_profile_cpu_f("cpu");
-
+  HirBuilderScope hir_build_scope{builder_.get()};
  // Reset() all caching when we leave.
  xe::make_reset_scope(builder_);
  xe::make_reset_scope(compiler_);
@ -196,7 +211,7 @@ bool PPCTranslator::Translate(GuestFunction* function,

  return true;
 }
-
+void PPCTranslator::Reset() { builder_->ResetPools(); }
 void PPCTranslator::DumpSource(GuestFunction* function,
                               StringBuffer* string_buffer) {
  Memory* memory = frontend_->memory();
--- a/src/xenia/cpu/ppc/ppc_translator.h
+++ b/src/xenia/cpu/ppc/ppc_translator.h
@ -31,7 +31,7 @@ class PPCTranslator {
  ~PPCTranslator();

  bool Translate(GuestFunction* function, uint32_t debug_info_flags);
-
+  void Reset();
 private:
  void DumpSource(GuestFunction* function, StringBuffer* string_buffer);

--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -19,6 +19,7 @@
 #include <string>
 #include <vector>

+#include "xenia/base/dma.h"
 #include "xenia/base/ring_buffer.h"
 #include "xenia/base/threading.h"
 #include "xenia/gpu/register_file.h"
@ -66,6 +67,11 @@ enum class GammaRampType {
 };

 class CommandProcessor {
+ protected:
+  RingBuffer
+      reader_;  // chrispy: instead of having ringbuffer on stack, have it near
+                // the start of the class so we can access it via rel8. This
+                // also reduces the number of params we need to pass
 public:
  enum class SwapPostEffect {
    kNone,
@ -76,7 +82,7 @@ class CommandProcessor {
  CommandProcessor(GraphicsSystem* graphics_system,
                   kernel::KernelState* kernel_state);
  virtual ~CommandProcessor();
-
+  dma::XeDMAC* GetDMAC() const { return dmac_; }
  uint32_t counter() const { return counter_; }
  void increment_counter() { counter_++; }

@ -101,7 +107,7 @@ class CommandProcessor {
  // screen right in the beginning of 4D530AA4 is not a resolved render target,
  // for instance).
  virtual void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
-                         uint32_t frontbuffer_height) = 0;
+                         uint32_t frontbuffer_height) {}

  // May be called not only from the command processor thread when the command
  // processor is paused, and the termination of this function may be explicitly
@ -153,7 +159,7 @@ class CommandProcessor {
  // rarely needed, most register writes have no special logic here
  XE_NOINLINE
  void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
-  XE_FORCEINLINE
+
  virtual void WriteRegister(uint32_t index, uint32_t value);

  // mem has big-endian register values
@ -165,12 +171,53 @@ class CommandProcessor {
  virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
                                          uint32_t num_registers);

-  XE_FORCEINLINE
+  XE_NOINLINE
  virtual void WriteOneRegisterFromRing(
-      xe::RingBuffer* ring, uint32_t base,
+      uint32_t base,
      uint32_t
          num_times);  // repeatedly write a value to one register, presumably a
                       // register with special handling for writes
+
+  XE_FORCEINLINE
+  void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                             uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                               uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                              uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                              uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                                   uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
+                            uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
+                              uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
+                             uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
+                             uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
+                                  uint32_t num_registers);
+
  const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
    return gamma_ramp_256_entry_table_;
  }
@ -186,75 +233,22 @@ class CommandProcessor {

  uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
  virtual void OnPrimaryBufferEnd() {}
-  void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
-  bool ExecutePacket(RingBuffer* reader);
-  bool ExecutePacketType0(RingBuffer* reader, uint32_t packet);
-  bool ExecutePacketType1(RingBuffer* reader, uint32_t packet);
-  bool ExecutePacketType2(RingBuffer* reader, uint32_t packet);
-  bool ExecutePacketType3(RingBuffer* reader, uint32_t packet);
-  bool ExecutePacketType3_ME_INIT(RingBuffer* reader, uint32_t packet,
-                                  uint32_t count);
-  bool ExecutePacketType3_NOP(RingBuffer* reader, uint32_t packet,
-                              uint32_t count);
-  bool ExecutePacketType3_INTERRUPT(RingBuffer* reader, uint32_t packet,
-                                    uint32_t count);
-  bool ExecutePacketType3_XE_SWAP(RingBuffer* reader, uint32_t packet,
-                                  uint32_t count);
-  bool ExecutePacketType3_INDIRECT_BUFFER(RingBuffer* reader, uint32_t packet,
-                                          uint32_t count);
-  bool ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, uint32_t packet,
-                                       uint32_t count);
-  bool ExecutePacketType3_REG_RMW(RingBuffer* reader, uint32_t packet,
-                                  uint32_t count);
-  bool ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, uint32_t packet,
-                                     uint32_t count);
-  bool ExecutePacketType3_MEM_WRITE(RingBuffer* reader, uint32_t packet,
-                                    uint32_t count);
-  bool ExecutePacketType3_COND_WRITE(RingBuffer* reader, uint32_t packet,
-                                     uint32_t count);
-  bool ExecutePacketType3_EVENT_WRITE(RingBuffer* reader, uint32_t packet,
-                                      uint32_t count);
-  bool ExecutePacketType3_EVENT_WRITE_SHD(RingBuffer* reader, uint32_t packet,
-                                          uint32_t count);
-  bool ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, uint32_t packet,
-                                          uint32_t count);
-  bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet,
-                                          uint32_t count);
-  bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet,
-                              const char* opcode_name,
-                              uint32_t viz_query_condition,
-                              uint32_t count_remaining);
-  bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet,
-                                    uint32_t count);
-  bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet,
-                                      uint32_t count);
-  bool ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, uint32_t packet,
-                                       uint32_t count);
-  bool ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t packet,
-                                        uint32_t count);
-  bool ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, uint32_t packet,
-                                            uint32_t count);
-  bool ExecutePacketType3_SET_SHADER_CONSTANTS(RingBuffer* reader,
-                                               uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_IM_LOAD(RingBuffer* reader, uint32_t packet,
-                                  uint32_t count);
-  bool ExecutePacketType3_IM_LOAD_IMMEDIATE(RingBuffer* reader,

-                                            uint32_t packet, uint32_t count);
-  bool ExecutePacketType3_INVALIDATE_STATE(RingBuffer* reader, uint32_t packet,
-                                           uint32_t count);
-  bool ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, uint32_t packet,
-                                    uint32_t count);
+#include "pm4_command_processor_declare.h"

  virtual Shader* LoadShader(xenos::ShaderType shader_type,
                             uint32_t guest_address,
                             const uint32_t* host_address,
-                             uint32_t dword_count) = 0;
+                             uint32_t dword_count) {
+    return nullptr;
+  }

  virtual bool IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count,
                         IndexBufferInfo* index_buffer_info,
-                         bool major_mode_explicit) = 0;
-  virtual bool IssueCopy() = 0;
+                         bool major_mode_explicit) {
+    return false;
+  }
+  virtual bool IssueCopy() { return false; }

  // "Actual" is for the command processor thread, to be read by the
  // implementations.
@ -267,7 +261,7 @@ class CommandProcessor {
  Memory* memory_ = nullptr;
  kernel::KernelState* kernel_state_ = nullptr;
  GraphicsSystem* graphics_system_ = nullptr;
-  RegisterFile* register_file_ = nullptr;
+  RegisterFile* XE_RESTRICT register_file_ = nullptr;

  TraceWriter trace_writer_;
  enum class TraceState {
@ -316,6 +310,7 @@ class CommandProcessor {
  reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
  reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
  uint32_t gamma_ramp_rw_component_ = 0;
+  dma::XeDMAC* dmac_ = nullptr;
 };

 }  // namespace gpu
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -1,4 +1,5 @@
 /**
+/**
 /**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
@ -35,6 +36,7 @@
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/kernel/kernel_state.h"
+#include "xenia/kernel/user_module.h"
 #include "xenia/ui/d3d12/d3d12_descriptor_heap_pool.h"
 #include "xenia/ui/d3d12/d3d12_provider.h"
 #include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
@ -46,6 +48,7 @@ namespace d3d12 {

 class D3D12CommandProcessor final : public CommandProcessor {
 public:
+#include "../pm4_command_processor_declare.h"
  explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
                                 kernel::KernelState* kernel_state);
  ~D3D12CommandProcessor();
@ -205,22 +208,70 @@ class D3D12CommandProcessor final : public CommandProcessor {
 protected:
  bool SetupContext() override;
  void ShutdownContext() override;
-  XE_FORCEINLINE
+
  void WriteRegister(uint32_t index, uint32_t value) override;
  XE_FORCEINLINE
  virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
                                     uint32_t num_registers) override;
+
+  template <uint32_t register_lower_bound, uint32_t register_upper_bound>
+  XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
+      uint32_t start_index, uint32_t* base, uint32_t num_registers);
  XE_FORCEINLINE
  virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
                                          uint32_t num_registers) override;
+  template <uint32_t register_lower_bound, uint32_t register_upper_bound>
+  XE_FORCEINLINE void WriteRegisterRangeFromRing_WithKnownBound(
+      xe::RingBuffer* ring, uint32_t base, uint32_t num_registers);

  XE_NOINLINE
  void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring,
                                                 uint32_t base,
-                                                 uint32_t num_registers); 
-  XE_FORCEINLINE
-  virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
+                                                 uint32_t num_registers);
+  XE_NOINLINE
+  virtual void WriteOneRegisterFromRing(uint32_t base,
                                        uint32_t num_times) override;
+
+  XE_FORCEINLINE
+  void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                             uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                               uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                              uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                              uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                                   uint32_t num_times);
+
+  XE_FORCEINLINE
+  void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
+                            uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
+                              uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
+                             uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
+                             uint32_t num_registers);
+
+  XE_FORCEINLINE
+  void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
+                                  uint32_t num_registers);
+
  void OnGammaRamp256EntryTableValueWritten() override;
  void OnGammaRampPWLValueWritten() override;

@ -367,6 +418,14 @@ class D3D12CommandProcessor final : public CommandProcessor {
                                const draw_util::Scissor& scissor,
                                bool primitive_polygonal,
                                reg::RB_DEPTHCONTROL normalized_depth_control);
+
+  template <bool primitive_polygonal, bool edram_rov_used>
+  XE_NOINLINE void UpdateSystemConstantValues_Impl(
+      bool shared_memory_is_uav, uint32_t line_loop_closing_index,
+      xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info,
+      uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
+      uint32_t normalized_color_mask);
+
  void UpdateSystemConstantValues(bool shared_memory_is_uav,
                                  bool primitive_polygonal,
                                  uint32_t line_loop_closing_index,
@ -619,8 +678,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
  uint32_t current_graphics_root_up_to_date_;

  // System shader constants.
-  alignas(XE_HOST_CACHE_LINE_SIZE) 
-	  DxbcShaderTranslator::SystemConstants system_constants_;
+  alignas(XE_HOST_CACHE_LINE_SIZE)
+      DxbcShaderTranslator::SystemConstants system_constants_;

  // Float constant usage masks of the last draw call.
  // chrispy: make sure accesses to these cant cross cacheline boundaries
--- a/src/xenia/gpu/d3d12/d3d12_nvapi.hpp
+++ b/src/xenia/gpu/d3d12/d3d12_nvapi.hpp
@ -0,0 +1,122 @@
+#pragma once
+// requires windows.h
+#include <stdint.h>
+
+namespace lightweight_nvapi {
+
+using nvstatus_t = int;
+
+using nvintfid_t = unsigned int;
+
+#ifndef LIGHTWEIGHT_NVAPI_EXCLUDE_D3D12
+constexpr nvintfid_t id_NvAPI_D3D12_QueryCpuVisibleVidmem = 0x26322BC3;
+
+using cb_NvAPI_D3D12_QueryCpuVisibleVidmem = nvstatus_t (*)(
+    ID3D12Device* pDevice, uint64_t* pTotalBytes, uint64_t* pFreeBytes);
+
+constexpr nvintfid_t id_NvAPI_D3D12_UseDriverHeapPriorities = 0xF0D978A8;
+using cb_NvAPI_D3D12_UseDriverHeapPriorities =
+    nvstatus_t (*)(ID3D12Device* pDevice);
+enum NV_D3D12_RESOURCE_FLAGS {
+  NV_D3D12_RESOURCE_FLAG_NONE = 0,
+  NV_D3D12_RESOURCE_FLAG_HTEX = 1,  //!< Create HTEX texture
+  NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM =
+      2,  //!< Hint to create resource in cpuvisible vidmem
+};
+
+struct NV_RESOURCE_PARAMS {
+  uint32_t version;  //!< Version of structure. Must always be first member
+  NV_D3D12_RESOURCE_FLAGS
+  NVResourceFlags;  //!< Additional NV specific flags (set the
+                    //!< NV_D3D12_RESOURCE_FLAG_HTEX bit to create HTEX
+                    //!< texture)
+};
+
+using cb_NvAPI_D3D12_CreateCommittedResource = nvstatus_t (*)(
+    ID3D12Device* pDevice, const D3D12_HEAP_PROPERTIES* pHeapProperties,
+    D3D12_HEAP_FLAGS HeapFlags, const D3D12_RESOURCE_DESC* pDesc,
+    D3D12_RESOURCE_STATES InitialState,
+    const D3D12_CLEAR_VALUE* pOptimizedClearValue,
+    const NV_RESOURCE_PARAMS* pNVResourceParams, REFIID riid,
+    void** ppvResource, bool* pSupported);
+constexpr nvintfid_t id_NvAPI_D3D12_CreateCommittedResource = 0x27E98AEu;
+#endif
+class nvapi_state_t {
+  HMODULE nvapi64_;
+  void* (*queryinterface_)(unsigned int intfid);
+  bool available_;
+  bool init_ptrs();
+
+  bool call_init_interface();
+  void call_deinit_interface();
+
+ public:
+  nvapi_state_t() : nvapi64_(LoadLibraryA("nvapi64.dll")), available_(false) {
+    available_ = init_ptrs();
+  }
+  ~nvapi_state_t();
+  template <typename T>
+  T* query_interface(unsigned int intfid) {
+    if (queryinterface_ == nullptr) {
+      return nullptr;
+    }
+    return reinterpret_cast<T*>(queryinterface_(intfid));
+  }
+
+  bool is_available() const { return available_; }
+};
+inline bool nvapi_state_t::call_init_interface() {
+  int result = -1;
+  auto initInterfaceEx = query_interface<int(int)>(0xAD298D3F);
+  if (!initInterfaceEx) {
+    auto initInterface = query_interface<int()>(0x150E828u);
+    if (initInterface) {
+      result = initInterface();
+    }
+  } else {
+    result = initInterfaceEx(0);
+  }
+  return result == 0;
+}
+inline void nvapi_state_t::call_deinit_interface() {
+  auto deinitinterfaceex = query_interface<void(int)>(0xD7C61344);
+  if (deinitinterfaceex) {
+    deinitinterfaceex(1);  // or 0? im not sure what the proper value is
+  } else {
+    auto deinitinterface = query_interface<void()>(0xD22BDD7E);
+    if (deinitinterface) {
+      deinitinterface();
+    }
+  }
+}
+inline bool nvapi_state_t::init_ptrs() {
+  if (!nvapi64_) return false;
+  queryinterface_ = reinterpret_cast<void* (*)(unsigned)>(
+      GetProcAddress(nvapi64_, "nvapi_QueryInterface"));
+
+  if (!queryinterface_) {
+    return false;
+  }
+  if (!call_init_interface()) {
+    return false;
+  }
+
+  return true;
+}
+inline nvapi_state_t::~nvapi_state_t() {
+  if (available_) {
+    call_deinit_interface();
+  }
+}
+inline void init_nvapi() {
+  /// HMODULE moddy = LoadLibraryA("nvapi64.dll");
+
+  // FARPROC quif = GetProcAddress(moddy, "nvapi_QueryInterface");
+
+  nvapi_state_t nvapi{};
+
+  auto queryvisible = nvapi.query_interface<void>(0x26322BC3);
+  return;
+}
+
+}  // namespace lightweight_nvapi
--- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
@ -108,12 +108,11 @@ bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer(
        size_bytes);
    return false;
  }
+
  Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
-  if (FAILED(device->CreateCommittedResource(
-          &ui::d3d12::util::kHeapPropertiesUpload,
+  if (!provider.CreateUploadResource(
          provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
-          D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
-          IID_PPV_ARGS(&upload_resource)))) {
+          D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&upload_resource))) {
    XELOGE(
        "D3D12 primitive processor: Failed to create the built-in index "
        "buffer upload resource with {} bytes",
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -5492,11 +5492,19 @@ void D3D12RenderTargetCache::SetCommandListRenderTargets(
  }

  // Bind the render targets.
-  if (are_current_command_list_render_targets_valid_ &&
-      std::memcmp(current_command_list_render_targets_,
-                  depth_and_color_render_targets,
-                  sizeof(current_command_list_render_targets_))) {
-    are_current_command_list_render_targets_valid_ = false;
+  if (are_current_command_list_render_targets_valid_) {
+    // chrispy: the small memcmp doesnt get optimized by msvc
+
+    for (unsigned i = 0;
+         i < sizeof(current_command_list_render_targets_) /
+                 sizeof(current_command_list_render_targets_[0]);
+         ++i) {
+      if ((const void*)current_command_list_render_targets_[i] !=
+          (const void*)depth_and_color_render_targets[i]) {
+        are_current_command_list_render_targets_valid_ = false;
+        break;
+      }
+    }
  }
  uint32_t render_targets_are_srgb;
  if (gamma_render_target_as_srgb_) {
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
@ -467,7 +467,7 @@ void D3D12TextureCache::EndFrame() {
      XELOGE("Unsupported texture formats used in the frame:");
      unsupported_header_written = true;
    }
-    XELOGE("* {}{}{}{}", FormatInfo::Get(xenos::TextureFormat(i))->name,
+    XELOGE("* {}{}{}{}", FormatInfo::GetName(xenos::TextureFormat(i)),
           unsupported_features & kUnsupportedResourceBit ? " resource" : "",
           unsupported_features & kUnsupportedUnormBit ? " unsigned" : "",
           unsupported_features & kUnsupportedSnormBit ? " signed" : "");
@ -523,12 +523,16 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
    }
  }
 }
-
+// chrispy: optimize this further
 bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
    const TextureSRVKey* keys,
    const D3D12Shader::TextureBinding* host_shader_bindings,
    size_t host_shader_binding_count) const {
  for (size_t i = 0; i < host_shader_binding_count; ++i) {
+    if (i + 8 < host_shader_binding_count) {
+      PrefetchTextureBinding<swcache::PrefetchTag::Nontemporal>(
+          host_shader_bindings[i + 8].fetch_constant);
+    }
    const TextureSRVKey& key = keys[i];
    const TextureBinding* binding =
        GetValidTextureBinding(host_shader_bindings[i].fetch_constant);
@ -538,8 +542,9 @@ bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
      }
      continue;
    }
-    if (key.key != binding->key || key.host_swizzle != binding->host_swizzle ||
-        key.swizzled_signs != binding->swizzled_signs) {
+    if ((key.key != binding->key) |
+        (key.host_swizzle != binding->host_swizzle) |
+        (key.swizzled_signs != binding->swizzled_signs)) {
      return false;
    }
  }
@ -666,8 +671,12 @@ uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex(
  }
  return descriptor_index;
 }
-
-D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
+void D3D12TextureCache::PrefetchSamplerParameters(
+	const D3D12Shader::SamplerBinding& binding) const {
+  swcache::PrefetchL1(&register_file()[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
+                                       binding.fetch_constant * 6]);
+}
+    D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
    const D3D12Shader::SamplerBinding& binding) const {
  const auto& regs = register_file();
  const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
@ -694,7 +703,7 @@ D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
                                                 nullptr, nullptr, nullptr,
                                                 &mip_min_level, nullptr);
  parameters.mip_min_level = mip_min_level;
-
+  //high cache miss count here, prefetch fetch earlier
  // TODO(Triang3l): Disable filtering for texture formats not supporting it.
  xenos::AnisoFilter aniso_filter =
      binding.aniso_filter == xenos::AnisoFilter::kUseFetchConst
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h
@ -119,7 +119,8 @@ class D3D12TextureCache final : public TextureCache {
      D3D12_CPU_DESCRIPTOR_HANDLE handle);
  uint32_t GetActiveTextureBindlessSRVIndex(
      const D3D12Shader::TextureBinding& host_shader_binding);
-
+  void PrefetchSamplerParameters(
+      const D3D12Shader::SamplerBinding& binding) const;
  SamplerParameters GetSamplerParameters(
      const D3D12Shader::SamplerBinding& binding) const;
  void WriteSampler(SamplerParameters parameters,
@ -712,7 +713,7 @@ class D3D12TextureCache final : public TextureCache {
  }

  LoadShaderIndex GetLoadShaderIndex(TextureKey key) const;
-
+  // chrispy: todo, can use simple branchless tests here
  static constexpr bool AreDimensionsCompatible(
      xenos::FetchOpDimension binding_dimension,
      xenos::DataDimension resource_dimension) {
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -1047,8 +1047,7 @@ bool PipelineCache::ConfigurePipeline(
  PipelineDescription& description = runtime_description.description;

  if (current_pipeline_ != nullptr &&
-      !std::memcmp(&current_pipeline_->description.description, &description,
-                   sizeof(description))) {
+      current_pipeline_->description.description == description) {
    *pipeline_handle_out = current_pipeline_;
    *root_signature_out = runtime_description.root_signature;
    return true;
@ -1059,8 +1058,7 @@ bool PipelineCache::ConfigurePipeline(
  auto found_range = pipelines_.equal_range(hash);
  for (auto it = found_range.first; it != found_range.second; ++it) {
    Pipeline* found_pipeline = it->second;
-    if (!std::memcmp(&found_pipeline->description.description, &description,
-                     sizeof(description))) {
+    if (found_pipeline->description.description == description) {
      current_pipeline_ = found_pipeline;
      *pipeline_handle_out = found_pipeline;
      *root_signature_out = found_pipeline->description.root_signature;
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@ -226,6 +226,7 @@ class PipelineCache {

    PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];

+    inline bool operator==(const PipelineDescription& other) const;
    static constexpr uint32_t kVersion = 0x20210425;
  });

@ -424,7 +425,34 @@ class PipelineCache {
  size_t creation_threads_shutdown_from_ = SIZE_MAX;
  std::vector<std::unique_ptr<xe::threading::Thread>> creation_threads_;
 };
+inline bool PipelineCache::PipelineDescription::operator==(
+    const PipelineDescription& other) const {
+  constexpr size_t cmp_size = sizeof(PipelineDescription);
+#if XE_ARCH_AMD64 == 1
+  if constexpr (cmp_size == 64) {
+    if (vertex_shader_hash != other.vertex_shader_hash ||
+        vertex_shader_modification != other.vertex_shader_modification) {
+      return false;
+    }
+    const __m128i* thiz = (const __m128i*)this;
+    const __m128i* thoze = (const __m128i*)&other;
+    __m128i cmp32 =
+        _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 1), _mm_loadu_si128(thoze + 1));

+    cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 2),
+                                                _mm_loadu_si128(thoze + 2)));
+
+    cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 3),
+                                                _mm_loadu_si128(thoze + 3)));
+
+    return _mm_movemask_epi8(cmp32) == 0xFFFF;
+
+  } else
+#endif
+  {
+    return !memcmp(this, &other, cmp_size);
+  }
+}
 }  // namespace d3d12
 }  // namespace gpu
 }  // namespace xe
--- a/src/xenia/gpu/draw_extent_estimator.cc
+++ b/src/xenia/gpu/draw_extent_estimator.cc
@ -320,22 +320,38 @@ uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
    // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
    // usually exists and can't be disabled.
    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
+
    float viewport_bottom = 0.0f;
+    uint32_t enable_window_offset =
+        regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable;
+
+    bool not_pix_center = !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center;
+
+    float window_y_offset_f = float(window_y_offset);
+
+    float yoffset = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+
    // First calculate all the integer.0 or integer.5 offsetting exactly at full
    // precision.
-    if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
-      viewport_bottom += float(window_y_offset);
+    // chrispy: branch mispredicts here causing some pain according to vtune
+    float sm1 = .0f, sm2 = .0f, sm3 = .0f, sm4 = .0f;
+
+    if (enable_window_offset) {
+      sm1 = window_y_offset_f;
    }
-    if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
-      viewport_bottom += 0.5f;
+    if (not_pix_center) {
+      sm2 = 0.5f;
    }
    // Then apply the floating-point viewport offset.
    if (pa_cl_vte_cntl.vport_y_offset_ena) {
-      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+      sm3 = yoffset;
    }
-    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
-                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
-                           : 1.0f;
+    sm4 = pa_cl_vte_cntl.vport_y_scale_ena
+              ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
+              : 1.0f;
+
+    viewport_bottom = sm1 + sm2 + sm3 + sm4;
+
    // Using floor, or, rather, truncation (because maxing with zero anyway)
    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
    // GPUs on Direct3D 12 (but not WARP), also like in
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -929,8 +929,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
      XELOGW(
          "Resolving to format {}, which is untested - treating like {}. "
          "Report the game to Xenia developers!",
-          FormatInfo::Get(dest_format)->name,
-          FormatInfo::Get(dest_closest_format)->name);
+          FormatInfo::GetName(dest_format),
+          FormatInfo::GetName(dest_closest_format));
    }
  }

@ -1002,7 +1002,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
    }
  } else {
    XELOGE("Tried to resolve to format {}, which is not a ColorFormat",
-           dest_format_info.name);
+           FormatInfo::GetName(dest_format));
    copy_dest_extent_start = copy_dest_base_adjusted;
    copy_dest_extent_end = copy_dest_base_adjusted;
  }
@ -1117,7 +1117,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                     xenos::DepthRenderTargetFormat(depth_edram_info.format))
               : xenos::GetColorRenderTargetFormatName(
                     xenos::ColorRenderTargetFormat(color_edram_info.format)),
-      dest_format_info.name, rb_copy_dest_base, copy_dest_extent_start,
+      FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
      copy_dest_extent_end);

  return true;
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@ -0,0 +1,106 @@
+
+
+void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
+
+virtual bool ExecutePacket();
+XE_NOINLINE
+bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
+
+bool ExecutePacketType2( uint32_t packet) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3( uint32_t packet) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_ME_INIT( uint32_t packet,
+                                uint32_t count) XE_RESTRICT;
+bool ExecutePacketType3_NOP( uint32_t packet,
+                            uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_INTERRUPT( uint32_t packet,
+                                  uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_XE_SWAP( uint32_t packet,
+                                uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_INDIRECT_BUFFER( uint32_t packet,
+                                        uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_WAIT_REG_MEM( uint32_t packet,
+                                     uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_REG_RMW( uint32_t packet,
+                                uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_REG_TO_MEM( uint32_t packet,
+                                   uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_MEM_WRITE( uint32_t packet,
+                                  uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_COND_WRITE( uint32_t packet,
+                                   uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_EVENT_WRITE( uint32_t packet,
+                                    uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_EVENT_WRITE_SHD( uint32_t packet,
+                                        uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_EVENT_WRITE_EXT( uint32_t packet,
+                                        uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_EVENT_WRITE_ZPD( uint32_t packet,
+                                        uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3Draw( uint32_t packet,
+                            const char* opcode_name,
+                            uint32_t viz_query_condition,
+                            uint32_t count_remaining) XE_RESTRICT;
+
+bool ExecutePacketType3_DRAW_INDX( uint32_t packet,
+                                  uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_DRAW_INDX_2( uint32_t packet,
+                                    uint32_t count) XE_RESTRICT;
+XE_FORCEINLINE
+bool ExecutePacketType3_SET_CONSTANT( uint32_t packet,
+                                     uint32_t count) XE_RESTRICT;
+XE_NOINLINE
+bool ExecutePacketType3_SET_CONSTANT2( uint32_t packet,
+                                      uint32_t count) XE_RESTRICT;
+XE_FORCEINLINE
+bool ExecutePacketType3_LOAD_ALU_CONSTANT( uint32_t packet,
+                                          uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_SET_SHADER_CONSTANTS(
+                                             uint32_t packet,
+                                             uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_IM_LOAD( uint32_t packet,
+                                uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_IM_LOAD_IMMEDIATE( uint32_t packet,
+                                          uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_INVALIDATE_STATE( uint32_t packet,
+                                         uint32_t count) XE_RESTRICT;
+
+bool ExecutePacketType3_VIZ_QUERY( uint32_t packet,
+                                  uint32_t count) XE_RESTRICT;
+
+
+XE_FORCEINLINE
+void WriteEventInitiator(uint32_t value) XE_RESTRICT;
+
+XE_NOINLINE
+XE_COLD
+bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
+
+XE_NOINLINE
+XE_NOALIAS
+uint32_t GetCurrentRingReadCount();
+
+XE_NOINLINE
+XE_COLD
+bool ExecutePacketType3_CountOverflow(uint32_t count);
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -233,15 +233,27 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
  // Fire per-range watches.
  for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
    WatchNode* node = watch_buckets_[i];
+    if (i + 1 <= bucket_last) {
+      WatchNode* nextnode = watch_buckets_[i + 1];
+      if (nextnode) {
+        swcache::PrefetchL1(nextnode->range);
+      }
+    }
    while (node != nullptr) {
      WatchRange* range = node->range;
      // Store the next node now since when the callback is triggered, the links
      // will be broken.
      node = node->bucket_node_next;
+      if (node) {
+        swcache::PrefetchL1(node);
+      }
      if (page_first <= range->page_last && page_last >= range->page_first) {
        range->callback(global_lock, range->callback_context,
                        range->callback_data, range->callback_argument,
                        invalidated_by_gpu);
+        if (node && node->range) {
+          swcache::PrefetchL1(node->range);
+        }
        UnlinkWatchRange(range);
      }
    }
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -440,7 +440,7 @@ void TextureCache::TextureKey::LogAction(const char* action) const {
      "base at 0x{:08X} (pitch {}), mips at 0x{:08X}",
      action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "",
      GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(),
-      FormatInfo::Get(format)->name, mip_max_level + 1, packed_mips ? "" : "un",
+      FormatInfo::GetName(format), mip_max_level + 1, packed_mips ? "" : "un",
      mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5,
      mip_page << 12);
 }
@ -453,7 +453,7 @@ void TextureCache::Texture::LogAction(const char* action) const {
      action, key_.tiled ? "tiled" : "linear",
      key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(),
      key_.GetDepthOrArraySize(), key_.GetLogDimensionName(),
-      FormatInfo::Get(key_.format)->name, key_.mip_max_level + 1,
+      FormatInfo::GetName(key_.format), key_.mip_max_level + 1,
      key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "",
      key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(),
      key_.mip_page << 12, GetGuestMipsSize());
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -128,6 +128,14 @@ class TextureCache {
    return (binding->texture && binding->texture->IsResolved()) ||
           (binding->texture_signed && binding->texture_signed->IsResolved());
  }
+  template <swcache::PrefetchTag tag>
+  void PrefetchTextureBinding(uint32_t fetch_constant_index) const {
+    swcache::Prefetch<tag>(&texture_bindings_[fetch_constant_index]);
+    swcache::Prefetch<tag>(
+        &texture_bindings_[fetch_constant_index +
+                           1]);  // we may cross a cache line boundary :( size
+                                 // of the structure is 0x28
+  }

 protected:
  struct TextureKey {
--- a/src/xenia/gpu/texture_dump.cc
+++ b/src/xenia/gpu/texture_dump.cc
@ -85,7 +85,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
      assert_unhandled_case(src.format);
      std::memset(&dds_header.pixel_format, 0xCD,
                  sizeof(dds_header.pixel_format));
-      XELOGW("Skipping {} for texture dump.", src.format_info()->name);
+      XELOGW("Skipping {} for texture dump.", src.format_name());
      return;
    }
  }
@ -96,7 +96,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
  std::filesystem::path path = "texture_dumps";
  path /= fmt::format("{:05d}_{:08X}_{:08X}_{:08X}.dds", dump_counter++,
                      src.memory.base_address, src.memory.mip_address,
-                      src.format_info()->name);
+                      src.format_name());

  FILE* handle = filesystem::OpenFile(path, "wb");
  if (handle) {
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -159,151 +159,6 @@ void TextureInfo::GetMipSize(uint32_t mip, uint32_t* out_width,
  *out_height = std::max(height_pow2 >> mip, 1u);
 }

-uint32_t TextureInfo::GetMipLocation(uint32_t mip, uint32_t* offset_x,
-                                     uint32_t* offset_y, bool is_guest) const {
-  if (mip == 0) {
-    // Short-circuit. Mip 0 is always stored in base_address.
-    if (!has_packed_mips) {
-      *offset_x = 0;
-      *offset_y = 0;
-    } else {
-      GetPackedTileOffset(0, offset_x, offset_y);
-    }
-    return memory.base_address;
-  }
-
-  if (!memory.mip_address) {
-    // Short-circuit. There is no mip data.
-    *offset_x = 0;
-    *offset_y = 0;
-    return 0;
-  }
-
-  uint32_t address_base, address_offset;
-  address_base = memory.mip_address;
-  address_offset = 0;
-
-  auto bytes_per_block = format_info()->bytes_per_block();
-
-  if (!has_packed_mips) {
-    for (uint32_t i = 1; i < mip; i++) {
-      address_offset +=
-          GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
-    }
-    *offset_x = 0;
-    *offset_y = 0;
-    return address_base + address_offset;
-  }
-
-  uint32_t width_pow2 = xe::next_pow2(width + 1);
-  uint32_t height_pow2 = xe::next_pow2(height + 1);
-
-  // Walk forward to find the address of the mip.
-  uint32_t packed_mip_base = 1;
-  for (uint32_t i = packed_mip_base; i < mip; i++, packed_mip_base++) {
-    uint32_t mip_width = std::max(width_pow2 >> i, 1u);
-    uint32_t mip_height = std::max(height_pow2 >> i, 1u);
-    if (std::min(mip_width, mip_height) <= 16) {
-      // We've reached the point where the mips are packed into a single tile.
-      break;
-    }
-    address_offset += GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
-  }
-
-  // Now, check if the mip is packed at an offset.
-  GetPackedTileOffset(width_pow2 >> mip, height_pow2 >> mip, format_info(),
-                      mip - packed_mip_base, offset_x, offset_y);
-  return address_base + address_offset;
-}
-
-bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height,
-                                      const FormatInfo* format_info,
-                                      int packed_tile, uint32_t* offset_x,
-                                      uint32_t* offset_y) {
-  // Tile size is 32x32, and once textures go <=16 they are packed into a
-  // single tile together. The math here is insane. Most sourced
-  // from graph paper and looking at dds dumps.
-  //   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-  // 0         +.4x4.+ +.....8x8.....+ +............16x16............+
-  // 1         +.4x4.+ +.....8x8.....+ +............16x16............+
-  // 2         +.4x4.+ +.....8x8.....+ +............16x16............+
-  // 3         +.4x4.+ +.....8x8.....+ +............16x16............+
-  // 4 x               +.....8x8.....+ +............16x16............+
-  // 5                 +.....8x8.....+ +............16x16............+
-  // 6                 +.....8x8.....+ +............16x16............+
-  // 7                 +.....8x8.....+ +............16x16............+
-  // 8 2x2                             +............16x16............+
-  // 9 2x2                             +............16x16............+
-  // 0                                 +............16x16............+
-  // ...                                            .....
-  // This only works for square textures, or textures that are some non-pot
-  // <= square. As soon as the aspect ratio goes weird, the textures start to
-  // stretch across tiles.
-  //
-  // The 2x2 and 1x1 squares are packed in their specific positions because
-  // each square is the size of at least one block (which is 4x4 pixels max)
-  //
-  // if (tile_aligned(w) > tile_aligned(h)) {
-  //   // wider than tall, so packed horizontally
-  // } else if (tile_aligned(w) < tile_aligned(h)) {
-  //   // taller than wide, so packed vertically
-  // } else {
-  //   square
-  // }
-  // It's important to use logical sizes here, as the input sizes will be
-  // for the entire packed tile set, not the actual texture.
-  // The minimum dimension is what matters most: if either width or height
-  // is <= 16 this mode kicks in.
-
-  uint32_t log2_width = xe::log2_ceil(width);
-  uint32_t log2_height = xe::log2_ceil(height);
-  if (std::min(log2_width, log2_height) > 4) {
-    // Too big, not packed.
-    *offset_x = 0;
-    *offset_y = 0;
-    return false;
-  }
-
-  // Find the block offset of the mip.
-  if (packed_tile < 3) {
-    if (log2_width > log2_height) {
-      // Wider than tall. Laid out vertically.
-      *offset_x = 0;
-      *offset_y = 16 >> packed_tile;
-    } else {
-      // Taller than wide. Laid out horizontally.
-      *offset_x = 16 >> packed_tile;
-      *offset_y = 0;
-    }
-  } else {
-    if (log2_width > log2_height) {
-      // Wider than tall. Laid out vertically.
-      *offset_x = 16 >> (packed_tile - 2);
-      *offset_y = 0;
-    } else {
-      // Taller than wide. Laid out horizontally.
-      *offset_x = 0;
-      *offset_y = 16 >> (packed_tile - 2);
-    }
-  }
-
-  *offset_x /= format_info->block_width;
-  *offset_y /= format_info->block_height;
-  return true;
-}
-
-bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
-                                      uint32_t* offset_y) const {
-  if (!has_packed_mips) {
-    *offset_x = 0;
-    *offset_y = 0;
-    return false;
-  }
-  return GetPackedTileOffset(xe::next_pow2(width + 1),
-                             xe::next_pow2(height + 1), format_info(),
-                             packed_tile, offset_x, offset_y);
-}
-
 uint64_t TextureInfo::hash() const {
  return XXH3_64bits(this, sizeof(TextureInfo));
 }
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -181,7 +181,7 @@ inline xenos::TextureFormat DepthRenderTargetToTextureFormat(
  }
 }

-enum class FormatType {
+enum class FormatType : uint32_t {
  // Uncompressed, and is also a ColorFormat.
  kResolvable,
  // Uncompressed, but resolve or memory export cannot be done to the format.
@ -190,12 +190,12 @@ enum class FormatType {
 };

 struct FormatInfo {
-  xenos::TextureFormat format;
-  const char* name;
-  FormatType type;
-  uint32_t block_width;
-  uint32_t block_height;
-  uint32_t bits_per_pixel;
+  const xenos::TextureFormat format;
+
+  const FormatType type;
+  const uint32_t block_width;
+  const uint32_t block_height;
+  const uint32_t bits_per_pixel;

  uint32_t bytes_per_block() const {
    return block_width * block_height * bits_per_pixel / 8;
@ -203,6 +203,20 @@ struct FormatInfo {

  static const FormatInfo* Get(uint32_t gpu_format);

+  static const char* GetName(uint32_t gpu_format);
+  static const char* GetName(xenos::TextureFormat format) {
+    return GetName(static_cast<uint32_t>(format));
+  }
+
+  static unsigned char GetWidthShift(uint32_t gpu_format);
+  static unsigned char GetHeightShift(uint32_t gpu_format);
+
+  static unsigned char GetWidthShift(xenos::TextureFormat gpu_format) {
+    return GetWidthShift(static_cast<uint32_t>(gpu_format));
+  }
+  static unsigned char GetHeightShift(xenos::TextureFormat gpu_format) {
+    return GetHeightShift(static_cast<uint32_t>(gpu_format));
+  }
  static const FormatInfo* Get(xenos::TextureFormat format) {
    return Get(static_cast<uint32_t>(format));
  }
@ -259,7 +273,9 @@ struct TextureInfo {
  const FormatInfo* format_info() const {
    return FormatInfo::Get(static_cast<uint32_t>(format));
  }
-
+  const char* format_name() const {
+    return FormatInfo::GetName(static_cast<uint32_t>(format));
+  }
  bool is_compressed() const {
    return format_info()->type == FormatType::kCompressed;
  }
@ -281,18 +297,6 @@ struct TextureInfo {

  void GetMipSize(uint32_t mip, uint32_t* width, uint32_t* height) const;

-  // Get the memory location of a mip. offset_x and offset_y are in blocks.
-  uint32_t GetMipLocation(uint32_t mip, uint32_t* offset_x, uint32_t* offset_y,
-                          bool is_guest) const;
-
-  static bool GetPackedTileOffset(uint32_t width, uint32_t height,
-                                  const FormatInfo* format_info,
-                                  int packed_tile, uint32_t* offset_x,
-                                  uint32_t* offset_y);
-
-  bool GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
-                           uint32_t* offset_y) const;
-
  uint64_t hash() const;
  bool operator==(const TextureInfo& other) const {
    return std::memcmp(this, &other, sizeof(TextureInfo)) == 0;
--- a/src/xenia/gpu/texture_info_formats.cc
+++ b/src/xenia/gpu/texture_info_formats.cc
@ -17,77 +17,60 @@ namespace gpu {
 using namespace xe::gpu::xenos;

 #define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) \
-    {xenos::TextureFormat::texture_format, #texture_format, FormatType::format, block_width, block_height, bits_per_pixel}
+    {xenos::TextureFormat::texture_format,  FormatType::format, block_width, block_height, bits_per_pixel}
 const FormatInfo* FormatInfo::Get(uint32_t gpu_format) {
  static const FormatInfo format_infos[64] = {
-      FORMAT_INFO(k_1_REVERSE                , kUncompressed, 1, 1, 1),
-      FORMAT_INFO(k_1                        , kUncompressed, 1, 1, 1),
-      FORMAT_INFO(k_8                        , kResolvable,   1, 1, 8),
-      FORMAT_INFO(k_1_5_5_5                  , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_5_6_5                    , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_6_5_5                    , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_8_8_8_8                  , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_2_10_10_10               , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_8_A                      , kResolvable,   1, 1, 8),
-      FORMAT_INFO(k_8_B                      , kResolvable,   1, 1, 8),
-      FORMAT_INFO(k_8_8                      , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP          , kCompressed,   2, 1, 16),
-      FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP          , kCompressed,   2, 1, 16),
-      FORMAT_INFO(k_16_16_EDRAM              , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_8_8_8_8_A                , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_4_4_4_4                  , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_10_11_11                 , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_11_11_10                 , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_DXT1                     , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_DXT2_3                   , kCompressed,   4, 4, 8),
-      FORMAT_INFO(k_DXT4_5                   , kCompressed,   4, 4, 8),
-      FORMAT_INFO(k_16_16_16_16_EDRAM        , kUncompressed, 1, 1, 64),
-      FORMAT_INFO(k_24_8                     , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_24_8_FLOAT               , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_16                       , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_16_16                    , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_16_16_16_16              , kResolvable,   1, 1, 64),
-      FORMAT_INFO(k_16_EXPAND                , kUncompressed, 1, 1, 16),
-      FORMAT_INFO(k_16_16_EXPAND             , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_16_16_16_16_EXPAND       , kUncompressed, 1, 1, 64),
-      FORMAT_INFO(k_16_FLOAT                 , kResolvable,   1, 1, 16),
-      FORMAT_INFO(k_16_16_FLOAT              , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_16_16_16_16_FLOAT        , kResolvable,   1, 1, 64),
-      FORMAT_INFO(k_32                       , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_32_32                    , kUncompressed, 1, 1, 64),
-      FORMAT_INFO(k_32_32_32_32              , kUncompressed, 1, 1, 128),
-      FORMAT_INFO(k_32_FLOAT                 , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_32_32_FLOAT              , kResolvable,   1, 1, 64),
-      FORMAT_INFO(k_32_32_32_32_FLOAT        , kResolvable,   1, 1, 128),
-      FORMAT_INFO(k_32_AS_8                  , kCompressed,   4, 1, 8),
-      FORMAT_INFO(k_32_AS_8_8                , kCompressed,   2, 1, 16),
-      FORMAT_INFO(k_16_MPEG                  , kUncompressed, 1, 1, 16),
-      FORMAT_INFO(k_16_16_MPEG               , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_8_INTERLACED             , kUncompressed, 1, 1, 8),
-      FORMAT_INFO(k_32_AS_8_INTERLACED       , kCompressed,   4, 1, 8),
-      FORMAT_INFO(k_32_AS_8_8_INTERLACED     , kCompressed,   1, 1, 16),
-      FORMAT_INFO(k_16_INTERLACED            , kUncompressed, 1, 1, 16),
-      FORMAT_INFO(k_16_MPEG_INTERLACED       , kUncompressed, 1, 1, 16),
-      FORMAT_INFO(k_16_16_MPEG_INTERLACED    , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_DXN                      , kCompressed,   4, 4, 8),
-      FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16   , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_DXT1_AS_16_16_16_16      , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_DXT2_3_AS_16_16_16_16    , kCompressed,   4, 4, 8),
-      FORMAT_INFO(k_DXT4_5_AS_16_16_16_16    , kCompressed,   4, 4, 8),
-      FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_10_11_11_AS_16_16_16_16  , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_11_11_10_AS_16_16_16_16  , kResolvable,   1, 1, 32),
-      FORMAT_INFO(k_32_32_32_FLOAT           , kUncompressed, 1, 1, 96),
-      FORMAT_INFO(k_DXT3A                    , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_DXT5A                    , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_CTX1                     , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_DXT3A_AS_1_1_1_1         , kCompressed,   4, 4, 4),
-      FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM      , kUncompressed, 1, 1, 32),
-      FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM   , kUncompressed, 1, 1, 32),
+      #include "texture_info_formats.inl"
  };
  return &format_infos[gpu_format];
 }
 #undef FORMAT_INFO

+
+constexpr unsigned char GetShift(unsigned pow) {
+	unsigned char sh = 0;
+
+	while (!(pow & 1)) {
+		pow>>=1;
+		sh++;
+	}
+
+	return sh;
+}
+/*
+	todo: getwidthshift and getheightshift should not need a full 64 byte table each
+	there are 15 elements for GetWidthShift where the shift will not be 0. the max shift that will be returned is 2, and there are 64 elements total
+	this means we can use a boolean table that also acts as a sparse indexer ( popcnt preceding bits to get index) and shift and mask a 32 bit word to get the shift
+*/
+unsigned char FormatInfo::GetWidthShift(uint32_t gpu_format) {
+	#define		FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel)		GetShift(block_width)
+	alignas(XE_HOST_CACHE_LINE_SIZE)
+	constexpr unsigned char wshift_table[64] = {
+		#include "texture_info_formats.inl"
+	};
+	#undef FORMAT_INFO
+
+	return wshift_table[gpu_format];
+}
+unsigned char FormatInfo::GetHeightShift(uint32_t gpu_format) {
+#define		FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel)		GetShift(block_height)
+	alignas(XE_HOST_CACHE_LINE_SIZE)
+	constexpr unsigned char hshift_table[64] = {
+		#include "texture_info_formats.inl"
+	};
+	#undef FORMAT_INFO
+
+	return hshift_table[gpu_format];
+}
+#define		FORMAT_INFO(texture_format,...)		#texture_format
+static constexpr const char* const format_name_table[64] = {
+	#include "texture_info_formats.inl"
+
+};
+#undef FORMAT_INFO
+const char* FormatInfo::GetName(uint32_t gpu_format) {
+
+	return format_name_table[gpu_format];
+}
 }  //  namespace gpu
 }  //  namespace xe
--- a/src/xenia/gpu/texture_info_formats.inl
+++ b/src/xenia/gpu/texture_info_formats.inl
@ -0,0 +1,64 @@
+FORMAT_INFO(k_1_REVERSE, kUncompressed, 1, 1, 1),
+FORMAT_INFO(k_1, kUncompressed, 1, 1, 1),
+FORMAT_INFO(k_8, kResolvable, 1, 1, 8),
+FORMAT_INFO(k_1_5_5_5, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_5_6_5, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_6_5_5, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_8_8_8_8, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_2_10_10_10, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_8_A, kResolvable, 1, 1, 8),
+FORMAT_INFO(k_8_B, kResolvable, 1, 1, 8),
+FORMAT_INFO(k_8_8, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP, kCompressed, 2, 1, 16),
+FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP, kCompressed, 2, 1, 16),
+FORMAT_INFO(k_16_16_EDRAM, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_8_8_8_8_A, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_4_4_4_4, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_10_11_11, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_11_11_10, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_DXT1, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_DXT2_3, kCompressed, 4, 4, 8),
+FORMAT_INFO(k_DXT4_5, kCompressed, 4, 4, 8),
+FORMAT_INFO(k_16_16_16_16_EDRAM, kUncompressed, 1, 1, 64),
+FORMAT_INFO(k_24_8, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_24_8_FLOAT, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_16, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_16_16, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_16_16_16_16, kResolvable, 1, 1, 64),
+FORMAT_INFO(k_16_EXPAND, kUncompressed, 1, 1, 16),
+FORMAT_INFO(k_16_16_EXPAND, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_16_16_16_16_EXPAND, kUncompressed, 1, 1, 64),
+FORMAT_INFO(k_16_FLOAT, kResolvable, 1, 1, 16),
+FORMAT_INFO(k_16_16_FLOAT, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_16_16_16_16_FLOAT, kResolvable, 1, 1, 64),
+FORMAT_INFO(k_32, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_32_32, kUncompressed, 1, 1, 64),
+FORMAT_INFO(k_32_32_32_32, kUncompressed, 1, 1, 128),
+FORMAT_INFO(k_32_FLOAT, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_32_32_FLOAT, kResolvable, 1, 1, 64),
+FORMAT_INFO(k_32_32_32_32_FLOAT, kResolvable, 1, 1, 128),
+FORMAT_INFO(k_32_AS_8, kCompressed, 4, 1, 8),
+FORMAT_INFO(k_32_AS_8_8, kCompressed, 2, 1, 16),
+FORMAT_INFO(k_16_MPEG, kUncompressed, 1, 1, 16),
+FORMAT_INFO(k_16_16_MPEG, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_8_INTERLACED, kUncompressed, 1, 1, 8),
+FORMAT_INFO(k_32_AS_8_INTERLACED, kCompressed, 4, 1, 8),
+FORMAT_INFO(k_32_AS_8_8_INTERLACED, kCompressed, 1, 1, 16),
+FORMAT_INFO(k_16_INTERLACED, kUncompressed, 1, 1, 16),
+FORMAT_INFO(k_16_MPEG_INTERLACED, kUncompressed, 1, 1, 16),
+FORMAT_INFO(k_16_16_MPEG_INTERLACED, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_DXN, kCompressed, 4, 4, 8),
+FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_DXT1_AS_16_16_16_16, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_DXT2_3_AS_16_16_16_16, kCompressed, 4, 4, 8),
+FORMAT_INFO(k_DXT4_5_AS_16_16_16_16, kCompressed, 4, 4, 8),
+FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_10_11_11_AS_16_16_16_16, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_11_11_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
+FORMAT_INFO(k_32_32_32_FLOAT, kUncompressed, 1, 1, 96),
+FORMAT_INFO(k_DXT3A, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_DXT5A, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_CTX1, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_DXT3A_AS_1_1_1_1, kCompressed, 4, 4, 4),
+FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM, kUncompressed, 1, 1, 32),
+FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM, kUncompressed, 1, 1, 32),
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -199,9 +199,8 @@ bool GetPackedMipOffset(uint32_t width, uint32_t height, uint32_t depth,
    }
  }

-  const FormatInfo* format_info = FormatInfo::Get(format);
-  x_blocks /= format_info->block_width;
-  y_blocks /= format_info->block_height;
+  x_blocks >>= FormatInfo::GetWidthShift(format);
+  y_blocks >>= FormatInfo::GetHeightShift(format);
  return true;
 }

@ -273,9 +272,10 @@ TextureGuestLayout GetGuestTextureLayout(
  }
  layout.mips_total_extent_bytes = 0;

-  const FormatInfo* format_info = FormatInfo::Get(format);
-  uint32_t bytes_per_block = format_info->bytes_per_block();
-
+  const FormatInfo* const format_info = FormatInfo::Get(format);
+  const uint32_t bytes_per_block = format_info->bytes_per_block();
+  const unsigned char block_width_sh = FormatInfo::GetWidthShift(format);
+  const unsigned char block_height_sh = FormatInfo::GetHeightShift(format);
  // The loop counter can mean two things depending on whether the packed mip
  // tail is stored as mip 0, because in this case, it would be ambiguous since
  // both the base and the mips would be on "level 0", but stored separately and
@ -320,10 +320,13 @@ TextureGuestLayout GetGuestTextureLayout(
      z_slice_stride_texel_rows_unaligned =
          std::max(xe::next_pow2(height_texels) >> level, uint32_t(1));
    }
-    uint32_t row_pitch_blocks_tile_aligned = xe::align(
-        xe::align(row_pitch_texels_unaligned, format_info->block_width) /
-            format_info->block_width,
-        xenos::kTextureTileWidthHeight);
+    // maybe do 1 << block_width_sh instead of format_info->block_width, since
+    // we'll have cl loaded with the shift anyway
+    uint32_t row_pitch_blocks_tile_aligned =
+        xe::align(xe::align<uint32_t>(row_pitch_texels_unaligned,
+                                      format_info->block_width) >>
+                      block_width_sh,
+                  xenos::kTextureTileWidthHeight);
    level_layout.row_pitch_bytes =
        row_pitch_blocks_tile_aligned * bytes_per_block;
    // Assuming the provided pitch is already 256-byte-aligned for linear, but
@ -335,10 +338,11 @@ TextureGuestLayout GetGuestTextureLayout(
    }
    level_layout.z_slice_stride_block_rows =
        dimension != xenos::DataDimension::k1D
-            ? xe::align(xe::align(z_slice_stride_texel_rows_unaligned,
-                                  format_info->block_height) /
-                            format_info->block_height,
-                        xenos::kTextureTileWidthHeight)
+            ? xe::align<uint32_t>(
+                  xe::align<uint32_t>(z_slice_stride_texel_rows_unaligned,
+                                      format_info->block_height) >>
+                      block_height_sh,
+                  xenos::kTextureTileWidthHeight)
            : 1;
    level_layout.array_slice_stride_bytes =
        level_layout.row_pitch_bytes * level_layout.z_slice_stride_block_rows;
@ -358,13 +362,13 @@ TextureGuestLayout GetGuestTextureLayout(
    // the stride. For tiled textures, this is the dimensions aligned to 32x32x4
    // blocks (or x1 for the missing dimensions).
    uint32_t level_width_blocks =
-        xe::align(std::max(width_texels >> level, uint32_t(1)),
-                  format_info->block_width) /
-        format_info->block_width;
+        xe::align<uint32_t>(std::max(width_texels >> level, uint32_t(1)),
+                            format_info->block_width) >>
+        block_width_sh;
    uint32_t level_height_blocks =
-        xe::align(std::max(height_texels >> level, uint32_t(1)),
-                  format_info->block_height) /
-        format_info->block_height;
+        xe::align<uint32_t>(std::max(height_texels >> level, uint32_t(1)),
+                            format_info->block_height) >>
+        block_height_sh;
    uint32_t level_depth = std::max(depth >> level, uint32_t(1));
    if (is_tiled) {
      level_layout.x_extent_blocks =
@ -415,20 +419,20 @@ TextureGuestLayout GetGuestTextureLayout(
          GetPackedMipOffset(width_texels, height_texels, depth, format,
                             packed_sublevel, packed_sublevel_x_blocks,
                             packed_sublevel_y_blocks, packed_sublevel_z);
-          level_layout.x_extent_blocks = std::max(
+          level_layout.x_extent_blocks = std::max<uint32_t>(
              level_layout.x_extent_blocks,
              packed_sublevel_x_blocks +
-                  xe::align(
-                      std::max(width_texels >> packed_sublevel, uint32_t(1)),
-                      format_info->block_width) /
-                      format_info->block_width);
-          level_layout.y_extent_blocks = std::max(
+                  (xe::align<uint32_t>(
+                       std::max(width_texels >> packed_sublevel, uint32_t(1)),
+                       format_info->block_width) >>
+                   block_width_sh));
+          level_layout.y_extent_blocks = std::max<uint32_t>(
              level_layout.y_extent_blocks,
              packed_sublevel_y_blocks +
-                  xe::align(
-                      std::max(height_texels >> packed_sublevel, uint32_t(1)),
-                      format_info->block_height) /
-                      format_info->block_height);
+                  (xe::align<uint32_t>(
+                       std::max(height_texels >> packed_sublevel, uint32_t(1)),
+                       format_info->block_height) >>
+                   block_height_sh));
          level_layout.z_extent =
              std::max(level_layout.z_extent,
                       packed_sublevel_z +
--- a/src/xenia/gpu/trace_viewer.cc
+++ b/src/xenia/gpu/trace_viewer.cc
@ -743,7 +743,7 @@ void TraceViewer::DrawTextureInfo(
  ImGui::NextColumn();
  ImGui::Text("Fetch Slot: %u", texture_binding.fetch_constant);
  ImGui::Text("Guest Address: %.8X", texture_info.memory.base_address);
-  ImGui::Text("Format: %s", texture_info.format_info()->name);
+  ImGui::Text("Format: %s", texture_info.format_name());
  switch (texture_info.dimension) {
    case xenos::DataDimension::k1D:
      ImGui::Text("1D: %dpx", texture_info.width + 1);
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -32,10 +32,11 @@
 #include "xenia/gpu/vulkan/vulkan_shader.h"
 #include "xenia/gpu/vulkan/vulkan_shared_memory.h"
 #include "xenia/gpu/xenos.h"
+#include "xenia/kernel/kernel_state.h"
+#include "xenia/kernel/user_module.h"
 #include "xenia/ui/vulkan/vulkan_presenter.h"
 #include "xenia/ui/vulkan/vulkan_provider.h"
 #include "xenia/ui/vulkan/vulkan_util.h"
-
 namespace xe {
 namespace gpu {
 namespace vulkan {
@ -4171,6 +4172,8 @@ uint32_t VulkanCommandProcessor::WriteTransientTextureBindings(
  return descriptor_set_write_count;
 }

+#define COMMAND_PROCESSOR VulkanCommandProcessor
+#include "../pm4_command_processor_implement.h"
 }  // namespace vulkan
 }  // namespace gpu
 }  // namespace xe
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@ -53,6 +53,7 @@ class VulkanCommandProcessor final : public CommandProcessor {
    kStorageBufferCompute,
    kCount,
  };
+#include "../pm4_command_processor_declare.h"

  class ScratchBufferAcquisition {
   public:
--- a/src/xenia/gpu/vulkan/vulkan_texture_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_texture_cache.cc
@ -2020,7 +2020,7 @@ bool VulkanTextureCache::Initialize() {
    // Log which formats are not supported or supported via fallbacks.
    const HostFormatPair& best_host_format = kBestHostFormats[i];
    const char* guest_format_name =
-        FormatInfo::Get(xenos::TextureFormat(i))->name;
+        FormatInfo::GetName(xenos::TextureFormat(i));
    if (best_host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
      assert_not_null(guest_format_name);
      if (host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -1045,8 +1045,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
      return value;
  }
 }
-
-inline uint32_t GpuSwap(uint32_t value, Endian endianness) {
+XE_NOINLINE
+XE_NOALIAS
+static uint32_t GpuSwap(uint32_t value, Endian endianness) {
  switch (endianness) {
    default:
    case Endian::kNone:
--- a/src/xenia/kernel/util/shim_utils.h
+++ b/src/xenia/kernel/util/shim_utils.h
@ -511,7 +511,8 @@ template <size_t I = 0, typename... Ps>
 StringBuffer* thread_local_string_buffer();

 template <typename Tuple>
-void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
+XE_NOALIAS void PrintKernelCall(cpu::Export* export_entry,
+                                const Tuple& params) {
  auto& string_buffer = *thread_local_string_buffer();
  string_buffer.Reset();
  string_buffer.Append(export_entry->name);
@ -526,58 +527,89 @@ void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
                               string_buffer.to_string_view());
  }
 }
+/*
+        todo: need faster string formatting/concatenation (all arguments are
+   always turned into strings except if kHighFrequency)

+*/
 template <typename F, typename Tuple, std::size_t... I>
-auto KernelTrampoline(F&& f, Tuple&& t, std::index_sequence<I...>) {
+XE_FORCEINLINE static auto KernelTrampoline(F&& f, Tuple&& t,
+                                            std::index_sequence<I...>) {
  return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
 }

 template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
-xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
-                                xe::cpu::ExportTag::type tags) {
-  static_assert(
-      std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
-      "R must be void or derive from shim::Result");
-  static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
-                "Ps must derive from shim::Param");
-  static const auto export_entry = new cpu::Export(
-      ORDINAL, xe::cpu::Export::Type::kFunction, name,
-      tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog);
-  static R (*FN)(Ps & ...) = fn;
-  struct X {
-    static void Trampoline(PPCContext* ppc_context) {
-      ++export_entry->function_data.call_count;
-      Param::Init init = {
-          ppc_context,
-          0,
-      };
-      // Using braces initializer instead of make_tuple because braces
-      // enforce execution order across compilers.
-      // The make_tuple order is undefined per the C++ standard and
-      // cause inconsitencies between msvc and clang.
-      std::tuple<Ps...> params = {Ps(init)...};
-      if (export_entry->tags & xe::cpu::ExportTag::kLog &&
-          (!(export_entry->tags & xe::cpu::ExportTag::kHighFrequency) ||
-           cvars::log_high_frequency_kernel_calls)) {
-        PrintKernelCall(export_entry, params);
-      }
-      if constexpr (std::is_void<R>::value) {
-        KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
-                         std::make_index_sequence<sizeof...(Ps)>());
-      } else {
-        auto result =
-            KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
-                             std::make_index_sequence<sizeof...(Ps)>());
-        result.Store(ppc_context);
-        if (export_entry->tags &
-            (xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
-          // TODO(benvanik): log result.
+struct ExportRegistrerHelper {
+  template <R (*fn)(Ps&...), xe::cpu::ExportTag::type tags>
+  static xe::cpu::Export* RegisterExport(const char* name) {
+    static_assert(
+        std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
+        "R must be void or derive from shim::Result");
+    static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
+                  "Ps must derive from shim::Param");
+    constexpr auto TAGS =
+        tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog;
+
+    static const auto export_entry =
+        new cpu::Export(ORDINAL, xe::cpu::Export::Type::kFunction, name, TAGS);
+    struct X {
+      static void Trampoline(PPCContext* ppc_context) {
+        ++export_entry->function_data.call_count;
+        Param::Init init = {
+            ppc_context,
+            0,
+        };
+        // Using braces initializer instead of make_tuple because braces
+        // enforce execution order across compilers.
+        // The make_tuple order is undefined per the C++ standard and
+        // cause inconsitencies between msvc and clang.
+        std::tuple<Ps...> params = {Ps(init)...};
+        if (TAGS & xe::cpu::ExportTag::kLog &&
+            (!(TAGS & xe::cpu::ExportTag::kHighFrequency) ||
+             cvars::log_high_frequency_kernel_calls)) {
+          PrintKernelCall(export_entry, params);
+        }
+        if constexpr (std::is_void<R>::value) {
+          KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
+                           std::make_index_sequence<sizeof...(Ps)>());
+        } else {
+          auto result =
+              KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
+                               std::make_index_sequence<sizeof...(Ps)>());
+          result.Store(ppc_context);
+          if (TAGS &
+              (xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
+            // TODO(benvanik): log result.
+          }
        }
      }
-    }
-  };
-  export_entry->function_data.trampoline = &X::Trampoline;
-  return export_entry;
+    };
+    struct Y {
+      static void Trampoline(PPCContext* ppc_context) {
+        Param::Init init = {
+            ppc_context,
+            0,
+        };
+        std::tuple<Ps...> params = {Ps(init)...};
+        if constexpr (std::is_void<R>::value) {
+          KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
+                           std::make_index_sequence<sizeof...(Ps)>());
+        } else {
+          auto result =
+              KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
+                               std::make_index_sequence<sizeof...(Ps)>());
+          result.Store(ppc_context);
+        }
+      }
+    };
+    export_entry->function_data.trampoline = &X::Trampoline;
+    return export_entry;
+  }
+};
+template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
+auto GetRegister(R (*fngetter)(Ps&...)) {
+  return static_cast<ExportRegistrerHelper<MODULE, ORDINAL, R, Ps...>*>(
+      nullptr);
 }

 }  // namespace shim
@ -585,13 +617,17 @@ xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
 using xe::cpu::ExportTag;

 #define DECLARE_EXPORT(module_name, name, category, tags)                  \
+  using _register_##module_name##_##name =                                 \
+      std::remove_cv_t<std::remove_reference_t<                            \
+          decltype(*xe::kernel::shim::GetRegister<                         \
+                   xe::kernel::shim::KernelModuleId::module_name,          \
+                   ordinals::name>(&name##_entry))>>;                      \
  const auto EXPORT_##module_name##_##name = RegisterExport_##module_name( \
-      xe::kernel::shim::RegisterExport<                                    \
-          xe::kernel::shim::KernelModuleId::module_name, ordinals::name>(  \
-          &name##_entry, #name,                                            \
-          tags | (static_cast<xe::cpu::ExportTag::type>(                   \
-                      xe::cpu::ExportCategory::category)                   \
-                  << xe::cpu::ExportTag::CategoryShift)));
+      _register_##module_name##_##name ::RegisterExport<                   \
+          &name##_entry, tags | (static_cast<xe::cpu::ExportTag::type>(    \
+                                     xe::cpu::ExportCategory::category)    \
+                                 << xe::cpu::ExportTag::CategoryShift)>(   \
+          #name));

 #define DECLARE_EMPTY_REGISTER_EXPORTS(module_name, group_name) \
  void xe::kernel::module_name::Register##group_name##Exports(  \
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -316,8 +316,46 @@ void Memory::Reset() {
  heaps_.v90000000.Reset();
  heaps_.physical.Reset();
 }
-
+XE_NOALIAS
 const BaseHeap* Memory::LookupHeap(uint32_t address) const {
+#if 1
+#define HEAP_INDEX(name) \
+  offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_)
+
+  const char* heap_select = (const char*)&this->heaps_;
+
+  unsigned selected_heap_offset = 0;
+  unsigned high_nibble = address >> 28;
+
+  if (high_nibble < 0x4) {
+    selected_heap_offset = HEAP_INDEX(v00000000);
+  } else if (address < 0x7F000000) {
+    selected_heap_offset = HEAP_INDEX(v40000000);
+  } else if (high_nibble < 0x8) {
+    heap_select = nullptr;
+    // return nullptr;
+  } else if (high_nibble < 0x9) {
+    selected_heap_offset = HEAP_INDEX(v80000000);
+    // return &heaps_.v80000000;
+  } else if (high_nibble < 0xA) {
+    // return &heaps_.v90000000;
+    selected_heap_offset = HEAP_INDEX(v90000000);
+  } else if (high_nibble < 0xC) {
+    // return &heaps_.vA0000000;
+    selected_heap_offset = HEAP_INDEX(vA0000000);
+  } else if (high_nibble < 0xE) {
+    // return &heaps_.vC0000000;
+    selected_heap_offset = HEAP_INDEX(vC0000000);
+  } else if (address < 0xFFD00000) {
+    // return &heaps_.vE0000000;
+    selected_heap_offset = HEAP_INDEX(vE0000000);
+  } else {
+    //  return nullptr;
+    heap_select = nullptr;
+  }
+  return reinterpret_cast<const BaseHeap*>(selected_heap_offset + heap_select);
+
+#else
  if (address < 0x40000000) {
    return &heaps_.v00000000;
  } else if (address < 0x7F000000) {
@ -337,6 +375,7 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
  } else {
    return nullptr;
  }
+#endif
 }

 BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
@ -465,8 +504,8 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
 }

 bool Memory::AccessViolationCallback(
-    global_unique_lock_type global_lock_locked_once,
-    void* host_address, bool is_write) {
+    global_unique_lock_type global_lock_locked_once, void* host_address,
+    bool is_write) {
  // Access via physical_membase_ is special, when need to bypass everything
  // (for instance, for a data provider to actually write the data) so only
  // triggering callbacks on virtual memory regions.
@ -493,16 +532,15 @@ bool Memory::AccessViolationCallback(
 }

 bool Memory::AccessViolationCallbackThunk(
-    global_unique_lock_type global_lock_locked_once,
-    void* context, void* host_address, bool is_write) {
+    global_unique_lock_type global_lock_locked_once, void* context,
+    void* host_address, bool is_write) {
  return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
      std::move(global_lock_locked_once), host_address, is_write);
 }

 bool Memory::TriggerPhysicalMemoryCallbacks(
-    global_unique_lock_type global_lock_locked_once,
-    uint32_t virtual_address, uint32_t length, bool is_write,
-    bool unwatch_exact_range, bool unprotect) {
+    global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+    uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
  BaseHeap* heap = LookupHeap(virtual_address);
  if (heap->heap_type() == HeapType::kGuestPhysical) {
    auto physical_heap = static_cast<PhysicalHeap*>(heap);
@ -1711,9 +1749,8 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
 }

 bool PhysicalHeap::TriggerCallbacks(
-    global_unique_lock_type global_lock_locked_once,
-    uint32_t virtual_address, uint32_t length, bool is_write,
-    bool unwatch_exact_range, bool unprotect) {
+    global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
+    uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
  // TODO(Triang3l): Support read watches.
  assert_true(is_write);
  if (!is_write) {
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -473,8 +473,9 @@ class Memory {
  void SystemHeapFree(uint32_t address);

  // Gets the heap for the address space containing the given address.
+  XE_NOALIAS
  const BaseHeap* LookupHeap(uint32_t address) const;
-
+  XE_NOALIAS
  inline BaseHeap* LookupHeap(uint32_t address) {
    return const_cast<BaseHeap*>(
        const_cast<const Memory*>(this)->LookupHeap(address));
--- a/src/xenia/ui/d3d12/d3d12_provider.cc
+++ b/src/xenia/ui/d3d12/d3d12_provider.cc
@ -17,7 +17,7 @@
 #include "xenia/base/math.h"
 #include "xenia/ui/d3d12/d3d12_immediate_drawer.h"
 #include "xenia/ui/d3d12/d3d12_presenter.h"
-
+#include "xenia/ui/d3d12/d3d12_util.h"
 DEFINE_bool(d3d12_debug, false, "Enable Direct3D 12 and DXGI debug layer.",
            "D3D12");
 DEFINE_bool(d3d12_break_on_error, false,
@ -35,6 +35,8 @@ DEFINE_int32(
    "system responsibility)",
    "D3D12");

+DEFINE_bool(d3d12_nvapi_use_driver_heap_priorities, false, "nvidia stuff",
+            "D3D12");
 namespace xe {
 namespace ui {
 namespace d3d12 {
@ -61,6 +63,7 @@ std::unique_ptr<D3D12Provider> D3D12Provider::Create() {
        "supported GPUs.");
    return nullptr;
  }
+
  return provider;
 }

@ -476,10 +479,69 @@ bool D3D12Provider::Initialize() {
  // Get the graphics analysis interface, will silently fail if PIX is not
  // attached.
  pfn_dxgi_get_debug_interface1_(0, IID_PPV_ARGS(&graphics_analysis_));
+  if (GetAdapterVendorID() == ui::GraphicsProvider::GpuVendorID::kNvidia) {
+    nvapi_ = new lightweight_nvapi::nvapi_state_t();
+    if (!nvapi_->is_available()) {
+      delete nvapi_;
+      nvapi_ = nullptr;
+    } else {
+      using namespace lightweight_nvapi;

+      nvapi_createcommittedresource_ =
+          (cb_NvAPI_D3D12_CreateCommittedResource)nvapi_->query_interface<void>(
+              id_NvAPI_D3D12_CreateCommittedResource);
+      nvapi_querycpuvisiblevidmem_ =
+          (cb_NvAPI_D3D12_QueryCpuVisibleVidmem)nvapi_->query_interface<void>(
+              id_NvAPI_D3D12_QueryCpuVisibleVidmem);
+      nvapi_usedriverheappriorities_ =
+          (cb_NvAPI_D3D12_UseDriverHeapPriorities)nvapi_->query_interface<void>(
+              id_NvAPI_D3D12_UseDriverHeapPriorities);
+
+      if (nvapi_usedriverheappriorities_) {
+        if (cvars::d3d12_nvapi_use_driver_heap_priorities) {
+          if (nvapi_usedriverheappriorities_(device_) != 0) {
+            XELOGI("Failed to enable driver heap priorities");
+          }
+        }
+      }
+    }
+  }
  return true;
 }
+uint32_t D3D12Provider::CreateUploadResource(
+    D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
+    D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
+    void** ppvResource, bool try_create_cpuvisible,
+    const D3D12_CLEAR_VALUE* pOptimizedClearValue) const {
+  auto device = GetDevice();

+  if (try_create_cpuvisible && nvapi_createcommittedresource_) {
+    lightweight_nvapi::NV_RESOURCE_PARAMS nvrp;
+    nvrp.NVResourceFlags =
+        lightweight_nvapi::NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM;
+    nvrp.version = 0;  // nothing checks the version
+
+    if (nvapi_createcommittedresource_(
+            device, &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
+            InitialResourceState, pOptimizedClearValue, &nvrp, riidResource,
+            ppvResource, nullptr) != 0) {
+      XELOGI(
+          "Failed to create CPUVISIBLE_VIDMEM upload resource, will just do "
+          "normal CreateCommittedResource");
+    } else {
+      return UPLOAD_RESULT_CREATE_CPUVISIBLE;
+    }
+  }
+  if (FAILED(device->CreateCommittedResource(
+          &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
+          InitialResourceState, pOptimizedClearValue, riidResource,
+          ppvResource))) {
+    XELOGE("Failed to create the gamma ramp upload buffer");
+    return UPLOAD_RESULT_CREATE_FAILED;
+  }
+
+  return UPLOAD_RESULT_CREATE_SUCCESS;
+}
 std::unique_ptr<Presenter> D3D12Provider::CreatePresenter(
    Presenter::HostGpuLossCallback host_gpu_loss_callback) {
  return D3D12Presenter::Create(host_gpu_loss_callback, *this);
--- a/src/xenia/ui/d3d12/d3d12_provider.h
+++ b/src/xenia/ui/d3d12/d3d12_provider.h
@ -14,13 +14,21 @@

 #include "xenia/ui/d3d12/d3d12_api.h"
 #include "xenia/ui/graphics_provider.h"
-
+// chrispy: this is here to prevent clang format from moving d3d12_nvapi above
+// the headers it depends on
+#define HEADERFENCE
+#undef HEADERFENCE
+#include "xenia/gpu/d3d12/d3d12_nvapi.hpp"
 #define XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES 1

 namespace xe {
 namespace ui {
 namespace d3d12 {
-
+enum {
+  UPLOAD_RESULT_CREATE_FAILED = 0,
+  UPLOAD_RESULT_CREATE_SUCCESS = 1,
+  UPLOAD_RESULT_CREATE_CPUVISIBLE = 2
+};
 class D3D12Provider : public GraphicsProvider {
 public:
  ~D3D12Provider();
@ -34,6 +42,11 @@ class D3D12Provider : public GraphicsProvider {
          Presenter::FatalErrorHostGpuLossCallback) override;

  std::unique_ptr<ImmediateDrawer> CreateImmediateDrawer() override;
+  uint32_t CreateUploadResource(
+      D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
+      D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
+      void** ppvResource, bool try_create_cpuvisible = false,
+      const D3D12_CLEAR_VALUE* pOptimizedClearValue = nullptr) const;

  IDXGIFactory2* GetDXGIFactory() const { return dxgi_factory_; }
  // nullptr if PIX not attached.
@ -193,6 +206,14 @@ class D3D12Provider : public GraphicsProvider {
  bool ps_specified_stencil_reference_supported_;
  bool rasterizer_ordered_views_supported_;
  bool unaligned_block_textures_supported_;
+
+  lightweight_nvapi::nvapi_state_t* nvapi_;
+  lightweight_nvapi::cb_NvAPI_D3D12_CreateCommittedResource
+      nvapi_createcommittedresource_ = nullptr;
+  lightweight_nvapi::cb_NvAPI_D3D12_UseDriverHeapPriorities
+      nvapi_usedriverheappriorities_ = nullptr;
+  lightweight_nvapi::cb_NvAPI_D3D12_QueryCpuVisibleVidmem
+      nvapi_querycpuvisiblevidmem_ = nullptr;
 };

 }  // namespace d3d12
--- a/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc
+++ b/src/xenia/ui/d3d12/d3d12_upload_buffer_pool.cc
@ -81,10 +81,10 @@ D3D12UploadBufferPool::CreatePageImplementation() {
  util::FillBufferResourceDesc(buffer_desc, page_size_,
                               D3D12_RESOURCE_FLAG_NONE);
  Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
-  if (FAILED(provider_.GetDevice()->CreateCommittedResource(
-          &util::kHeapPropertiesUpload, provider_.GetHeapFlagCreateNotZeroed(),
-          &buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
-          IID_PPV_ARGS(&buffer)))) {
+
+  if (!provider_.CreateUploadResource(
+          provider_.GetHeapFlagCreateNotZeroed(), &buffer_desc,
+          D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&buffer))) {
    XELOGE("Failed to create a D3D upload buffer with {} bytes", page_size_);
    return nullptr;
  }