diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc
index 7024db5a8..74847efed 100644
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
   // Allocate ffmpeg stuff:
   av_packet_ = av_packet_alloc();
   assert_not_null(av_packet_);
-
+  //chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet, 
+  //these allocations were causing RtlSubsegmentInitialize
+  av_packet_->buf = av_buffer_alloc(128 * 1024);
   // find the XMA2 audio decoder
   av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES);
   if (!av_codec_) {
@@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }
 
 bool XmaContext::Work() {
-  std::lock_guard<xe_mutex> lock(lock_);
-  if (!is_allocated() || !is_enabled()) {
+  
+  if (!is_enabled() || !is_allocated()) {
     return false;
   }
+  {
+    std::lock_guard<xe_mutex> lock(lock_);
+    set_is_enabled(false);
 
-  set_is_enabled(false);
-
-  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
-  XMA_CONTEXT_DATA data(context_ptr);
-  Decode(&data);
-  data.Store(context_ptr);
-  return true;
+    auto context_ptr = memory()->TranslateVirtual(guest_ptr());
+    XMA_CONTEXT_DATA data(context_ptr);
+    Decode(&data);
+    data.Store(context_ptr);
+    return true;
+  }
 }
 
 void XmaContext::Enable() {
diff --git a/src/xenia/apu/xma_context.h b/src/xenia/apu/xma_context.h
index a348c7836..cc0d0f8d5 100644
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@@ -201,8 +201,8 @@ class XmaContext {
   uint32_t id_ = 0;
   uint32_t guest_ptr_ = 0;
   xe_mutex lock_;
-  bool is_allocated_ = false;
-  bool is_enabled_ = false;
+  volatile bool is_allocated_ = false;
+  volatile bool is_enabled_ = false;
   // bool is_dirty_ = true;
 
   // ffmpeg structures
diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc
deleted file mode 100644
index 520cbdbc3..000000000
--- a/src/xenia/base/dma.cc
+++ /dev/null
@@ -1,348 +0,0 @@
-#include "dma.h"
-#include "logging.h"
-#include "mutex.h"
-#include "platform_win.h"
-
-XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
-                NtDelayExecutionPointer);
-XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
-XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
-                NtAlertThreadByThreadId);
-
-template <size_t N, typename... Ts>
-static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
-  char buffer[1024];
-  sprintf_s(buffer, fmt, args...);
-  XELOGI("%s", buffer);
-}
-
-//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
-//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
-#define XEDMALOG(...) static_cast<void>(0)
-using xe::swcache::CacheLine;
-static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
-#if defined(__clang__)
-XE_FORCEINLINE
-static void mvdir64b(void* to, const void* from) {
-  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
-}
-#define _movdir64b mvdir64b
-#endif
-XE_FORCEINLINE
-static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
-                                    CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-XE_FORCEINLINE
-static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
-                                 CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-#if 0
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-#else
-    _movdir64b(dest1 + i, src1 + i);
-    _movdir64b(dest2 + i, src2 + i);
-    _movdir64b(dest3 + i, src3 + i);
-    _movdir64b(dest4 + i, src4 + i);
-#endif
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-
-namespace xe::dma {
-using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
-                                 CacheLine* XE_RESTRICT rdmapping,
-                                 uint32_t written_length);
-static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
-                             CacheLine* XE_RESTRICT rdmapping,
-                             uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384StreamingAVX(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    xe::swcache::CacheLine line0, line1;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-
-    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
-  }
-
-  if (i < num_written_lines) {
-    xe::swcache::CacheLine line0;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-  }
-}
-
-static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
-                                   CacheLine* XE_RESTRICT rdmapping,
-                                   uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384Movdir64M(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    _movdir64b(physaddr + i, rdmapping + i);
-    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
-  }
-
-  if (i < num_written_lines) {
-    _movdir64b(physaddr + i, rdmapping + i);
-  }
-}
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length);
-
-static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length) {
-  VastCpyDispatch dispatch_to_use = nullptr;
-  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
-    XELOGI("Selecting MOVDIR64M vastcpy.");
-    dispatch_to_use = vastcpy_impl_movdir64m;
-  } else {
-    XELOGI("Selecting generic AVX vastcpy.");
-    dispatch_to_use = vastcpy_impl_avx;
-  }
-
-  vastcpy_dispatch =
-      dispatch_to_use;  // all future calls will go through our selected path
-  return vastcpy_dispatch(physaddr, rdmapping, written_length);
-}
-
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length) {
-  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
-                          written_length);
-}
-
-#define MAX_INFLIGHT_DMAJOBS 65536
-#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
-class XeDMACGeneric : public XeDMAC {
-  std::unique_ptr<xe::threading::Thread> thrd_;
-  XeDMAJob* jobs_ring_;
-  volatile std::atomic<uintptr_t> write_ptr_;
-
-  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
-    volatile std::atomic<uintptr_t> read_ptr_;
-    xe_mutex push_into_ring_lock_;
-  };
-  HANDLE gotjob_event;
-  void WorkerWait();
-
- public:
-  virtual ~XeDMACGeneric() {}
-  void WorkerThreadMain();
-  XeDMACGeneric() {
-    threading::Thread::CreationParameters crparams;
-    crparams.create_suspended = true;
-    crparams.initial_priority = threading::ThreadPriority::kNormal;
-    crparams.stack_size = 65536;
-    gotjob_event = CreateEventA(nullptr, false, false, nullptr);
-    thrd_ = std::move(threading::Thread::Create(
-        crparams, [this]() { this->WorkerThreadMain(); }));
-
-    jobs_ring_ = (XeDMAJob*)_aligned_malloc(
-        MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
-
-    write_ptr_ = 0;
-    read_ptr_ = 0;
-
-    thrd_->Resume();
-  }
-
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
-    // std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
-    HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
-    {
-      job->dmac_specific_ = (uintptr_t)dmacevent;
-
-      jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
-      write_ptr_++;
-      SetEvent(gotjob_event);
-    }
-    return (DMACJobHandle)dmacevent;
-  }
-  virtual void WaitJobDone(DMACJobHandle handle) override {
-    while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
-      // NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
-      //  while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
-      //           WAIT_TIMEOUT) {
-      //    ;
-    }
-    //}
-
-    // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
-    CloseHandle((HANDLE)handle);
-  }
-  virtual void WaitForIdle() override {
-    while (write_ptr_ != read_ptr_) {
-      threading::MaybeYield();
-    }
-  }
-};
-void XeDMACGeneric::WorkerWait() {
-  constexpr unsigned NUM_PAUSE_SPINS = 2048;
-  constexpr unsigned NUM_YIELD_SPINS = 8;
-#if 0
-
-  for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      _mm_pause();
-    } else {
-      break;
-    }
-  }
-  for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      threading::MaybeYield();
-    } else {
-      break;
-    }
-  }
-  LARGE_INTEGER yield_execution_delay{};
-  yield_execution_delay.QuadPart =
-      -2000;  //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
-  while (write_ptr_ == read_ptr_) {
-    NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
-  }
-#else
-  do {
-    if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
-      while (write_ptr_ == read_ptr_) {
-        _mm_pause();
-      }
-    }
-
-  } while (write_ptr_ == read_ptr_);
-#endif
-}
-void XeDMACGeneric::WorkerThreadMain() {
-  while (true) {
-    this->WorkerWait();
-
-    XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
-    swcache::ReadFence();
-
-    if (current_job.precall) {
-      current_job.precall(&current_job);
-    }
-
-    size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
-    size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
-
-    size_t line_rem = current_job.size - line_rounded;
-
-    vastcpy(current_job.destination, current_job.source,
-            static_cast<uint32_t>(line_rounded));
-
-    if (line_rem) {
-      __movsb(current_job.destination + line_rounded,
-              current_job.source + line_rounded, line_rem);
-    }
-
-    if (current_job.postcall) {
-      current_job.postcall(&current_job);
-    }
-    read_ptr_++;
-    swcache::WriteFence();
-
-    SetEvent((HANDLE)current_job.dmac_specific_);
-  }
-}
-
-XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
-}  // namespace xe::dma
diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h
deleted file mode 100644
index 9406a31cd..000000000
--- a/src/xenia/base/dma.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef XENIA_BASE_DMA_H_
-#define XENIA_BASE_DMA_H_
-#include "memory.h"
-#include "threading.h"
-namespace xe::dma {
-struct XeDMAJob;
-using DmaPrecall = void (*)(XeDMAJob* job);
-using DmaPostcall = void (*)(XeDMAJob* job);
-struct XeDMAJob {
-  //threading::Event* signal_on_done;
-  uintptr_t dmac_specific_;
-  uint8_t* destination;
-  uint8_t* source;
-  size_t size;
-  DmaPrecall precall;
-  DmaPostcall postcall;
-  void* userdata1;
-  void* userdata2;
-};
-using DMACJobHandle = uint64_t;
-
-class XeDMAC {
- public:
-  virtual ~XeDMAC() {}
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
-  virtual void WaitJobDone(DMACJobHandle handle) = 0;
-  virtual void WaitForIdle() = 0;
-};
-
-XeDMAC* CreateDMAC();
-// must be divisible by cache line size
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length);
-
-}  // namespace xe::dma
-
-#endif  // XENIA_BASE_DMA_H_
diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h
index ad2af543a..7b9063084 100644
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@@ -44,8 +44,18 @@ template <typename T>
 constexpr bool is_pow2(T value) {
   return (value & (value - 1)) == 0;
 }
+/*
+	Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient 
+	because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction),
+	but this code can be executed alongside the scan
+*/
+template<typename T>
+constexpr T clear_lowest_bit(T value) {
+  static_assert(std::is_integral_v<T>);
 
-// Rounds up the given value to the given alignment.
+  return (value - static_cast<T>(1)) & value;
+}
+    // Rounds up the given value to the given alignment.
 template <typename T>
 constexpr T align(T value, T alignment) {
   return (value + alignment - 1) & ~(alignment - 1);
diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc
index 5c1763282..26f34318e 100644
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@@ -10,6 +10,7 @@
 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"
+#include "xenia/base/logging.h"
 
 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() {
          cvars::writable_executable_memory;
 }
 
+using xe::swcache::CacheLine;
+
+static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
+
+#if defined(__clang__)
+XE_FORCEINLINE
+static void mvdir64b(void* to, const void* from) {
+  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
+}
+#define _movdir64b mvdir64b
+#endif
+XE_FORCEINLINE
+static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
+                                    CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    xe::swcache::CacheLine line0, line1, line2, line3;
+
+    xe::swcache::ReadLine(&line0, src1 + i);
+    xe::swcache::ReadLine(&line1, src2 + i);
+    xe::swcache::ReadLine(&line2, src3 + i);
+    xe::swcache::ReadLine(&line3, src4 + i);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(dest1 + i, &line0);
+    xe::swcache::WriteLineNT(dest2 + i, &line1);
+
+    xe::swcache::WriteLineNT(dest3 + i, &line2);
+    xe::swcache::WriteLineNT(dest4 + i, &line3);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+XE_FORCEINLINE
+static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
+                                 CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    _movdir64b(dest1 + i, src1 + i);
+    _movdir64b(dest2 + i, src2 + i);
+    _movdir64b(dest3 + i, src3 + i);
+    _movdir64b(dest4 + i, src4 + i);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length);
+static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
+                             CacheLine* XE_RESTRICT rdmapping,
+                             uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384StreamingAVX(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    xe::swcache::CacheLine line0, line1;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+
+    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
+  }
+
+  if (i < num_written_lines) {
+    xe::swcache::CacheLine line0;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+  }
+}
+
+static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
+                                   CacheLine* XE_RESTRICT rdmapping,
+                                   uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384Movdir64M(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    _movdir64b(physaddr + i, rdmapping + i);
+    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
+  }
+
+  if (i < num_written_lines) {
+    _movdir64b(physaddr + i, rdmapping + i);
+  }
+}
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length);
+
+static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length) {
+  VastCpyDispatch dispatch_to_use = nullptr;
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
+    XELOGI("Selecting MOVDIR64M vastcpy.");
+    dispatch_to_use = vastcpy_impl_movdir64m;
+  } else {
+    XELOGI("Selecting generic AVX vastcpy.");
+    dispatch_to_use = vastcpy_impl_avx;
+  }
+
+  vastcpy_dispatch =
+      dispatch_to_use;  // all future calls will go through our selected path
+  return vastcpy_dispatch(physaddr, rdmapping, written_length);
+}
+
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length) {
+  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
+                          written_length);
+}
 }  // namespace memory
 
 // TODO(benvanik): fancy AVX versions.
diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h
index b20b33b97..bd7081418 100644
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <string_view>
 
-#include "xenia/base/assert.h"
+
 #include "xenia/base/byte_order.h"
-#include "xenia/base/platform.h"
+
 
 namespace xe {
 namespace memory {
@@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
   seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2);
   return hash_combine(seed, vs...);
 }
+// must be divisible by cache line size
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length);
 
 }  // namespace memory
 
diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h
index 92c3a9405..b7fc09896 100644
--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // };
 class global_critical_region {
  public:
+  constexpr global_critical_region() {}
   static global_mutex_type& mutex();
 
   // Acquires a lock on the global critical section.
@@ -144,18 +145,18 @@ class global_critical_region {
   }
 
   // Acquires a lock on the global critical section.
-  inline global_unique_lock_type Acquire() {
+  static inline global_unique_lock_type Acquire() {
     return global_unique_lock_type(mutex());
   }
 
   // Acquires a deferred lock on the global critical section.
-  inline global_unique_lock_type AcquireDeferred() {
+  static inline global_unique_lock_type AcquireDeferred() {
     return global_unique_lock_type(mutex(), std::defer_lock);
   }
 
   // Tries to acquire a lock on the glboal critical section.
   // Check owns_lock() to see if the lock was successfully acquired.
-  inline global_unique_lock_type TryAcquire() {
+  static inline global_unique_lock_type TryAcquire() {
     return global_unique_lock_type(mutex(), std::try_to_lock);
   }
 };
diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h
index 84d5893b5..e481f4f27 100644
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
   xenia_assert(this->capacity_ >= 4);
 
   ring_size_t next_read_offset = read_offset + 4;
-#if 0
-  size_t zerotest = next_read_offset - this->capacity_;
-  // unpredictable branch, use bit arith instead
-  // todo: it would be faster to use lzcnt, but we need to figure out if all
-  // machines we support support it
-  next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
-#else
+
   if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
     next_read_offset = 0;
     // todo: maybe prefetch next? or should that happen much earlier?
   }
-#endif
   this->read_offset_ = next_read_offset;
-  unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
+  uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset];
   return xe::byte_swap(ring_value);
 }
 }  // namespace xe
diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h
index 604819950..1be26b373 100644
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@@ -271,7 +271,10 @@ inline std::pair<WaitResult, size_t> WaitAny(
   return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable,
                  timeout);
 }
-
+struct EventInfo {
+  uint32_t type;
+  uint32_t state;
+};
 // Models a Win32-like event object.
 // https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx
 class Event : public WaitHandle {
@@ -299,6 +302,8 @@ class Event : public WaitHandle {
   // the nonsignaled state after releasing the appropriate number of waiting
   // threads.
   virtual void Pulse() = 0;
+	
+  virtual EventInfo Query() = 0;
   #if XE_PLATFORM_WIN32 ==1
   //SetEvent, but if there is a waiter we immediately transfer execution to it
   virtual void SetBoostPriority() = 0;
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index 01a4eb9be..ed7874458 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
 
 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                 NtDelayExecutionPointer);
-
+XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer);
 namespace xe {
 namespace threading {
 
@@ -255,20 +255,20 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
                                            size_t wait_handle_count,
                                            bool wait_all, bool is_alertable,
                                            std::chrono::milliseconds timeout) {
-  std::vector<HANDLE> handles(
-      wait_handle_count);  // max handles is like 64, so it would make more
-                           // sense to just do a fixed size array here
+  xenia_assert(wait_handle_count <= 64);
+  HANDLE handles[64];
+
   for (size_t i = 0; i < wait_handle_count; ++i) {
     handles[i] = wait_handles[i]->native_handle();
   }
   DWORD result = WaitForMultipleObjectsEx(
-      DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE,
+      static_cast<DWORD>(wait_handle_count), handles, wait_all ? TRUE : FALSE,
       DWORD(timeout.count()), is_alertable ? TRUE : FALSE);
-  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) {
+  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) {
     return std::pair<WaitResult, size_t>(WaitResult::kSuccess,
                                          result - WAIT_OBJECT_0);
   } else if (result >= WAIT_ABANDONED_0 &&
-             result < WAIT_ABANDONED_0 + handles.size()) {
+             result < WAIT_ABANDONED_0 + wait_handle_count) {
     return std::pair<WaitResult, size_t>(WaitResult::kAbandoned,
                                          result - WAIT_ABANDONED_0);
   }
@@ -293,7 +293,15 @@ class Win32Event : public Win32Handle<Event> {
   void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
   void SetBoostPriority() override {
     // no previous state for boostpriority
-    NtSetEventBoostPriorityPointer.invoke(handle_);
+    // Boost priority is unimplemented under wine probably because it's not used
+    // anywhere in user mode except by us. Maybe some Windows internals uses it
+    // see:
+    // https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228
+    if (NtSetEventBoostPriorityPointer) {
+      NtSetEventBoostPriorityPointer.invoke(handle_);
+    } else {
+      NtSetEventPointer.invoke(handle_, nullptr);
+    }
   }
 #else
   void Set() override { SetEvent(handle_); }
@@ -305,6 +313,11 @@ class Win32Event : public Win32Handle<Event> {
     SetEvent(handle_);
   }
 #endif
+
+  EventInfo Query() { EventInfo result{};
+    NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
+    return result;
+  }
 };
 
 std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 57b038613..1b6c40f44 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-XE_MSVC_OPTIMIZE_SMALL()
+
 DEFINE_bool(use_fast_dot_product, false,
             "Experimental optimization, much shorter sequence on dot products, "
             "treating inf as overflow instead of using mcxsr"
diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc
index 5e0ea6c1a..d759e97b3 100644
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@@ -9,23 +9,16 @@
 
 #include "xenia/gpu/command_processor.h"
 
-#include <algorithm>
 #include <cinttypes>
-#include <cmath>
-#include <cstring>
 
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/byte_stream.h"
-#include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
-#include "xenia/base/ring_buffer.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/graphics_system.h"
 #include "xenia/gpu/sampler_info.h"
 #include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"
 
@@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
       write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
       write_ptr_index_(0) {
   assert_not_null(write_ptr_index_event_);
-#if 0
-  dmac_ = dma::CreateDMAC();
-#else
-  dmac_ = nullptr;
-#endif
 }
 
 CommandProcessor::~CommandProcessor() = default;
@@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
 
 void CommandProcessor::ReturnFromWait() {}
 
-uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
-  SCOPE_profile_cpu_f("gpu");
-#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
-  // If we have a pending trace stream open it now. That way we ensure we get
-  // all commands.
-  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
-    uint32_t title_id = kernel_state_->GetExecutableModule()
-                            ? kernel_state_->GetExecutableModule()->title_id()
-                            : 0;
-    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
-    auto path = trace_stream_path_ / file_name;
-    trace_writer_.Open(path, title_id);
-    InitializeTrace();
-  }
-#endif
-  // Adjust pointer base.
-  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
-  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
-  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
-  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
-
-  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
-
-  // Execute commands!
-
-  RingBuffer old_reader = reader_;
-  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
-                            primary_buffer_size_);
-
-  reader_.set_read_offset(read_index * sizeof(uint32_t));
-  reader_.set_write_offset(write_index * sizeof(uint32_t));
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      GetCurrentRingReadCount());
-  do {
-    if (!ExecutePacket()) {
-      // This probably should be fatal - but we're going to continue anyways.
-      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-
-  OnPrimaryBufferEnd();
-
-  trace_writer_.WritePrimaryBufferEnd();
-
-  reader_ = old_reader;
-  return write_index;
-}
-
-void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
-  // Execute commands!
-  RingBuffer old_reader = reader_;
-
-  new (&reader_)
-      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
-
-  reader_.set_write_offset(count * sizeof(uint32_t));
-
-  do {
-    if (!ExecutePacket()) {
-      XELOGE("**** ExecutePacket: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-  reader_ = old_reader;
-}
 
 void CommandProcessor::InitializeTrace() {
   // Write the initial register values, to be loaded directly into the
diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h
index 4e96391b8..0553a4184 100644
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@@ -19,11 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "xenia/base/dma.h"
 #include "xenia/base/ring_buffer.h"
-#include "xenia/base/threading.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/kernel/xthread.h"
@@ -82,7 +79,6 @@ class CommandProcessor {
   CommandProcessor(GraphicsSystem* graphics_system,
                    kernel::KernelState* kernel_state);
   virtual ~CommandProcessor();
-  dma::XeDMAC* GetDMAC() const { return dmac_; }
   uint32_t counter() const { return counter_; }
   void increment_counter() { counter_++; }
 
@@ -135,7 +131,7 @@ class CommandProcessor {
 
   void UpdateWritePointer(uint32_t value);
 
-  void ExecutePacket(uint32_t ptr, uint32_t count);
+  
 
   bool is_paused() const { return paused_; }
   void Pause();
@@ -172,7 +168,7 @@ class CommandProcessor {
                                           uint32_t num_registers);
 
   XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(
+  void WriteOneRegisterFromRing(
       uint32_t base,
       uint32_t
           num_times);  // repeatedly write a value to one register, presumably a
@@ -221,7 +217,7 @@ class CommandProcessor {
   virtual void PrepareForWait();
   virtual void ReturnFromWait();
 
-  uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
+  
   virtual void OnPrimaryBufferEnd() {}
 
 #include "pm4_command_processor_declare.h"
@@ -300,7 +296,6 @@ class CommandProcessor {
   reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
   reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
   uint32_t gamma_ramp_rw_component_ = 0;
-  dma::XeDMAC* dmac_ = nullptr;
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index a24d468ae..584917e45 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // validity is tracked.
   const Shader::ConstantRegisterMap& constant_map_vertex =
       vertex_shader->constant_register_map();
-  for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap);
-       ++i) {
-    uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i];
-    uint32_t j;
-    while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
-      vfetch_bits_remaining &= ~(uint32_t(1) << j);
-      uint32_t vfetch_index = i * 32 + j;
-      const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
-          XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
-      switch (vfetch_constant.type) {
-        case xenos::FetchConstantType::kVertex:
-          break;
-        case xenos::FetchConstantType::kInvalidVertex:
-          if (cvars::gpu_allow_invalid_fetch_constants) {
+  {
+    uint32_t vfetch_addresses[96];
+    uint32_t vfetch_sizes[96];
+    uint32_t vfetch_current_queued = 0;
+    for (uint32_t i = 0;
+         i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) {
+      uint32_t vfetch_bits_remaining =
+          constant_map_vertex.vertex_fetch_bitmap[i];
+      uint32_t j;
+      while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
+        vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining);
+        uint32_t vfetch_index = i * 32 + j;
+        const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
+            XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
+        switch (vfetch_constant.type) {
+          case xenos::FetchConstantType::kVertex:
             break;
-          }
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! "
-              "This is incorrect behavior, but you can try bypassing this by "
-              "launching Xenia with --gpu_allow_invalid_fetch_constants=true.",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
-        default:
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
+          case xenos::FetchConstantType::kInvalidVertex:
+            if (cvars::gpu_allow_invalid_fetch_constants) {
+              break;
+            }
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" "
+                "type! "
+                "This is incorrect behavior, but you can try bypassing this by "
+                "launching Xenia with "
+                "--gpu_allow_invalid_fetch_constants=true.",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+          default:
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) is completely "
+                "invalid!",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+        }
+        vfetch_addresses[vfetch_current_queued] = vfetch_constant.address;
+        vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size;
       }
-      if (!shared_memory_->RequestRange(vfetch_constant.address << 2,
-                                        vfetch_constant.size << 2)) {
-        XELOGE(
-            "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
-            "shared memory",
-            vfetch_constant.address << 2, vfetch_constant.size << 2);
-        return false;
+    }
+
+    if (vfetch_current_queued) {
+      // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
+      // pre-acquire the critical region so we're not repeatedly re-acquiring it
+      // in requestrange
+      auto shared_memory_request_range_hoisted =
+          global_critical_region::Acquire();
+
+      for (uint32_t i = 0; i < vfetch_current_queued; ++i) {
+        if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2,
+                                          vfetch_sizes[i] << 2)) {
+          XELOGE(
+              "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
+              "shared memory",
+              vfetch_addresses[i] << 2, vfetch_sizes[i] << 2);
+          return false;
+        }
       }
     }
   }
-
   // Gather memexport ranges and ensure the heaps for them are resident, and
   // also load the data surrounding the export and to fill the regions that
   // won't be modified by the shaders.
@@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() {
     shared_memory_->InitializeTraceCompleteDownloads();
   }
 }
-static void DmaPrefunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_range;
-  readback_range.Begin = 0;
-  readback_range.End = job->size;
-  void* readback_mapping;
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-
-  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
-  xenia_assert(SUCCEEDED(mapres));
-
-  job->source = (uint8_t*)readback_mapping;
-}
-
-static void DmaPostfunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_write_range = {};
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-  readback_buffer->Unmap(0, &readback_write_range);
-}
 
 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() {
   if (!BeginSubmission(true)) {
     return false;
   }
-
   if (!cvars::d3d12_readback_resolve) {
     uint32_t written_address, written_length;
     return render_target_cache_->Resolve(*memory_, *shared_memory_,
@@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
             readback_buffer, 0, shared_memory_buffer, written_address,
             written_length);
         if (AwaitAllQueueOperationsCompletion()) {
-#if 1
-        D3D12_RANGE readback_range;
-        readback_range.Begin = 0;
-        readback_range.End = written_length;
-        void* readback_mapping;
-        if (SUCCEEDED(
-                readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          // chrispy: this memcpy needs to be optimized as much as possible
+          D3D12_RANGE readback_range;
+          readback_range.Begin = 0;
+          readback_range.End = written_length;
+          void* readback_mapping;
+          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                             &readback_mapping))) {
+            // chrispy: this memcpy needs to be optimized as much as possible
 
-          auto physaddr = memory_->TranslatePhysical(written_address);
-          dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length);
-          // XEDmaCpy(physaddr, readback_mapping, written_length);
-          D3D12_RANGE readback_write_range = {};
-          readback_buffer->Unmap(0, &readback_write_range);
-        }
-
-#else
-          dma::XeDMAJob job{};
-          job.destination = memory_->TranslatePhysical(written_address);
-          job.size = written_length;
-          job.source = nullptr;
-          job.userdata1 = (void*)readback_buffer;
-          job.precall = DmaPrefunc;
-          job.postcall = DmaPostfunc;
-
-          readback_available_ = GetDMAC()->PushDMAJob(&job);
-
-#endif
+            auto physaddr = memory_->TranslatePhysical(written_address);
+            memory::vastcpy(physaddr, (uint8_t*)readback_mapping,
+                            written_length);
+            // XEDmaCpy(physaddr, readback_mapping, written_length);
+            D3D12_RANGE readback_write_range = {};
+            readback_buffer->Unmap(0, &readback_write_range);
+          }
         }
       }
     }
@@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
     uint32_t user_clip_plane_index;
     while (xe::bit_scan_forward(user_clip_planes_remaining,
                                 &user_clip_plane_index)) {
-      user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index);
+      user_clip_planes_remaining =
+          xe::clear_lowest_bit(user_clip_planes_remaining);
       const float* user_clip_plane =
           &regs[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32;
       if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane,
@@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
   uint32_t textures_remaining = used_texture_mask;
   uint32_t texture_index;
   while (xe::bit_scan_forward(textures_remaining, &texture_index)) {
-    textures_remaining &= ~(uint32_t(1) << texture_index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
     uint32_t& texture_signs_uint =
         system_constants_.texture_swizzled_signs[texture_index >> 2];
     uint32_t texture_signs_shift = (texture_index & 3) * 8;
@@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
   }
-#if 1
-  if (readback_available_) {
-    GetDMAC()->WaitJobDone(readback_available_);
-    readback_available_ = 0;
-  }
-#endif
   size = xe::align(size, kReadbackBufferSizeIncrement);
   if (size > readback_buffer_size_) {
     const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index ba2c17a82..9412116ac 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -21,7 +21,6 @@
 #include <utility>
 
 #include "xenia/base/assert.h"
-#include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@@ -50,8 +49,10 @@ struct MemExportRange {
   uint32_t size_dwords;
 };
 class D3D12CommandProcessor final : public CommandProcessor {
- public:
+ protected:
 #include "../pm4_command_processor_declare.h"
+
+ public:
   explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
                                  kernel::KernelState* kernel_state);
   ~D3D12CommandProcessor();
@@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
                                                  uint32_t base,
                                                  uint32_t num_registers);
   XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(uint32_t base,
-                                        uint32_t num_times) override;
+  void WriteOneRegisterFromRing(uint32_t base,
+                                        uint32_t num_times);
 
   XE_FORCEINLINE
   void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
@@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
   ID3D12Resource* readback_buffer_ = nullptr;
-  dma::DMACJobHandle readback_available_ = 0;
   uint32_t readback_buffer_size_ = 0;
 
   std::atomic<bool> pix_capture_requested_ = false;
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
index 1e6f61685..91bb139e3 100644
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
 
 bool D3D12SharedMemory::UploadRanges(
     const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_page_ranges) {
+    uint32_t num_upload_page_ranges) {
   if (!num_upload_page_ranges) {
     return true;
   }
   CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
   command_processor_.SubmitBarriers();
   auto& command_list = command_processor_.GetDeferredCommandList();
-  // for (auto upload_range : upload_page_ranges) {
-  for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
+  for (uint32_t i = 0; i < num_upload_page_ranges; ++i) {
     auto& upload_range = upload_page_ranges[i];
     uint32_t upload_range_start = upload_range.first;
     uint32_t upload_range_length = upload_range.second;
@@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges(
                      uint32_t(upload_buffer_size), false, false);
 
       if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
-        dma::vastcpy(
+        memory::vastcpy(
             upload_buffer_mapping,
             memory().TranslatePhysical(upload_range_start << page_size_log2()),
             static_cast<uint32_t>(upload_buffer_size));
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
index 76200ef7f..73cff2e7b 100644
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
   bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                         uint32_t length_allocations) override;
 
-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;
 
  private:
   D3D12CommandProcessor& command_processor_;
diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
index 0cfaaaa41..eb0ada4e0 100644
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
@@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
   uint32_t textures_remaining = used_texture_mask;
   uint32_t index;
   while (xe::bit_scan_forward(textures_remaining, &index)) {
-    textures_remaining &= ~(uint32_t(1) << index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
     const TextureBinding* binding = GetValidTextureBinding(index);
     if (!binding) {
       continue;
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 5c62c50c3..af977d4d5 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -9,21 +9,14 @@
 
 #include "xenia/gpu/draw_util.h"
 
-#include <algorithm>
-#include <cmath>
 #include <cstring>
 
-#include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/texture_cache.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/ui/graphics_util.h"
 
 // Very prominent in 545407F2.
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index 15c014520..ececbaac9 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -16,7 +16,6 @@
 
 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h
index 2da2fb72c..8db802d13 100644
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@@ -1,9 +1,14 @@
 
 
 void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
-
+virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT;
 virtual bool ExecutePacket();
-XE_NOINLINE
+
+public:
+void ExecutePacket(uint32_t ptr, uint32_t count);
+
+protected:
+ XE_NOINLINE
 bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
 XE_NOINLINE
 bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
@@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount();
 
 XE_NOINLINE
 XE_COLD
-bool ExecutePacketType3_CountOverflow(uint32_t count);
\ No newline at end of file
+bool ExecutePacketType3_CountOverflow(uint32_t count);
+XE_NOINLINE
+XE_COLD
+bool ExecutePacketType0_CountOverflow(uint32_t count);
\ No newline at end of file
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 48795abba..bb6b94051 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() {
   }
 }
 XE_NOINLINE
+XE_COLD
+bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
+  XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
+         COMMAND_PROCESSOR::GetCurrentRingReadCount(),
+         count * sizeof(uint32_t));
+  return false;
+}
+    /*
+	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+*/
+XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
   // Type-0 packet.
   // Write count registers in sequence to the registers starting at
   // (base_index << 2).
 
   uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
-  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) {
-    XELOGE(
-        "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
-        COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t));
-    return false;
-  }
 
-  trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
 
-  uint32_t base_index = (packet & 0x7FFF);
-  uint32_t write_one_reg = (packet >> 15) & 0x1;
+  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
+      count * sizeof(uint32_t)) {
+    trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
 
-  if (!write_one_reg) {
-    COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count);
+    uint32_t base_index = (packet & 0x7FFF);
+    uint32_t write_one_reg = (packet >> 15) & 0x1;
 
+    if (!write_one_reg) {
+      COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index,
+                                                    count);
+
+    } else {
+      COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    }
+
+    trace_writer_.WritePacketEnd();
+    return true;
   } else {
-    COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count);
   }
-
-  trace_writer_.WritePacketEnd();
-  return true;
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT {
@@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
 
 XE_NOINLINE
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
+  /*
+	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
+	then use the wait_info value in order to select the bits that correctly implement the condition
+	If neither subtraction has the signbit set then that means the value is equal
+  */
   bool matched = false;
   switch (wait_info & 0x7) {
     case 0x0:  // Never.
@@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
   return true;
 }
 
+/*
+        todo: shouldn't this do something?
+*/
+
 bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(
     uint32_t packet, uint32_t count) XE_RESTRICT {
   // selective invalidation of state pointers
@@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
 
   return true;
 }
+
+uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
+                                                uint32_t write_index) {
+  SCOPE_profile_cpu_f("gpu");
+#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
+  // If we have a pending trace stream open it now. That way we ensure we get
+  // all commands.
+  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
+    uint32_t title_id = kernel_state_->GetExecutableModule()
+                            ? kernel_state_->GetExecutableModule()->title_id()
+                            : 0;
+    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
+    auto path = trace_stream_path_ / file_name;
+    trace_writer_.Open(path, title_id);
+    InitializeTrace();
+  }
+#endif
+  // Adjust pointer base.
+  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
+  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
+  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
+  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
+
+  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
+
+  // Execute commands!
+
+  RingBuffer old_reader = reader_;
+  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
+                            primary_buffer_size_);
+
+  reader_.set_read_offset(read_index * sizeof(uint32_t));
+  reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      // This probably should be fatal - but we're going to continue anyways.
+      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+
+  COMMAND_PROCESSOR::OnPrimaryBufferEnd();
+
+  trace_writer_.WritePrimaryBufferEnd();
+
+  reader_ = old_reader;
+  return write_index;
+}
+
+void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) {
+  // Execute commands!
+  RingBuffer old_reader = reader_;
+
+  new (&reader_)
+      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
+
+  reader_.set_write_offset(count * sizeof(uint32_t));
+
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      XELOGE("**** ExecutePacket: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+  reader_ = old_reader;
+}
diff --git a/src/xenia/gpu/sampler_info.h b/src/xenia/gpu/sampler_info.h
index c36cae07d..5a169aed8 100644
--- a/src/xenia/gpu/sampler_info.h
+++ b/src/xenia/gpu/sampler_info.h
@@ -11,7 +11,6 @@
 #define XENIA_GPU_SAMPLER_INFO_H_
 
 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/xenos.h"
 
 namespace xe {
 namespace gpu {
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index f07c0ee97..e9c21d801 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -24,7 +24,6 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"
 
 namespace xe {
 namespace gpu {
diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc
index 9e1084397..6eda12f42 100644
--- a/src/xenia/gpu/shader_interpreter.cc
+++ b/src/xenia/gpu/shader_interpreter.cc
@@ -13,13 +13,6 @@
 #include <cmath>
 #include <cstring>
 
-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
-#include "xenia/gpu/registers.h"
-#include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/xenos.h"
-
 namespace xe {
 namespace gpu {
 
diff --git a/src/xenia/gpu/shader_interpreter.h b/src/xenia/gpu/shader_interpreter.h
index dca530221..759f606eb 100644
--- a/src/xenia/gpu/shader_interpreter.h
+++ b/src/xenia/gpu/shader_interpreter.h
@@ -14,12 +14,9 @@
 #include <cstddef>
 #include <cstdint>
 
-#include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"
 
 namespace xe {
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 4530f57d4..88c7f95f4 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -9,15 +9,9 @@
 
 #include "xenia/gpu/shader_translator.h"
 
-#include <algorithm>
 #include <cstdarg>
-#include <cstring>
-#include <set>
-#include <string>
 
-#include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/gpu/gpu_flags.h"
 
 namespace xe {
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 593907952..0e764fe30 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -11,16 +11,7 @@
 #define XENIA_GPU_SHADER_TRANSLATOR_H_
 
 #include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "xenia/base/math.h"
-#include "xenia/base/string_buffer.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"
 
 namespace xe {
 namespace gpu {
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
index 38a8c54e9..c15da8a9b 100644
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@@ -10,15 +10,11 @@
 #include "xenia/gpu/shared_memory.h"
 
 #include <algorithm>
-#include <utility>
 
 #include "xenia/base/assert.h"
 #include "xenia/base/bit_range.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/profiling.h"
-#include "xenia/memory.h"
 
 namespace xe {
 namespace gpu {
diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h
index 7f8d3a892..e721fe8af 100644
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -10,12 +10,6 @@
 #ifndef XENIA_GPU_SHARED_MEMORY_H_
 #define XENIA_GPU_SHARED_MEMORY_H_
 
-#include <cstdint>
-#include <mutex>
-#include <utility>
-#include <vector>
-
-#include "xenia/base/mutex.h"
 #include "xenia/memory.h"
 
 namespace xe {
@@ -141,7 +135,7 @@ class SharedMemory {
   // overall bounds of pages to be uploaded.
   virtual bool UploadRanges(
       const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-      unsigned num_upload_ranges) = 0;
+      uint32_t num_upload_ranges) = 0;
 
   const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
     return trace_download_ranges_;
@@ -183,14 +177,10 @@ class SharedMemory {
   FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
       upload_ranges_;
 
-  // GPU-written memory downloading for traces. <Start address, length>.
-  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
-  uint32_t trace_download_page_count_ = 0;
-
   // Mutex between the guest memory subsystem and the command processor, to be
   // locked when checking or updating validity of pages/ranges and when firing
   // watches.
-  xe::global_critical_region global_critical_region_;
+  static constexpr xe::global_critical_region global_critical_region_{};
 
   // ***************************************************************************
   // Things below should be fully protected by global_critical_region.
@@ -266,6 +256,11 @@ class SharedMemory {
   uint32_t watch_node_current_pool_allocated_ = 0;
   WatchRange* watch_range_first_free_ = nullptr;
   WatchNode* watch_node_first_free_ = nullptr;
+
+  // GPU-written memory downloading for traces. <Start address, length>.
+  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
+  uint32_t trace_download_page_count_ = 0;
+
   // Triggers the watches (global and per-range), removing triggered range
   // watches.
   void FireWatches(uint32_t page_first, uint32_t page_last,
diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc
index dfc3264ca..14af42d0d 100644
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@@ -9,21 +9,11 @@
 
 #include "xenia/gpu/texture_cache.h"
 
-#include <algorithm>
-#include <cstdint>
-#include <utility>
-
-#include "xenia/base/assert.h"
 #include "xenia/base/clock.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/register_file.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"
 
 DEFINE_int32(
     draw_resolution_scale_x, 1,
@@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
   uint32_t bindings_changed = 0;
   uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
   uint32_t index = 0;
+
+  Texture* textures_to_load[64];  // max bits = 32, can be unsigned + signed
+                                  // means max array size = 64
+  uint32_t num_textures_to_load = 0;
   while (xe::bit_scan_forward(textures_remaining, &index)) {
     uint32_t index_bit = UINT32_C(1) << index;
-    textures_remaining &= ~index_bit;
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
     TextureBinding& binding = texture_bindings_[index];
     const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
         XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6);
@@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
       binding.texture_signed = nullptr;
     }
     if (load_unsigned_data && binding.texture != nullptr) {
-      LoadTextureData(*binding.texture);
+      textures_to_load[num_textures_to_load++] = binding.texture;
     }
     if (load_signed_data && binding.texture_signed != nullptr) {
-      LoadTextureData(*binding.texture_signed);
+      textures_to_load[num_textures_to_load++] = binding.texture_signed;
     }
   }
+
+  LoadTexturesData(textures_to_load, num_textures_to_load);
+
   if (bindings_changed) {
     UpdateTextureBindingsImpl(bindings_changed);
   }
@@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
   texture->LogAction("Created");
   return texture;
 }
+void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
+  assert_true(n_textures <= 64);
+  if (n_textures < 2) {
+    if (!n_textures) {
+      return;
+    } else {
+      LoadTextureData(*textures[0]);
+      return;
+    }
+  }
 
+  uint64_t index_base_outdated = 0;
+  uint64_t index_mips_outdated = 0;
+  uint32_t nkept = 0;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      Texture* current = textures[i];
+
+      auto base_outdated = current->base_outdated(global_lock);
+      auto mips_outdated = current->mips_outdated(global_lock);
+
+      index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
+      index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
+      if (!base_outdated && !mips_outdated) {
+        textures[i] = nullptr;
+
+      } else {
+        nkept++;
+      }
+    }
+  }
+
+  if (nkept == 0) {
+    return;
+  }
+
+  for (uint32_t i = 0; i < n_textures; ++i) {
+    Texture* p_texture = textures[i];
+    if (!p_texture) {
+      continue;
+    }
+    textures[i] = nullptr;
+    Texture& texture = *p_texture;
+
+    TextureKey texture_key = texture.key();
+    // Implementation may load multiple blocks at once via accesses of up to 128
+    // bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
+    // if the texture is small (especially if it's linear), the last blocks
+    // won't be cut off (hosts may return 0, 0, 0, 0 for the whole
+    // R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
+    // are actually provided for it).
+
+    // Request uploading of the texture data to the shared memory.
+    // This is also necessary when resolution scaling is used - the texture
+    // cache relies on shared memory for invalidation of both unscaled and
+    // scaled textures. Plus a texture may be unscaled partially, when only a
+    // portion of its pages is invalidated, in this case we'll need the texture
+    // from the shared memory to load the unscaled parts.
+    // TODO(Triang3l): Load unscaled parts.
+    bool base_resolved = texture.GetBaseResolved();
+    if (index_base_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.base_page << 12,
+              xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &base_resolved)) {
+        continue;
+      }
+    }
+    bool mips_resolved = texture.GetMipsResolved();
+    if (index_mips_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.mip_page << 12,
+              xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
+        continue;
+      }
+    }
+    if (texture_key.scaled_resolve) {
+      // Make sure all the scaled resolve memory is resident and accessible from
+      // the shader, including any possible padding that hasn't yet been touched
+      // by an actual resolve, but is still included in the texture size, so the
+      // GPU won't be trying to access unmapped memory.
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
+                                              texture.GetGuestBaseSize(), 4)) {
+        continue;
+      }
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
+                                              texture.GetGuestMipsSize(), 4)) {
+        continue;
+      }
+    }
+
+    // Actually load the texture data.
+    if (!LoadTextureDataFromResidentMemoryImpl(
+            texture, (index_base_outdated & (1ULL << i)) != 0,
+            (index_mips_outdated & (1ULL << i)) != 0)) {
+      continue;
+    }
+
+    // Update the source of the texture (resolve vs. CPU or memexport) for
+    // purposes of handling piecewise gamma emulation via sRGB and for
+    // resolution scale in sampling offsets.
+    if (!texture_key.scaled_resolve) {
+      texture.SetBaseResolved(base_resolved);
+      texture.SetMipsResolved(mips_resolved);
+    }
+    // reque for makeuptodatandwatch
+    textures[i] = &texture;
+  }
+  {
+    auto crit = global_critical_region_.Acquire();
+
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      auto texture = textures[i];
+      if (!texture) {
+        continue;
+      }
+      // Mark the ranges as uploaded and watch them. This is needed for scaled
+      // resolves as well to detect when the CPU wants to reuse the memory for a
+      // regular texture or a vertex buffer, and thus the scaled resolve version
+      // is not up to date anymore.
+      texture->MakeUpToDateAndWatch(crit);
+
+      texture->LogAction("Loaded");
+    }
+  }
+}
 bool TextureCache::LoadTextureData(Texture& texture) {
   // Check what needs to be uploaded.
   bool base_outdated, mips_outdated;
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index 197124e37..ff29fcb38 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -559,6 +559,7 @@ class TextureCache {
     return load_shader_info_[load_shader_index];
   }
   bool LoadTextureData(Texture& texture);
+  void LoadTexturesData(Texture** textures, uint32_t n_textures);
   // Writes the texture data (for base, mips or both - but not neither) from the
   // shared memory or the scaled resolve memory. The shared memory management is
   // done outside this function, the implementation just needs to load the data
diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc
index 67c465b33..7a617af9b 100644
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@@ -8,14 +8,7 @@
  */
 
 #include "xenia/gpu/texture_info.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/xxhash.h"
 
 namespace xe {
diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h
index 5c514f61a..567e2cc48 100644
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@@ -13,7 +13,6 @@
 #include <array>
 #include <cstring>
 #include <memory>
-#include "xenia/base/assert.h"
 #include "xenia/gpu/xenos.h"
 
 namespace xe {
diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc
index cbe6c62bd..74f02b3d6 100644
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@@ -8,13 +8,6 @@
  */
 
 #include "xenia/gpu/texture_util.h"
-
-#include <algorithm>
-#include <cstring>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace texture_util {
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index 3b0875e25..edecafe7f 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -10,11 +10,6 @@
 #ifndef XENIA_GPU_UCODE_H_
 #define XENIA_GPU_UCODE_H_
 
-#include <cstdint>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-#include "xenia/base/platform.h"
 #include "xenia/gpu/xenos.h"
 
 // The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource,
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index b1b2eb1cd..f5fd3e1af 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -46,6 +46,9 @@ namespace gpu {
 namespace vulkan {
 
 class VulkanCommandProcessor final : public CommandProcessor {
+ protected:
+#include "../pm4_command_processor_declare.h"
+
  public:
   // Single-descriptor layouts for use within a single frame.
   enum class SingleTransientDescriptorLayout {
@@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor {
     kStorageBufferCompute,
     kCount,
   };
-#include "../pm4_command_processor_declare.h"
 
   class ScratchBufferAcquisition {
    public:
diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
index eb6403441..33c348988 100644
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
 
 bool VulkanSharedMemory::UploadRanges(
     const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_ranges) {
+    uint32_t num_upload_ranges) {
   if (!num_upload_ranges) {
     return true;
   }
diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
index b83877a38..d8a72cb8c 100644
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
   bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                         uint32_t length_allocations) override;
 
-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;
 
  private:
   void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc
index 997e9a48a..05bb30014 100644
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@@ -9,10 +9,6 @@
 
 #include "xenia/gpu/xenos.h"
 
-#include <cmath>
-
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace xenos {
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 8e9fd5c11..6c39daf45 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -10,13 +10,10 @@
 #ifndef XENIA_GPU_XENOS_H_
 #define XENIA_GPU_XENOS_H_
 
-#include <algorithm>
 
-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
 #include "xenia/base/memory.h"
-#include "xenia/base/platform.h"
+#include "xenia/base/math.h"
+
 
 namespace xe {
 namespace gpu {
diff --git a/src/xenia/kernel/util/object_table.cc b/src/xenia/kernel/util/object_table.cc
index d1250791c..f560cc837 100644
--- a/src/xenia/kernel/util/object_table.cc
+++ b/src/xenia/kernel/util/object_table.cc
@@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) {
 
 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle) {
-  auto object = ObjectTable::LookupObject(handle, false);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked) {
+  auto object = ObjectTable::LookupObject(handle, already_locked);
   auto result = object_ref<XObject>(reinterpret_cast<XObject*>(object));
   return result;
 }
diff --git a/src/xenia/kernel/util/object_table.h b/src/xenia/kernel/util/object_table.h
index 6ac67d1b7..7722c28d1 100644
--- a/src/xenia/kernel/util/object_table.h
+++ b/src/xenia/kernel/util/object_table.h
@@ -46,10 +46,9 @@ class ObjectTable {
   // Restores a XObject reference with a handle. Mainly for internal use - do
   // not use.
   X_STATUS RestoreHandle(X_HANDLE handle, XObject* object);
-
   template <typename T>
-  object_ref<T> LookupObject(X_HANDLE handle) {
-    auto object = LookupObject(handle, false);
+  object_ref<T> LookupObject(X_HANDLE handle, bool already_locked = false) {
+    auto object = LookupObject(handle, already_locked);
     if (T::kObjectType == XObject::Type::Socket) {
       object = LookupObject((handle | 0xF8000000), false);
     }
@@ -110,7 +109,8 @@ class ObjectTable {
 
 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked);
 
 }  // namespace util
 }  // namespace kernel
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
index 3e9c39cbd..0cf538ffa 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
@@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub);
 
 dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr,
                                                       lpdword_t driver_ptr) {
+  if (!callback_ptr) {
+    return X_E_INVALIDARG;
+  }
+
   uint32_t callback = callback_ptr[0];
+
+  if (!callback) {
+    return X_E_INVALIDARG;
+  }
   uint32_t callback_arg = callback_ptr[1];
 
   auto audio_system = kernel_state()->emulator()->audio_system();
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index b5bb6c57b..95b26dfb3 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle,
 }
 DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented,
                          kHighFrequency);
+dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) {
+  X_STATUS result = X_STATUS_SUCCESS;
 
+  auto ev = kernel_state()->object_table()->LookupObject<XEvent>(handle);
+  if (ev) {
+    uint32_t type_tmp, state_tmp;
+
+    ev->Query(&type_tmp, &state_tmp);
+
+    out_struc[0] = type_tmp;
+    out_struc[1] = state_tmp;
+
+  } else {
+    result = X_STATUS_INVALID_HANDLE;
+  }
+
+  return result;
+}
+DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented,
+                         kHighFrequency);
 uint32_t xeNtClearEvent(uint32_t handle) {
   X_STATUS result = X_STATUS_SUCCESS;
 
@@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry(
     lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) {
   assert_true(wait_type <= 1);
 
-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
-    auto object_ref =
-        XObject::GetNativeObject<XObject>(kernel_state(), object_ptr);
-    if (!object_ref) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
+      auto object_ref = XObject::GetNativeObject<XObject>(kernel_state(),
+                                                          object_ptr, -1, true);
+      if (!object_ref) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+
+      objects[n] = std::move(object_ref);
     }
-
-    objects.push_back(std::move(object_ref));
   }
-
   uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
-  return XObject::WaitMultiple(uint32_t(objects.size()),
-                               reinterpret_cast<XObject**>(objects.data()),
-                               wait_type, wait_reason, processor_mode,
-                               alertable, timeout_ptr ? &timeout : nullptr);
+  return XObject::WaitMultiple(
+      uint32_t(count), reinterpret_cast<XObject**>(&objects[0]), wait_type,
+      wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr);
 }
 DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented,
                          kBlocking, kHighFrequency);
@@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be<uint32_t>* handles,
                                       uint64_t* timeout_ptr) {
   assert_true(wait_type <= 1);
 
-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    uint32_t object_handle = handles[n];
-    auto object =
-        kernel_state()->object_table()->LookupObject<XObject>(object_handle);
-    if (!object) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+
+  /*
+        Reserving to squash the constant reallocations, in a benchmark of one
+     particular game over a period of five minutes roughly 11% of CPU time was
+     spent inside a helper function to Windows' heap allocation function. 7% of
+     that time was traced back to here
+
+         edit: actually switched to fixed size array, as there can never be more
+     than 64 events specified
+  */
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      uint32_t object_handle = handles[n];
+      auto object = kernel_state()->object_table()->LookupObject<XObject>(
+          object_handle, true);
+      if (!object) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+      objects[n] = std::move(object);
     }
-    objects.push_back(std::move(object));
   }
 
-  return XObject::WaitMultiple(count,
-                               reinterpret_cast<XObject**>(objects.data()),
+  return XObject::WaitMultiple(count, reinterpret_cast<XObject**>(&objects[0]),
                                wait_type, 6, wait_mode, alertable, timeout_ptr);
 }
 
@@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
     dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
     dword_t alertable, lpqword_t timeout_ptr) {
   uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
+  if (!count || count > 64 || wait_type != 1 && wait_type) {
+    return X_STATUS_INVALID_PARAMETER;
+  }
   return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
                                       alertable,
                                       timeout_ptr ? &timeout : nullptr);
@@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
                                                       dword_t r6,
                                                       lpqword_t timeout_ptr) {
   X_STATUS result = X_STATUS_SUCCESS;
+  // pre-lock for these two handle lookups
+  global_critical_region::mutex().lock();
 
-  auto signal_object =
-      kernel_state()->object_table()->LookupObject<XObject>(signal_handle);
+  auto signal_object = kernel_state()->object_table()->LookupObject<XObject>(
+      signal_handle, true);
   auto wait_object =
-      kernel_state()->object_table()->LookupObject<XObject>(wait_handle);
+      kernel_state()->object_table()->LookupObject<XObject>(wait_handle, true);
+  global_critical_region::mutex().unlock();
   if (signal_object && wait_object) {
     uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
     result =
diff --git a/src/xenia/kernel/xevent.cc b/src/xenia/kernel/xevent.cc
index 03c947c7e..bf1176af8 100644
--- a/src/xenia/kernel/xevent.cc
+++ b/src/xenia/kernel/xevent.cc
@@ -71,7 +71,12 @@ int32_t XEvent::Reset() {
   event_->Reset();
   return 1;
 }
+void XEvent::Query(uint32_t* out_type, uint32_t* out_state) {
+  auto [type, state] = event_->Query();
 
+  *out_type = type;
+  *out_state = state;
+}
 void XEvent::Clear() { event_->Reset(); }
 
 bool XEvent::Save(ByteStream* stream) {
diff --git a/src/xenia/kernel/xevent.h b/src/xenia/kernel/xevent.h
index b5f4accf5..4fd174cd0 100644
--- a/src/xenia/kernel/xevent.h
+++ b/src/xenia/kernel/xevent.h
@@ -36,6 +36,7 @@ class XEvent : public XObject {
   int32_t Set(uint32_t priority_increment, bool wait);
   int32_t Pulse(uint32_t priority_increment, bool wait);
   int32_t Reset();
+  void Query(uint32_t* out_type, uint32_t* out_state);
   void Clear();
 
   bool Save(ByteStream* stream) override;
diff --git a/src/xenia/kernel/xobject.cc b/src/xenia/kernel/xobject.cc
index bf189c2af..bd7374b74 100644
--- a/src/xenia/kernel/xobject.cc
+++ b/src/xenia/kernel/xobject.cc
@@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                                uint32_t wait_type, uint32_t wait_reason,
                                uint32_t processor_mode, uint32_t alertable,
                                uint64_t* opt_timeout) {
-  std::vector<xe::threading::WaitHandle*> wait_handles(count);
+  xe::threading::WaitHandle* wait_handles[64];
+
   for (size_t i = 0; i < count; ++i) {
     wait_handles[i] = objects[i]->GetWaitHandle();
     assert_not_null(wait_handles[i]);
@@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                   : std::chrono::milliseconds::max();
 
   if (wait_type) {
-    auto result = xe::threading::WaitAny(std::move(wait_handles),
+    auto result = xe::threading::WaitAny(wait_handles, count,
                                          alertable ? true : false, timeout_ms);
     switch (result.first) {
       case xe::threading::WaitResult::kSuccess:
@@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
         return X_STATUS_UNSUCCESSFUL;
     }
   } else {
-    auto result = xe::threading::WaitAll(std::move(wait_handles),
+    auto result = xe::threading::WaitAll(wait_handles, count,
                                          alertable ? true : false, timeout_ms);
     switch (result) {
       case xe::threading::WaitResult::kSuccess:
@@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) {
 }
 
 object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
-                                             void* native_ptr,
-                                             int32_t as_type) {
+                                             void* native_ptr, int32_t as_type,
+                                             bool already_locked) {
   assert_not_null(native_ptr);
 
   // Unfortunately the XDK seems to inline some KeInitialize calls, meaning
@@ -373,10 +374,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
   // We identify this by setting wait_list_flink to a magic value. When set,
   // wait_list_blink will hold a handle to our object.
 
-  auto global_lock = xe::global_critical_region::AcquireDirect();
+  if (!already_locked) {
+    global_critical_region::mutex().lock();
+  }
 
   auto header = reinterpret_cast<X_DISPATCH_HEADER*>(native_ptr);
-
+  XObject* result;
   if (as_type == -1) {
     as_type = header->type;
   }
@@ -385,10 +388,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
     // Already initialized.
     // TODO: assert if the type of the object != as_type
     uint32_t handle = header->wait_list_blink;
-    auto object = kernel_state->object_table()->LookupObject<XObject>(handle);
-
+    result = kernel_state->object_table()
+                 ->LookupObject<XObject>(handle, true)
+                 .release();
+    goto return_result;
     // TODO(benvanik): assert nothing has been changed in the struct.
-    return object;
+    // return object;
   } else {
     // First use, create new.
     // https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html
@@ -430,14 +435,22 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
       case 24:  // ThreadedDpcObject
       default:
         assert_always();
-        return NULL;
+        result = nullptr;
+        goto return_result;
+
+        // return NULL;
     }
 
     // Stash pointer in struct.
     // FIXME: This assumes the object contains a dispatch header (some don't!)
     StashHandle(header, object->handle());
+    result = object;
 
-    return object_ref<XObject>(object);
+  return_result:
+    if (!already_locked) {
+      global_critical_region::mutex().unlock();
+    }
+    return object_ref<XObject>(result);
   }
 }
 
diff --git a/src/xenia/kernel/xobject.h b/src/xenia/kernel/xobject.h
index 67518b35c..df067b55d 100644
--- a/src/xenia/kernel/xobject.h
+++ b/src/xenia/kernel/xobject.h
@@ -192,10 +192,12 @@ class XObject {
 
   static object_ref<XObject> GetNativeObject(KernelState* kernel_state,
                                              void* native_ptr,
-                                             int32_t as_type = -1);
+                                             int32_t as_type = -1,
+                                             bool already_locked = false);
   template <typename T>
   static object_ref<T> GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type = -1);
+                                       void* native_ptr, int32_t as_type = -1,
+                                       bool already_locked = false);
 
  protected:
   bool SaveObject(ByteStream* stream);
@@ -362,9 +364,11 @@ object_ref<T> retain_object(T* ptr) {
 
 template <typename T>
 object_ref<T> XObject::GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type) {
+                                       void* native_ptr, int32_t as_type,
+                                       bool already_locked) {
   return object_ref<T>(reinterpret_cast<T*>(
-      GetNativeObject(kernel_state, native_ptr, as_type).release()));
+      GetNativeObject(kernel_state, native_ptr, as_type, already_locked)
+          .release()));
 }
 
 }  // namespace kernel