Merge pull request #81 from chrisps/canary_experimental

global lock changes, minor kernel changes, premake fix
2022-10-08 12:04:43 -07:00 · 2022-10-08 12:04:43 -07:00 · 08d38bdff6
parent 50fce8bdb3 2dd6f33f4b
commit 08d38bdff6
59 changed files with 740 additions and 793 deletions
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
  // Allocate ffmpeg stuff:
  av_packet_ = av_packet_alloc();
  assert_not_null(av_packet_);
-
+  //chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet, 
+  //these allocations were causing RtlSubsegmentInitialize
+  av_packet_->buf = av_buffer_alloc(128 * 1024);
  // find the XMA2 audio decoder
  av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES);
  if (!av_codec_) {
@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }

 bool XmaContext::Work() {
-  std::lock_guard<xe_mutex> lock(lock_);
-  if (!is_allocated() || !is_enabled()) {
+  
+  if (!is_enabled() || !is_allocated()) {
    return false;
  }
+  {
+    std::lock_guard<xe_mutex> lock(lock_);
+    set_is_enabled(false);

-  set_is_enabled(false);
-
-  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
-  XMA_CONTEXT_DATA data(context_ptr);
-  Decode(&data);
-  data.Store(context_ptr);
-  return true;
+    auto context_ptr = memory()->TranslateVirtual(guest_ptr());
+    XMA_CONTEXT_DATA data(context_ptr);
+    Decode(&data);
+    data.Store(context_ptr);
+    return true;
+  }
 }

 void XmaContext::Enable() {
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@ -201,8 +201,8 @@ class XmaContext {
  uint32_t id_ = 0;
  uint32_t guest_ptr_ = 0;
  xe_mutex lock_;
-  bool is_allocated_ = false;
-  bool is_enabled_ = false;
+  volatile bool is_allocated_ = false;
+  volatile bool is_enabled_ = false;
  // bool is_dirty_ = true;

  // ffmpeg structures
--- a/src/xenia/base/dma.cc
+++ b/src/xenia/base/dma.cc
@ -1,348 +0,0 @@
-#include "dma.h"
-#include "logging.h"
-#include "mutex.h"
-#include "platform_win.h"
-
-XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
-                NtDelayExecutionPointer);
-XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
-XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
-                NtAlertThreadByThreadId);
-
-template <size_t N, typename... Ts>
-static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
-  char buffer[1024];
-  sprintf_s(buffer, fmt, args...);
-  XELOGI("%s", buffer);
-}
-
-//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
-//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
-#define XEDMALOG(...) static_cast<void>(0)
-using xe::swcache::CacheLine;
-static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
-#if defined(__clang__)
-XE_FORCEINLINE
-static void mvdir64b(void* to, const void* from) {
-  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
-}
-#define _movdir64b mvdir64b
-#endif
-XE_FORCEINLINE
-static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
-                                    CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-XE_FORCEINLINE
-static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
-                                 CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-#if 0
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-#else
-    _movdir64b(dest1 + i, src1 + i);
-    _movdir64b(dest2 + i, src2 + i);
-    _movdir64b(dest3 + i, src3 + i);
-    _movdir64b(dest4 + i, src4 + i);
-#endif
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-
-namespace xe::dma {
-using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
-                                 CacheLine* XE_RESTRICT rdmapping,
-                                 uint32_t written_length);
-static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
-                             CacheLine* XE_RESTRICT rdmapping,
-                             uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384StreamingAVX(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    xe::swcache::CacheLine line0, line1;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-
-    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
-  }
-
-  if (i < num_written_lines) {
-    xe::swcache::CacheLine line0;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-  }
-}
-
-static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
-                                   CacheLine* XE_RESTRICT rdmapping,
-                                   uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384Movdir64M(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    _movdir64b(physaddr + i, rdmapping + i);
-    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
-  }
-
-  if (i < num_written_lines) {
-    _movdir64b(physaddr + i, rdmapping + i);
-  }
-}
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length);
-
-static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length) {
-  VastCpyDispatch dispatch_to_use = nullptr;
-  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
-    XELOGI("Selecting MOVDIR64M vastcpy.");
-    dispatch_to_use = vastcpy_impl_movdir64m;
-  } else {
-    XELOGI("Selecting generic AVX vastcpy.");
-    dispatch_to_use = vastcpy_impl_avx;
-  }
-
-  vastcpy_dispatch =
-      dispatch_to_use;  // all future calls will go through our selected path
-  return vastcpy_dispatch(physaddr, rdmapping, written_length);
-}
-
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length) {
-  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
-                          written_length);
-}
-
-#define MAX_INFLIGHT_DMAJOBS 65536
-#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
-class XeDMACGeneric : public XeDMAC {
-  std::unique_ptr<xe::threading::Thread> thrd_;
-  XeDMAJob* jobs_ring_;
-  volatile std::atomic<uintptr_t> write_ptr_;
-
-  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
-    volatile std::atomic<uintptr_t> read_ptr_;
-    xe_mutex push_into_ring_lock_;
-  };
-  HANDLE gotjob_event;
-  void WorkerWait();
-
- public:
-  virtual ~XeDMACGeneric() {}
-  void WorkerThreadMain();
-  XeDMACGeneric() {
-    threading::Thread::CreationParameters crparams;
-    crparams.create_suspended = true;
-    crparams.initial_priority = threading::ThreadPriority::kNormal;
-    crparams.stack_size = 65536;
-    gotjob_event = CreateEventA(nullptr, false, false, nullptr);
-    thrd_ = std::move(threading::Thread::Create(
-        crparams, [this]() { this->WorkerThreadMain(); }));
-
-    jobs_ring_ = (XeDMAJob*)_aligned_malloc(
-        MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
-
-    write_ptr_ = 0;
-    read_ptr_ = 0;
-
-    thrd_->Resume();
-  }
-
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
-    // std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
-    HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
-    {
-      job->dmac_specific_ = (uintptr_t)dmacevent;
-
-      jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
-      write_ptr_++;
-      SetEvent(gotjob_event);
-    }
-    return (DMACJobHandle)dmacevent;
-  }
-  virtual void WaitJobDone(DMACJobHandle handle) override {
-    while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
-      // NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
-      //  while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
-      //           WAIT_TIMEOUT) {
-      //    ;
-    }
-    //}
-
-    // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
-    CloseHandle((HANDLE)handle);
-  }
-  virtual void WaitForIdle() override {
-    while (write_ptr_ != read_ptr_) {
-      threading::MaybeYield();
-    }
-  }
-};
-void XeDMACGeneric::WorkerWait() {
-  constexpr unsigned NUM_PAUSE_SPINS = 2048;
-  constexpr unsigned NUM_YIELD_SPINS = 8;
-#if 0
-
-  for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      _mm_pause();
-    } else {
-      break;
-    }
-  }
-  for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      threading::MaybeYield();
-    } else {
-      break;
-    }
-  }
-  LARGE_INTEGER yield_execution_delay{};
-  yield_execution_delay.QuadPart =
-      -2000;  //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
-  while (write_ptr_ == read_ptr_) {
-    NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
-  }
-#else
-  do {
-    if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
-      while (write_ptr_ == read_ptr_) {
-        _mm_pause();
-      }
-    }
-
-  } while (write_ptr_ == read_ptr_);
-#endif
-}
-void XeDMACGeneric::WorkerThreadMain() {
-  while (true) {
-    this->WorkerWait();
-
-    XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
-    swcache::ReadFence();
-
-    if (current_job.precall) {
-      current_job.precall(&current_job);
-    }
-
-    size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
-    size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
-
-    size_t line_rem = current_job.size - line_rounded;
-
-    vastcpy(current_job.destination, current_job.source,
-            static_cast<uint32_t>(line_rounded));
-
-    if (line_rem) {
-      __movsb(current_job.destination + line_rounded,
-              current_job.source + line_rounded, line_rem);
-    }
-
-    if (current_job.postcall) {
-      current_job.postcall(&current_job);
-    }
-    read_ptr_++;
-    swcache::WriteFence();
-
-    SetEvent((HANDLE)current_job.dmac_specific_);
-  }
-}
-
-XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
-}  // namespace xe::dma
--- a/src/xenia/base/dma.h
+++ b/src/xenia/base/dma.h
@ -1,47 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef XENIA_BASE_DMA_H_
-#define XENIA_BASE_DMA_H_
-#include "memory.h"
-#include "threading.h"
-namespace xe::dma {
-struct XeDMAJob;
-using DmaPrecall = void (*)(XeDMAJob* job);
-using DmaPostcall = void (*)(XeDMAJob* job);
-struct XeDMAJob {
-  //threading::Event* signal_on_done;
-  uintptr_t dmac_specific_;
-  uint8_t* destination;
-  uint8_t* source;
-  size_t size;
-  DmaPrecall precall;
-  DmaPostcall postcall;
-  void* userdata1;
-  void* userdata2;
-};
-using DMACJobHandle = uint64_t;
-
-class XeDMAC {
- public:
-  virtual ~XeDMAC() {}
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
-  virtual void WaitJobDone(DMACJobHandle handle) = 0;
-  virtual void WaitForIdle() = 0;
-};
-
-XeDMAC* CreateDMAC();
-// must be divisible by cache line size
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length);
-
-}  // namespace xe::dma
-
-#endif  // XENIA_BASE_DMA_H_
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -44,8 +44,18 @@ template <typename T>
 constexpr bool is_pow2(T value) {
  return (value & (value - 1)) == 0;
 }
+/*
+	Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient 
+	because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction),
+	but this code can be executed alongside the scan
+*/
+template<typename T>
+constexpr T clear_lowest_bit(T value) {
+  static_assert(std::is_integral_v<T>);

-// Rounds up the given value to the given alignment.
+  return (value - static_cast<T>(1)) & value;
+}
+    // Rounds up the given value to the given alignment.
 template <typename T>
 constexpr T align(T value, T alignment) {
  return (value + alignment - 1) & ~(alignment - 1);
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@ -10,6 +10,7 @@
 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"
+#include "xenia/base/logging.h"

 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() {
         cvars::writable_executable_memory;
 }

+using xe::swcache::CacheLine;
+
+static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
+
+#if defined(__clang__)
+XE_FORCEINLINE
+static void mvdir64b(void* to, const void* from) {
+  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
+}
+#define _movdir64b mvdir64b
+#endif
+XE_FORCEINLINE
+static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
+                                    CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    xe::swcache::CacheLine line0, line1, line2, line3;
+
+    xe::swcache::ReadLine(&line0, src1 + i);
+    xe::swcache::ReadLine(&line1, src2 + i);
+    xe::swcache::ReadLine(&line2, src3 + i);
+    xe::swcache::ReadLine(&line3, src4 + i);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(dest1 + i, &line0);
+    xe::swcache::WriteLineNT(dest2 + i, &line1);
+
+    xe::swcache::WriteLineNT(dest3 + i, &line2);
+    xe::swcache::WriteLineNT(dest4 + i, &line3);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+XE_FORCEINLINE
+static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
+                                 CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    _movdir64b(dest1 + i, src1 + i);
+    _movdir64b(dest2 + i, src2 + i);
+    _movdir64b(dest3 + i, src3 + i);
+    _movdir64b(dest4 + i, src4 + i);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length);
+static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
+                             CacheLine* XE_RESTRICT rdmapping,
+                             uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384StreamingAVX(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    xe::swcache::CacheLine line0, line1;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+
+    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
+  }
+
+  if (i < num_written_lines) {
+    xe::swcache::CacheLine line0;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+  }
+}
+
+static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
+                                   CacheLine* XE_RESTRICT rdmapping,
+                                   uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384Movdir64M(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    _movdir64b(physaddr + i, rdmapping + i);
+    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
+  }
+
+  if (i < num_written_lines) {
+    _movdir64b(physaddr + i, rdmapping + i);
+  }
+}
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length);
+
+static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length) {
+  VastCpyDispatch dispatch_to_use = nullptr;
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
+    XELOGI("Selecting MOVDIR64M vastcpy.");
+    dispatch_to_use = vastcpy_impl_movdir64m;
+  } else {
+    XELOGI("Selecting generic AVX vastcpy.");
+    dispatch_to_use = vastcpy_impl_avx;
+  }
+
+  vastcpy_dispatch =
+      dispatch_to_use;  // all future calls will go through our selected path
+  return vastcpy_dispatch(physaddr, rdmapping, written_length);
+}
+
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length) {
+  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
+                          written_length);
+}
 }  // namespace memory

 // TODO(benvanik): fancy AVX versions.
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -17,9 +17,9 @@
 #include <string>
 #include <string_view>

-#include "xenia/base/assert.h"
+
 #include "xenia/base/byte_order.h"
-#include "xenia/base/platform.h"
+

 namespace xe {
 namespace memory {
@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
  seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2);
  return hash_combine(seed, vs...);
 }
+// must be divisible by cache line size
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length);

 }  // namespace memory

--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // };
 class global_critical_region {
 public:
+  constexpr global_critical_region() {}
  static global_mutex_type& mutex();

  // Acquires a lock on the global critical section.
@ -144,18 +145,18 @@ class global_critical_region {
  }

  // Acquires a lock on the global critical section.
-  inline global_unique_lock_type Acquire() {
+  static inline global_unique_lock_type Acquire() {
    return global_unique_lock_type(mutex());
  }

  // Acquires a deferred lock on the global critical section.
-  inline global_unique_lock_type AcquireDeferred() {
+  static inline global_unique_lock_type AcquireDeferred() {
    return global_unique_lock_type(mutex(), std::defer_lock);
  }

  // Tries to acquire a lock on the glboal critical section.
  // Check owns_lock() to see if the lock was successfully acquired.
-  inline global_unique_lock_type TryAcquire() {
+  static inline global_unique_lock_type TryAcquire() {
    return global_unique_lock_type(mutex(), std::try_to_lock);
  }
 };
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
  xenia_assert(this->capacity_ >= 4);

  ring_size_t next_read_offset = read_offset + 4;
-#if 0
-  size_t zerotest = next_read_offset - this->capacity_;
-  // unpredictable branch, use bit arith instead
-  // todo: it would be faster to use lzcnt, but we need to figure out if all
-  // machines we support support it
-  next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
-#else
+
  if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
    next_read_offset = 0;
    // todo: maybe prefetch next? or should that happen much earlier?
  }
-#endif
  this->read_offset_ = next_read_offset;
-  unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
+  uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset];
  return xe::byte_swap(ring_value);
 }
 }  // namespace xe
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -271,7 +271,10 @@ inline std::pair<WaitResult, size_t> WaitAny(
  return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable,
                 timeout);
 }
-
+struct EventInfo {
+  uint32_t type;
+  uint32_t state;
+};
 // Models a Win32-like event object.
 // https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx
 class Event : public WaitHandle {
@ -299,6 +302,8 @@ class Event : public WaitHandle {
  // the nonsignaled state after releasing the appropriate number of waiting
  // threads.
  virtual void Pulse() = 0;
+	
+  virtual EventInfo Query() = 0;
  #if XE_PLATFORM_WIN32 ==1
  //SetEvent, but if there is a waiter we immediately transfer execution to it
  virtual void SetBoostPriority() = 0;
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,

 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                NtDelayExecutionPointer);
-
+XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer);
 namespace xe {
 namespace threading {

@ -255,20 +255,20 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
                                           size_t wait_handle_count,
                                           bool wait_all, bool is_alertable,
                                           std::chrono::milliseconds timeout) {
-  std::vector<HANDLE> handles(
-      wait_handle_count);  // max handles is like 64, so it would make more
-                           // sense to just do a fixed size array here
+  xenia_assert(wait_handle_count <= 64);
+  HANDLE handles[64];
+
  for (size_t i = 0; i < wait_handle_count; ++i) {
    handles[i] = wait_handles[i]->native_handle();
  }
  DWORD result = WaitForMultipleObjectsEx(
-      DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE,
+      static_cast<DWORD>(wait_handle_count), handles, wait_all ? TRUE : FALSE,
      DWORD(timeout.count()), is_alertable ? TRUE : FALSE);
-  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) {
+  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) {
    return std::pair<WaitResult, size_t>(WaitResult::kSuccess,
                                         result - WAIT_OBJECT_0);
  } else if (result >= WAIT_ABANDONED_0 &&
-             result < WAIT_ABANDONED_0 + handles.size()) {
+             result < WAIT_ABANDONED_0 + wait_handle_count) {
    return std::pair<WaitResult, size_t>(WaitResult::kAbandoned,
                                         result - WAIT_ABANDONED_0);
  }
@ -293,7 +293,15 @@ class Win32Event : public Win32Handle<Event> {
  void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
  void SetBoostPriority() override {
    // no previous state for boostpriority
-    NtSetEventBoostPriorityPointer.invoke(handle_);
+    // Boost priority is unimplemented under wine probably because it's not used
+    // anywhere in user mode except by us. Maybe some Windows internals uses it
+    // see:
+    // https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228
+    if (NtSetEventBoostPriorityPointer) {
+      NtSetEventBoostPriorityPointer.invoke(handle_);
+    } else {
+      NtSetEventPointer.invoke(handle_, nullptr);
+    }
  }
 #else
  void Set() override { SetEvent(handle_); }
@ -305,6 +313,11 @@ class Win32Event : public Win32Handle<Event> {
    SetEvent(handle_);
  }
 #endif
+
+  EventInfo Query() { EventInfo result{};
+    NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
+    return result;
+  }
 };

 std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-XE_MSVC_OPTIMIZE_SMALL()
+
 DEFINE_bool(use_fast_dot_product, false,
            "Experimental optimization, much shorter sequence on dot products, "
            "treating inf as overflow instead of using mcxsr"
--- a/src/xenia/cpu/premake5.lua
+++ b/src/xenia/cpu/premake5.lua
@ -10,11 +10,7 @@ project("xenia-cpu")
    "xenia-base",
    "mspack",
  })
-  filter({"configurations:Release", "platforms:Windows"})
-    buildoptions({
-      "/Os",
-      "/O1"
-    })
+
  includedirs({
    project_root.."/third_party/llvm/include",
  })
@ -27,3 +23,8 @@ project("xenia-cpu")

 include("testing")
 include("ppc/testing")
+filter({"configurations:Release", "platforms:Windows"})
+buildoptions({
+  "/Os",
+  "/O1"
+})
--- a/src/xenia/debug/ui/premake5.lua
+++ b/src/xenia/debug/ui/premake5.lua
@ -17,6 +17,7 @@ project("xenia-debug-ui")
      "/Os",
      "/O1"
    })
+  filter{}
  defines({
  })
  includedirs({
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -9,23 +9,16 @@

 #include "xenia/gpu/command_processor.h"

-#include <algorithm>
 #include <cinttypes>
-#include <cmath>
-#include <cstring>

 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/byte_stream.h"
-#include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
-#include "xenia/base/ring_buffer.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/graphics_system.h"
 #include "xenia/gpu/sampler_info.h"
 #include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"

@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
      write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
      write_ptr_index_(0) {
  assert_not_null(write_ptr_index_event_);
-#if 0
-  dmac_ = dma::CreateDMAC();
-#else
-  dmac_ = nullptr;
-#endif
 }

 CommandProcessor::~CommandProcessor() = default;
@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }

 void CommandProcessor::ReturnFromWait() {}

-uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
-  SCOPE_profile_cpu_f("gpu");
-#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
-  // If we have a pending trace stream open it now. That way we ensure we get
-  // all commands.
-  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
-    uint32_t title_id = kernel_state_->GetExecutableModule()
-                            ? kernel_state_->GetExecutableModule()->title_id()
-                            : 0;
-    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
-    auto path = trace_stream_path_ / file_name;
-    trace_writer_.Open(path, title_id);
-    InitializeTrace();
-  }
-#endif
-  // Adjust pointer base.
-  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
-  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
-  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
-  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
-
-  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
-
-  // Execute commands!
-
-  RingBuffer old_reader = reader_;
-  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
-                            primary_buffer_size_);
-
-  reader_.set_read_offset(read_index * sizeof(uint32_t));
-  reader_.set_write_offset(write_index * sizeof(uint32_t));
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      GetCurrentRingReadCount());
-  do {
-    if (!ExecutePacket()) {
-      // This probably should be fatal - but we're going to continue anyways.
-      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-
-  OnPrimaryBufferEnd();
-
-  trace_writer_.WritePrimaryBufferEnd();
-
-  reader_ = old_reader;
-  return write_index;
-}
-
-void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
-  // Execute commands!
-  RingBuffer old_reader = reader_;
-
-  new (&reader_)
-      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
-
-  reader_.set_write_offset(count * sizeof(uint32_t));
-
-  do {
-    if (!ExecutePacket()) {
-      XELOGE("**** ExecutePacket: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-  reader_ = old_reader;
-}

 void CommandProcessor::InitializeTrace() {
  // Write the initial register values, to be loaded directly into the
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -19,11 +19,8 @@
 #include <string>
 #include <vector>

-#include "xenia/base/dma.h"
 #include "xenia/base/ring_buffer.h"
-#include "xenia/base/threading.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/kernel/xthread.h"
@ -82,7 +79,6 @@ class CommandProcessor {
  CommandProcessor(GraphicsSystem* graphics_system,
                   kernel::KernelState* kernel_state);
  virtual ~CommandProcessor();
-  dma::XeDMAC* GetDMAC() const { return dmac_; }
  uint32_t counter() const { return counter_; }
  void increment_counter() { counter_++; }

@ -135,7 +131,7 @@ class CommandProcessor {

  void UpdateWritePointer(uint32_t value);

-  void ExecutePacket(uint32_t ptr, uint32_t count);
+  

  bool is_paused() const { return paused_; }
  void Pause();
@ -172,7 +168,7 @@ class CommandProcessor {
                                          uint32_t num_registers);

  XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(
+  void WriteOneRegisterFromRing(
      uint32_t base,
      uint32_t
          num_times);  // repeatedly write a value to one register, presumably a
@ -221,7 +217,7 @@ class CommandProcessor {
  virtual void PrepareForWait();
  virtual void ReturnFromWait();

-  uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
+  
  virtual void OnPrimaryBufferEnd() {}

 #include "pm4_command_processor_declare.h"
@ -300,7 +296,6 @@ class CommandProcessor {
  reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
  reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
  uint32_t gamma_ramp_rw_component_ = 0;
-  dma::XeDMAC* dmac_ = nullptr;
 };

 }  // namespace gpu
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // validity is tracked.
  const Shader::ConstantRegisterMap& constant_map_vertex =
      vertex_shader->constant_register_map();
-  for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap);
-       ++i) {
-    uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i];
-    uint32_t j;
-    while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
-      vfetch_bits_remaining &= ~(uint32_t(1) << j);
-      uint32_t vfetch_index = i * 32 + j;
-      const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
-          XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
-      switch (vfetch_constant.type) {
-        case xenos::FetchConstantType::kVertex:
-          break;
-        case xenos::FetchConstantType::kInvalidVertex:
-          if (cvars::gpu_allow_invalid_fetch_constants) {
+  {
+    uint32_t vfetch_addresses[96];
+    uint32_t vfetch_sizes[96];
+    uint32_t vfetch_current_queued = 0;
+    for (uint32_t i = 0;
+         i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) {
+      uint32_t vfetch_bits_remaining =
+          constant_map_vertex.vertex_fetch_bitmap[i];
+      uint32_t j;
+      while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
+        vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining);
+        uint32_t vfetch_index = i * 32 + j;
+        const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
+            XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
+        switch (vfetch_constant.type) {
+          case xenos::FetchConstantType::kVertex:
            break;
-          }
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! "
-              "This is incorrect behavior, but you can try bypassing this by "
-              "launching Xenia with --gpu_allow_invalid_fetch_constants=true.",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
-        default:
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
+          case xenos::FetchConstantType::kInvalidVertex:
+            if (cvars::gpu_allow_invalid_fetch_constants) {
+              break;
+            }
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" "
+                "type! "
+                "This is incorrect behavior, but you can try bypassing this by "
+                "launching Xenia with "
+                "--gpu_allow_invalid_fetch_constants=true.",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+          default:
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) is completely "
+                "invalid!",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+        }
+        vfetch_addresses[vfetch_current_queued] = vfetch_constant.address;
+        vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size;
      }
-      if (!shared_memory_->RequestRange(vfetch_constant.address << 2,
-                                        vfetch_constant.size << 2)) {
-        XELOGE(
-            "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
-            "shared memory",
-            vfetch_constant.address << 2, vfetch_constant.size << 2);
-        return false;
+    }
+
+    if (vfetch_current_queued) {
+      // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
+      // pre-acquire the critical region so we're not repeatedly re-acquiring it
+      // in requestrange
+      auto shared_memory_request_range_hoisted =
+          global_critical_region::Acquire();
+
+      for (uint32_t i = 0; i < vfetch_current_queued; ++i) {
+        if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2,
+                                          vfetch_sizes[i] << 2)) {
+          XELOGE(
+              "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
+              "shared memory",
+              vfetch_addresses[i] << 2, vfetch_sizes[i] << 2);
+          return false;
+        }
      }
    }
  }
-
  // Gather memexport ranges and ensure the heaps for them are resident, and
  // also load the data surrounding the export and to fill the regions that
  // won't be modified by the shaders.
@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() {
    shared_memory_->InitializeTraceCompleteDownloads();
  }
 }
-static void DmaPrefunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_range;
-  readback_range.Begin = 0;
-  readback_range.End = job->size;
-  void* readback_mapping;
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-
-  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
-  xenia_assert(SUCCEEDED(mapres));
-
-  job->source = (uint8_t*)readback_mapping;
-}
-
-static void DmaPostfunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_write_range = {};
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-  readback_buffer->Unmap(0, &readback_write_range);
-}

 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() {
  if (!BeginSubmission(true)) {
    return false;
  }
-
  if (!cvars::d3d12_readback_resolve) {
    uint32_t written_address, written_length;
    return render_target_cache_->Resolve(*memory_, *shared_memory_,
@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
            readback_buffer, 0, shared_memory_buffer, written_address,
            written_length);
        if (AwaitAllQueueOperationsCompletion()) {
-#if 1
-        D3D12_RANGE readback_range;
-        readback_range.Begin = 0;
-        readback_range.End = written_length;
-        void* readback_mapping;
-        if (SUCCEEDED(
-                readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          // chrispy: this memcpy needs to be optimized as much as possible
+          D3D12_RANGE readback_range;
+          readback_range.Begin = 0;
+          readback_range.End = written_length;
+          void* readback_mapping;
+          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                             &readback_mapping))) {
+            // chrispy: this memcpy needs to be optimized as much as possible

-          auto physaddr = memory_->TranslatePhysical(written_address);
-          dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length);
-          // XEDmaCpy(physaddr, readback_mapping, written_length);
-          D3D12_RANGE readback_write_range = {};
-          readback_buffer->Unmap(0, &readback_write_range);
-        }
-
-#else
-          dma::XeDMAJob job{};
-          job.destination = memory_->TranslatePhysical(written_address);
-          job.size = written_length;
-          job.source = nullptr;
-          job.userdata1 = (void*)readback_buffer;
-          job.precall = DmaPrefunc;
-          job.postcall = DmaPostfunc;
-
-          readback_available_ = GetDMAC()->PushDMAJob(&job);
-
-#endif
+            auto physaddr = memory_->TranslatePhysical(written_address);
+            memory::vastcpy(physaddr, (uint8_t*)readback_mapping,
+                            written_length);
+            // XEDmaCpy(physaddr, readback_mapping, written_length);
+            D3D12_RANGE readback_write_range = {};
+            readback_buffer->Unmap(0, &readback_write_range);
+          }
        }
      }
    }
@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
    uint32_t user_clip_plane_index;
    while (xe::bit_scan_forward(user_clip_planes_remaining,
                                &user_clip_plane_index)) {
-      user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index);
+      user_clip_planes_remaining =
+          xe::clear_lowest_bit(user_clip_planes_remaining);
      const float* user_clip_plane =
          &regs[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32;
      if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane,
@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
  uint32_t textures_remaining = used_texture_mask;
  uint32_t texture_index;
  while (xe::bit_scan_forward(textures_remaining, &texture_index)) {
-    textures_remaining &= ~(uint32_t(1) << texture_index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    uint32_t& texture_signs_uint =
        system_constants_.texture_swizzled_signs[texture_index >> 2];
    uint32_t texture_signs_shift = (texture_index & 3) * 8;
@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
  if (size == 0) {
    return nullptr;
  }
-#if 1
-  if (readback_available_) {
-    GetDMAC()->WaitJobDone(readback_available_);
-    readback_available_ = 0;
-  }
-#endif
  size = xe::align(size, kReadbackBufferSizeIncrement);
  if (size > readback_buffer_size_) {
    const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -21,7 +21,6 @@
 #include <utility>

 #include "xenia/base/assert.h"
-#include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@ -50,8 +49,10 @@ struct MemExportRange {
  uint32_t size_dwords;
 };
 class D3D12CommandProcessor final : public CommandProcessor {
- public:
+ protected:
 #include "../pm4_command_processor_declare.h"
+
+ public:
  explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
                                 kernel::KernelState* kernel_state);
  ~D3D12CommandProcessor();
@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
                                                 uint32_t base,
                                                 uint32_t num_registers);
  XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(uint32_t base,
-                                        uint32_t num_times) override;
+  void WriteOneRegisterFromRing(uint32_t base,
+                                        uint32_t num_times);

  XE_FORCEINLINE
  void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor {

  static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
  ID3D12Resource* readback_buffer_ = nullptr;
-  dma::DMACJobHandle readback_available_ = 0;
  uint32_t readback_buffer_size_ = 0;

  std::atomic<bool> pix_capture_requested_ = false;
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(

 bool D3D12SharedMemory::UploadRanges(
    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_page_ranges) {
+    uint32_t num_upload_page_ranges) {
  if (!num_upload_page_ranges) {
    return true;
  }
  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
  command_processor_.SubmitBarriers();
  auto& command_list = command_processor_.GetDeferredCommandList();
-  // for (auto upload_range : upload_page_ranges) {
-  for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
+  for (uint32_t i = 0; i < num_upload_page_ranges; ++i) {
    auto& upload_range = upload_page_ranges[i];
    uint32_t upload_range_start = upload_range.first;
    uint32_t upload_range_length = upload_range.second;
@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges(
                     uint32_t(upload_buffer_size), false, false);

      if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
-        dma::vastcpy(
+        memory::vastcpy(
            upload_buffer_mapping,
            memory().TranslatePhysical(upload_range_start << page_size_log2()),
            static_cast<uint32_t>(upload_buffer_size));
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;

 private:
  D3D12CommandProcessor& command_processor_;
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
  uint32_t textures_remaining = used_texture_mask;
  uint32_t index;
  while (xe::bit_scan_forward(textures_remaining, &index)) {
-    textures_remaining &= ~(uint32_t(1) << index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    const TextureBinding* binding = GetValidTextureBinding(index);
    if (!binding) {
      continue;
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -9,21 +9,14 @@

 #include "xenia/gpu/draw_util.h"

-#include <algorithm>
-#include <cmath>
 #include <cstring>

-#include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/texture_cache.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/ui/graphics_util.h"

 // Very prominent in 545407F2.
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -16,7 +16,6 @@

 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@ -1,9 +1,14 @@


 void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
-
+virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT;
 virtual bool ExecutePacket();
-XE_NOINLINE
+
+public:
+void ExecutePacket(uint32_t ptr, uint32_t count);
+
+protected:
+ XE_NOINLINE
 bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
 XE_NOINLINE
 bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount();

 XE_NOINLINE
 XE_COLD
-bool ExecutePacketType3_CountOverflow(uint32_t count);
+bool ExecutePacketType3_CountOverflow(uint32_t count);
+XE_NOINLINE
+XE_COLD
+bool ExecutePacketType0_CountOverflow(uint32_t count);
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() {
  }
 }
 XE_NOINLINE
+XE_COLD
+bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
+  XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
+         COMMAND_PROCESSOR::GetCurrentRingReadCount(),
+         count * sizeof(uint32_t));
+  return false;
+}
+    /*
+	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+*/
+XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
  // Type-0 packet.
  // Write count registers in sequence to the registers starting at
  // (base_index << 2).

  uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
-  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) {
-    XELOGE(
-        "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
-        COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t));
-    return false;
-  }

-  trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);

-  uint32_t base_index = (packet & 0x7FFF);
-  uint32_t write_one_reg = (packet >> 15) & 0x1;
+  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
+      count * sizeof(uint32_t)) {
+    trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);

-  if (!write_one_reg) {
-    COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count);
+    uint32_t base_index = (packet & 0x7FFF);
+    uint32_t write_one_reg = (packet >> 15) & 0x1;

+    if (!write_one_reg) {
+      COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index,
+                                                    count);
+
+    } else {
+      COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    }
+
+    trace_writer_.WritePacketEnd();
+    return true;
  } else {
-    COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count);
  }
-
-  trace_writer_.WritePacketEnd();
-  return true;
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT {
@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(

 XE_NOINLINE
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
+  /*
+	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
+	then use the wait_info value in order to select the bits that correctly implement the condition
+	If neither subtraction has the signbit set then that means the value is equal
+  */
  bool matched = false;
  switch (wait_info & 0x7) {
    case 0x0:  // Never.
@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
  return true;
 }

+/*
+        todo: shouldn't this do something?
+*/
+
 bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(
    uint32_t packet, uint32_t count) XE_RESTRICT {
  // selective invalidation of state pointers
@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(

  return true;
 }
+
+uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
+                                                uint32_t write_index) {
+  SCOPE_profile_cpu_f("gpu");
+#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
+  // If we have a pending trace stream open it now. That way we ensure we get
+  // all commands.
+  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
+    uint32_t title_id = kernel_state_->GetExecutableModule()
+                            ? kernel_state_->GetExecutableModule()->title_id()
+                            : 0;
+    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
+    auto path = trace_stream_path_ / file_name;
+    trace_writer_.Open(path, title_id);
+    InitializeTrace();
+  }
+#endif
+  // Adjust pointer base.
+  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
+  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
+  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
+  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
+
+  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
+
+  // Execute commands!
+
+  RingBuffer old_reader = reader_;
+  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
+                            primary_buffer_size_);
+
+  reader_.set_read_offset(read_index * sizeof(uint32_t));
+  reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      // This probably should be fatal - but we're going to continue anyways.
+      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+
+  COMMAND_PROCESSOR::OnPrimaryBufferEnd();
+
+  trace_writer_.WritePrimaryBufferEnd();
+
+  reader_ = old_reader;
+  return write_index;
+}
+
+void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) {
+  // Execute commands!
+  RingBuffer old_reader = reader_;
+
+  new (&reader_)
+      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
+
+  reader_.set_write_offset(count * sizeof(uint32_t));
+
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      XELOGE("**** ExecutePacket: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+  reader_ = old_reader;
+}
--- a/src/xenia/gpu/sampler_info.h
+++ b/src/xenia/gpu/sampler_info.h
@ -11,7 +11,6 @@
 #define XENIA_GPU_SAMPLER_INFO_H_

 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -24,7 +24,6 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shader_interpreter.cc
+++ b/src/xenia/gpu/shader_interpreter.cc
@ -13,13 +13,6 @@
 #include <cmath>
 #include <cstring>

-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
-#include "xenia/gpu/registers.h"
-#include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/xenos.h"
-
 namespace xe {
 namespace gpu {

--- a/src/xenia/gpu/shader_interpreter.h
+++ b/src/xenia/gpu/shader_interpreter.h
@ -14,12 +14,9 @@
 #include <cstddef>
 #include <cstdint>

-#include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"

 namespace xe {
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -9,15 +9,9 @@

 #include "xenia/gpu/shader_translator.h"

-#include <algorithm>
 #include <cstdarg>
-#include <cstring>
-#include <set>
-#include <string>

-#include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/gpu/gpu_flags.h"

 namespace xe {
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@ -11,16 +11,7 @@
 #define XENIA_GPU_SHADER_TRANSLATOR_H_

 #include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "xenia/base/math.h"
-#include "xenia/base/string_buffer.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -10,15 +10,11 @@
 #include "xenia/gpu/shared_memory.h"

 #include <algorithm>
-#include <utility>

 #include "xenia/base/assert.h"
 #include "xenia/base/bit_range.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/profiling.h"
-#include "xenia/memory.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@ -10,12 +10,6 @@
 #ifndef XENIA_GPU_SHARED_MEMORY_H_
 #define XENIA_GPU_SHARED_MEMORY_H_

-#include <cstdint>
-#include <mutex>
-#include <utility>
-#include <vector>
-
-#include "xenia/base/mutex.h"
 #include "xenia/memory.h"

 namespace xe {
@ -141,7 +135,7 @@ class SharedMemory {
  // overall bounds of pages to be uploaded.
  virtual bool UploadRanges(
      const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-      unsigned num_upload_ranges) = 0;
+      uint32_t num_upload_ranges) = 0;

  const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
    return trace_download_ranges_;
@ -183,14 +177,10 @@ class SharedMemory {
  FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
      upload_ranges_;

-  // GPU-written memory downloading for traces. <Start address, length>.
-  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
-  uint32_t trace_download_page_count_ = 0;
-
  // Mutex between the guest memory subsystem and the command processor, to be
  // locked when checking or updating validity of pages/ranges and when firing
  // watches.
-  xe::global_critical_region global_critical_region_;
+  static constexpr xe::global_critical_region global_critical_region_{};

  // ***************************************************************************
  // Things below should be fully protected by global_critical_region.
@ -266,6 +256,11 @@ class SharedMemory {
  uint32_t watch_node_current_pool_allocated_ = 0;
  WatchRange* watch_range_first_free_ = nullptr;
  WatchNode* watch_node_first_free_ = nullptr;
+
+  // GPU-written memory downloading for traces. <Start address, length>.
+  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
+  uint32_t trace_download_page_count_ = 0;
+
  // Triggers the watches (global and per-range), removing triggered range
  // watches.
  void FireWatches(uint32_t page_first, uint32_t page_last,
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -9,21 +9,11 @@

 #include "xenia/gpu/texture_cache.h"

-#include <algorithm>
-#include <cstdint>
-#include <utility>
-
-#include "xenia/base/assert.h"
 #include "xenia/base/clock.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/register_file.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"

 DEFINE_int32(
    draw_resolution_scale_x, 1,
@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
  uint32_t bindings_changed = 0;
  uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
  uint32_t index = 0;
+
+  Texture* textures_to_load[64];  // max bits = 32, can be unsigned + signed
+                                  // means max array size = 64
+  uint32_t num_textures_to_load = 0;
  while (xe::bit_scan_forward(textures_remaining, &index)) {
    uint32_t index_bit = UINT32_C(1) << index;
-    textures_remaining &= ~index_bit;
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    TextureBinding& binding = texture_bindings_[index];
    const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
        XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6);
@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
      binding.texture_signed = nullptr;
    }
    if (load_unsigned_data && binding.texture != nullptr) {
-      LoadTextureData(*binding.texture);
+      textures_to_load[num_textures_to_load++] = binding.texture;
    }
    if (load_signed_data && binding.texture_signed != nullptr) {
-      LoadTextureData(*binding.texture_signed);
+      textures_to_load[num_textures_to_load++] = binding.texture_signed;
    }
  }
+
+  LoadTexturesData(textures_to_load, num_textures_to_load);
+
  if (bindings_changed) {
    UpdateTextureBindingsImpl(bindings_changed);
  }
@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
  texture->LogAction("Created");
  return texture;
 }
+void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
+  assert_true(n_textures <= 64);
+  if (n_textures < 2) {
+    if (!n_textures) {
+      return;
+    } else {
+      LoadTextureData(*textures[0]);
+      return;
+    }
+  }

+  uint64_t index_base_outdated = 0;
+  uint64_t index_mips_outdated = 0;
+  uint32_t nkept = 0;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      Texture* current = textures[i];
+
+      auto base_outdated = current->base_outdated(global_lock);
+      auto mips_outdated = current->mips_outdated(global_lock);
+
+      index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
+      index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
+      if (!base_outdated && !mips_outdated) {
+        textures[i] = nullptr;
+
+      } else {
+        nkept++;
+      }
+    }
+  }
+
+  if (nkept == 0) {
+    return;
+  }
+
+  for (uint32_t i = 0; i < n_textures; ++i) {
+    Texture* p_texture = textures[i];
+    if (!p_texture) {
+      continue;
+    }
+    textures[i] = nullptr;
+    Texture& texture = *p_texture;
+
+    TextureKey texture_key = texture.key();
+    // Implementation may load multiple blocks at once via accesses of up to 128
+    // bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
+    // if the texture is small (especially if it's linear), the last blocks
+    // won't be cut off (hosts may return 0, 0, 0, 0 for the whole
+    // R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
+    // are actually provided for it).
+
+    // Request uploading of the texture data to the shared memory.
+    // This is also necessary when resolution scaling is used - the texture
+    // cache relies on shared memory for invalidation of both unscaled and
+    // scaled textures. Plus a texture may be unscaled partially, when only a
+    // portion of its pages is invalidated, in this case we'll need the texture
+    // from the shared memory to load the unscaled parts.
+    // TODO(Triang3l): Load unscaled parts.
+    bool base_resolved = texture.GetBaseResolved();
+    if (index_base_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.base_page << 12,
+              xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &base_resolved)) {
+        continue;
+      }
+    }
+    bool mips_resolved = texture.GetMipsResolved();
+    if (index_mips_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.mip_page << 12,
+              xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
+        continue;
+      }
+    }
+    if (texture_key.scaled_resolve) {
+      // Make sure all the scaled resolve memory is resident and accessible from
+      // the shader, including any possible padding that hasn't yet been touched
+      // by an actual resolve, but is still included in the texture size, so the
+      // GPU won't be trying to access unmapped memory.
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
+                                              texture.GetGuestBaseSize(), 4)) {
+        continue;
+      }
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
+                                              texture.GetGuestMipsSize(), 4)) {
+        continue;
+      }
+    }
+
+    // Actually load the texture data.
+    if (!LoadTextureDataFromResidentMemoryImpl(
+            texture, (index_base_outdated & (1ULL << i)) != 0,
+            (index_mips_outdated & (1ULL << i)) != 0)) {
+      continue;
+    }
+
+    // Update the source of the texture (resolve vs. CPU or memexport) for
+    // purposes of handling piecewise gamma emulation via sRGB and for
+    // resolution scale in sampling offsets.
+    if (!texture_key.scaled_resolve) {
+      texture.SetBaseResolved(base_resolved);
+      texture.SetMipsResolved(mips_resolved);
+    }
+    // reque for makeuptodatandwatch
+    textures[i] = &texture;
+  }
+  {
+    auto crit = global_critical_region_.Acquire();
+
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      auto texture = textures[i];
+      if (!texture) {
+        continue;
+      }
+      // Mark the ranges as uploaded and watch them. This is needed for scaled
+      // resolves as well to detect when the CPU wants to reuse the memory for a
+      // regular texture or a vertex buffer, and thus the scaled resolve version
+      // is not up to date anymore.
+      texture->MakeUpToDateAndWatch(crit);
+
+      texture->LogAction("Loaded");
+    }
+  }
+}
 bool TextureCache::LoadTextureData(Texture& texture) {
  // Check what needs to be uploaded.
  bool base_outdated, mips_outdated;
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -559,6 +559,7 @@ class TextureCache {
    return load_shader_info_[load_shader_index];
  }
  bool LoadTextureData(Texture& texture);
+  void LoadTexturesData(Texture** textures, uint32_t n_textures);
  // Writes the texture data (for base, mips or both - but not neither) from the
  // shared memory or the scaled resolve memory. The shared memory management is
  // done outside this function, the implementation just needs to load the data
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -8,14 +8,7 @@
 */

 #include "xenia/gpu/texture_info.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/xxhash.h"

 namespace xe {
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -13,7 +13,6 @@
 #include <array>
 #include <cstring>
 #include <memory>
-#include "xenia/base/assert.h"
 #include "xenia/gpu/xenos.h"

 namespace xe {
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -8,13 +8,6 @@
 */

 #include "xenia/gpu/texture_util.h"
-
-#include <algorithm>
-#include <cstring>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace texture_util {
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -10,11 +10,6 @@
 #ifndef XENIA_GPU_UCODE_H_
 #define XENIA_GPU_UCODE_H_

-#include <cstdint>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-#include "xenia/base/platform.h"
 #include "xenia/gpu/xenos.h"

 // The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource,
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@ -46,6 +46,9 @@ namespace gpu {
 namespace vulkan {

 class VulkanCommandProcessor final : public CommandProcessor {
+ protected:
+#include "../pm4_command_processor_declare.h"
+
 public:
  // Single-descriptor layouts for use within a single frame.
  enum class SingleTransientDescriptorLayout {
@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor {
    kStorageBufferCompute,
    kCount,
  };
-#include "../pm4_command_processor_declare.h"

  class ScratchBufferAcquisition {
   public:
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(

 bool VulkanSharedMemory::UploadRanges(
    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_ranges) {
+    uint32_t num_upload_ranges) {
  if (!num_upload_ranges) {
    return true;
  }
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;

 private:
  void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -9,10 +9,6 @@

 #include "xenia/gpu/xenos.h"

-#include <cmath>
-
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace xenos {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -10,13 +10,10 @@
 #ifndef XENIA_GPU_XENOS_H_
 #define XENIA_GPU_XENOS_H_

-#include <algorithm>

-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
 #include "xenia/base/memory.h"
-#include "xenia/base/platform.h"
+#include "xenia/base/math.h"
+

 namespace xe {
 namespace gpu {
--- a/src/xenia/kernel/util/object_table.cc
+++ b/src/xenia/kernel/util/object_table.cc
@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) {

 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle) {
-  auto object = ObjectTable::LookupObject(handle, false);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked) {
+  auto object = ObjectTable::LookupObject(handle, already_locked);
  auto result = object_ref<XObject>(reinterpret_cast<XObject*>(object));
  return result;
 }
--- a/src/xenia/kernel/util/object_table.h
+++ b/src/xenia/kernel/util/object_table.h
@ -46,10 +46,9 @@ class ObjectTable {
  // Restores a XObject reference with a handle. Mainly for internal use - do
  // not use.
  X_STATUS RestoreHandle(X_HANDLE handle, XObject* object);
-
  template <typename T>
-  object_ref<T> LookupObject(X_HANDLE handle) {
-    auto object = LookupObject(handle, false);
+  object_ref<T> LookupObject(X_HANDLE handle, bool already_locked = false) {
+    auto object = LookupObject(handle, already_locked);
    if (T::kObjectType == XObject::Type::Socket) {
      object = LookupObject((handle | 0xF8000000), false);
    }
@ -110,7 +109,8 @@ class ObjectTable {

 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked);

 }  // namespace util
 }  // namespace kernel
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub);

 dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr,
                                                      lpdword_t driver_ptr) {
+  if (!callback_ptr) {
+    return X_E_INVALIDARG;
+  }
+
  uint32_t callback = callback_ptr[0];
+
+  if (!callback) {
+    return X_E_INVALIDARG;
+  }
  uint32_t callback_arg = callback_ptr[1];

  auto audio_system = kernel_state()->emulator()->audio_system();
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle,
 }
 DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented,
                         kHighFrequency);
+dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) {
+  X_STATUS result = X_STATUS_SUCCESS;

+  auto ev = kernel_state()->object_table()->LookupObject<XEvent>(handle);
+  if (ev) {
+    uint32_t type_tmp, state_tmp;
+
+    ev->Query(&type_tmp, &state_tmp);
+
+    out_struc[0] = type_tmp;
+    out_struc[1] = state_tmp;
+
+  } else {
+    result = X_STATUS_INVALID_HANDLE;
+  }
+
+  return result;
+}
+DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented,
+                         kHighFrequency);
 uint32_t xeNtClearEvent(uint32_t handle) {
  X_STATUS result = X_STATUS_SUCCESS;

@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry(
    lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) {
  assert_true(wait_type <= 1);

-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
-    auto object_ref =
-        XObject::GetNativeObject<XObject>(kernel_state(), object_ptr);
-    if (!object_ref) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
+      auto object_ref = XObject::GetNativeObject<XObject>(kernel_state(),
+                                                          object_ptr, -1, true);
+      if (!object_ref) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+
+      objects[n] = std::move(object_ref);
    }
-
-    objects.push_back(std::move(object_ref));
  }
-
  uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
-  return XObject::WaitMultiple(uint32_t(objects.size()),
-                               reinterpret_cast<XObject**>(objects.data()),
-                               wait_type, wait_reason, processor_mode,
-                               alertable, timeout_ptr ? &timeout : nullptr);
+  return XObject::WaitMultiple(
+      uint32_t(count), reinterpret_cast<XObject**>(&objects[0]), wait_type,
+      wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr);
 }
 DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented,
                         kBlocking, kHighFrequency);
@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be<uint32_t>* handles,
                                      uint64_t* timeout_ptr) {
  assert_true(wait_type <= 1);

-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    uint32_t object_handle = handles[n];
-    auto object =
-        kernel_state()->object_table()->LookupObject<XObject>(object_handle);
-    if (!object) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+
+  /*
+        Reserving to squash the constant reallocations, in a benchmark of one
+     particular game over a period of five minutes roughly 11% of CPU time was
+     spent inside a helper function to Windows' heap allocation function. 7% of
+     that time was traced back to here
+
+         edit: actually switched to fixed size array, as there can never be more
+     than 64 events specified
+  */
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      uint32_t object_handle = handles[n];
+      auto object = kernel_state()->object_table()->LookupObject<XObject>(
+          object_handle, true);
+      if (!object) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+      objects[n] = std::move(object);
    }
-    objects.push_back(std::move(object));
  }

-  return XObject::WaitMultiple(count,
-                               reinterpret_cast<XObject**>(objects.data()),
+  return XObject::WaitMultiple(count, reinterpret_cast<XObject**>(&objects[0]),
                               wait_type, 6, wait_mode, alertable, timeout_ptr);
 }

@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
    dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
    dword_t alertable, lpqword_t timeout_ptr) {
  uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
+  if (!count || count > 64 || wait_type != 1 && wait_type) {
+    return X_STATUS_INVALID_PARAMETER;
+  }
  return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
                                      alertable,
                                      timeout_ptr ? &timeout : nullptr);
@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
                                                      dword_t r6,
                                                      lpqword_t timeout_ptr) {
  X_STATUS result = X_STATUS_SUCCESS;
+  // pre-lock for these two handle lookups
+  global_critical_region::mutex().lock();

-  auto signal_object =
-      kernel_state()->object_table()->LookupObject<XObject>(signal_handle);
+  auto signal_object = kernel_state()->object_table()->LookupObject<XObject>(
+      signal_handle, true);
  auto wait_object =
-      kernel_state()->object_table()->LookupObject<XObject>(wait_handle);
+      kernel_state()->object_table()->LookupObject<XObject>(wait_handle, true);
+  global_critical_region::mutex().unlock();
  if (signal_object && wait_object) {
    uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
    result =
--- a/src/xenia/kernel/xevent.cc
+++ b/src/xenia/kernel/xevent.cc
@ -71,7 +71,12 @@ int32_t XEvent::Reset() {
  event_->Reset();
  return 1;
 }
+void XEvent::Query(uint32_t* out_type, uint32_t* out_state) {
+  auto [type, state] = event_->Query();

+  *out_type = type;
+  *out_state = state;
+}
 void XEvent::Clear() { event_->Reset(); }

 bool XEvent::Save(ByteStream* stream) {
--- a/src/xenia/kernel/xevent.h
+++ b/src/xenia/kernel/xevent.h
@ -36,6 +36,7 @@ class XEvent : public XObject {
  int32_t Set(uint32_t priority_increment, bool wait);
  int32_t Pulse(uint32_t priority_increment, bool wait);
  int32_t Reset();
+  void Query(uint32_t* out_type, uint32_t* out_state);
  void Clear();

  bool Save(ByteStream* stream) override;
--- a/src/xenia/kernel/xobject.cc
+++ b/src/xenia/kernel/xobject.cc
@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                               uint32_t wait_type, uint32_t wait_reason,
                               uint32_t processor_mode, uint32_t alertable,
                               uint64_t* opt_timeout) {
-  std::vector<xe::threading::WaitHandle*> wait_handles(count);
+  xe::threading::WaitHandle* wait_handles[64];
+
  for (size_t i = 0; i < count; ++i) {
    wait_handles[i] = objects[i]->GetWaitHandle();
    assert_not_null(wait_handles[i]);
@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                  : std::chrono::milliseconds::max();

  if (wait_type) {
-    auto result = xe::threading::WaitAny(std::move(wait_handles),
+    auto result = xe::threading::WaitAny(wait_handles, count,
                                         alertable ? true : false, timeout_ms);
    switch (result.first) {
      case xe::threading::WaitResult::kSuccess:
@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
        return X_STATUS_UNSUCCESSFUL;
    }
  } else {
-    auto result = xe::threading::WaitAll(std::move(wait_handles),
+    auto result = xe::threading::WaitAll(wait_handles, count,
                                         alertable ? true : false, timeout_ms);
    switch (result) {
      case xe::threading::WaitResult::kSuccess:
@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) {
 }

 object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
-                                             void* native_ptr,
-                                             int32_t as_type) {
+                                             void* native_ptr, int32_t as_type,
+                                             bool already_locked) {
  assert_not_null(native_ptr);

  // Unfortunately the XDK seems to inline some KeInitialize calls, meaning
@ -373,10 +374,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
  // We identify this by setting wait_list_flink to a magic value. When set,
  // wait_list_blink will hold a handle to our object.

-  auto global_lock = xe::global_critical_region::AcquireDirect();
+  if (!already_locked) {
+    global_critical_region::mutex().lock();
+  }

  auto header = reinterpret_cast<X_DISPATCH_HEADER*>(native_ptr);
-
+  XObject* result;
  if (as_type == -1) {
    as_type = header->type;
  }
@ -385,10 +388,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
    // Already initialized.
    // TODO: assert if the type of the object != as_type
    uint32_t handle = header->wait_list_blink;
-    auto object = kernel_state->object_table()->LookupObject<XObject>(handle);
-
+    result = kernel_state->object_table()
+                 ->LookupObject<XObject>(handle, true)
+                 .release();
+    goto return_result;
    // TODO(benvanik): assert nothing has been changed in the struct.
-    return object;
+    // return object;
  } else {
    // First use, create new.
    // https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html
@ -430,14 +435,22 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
      case 24:  // ThreadedDpcObject
      default:
        assert_always();
-        return NULL;
+        result = nullptr;
+        goto return_result;
+
+        // return NULL;
    }

    // Stash pointer in struct.
    // FIXME: This assumes the object contains a dispatch header (some don't!)
    StashHandle(header, object->handle());
+    result = object;

-    return object_ref<XObject>(object);
+  return_result:
+    if (!already_locked) {
+      global_critical_region::mutex().unlock();
+    }
+    return object_ref<XObject>(result);
  }
 }

--- a/src/xenia/kernel/xobject.h
+++ b/src/xenia/kernel/xobject.h
@ -192,10 +192,12 @@ class XObject {

  static object_ref<XObject> GetNativeObject(KernelState* kernel_state,
                                             void* native_ptr,
-                                             int32_t as_type = -1);
+                                             int32_t as_type = -1,
+                                             bool already_locked = false);
  template <typename T>
  static object_ref<T> GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type = -1);
+                                       void* native_ptr, int32_t as_type = -1,
+                                       bool already_locked = false);

 protected:
  bool SaveObject(ByteStream* stream);
@ -362,9 +364,11 @@ object_ref<T> retain_object(T* ptr) {

 template <typename T>
 object_ref<T> XObject::GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type) {
+                                       void* native_ptr, int32_t as_type,
+                                       bool already_locked) {
  return object_ref<T>(reinterpret_cast<T*>(
-      GetNativeObject(kernel_state, native_ptr, as_type).release()));
+      GetNativeObject(kernel_state, native_ptr, as_type, already_locked)
+          .release()));
 }

 }  // namespace kernel
--- a/src/xenia/ui/d3d12/premake5.lua
+++ b/src/xenia/ui/d3d12/premake5.lua
@ -15,6 +15,7 @@ project("xenia-ui-d3d12")
      "/Os",
      "/O1"
    })
+  filter {}
  local_platform_files()
  files({
    "../shaders/bytecode/d3d12_5_1/*.h",
--- a/src/xenia/ui/vulkan/premake5.lua
+++ b/src/xenia/ui/vulkan/premake5.lua
@ -15,6 +15,7 @@ project("xenia-ui-vulkan")
      "/Os",
      "/O1"
    })
+  filter {}
  includedirs({
    project_root.."/third_party/Vulkan-Headers/include",
  })
--- a/src/xenia/vfs/premake5.lua
+++ b/src/xenia/vfs/premake5.lua
@ -16,6 +16,8 @@ project("xenia-vfs")
      "/Os",
      "/O1"
    })
+  filter {}
+
  recursive_platform_files()
  removefiles({"vfs_dump.cc"})

--- a/third_party/capstone.lua
+++ b/third_party/capstone.lua
@ -16,6 +16,8 @@ project("capstone")
 		"/Os",
 		"/O1"
 	})
+  filter {}
+
  includedirs({
    "capstone",
    "capstone/include",
--- a/third_party/fmt.lua
+++ b/third_party/fmt.lua
@ -13,6 +13,8 @@ project("fmt")
      "/Os",
      "/O1"
    })
+  filter {}
+
  includedirs({
    "fmt/include",
  })
--- a/third_party/glslang-spirv.lua
+++ b/third_party/glslang-spirv.lua
@ -15,6 +15,8 @@ project("glslang-spirv")
      "/Os",
      "/O1"
    })
+  filter {}
+
  files({
    "glslang/SPIRV/bitutils.h",
    "glslang/SPIRV/disassemble.cpp",
--- a/third_party/imgui.lua
+++ b/third_party/imgui.lua
@ -16,6 +16,7 @@ project("imgui")
      "/Os",
      "/O1"
    })
+  filter{}
  files({
    "imgui/imconfig.h",
    "imgui/imgui.cpp",