fixed wine crash from use of NtSetEventPriorityBoost

add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 09:55:17 -07:00 · 2022-10-08 09:55:17 -07:00 · 8f7f7dc6ad
parent bae63b95c5
commit 8f7f7dc6ad
50 changed files with 722 additions and 788 deletions
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
  // Allocate ffmpeg stuff:
  av_packet_ = av_packet_alloc();
  assert_not_null(av_packet_);
-
+  //chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet, 
+  //these allocations were causing RtlSubsegmentInitialize
+  av_packet_->buf = av_buffer_alloc(128 * 1024);
  // find the XMA2 audio decoder
  av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES);
  if (!av_codec_) {
@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }

 bool XmaContext::Work() {
-  std::lock_guard<xe_mutex> lock(lock_);
-  if (!is_allocated() || !is_enabled()) {
+  
+  if (!is_enabled() || !is_allocated()) {
    return false;
  }
+  {
+    std::lock_guard<xe_mutex> lock(lock_);
+    set_is_enabled(false);

-  set_is_enabled(false);
-
-  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
-  XMA_CONTEXT_DATA data(context_ptr);
-  Decode(&data);
-  data.Store(context_ptr);
-  return true;
+    auto context_ptr = memory()->TranslateVirtual(guest_ptr());
+    XMA_CONTEXT_DATA data(context_ptr);
+    Decode(&data);
+    data.Store(context_ptr);
+    return true;
+  }
 }

 void XmaContext::Enable() {
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@ -201,8 +201,8 @@ class XmaContext {
  uint32_t id_ = 0;
  uint32_t guest_ptr_ = 0;
  xe_mutex lock_;
-  bool is_allocated_ = false;
-  bool is_enabled_ = false;
+  volatile bool is_allocated_ = false;
+  volatile bool is_enabled_ = false;
  // bool is_dirty_ = true;

  // ffmpeg structures
--- a/src/xenia/base/dma.cc
+++ b/src/xenia/base/dma.cc
@ -1,348 +0,0 @@
-#include "dma.h"
-#include "logging.h"
-#include "mutex.h"
-#include "platform_win.h"
-
-XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
-                NtDelayExecutionPointer);
-XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
-XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
-                NtAlertThreadByThreadId);
-
-template <size_t N, typename... Ts>
-static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
-  char buffer[1024];
-  sprintf_s(buffer, fmt, args...);
-  XELOGI("%s", buffer);
-}
-
-//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
-//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
-#define XEDMALOG(...) static_cast<void>(0)
-using xe::swcache::CacheLine;
-static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
-#if defined(__clang__)
-XE_FORCEINLINE
-static void mvdir64b(void* to, const void* from) {
-  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
-}
-#define _movdir64b mvdir64b
-#endif
-XE_FORCEINLINE
-static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
-                                    CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-XE_FORCEINLINE
-static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
-                                 CacheLine* XE_RESTRICT from) {
-  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
-
-  CacheLine* dest1 = to;
-  CacheLine* src1 = from;
-
-  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
-  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
-
-  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
-  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
-
-  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
-  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-#pragma loop(no_vector)
-  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
-#if 0
-    xe::swcache::CacheLine line0, line1, line2, line3;
-
-    xe::swcache::ReadLine(&line0, src1 + i);
-    xe::swcache::ReadLine(&line1, src2 + i);
-    xe::swcache::ReadLine(&line2, src3 + i);
-    xe::swcache::ReadLine(&line3, src4 + i);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(dest1 + i, &line0);
-    xe::swcache::WriteLineNT(dest2 + i, &line1);
-
-    xe::swcache::WriteLineNT(dest3 + i, &line2);
-    xe::swcache::WriteLineNT(dest4 + i, &line3);
-#else
-    _movdir64b(dest1 + i, src1 + i);
-    _movdir64b(dest2 + i, src2 + i);
-    _movdir64b(dest3 + i, src3 + i);
-    _movdir64b(dest4 + i, src4 + i);
-#endif
-  }
-  XE_MSVC_REORDER_BARRIER();
-}
-
-namespace xe::dma {
-using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
-                                 CacheLine* XE_RESTRICT rdmapping,
-                                 uint32_t written_length);
-static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
-                             CacheLine* XE_RESTRICT rdmapping,
-                             uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384StreamingAVX(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    xe::swcache::CacheLine line0, line1;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-
-    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
-    XE_MSVC_REORDER_BARRIER();
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
-  }
-
-  if (i < num_written_lines) {
-    xe::swcache::CacheLine line0;
-
-    xe::swcache::ReadLine(&line0, rdmapping + i);
-    xe::swcache::WriteLineNT(physaddr + i, &line0);
-  }
-}
-
-static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
-                                   CacheLine* XE_RESTRICT rdmapping,
-                                   uint32_t written_length) {
-  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
-
-  while (written_length >= 16384) {
-    XeCopy16384Movdir64M(physaddr, rdmapping);
-
-    physaddr += NUM_LINES_FOR_16K;
-    rdmapping += NUM_LINES_FOR_16K;
-
-    written_length -= 16384;
-  }
-
-  if (!written_length) {
-    return;
-  }
-  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
-
-  uint32_t i = 0;
-
-  for (; i + 1 < num_written_lines; i += 2) {
-    _movdir64b(physaddr + i, rdmapping + i);
-    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
-  }
-
-  if (i < num_written_lines) {
-    _movdir64b(physaddr + i, rdmapping + i);
-  }
-}
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length);
-
-static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
-
-XE_COLD
-static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
-                          CacheLine* XE_RESTRICT rdmapping,
-                          uint32_t written_length) {
-  VastCpyDispatch dispatch_to_use = nullptr;
-  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
-    XELOGI("Selecting MOVDIR64M vastcpy.");
-    dispatch_to_use = vastcpy_impl_movdir64m;
-  } else {
-    XELOGI("Selecting generic AVX vastcpy.");
-    dispatch_to_use = vastcpy_impl_avx;
-  }
-
-  vastcpy_dispatch =
-      dispatch_to_use;  // all future calls will go through our selected path
-  return vastcpy_dispatch(physaddr, rdmapping, written_length);
-}
-
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length) {
-  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
-                          written_length);
-}
-
-#define MAX_INFLIGHT_DMAJOBS 65536
-#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
-class XeDMACGeneric : public XeDMAC {
-  std::unique_ptr<xe::threading::Thread> thrd_;
-  XeDMAJob* jobs_ring_;
-  volatile std::atomic<uintptr_t> write_ptr_;
-
-  struct alignas(XE_HOST_CACHE_LINE_SIZE) {
-    volatile std::atomic<uintptr_t> read_ptr_;
-    xe_mutex push_into_ring_lock_;
-  };
-  HANDLE gotjob_event;
-  void WorkerWait();
-
- public:
-  virtual ~XeDMACGeneric() {}
-  void WorkerThreadMain();
-  XeDMACGeneric() {
-    threading::Thread::CreationParameters crparams;
-    crparams.create_suspended = true;
-    crparams.initial_priority = threading::ThreadPriority::kNormal;
-    crparams.stack_size = 65536;
-    gotjob_event = CreateEventA(nullptr, false, false, nullptr);
-    thrd_ = std::move(threading::Thread::Create(
-        crparams, [this]() { this->WorkerThreadMain(); }));
-
-    jobs_ring_ = (XeDMAJob*)_aligned_malloc(
-        MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
-
-    write_ptr_ = 0;
-    read_ptr_ = 0;
-
-    thrd_->Resume();
-  }
-
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
-    // std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
-    HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
-    {
-      job->dmac_specific_ = (uintptr_t)dmacevent;
-
-      jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
-      write_ptr_++;
-      SetEvent(gotjob_event);
-    }
-    return (DMACJobHandle)dmacevent;
-  }
-  virtual void WaitJobDone(DMACJobHandle handle) override {
-    while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
-      // NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
-      //  while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
-      //           WAIT_TIMEOUT) {
-      //    ;
-    }
-    //}
-
-    // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
-    CloseHandle((HANDLE)handle);
-  }
-  virtual void WaitForIdle() override {
-    while (write_ptr_ != read_ptr_) {
-      threading::MaybeYield();
-    }
-  }
-};
-void XeDMACGeneric::WorkerWait() {
-  constexpr unsigned NUM_PAUSE_SPINS = 2048;
-  constexpr unsigned NUM_YIELD_SPINS = 8;
-#if 0
-
-  for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      _mm_pause();
-    } else {
-      break;
-    }
-  }
-  for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
-    if (write_ptr_ == read_ptr_) {
-      threading::MaybeYield();
-    } else {
-      break;
-    }
-  }
-  LARGE_INTEGER yield_execution_delay{};
-  yield_execution_delay.QuadPart =
-      -2000;  //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
-  while (write_ptr_ == read_ptr_) {
-    NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
-  }
-#else
-  do {
-    if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
-      while (write_ptr_ == read_ptr_) {
-        _mm_pause();
-      }
-    }
-
-  } while (write_ptr_ == read_ptr_);
-#endif
-}
-void XeDMACGeneric::WorkerThreadMain() {
-  while (true) {
-    this->WorkerWait();
-
-    XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
-    swcache::ReadFence();
-
-    if (current_job.precall) {
-      current_job.precall(&current_job);
-    }
-
-    size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
-    size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
-
-    size_t line_rem = current_job.size - line_rounded;
-
-    vastcpy(current_job.destination, current_job.source,
-            static_cast<uint32_t>(line_rounded));
-
-    if (line_rem) {
-      __movsb(current_job.destination + line_rounded,
-              current_job.source + line_rounded, line_rem);
-    }
-
-    if (current_job.postcall) {
-      current_job.postcall(&current_job);
-    }
-    read_ptr_++;
-    swcache::WriteFence();
-
-    SetEvent((HANDLE)current_job.dmac_specific_);
-  }
-}
-
-XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
-}  // namespace xe::dma
--- a/src/xenia/base/dma.h
+++ b/src/xenia/base/dma.h
@ -1,47 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef XENIA_BASE_DMA_H_
-#define XENIA_BASE_DMA_H_
-#include "memory.h"
-#include "threading.h"
-namespace xe::dma {
-struct XeDMAJob;
-using DmaPrecall = void (*)(XeDMAJob* job);
-using DmaPostcall = void (*)(XeDMAJob* job);
-struct XeDMAJob {
-  //threading::Event* signal_on_done;
-  uintptr_t dmac_specific_;
-  uint8_t* destination;
-  uint8_t* source;
-  size_t size;
-  DmaPrecall precall;
-  DmaPostcall postcall;
-  void* userdata1;
-  void* userdata2;
-};
-using DMACJobHandle = uint64_t;
-
-class XeDMAC {
- public:
-  virtual ~XeDMAC() {}
-  virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
-  virtual void WaitJobDone(DMACJobHandle handle) = 0;
-  virtual void WaitForIdle() = 0;
-};
-
-XeDMAC* CreateDMAC();
-// must be divisible by cache line size
-XE_NOINLINE
-void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
-             uint32_t written_length);
-
-}  // namespace xe::dma
-
-#endif  // XENIA_BASE_DMA_H_
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -44,8 +44,18 @@ template <typename T>
 constexpr bool is_pow2(T value) {
  return (value & (value - 1)) == 0;
 }
+/*
+	Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient 
+	because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction),
+	but this code can be executed alongside the scan
+*/
+template<typename T>
+constexpr T clear_lowest_bit(T value) {
+  static_assert(std::is_integral_v<T>);

-// Rounds up the given value to the given alignment.
+  return (value - static_cast<T>(1)) & value;
+}
+    // Rounds up the given value to the given alignment.
 template <typename T>
 constexpr T align(T value, T alignment) {
  return (value + alignment - 1) & ~(alignment - 1);
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@ -10,6 +10,7 @@
 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/platform.h"
+#include "xenia/base/logging.h"

 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() {
         cvars::writable_executable_memory;
 }

+using xe::swcache::CacheLine;
+
+static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
+
+#if defined(__clang__)
+XE_FORCEINLINE
+static void mvdir64b(void* to, const void* from) {
+  __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
+}
+#define _movdir64b mvdir64b
+#endif
+XE_FORCEINLINE
+static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
+                                    CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    xe::swcache::CacheLine line0, line1, line2, line3;
+
+    xe::swcache::ReadLine(&line0, src1 + i);
+    xe::swcache::ReadLine(&line1, src2 + i);
+    xe::swcache::ReadLine(&line2, src3 + i);
+    xe::swcache::ReadLine(&line3, src4 + i);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(dest1 + i, &line0);
+    xe::swcache::WriteLineNT(dest2 + i, &line1);
+
+    xe::swcache::WriteLineNT(dest3 + i, &line2);
+    xe::swcache::WriteLineNT(dest4 + i, &line3);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+XE_FORCEINLINE
+static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
+                                 CacheLine* XE_RESTRICT from) {
+  uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
+
+  CacheLine* dest1 = to;
+  CacheLine* src1 = from;
+
+  CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
+  CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
+
+  CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
+  CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
+
+  CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
+  CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
+#pragma loop(no_vector)
+  for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
+    _movdir64b(dest1 + i, src1 + i);
+    _movdir64b(dest2 + i, src2 + i);
+    _movdir64b(dest3 + i, src3 + i);
+    _movdir64b(dest4 + i, src4 + i);
+  }
+  XE_MSVC_REORDER_BARRIER();
+}
+using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length);
+static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
+                             CacheLine* XE_RESTRICT rdmapping,
+                             uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384StreamingAVX(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    xe::swcache::CacheLine line0, line1;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+
+    xe::swcache::ReadLine(&line1, rdmapping + i + 1);
+    XE_MSVC_REORDER_BARRIER();
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+    xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
+  }
+
+  if (i < num_written_lines) {
+    xe::swcache::CacheLine line0;
+
+    xe::swcache::ReadLine(&line0, rdmapping + i);
+    xe::swcache::WriteLineNT(physaddr + i, &line0);
+  }
+}
+
+static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
+                                   CacheLine* XE_RESTRICT rdmapping,
+                                   uint32_t written_length) {
+  static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
+
+  while (written_length >= 16384) {
+    XeCopy16384Movdir64M(physaddr, rdmapping);
+
+    physaddr += NUM_LINES_FOR_16K;
+    rdmapping += NUM_LINES_FOR_16K;
+
+    written_length -= 16384;
+  }
+
+  if (!written_length) {
+    return;
+  }
+  uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
+
+  uint32_t i = 0;
+
+  for (; i + 1 < num_written_lines; i += 2) {
+    _movdir64b(physaddr + i, rdmapping + i);
+    _movdir64b(physaddr + i + 1, rdmapping + i + 1);
+  }
+
+  if (i < num_written_lines) {
+    _movdir64b(physaddr + i, rdmapping + i);
+  }
+}
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length);
+
+static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
+
+XE_COLD
+static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
+                          CacheLine* XE_RESTRICT rdmapping,
+                          uint32_t written_length) {
+  VastCpyDispatch dispatch_to_use = nullptr;
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
+    XELOGI("Selecting MOVDIR64M vastcpy.");
+    dispatch_to_use = vastcpy_impl_movdir64m;
+  } else {
+    XELOGI("Selecting generic AVX vastcpy.");
+    dispatch_to_use = vastcpy_impl_avx;
+  }
+
+  vastcpy_dispatch =
+      dispatch_to_use;  // all future calls will go through our selected path
+  return vastcpy_dispatch(physaddr, rdmapping, written_length);
+}
+
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length) {
+  return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
+                          written_length);
+}
 }  // namespace memory

 // TODO(benvanik): fancy AVX versions.
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -17,9 +17,9 @@
 #include <string>
 #include <string_view>

-#include "xenia/base/assert.h"
+
 #include "xenia/base/byte_order.h"
-#include "xenia/base/platform.h"
+

 namespace xe {
 namespace memory {
@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
  seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2);
  return hash_combine(seed, vs...);
 }
+// must be divisible by cache line size
+XE_NOINLINE
+void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
+             uint32_t written_length);

 }  // namespace memory

--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // };
 class global_critical_region {
 public:
+  constexpr global_critical_region() {}
  static global_mutex_type& mutex();

  // Acquires a lock on the global critical section.
@ -144,18 +145,18 @@ class global_critical_region {
  }

  // Acquires a lock on the global critical section.
-  inline global_unique_lock_type Acquire() {
+  static inline global_unique_lock_type Acquire() {
    return global_unique_lock_type(mutex());
  }

  // Acquires a deferred lock on the global critical section.
-  inline global_unique_lock_type AcquireDeferred() {
+  static inline global_unique_lock_type AcquireDeferred() {
    return global_unique_lock_type(mutex(), std::defer_lock);
  }

  // Tries to acquire a lock on the glboal critical section.
  // Check owns_lock() to see if the lock was successfully acquired.
-  inline global_unique_lock_type TryAcquire() {
+  static inline global_unique_lock_type TryAcquire() {
    return global_unique_lock_type(mutex(), std::try_to_lock);
  }
 };
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
  xenia_assert(this->capacity_ >= 4);

  ring_size_t next_read_offset = read_offset + 4;
-#if 0
-  size_t zerotest = next_read_offset - this->capacity_;
-  // unpredictable branch, use bit arith instead
-  // todo: it would be faster to use lzcnt, but we need to figure out if all
-  // machines we support support it
-  next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
-#else
+
  if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
    next_read_offset = 0;
    // todo: maybe prefetch next? or should that happen much earlier?
  }
-#endif
  this->read_offset_ = next_read_offset;
-  unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
+  uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset];
  return xe::byte_swap(ring_value);
 }
 }  // namespace xe
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -271,7 +271,10 @@ inline std::pair<WaitResult, size_t> WaitAny(
  return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable,
                 timeout);
 }
-
+struct EventInfo {
+  uint32_t type;
+  uint32_t state;
+};
 // Models a Win32-like event object.
 // https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx
 class Event : public WaitHandle {
@ -299,6 +302,8 @@ class Event : public WaitHandle {
  // the nonsignaled state after releasing the appropriate number of waiting
  // threads.
  virtual void Pulse() = 0;
+	
+  virtual EventInfo Query() = 0;
  #if XE_PLATFORM_WIN32 ==1
  //SetEvent, but if there is a waiter we immediately transfer execution to it
  virtual void SetBoostPriority() = 0;
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,

 XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
                NtDelayExecutionPointer);
-
+XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer);
 namespace xe {
 namespace threading {

@ -255,20 +255,20 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
                                           size_t wait_handle_count,
                                           bool wait_all, bool is_alertable,
                                           std::chrono::milliseconds timeout) {
-  std::vector<HANDLE> handles(
-      wait_handle_count);  // max handles is like 64, so it would make more
-                           // sense to just do a fixed size array here
+  xenia_assert(wait_handle_count <= 64);
+  HANDLE handles[64];
+
  for (size_t i = 0; i < wait_handle_count; ++i) {
    handles[i] = wait_handles[i]->native_handle();
  }
  DWORD result = WaitForMultipleObjectsEx(
-      DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE,
+      static_cast<DWORD>(wait_handle_count), handles, wait_all ? TRUE : FALSE,
      DWORD(timeout.count()), is_alertable ? TRUE : FALSE);
-  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) {
+  if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) {
    return std::pair<WaitResult, size_t>(WaitResult::kSuccess,
                                         result - WAIT_OBJECT_0);
  } else if (result >= WAIT_ABANDONED_0 &&
-             result < WAIT_ABANDONED_0 + handles.size()) {
+             result < WAIT_ABANDONED_0 + wait_handle_count) {
    return std::pair<WaitResult, size_t>(WaitResult::kAbandoned,
                                         result - WAIT_ABANDONED_0);
  }
@ -293,7 +293,15 @@ class Win32Event : public Win32Handle<Event> {
  void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
  void SetBoostPriority() override {
    // no previous state for boostpriority
-    NtSetEventBoostPriorityPointer.invoke(handle_);
+    // Boost priority is unimplemented under wine probably because it's not used
+    // anywhere in user mode except by us. Maybe some Windows internals uses it
+    // see:
+    // https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228
+    if (NtSetEventBoostPriorityPointer) {
+      NtSetEventBoostPriorityPointer.invoke(handle_);
+    } else {
+      NtSetEventPointer.invoke(handle_, nullptr);
+    }
  }
 #else
  void Set() override { SetEvent(handle_); }
@ -305,6 +313,11 @@ class Win32Event : public Win32Handle<Event> {
    SetEvent(handle_);
  }
 #endif
+
+  EventInfo Query() { EventInfo result{};
+    NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
+    return result;
+  }
 };

 std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -39,7 +39,7 @@
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/processor.h"
-XE_MSVC_OPTIMIZE_SMALL()
+
 DEFINE_bool(use_fast_dot_product, false,
            "Experimental optimization, much shorter sequence on dot products, "
            "treating inf as overflow instead of using mcxsr"
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -9,23 +9,16 @@

 #include "xenia/gpu/command_processor.h"

-#include <algorithm>
 #include <cinttypes>
-#include <cmath>
-#include <cstring>

 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/byte_stream.h"
-#include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
-#include "xenia/base/ring_buffer.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/graphics_system.h"
 #include "xenia/gpu/sampler_info.h"
 #include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"

@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
      write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
      write_ptr_index_(0) {
  assert_not_null(write_ptr_index_event_);
-#if 0
-  dmac_ = dma::CreateDMAC();
-#else
-  dmac_ = nullptr;
-#endif
 }

 CommandProcessor::~CommandProcessor() = default;
@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }

 void CommandProcessor::ReturnFromWait() {}

-uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
-  SCOPE_profile_cpu_f("gpu");
-#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
-  // If we have a pending trace stream open it now. That way we ensure we get
-  // all commands.
-  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
-    uint32_t title_id = kernel_state_->GetExecutableModule()
-                            ? kernel_state_->GetExecutableModule()->title_id()
-                            : 0;
-    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
-    auto path = trace_stream_path_ / file_name;
-    trace_writer_.Open(path, title_id);
-    InitializeTrace();
-  }
-#endif
-  // Adjust pointer base.
-  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
-  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
-  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
-  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
-
-  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
-
-  // Execute commands!
-
-  RingBuffer old_reader = reader_;
-  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
-                            primary_buffer_size_);
-
-  reader_.set_read_offset(read_index * sizeof(uint32_t));
-  reader_.set_write_offset(write_index * sizeof(uint32_t));
-  // prefetch the wraparound range
-  // it likely is already in L3 cache, but in a zen system it may be another
-  // chiplets l3
-  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
-      GetCurrentRingReadCount());
-  do {
-    if (!ExecutePacket()) {
-      // This probably should be fatal - but we're going to continue anyways.
-      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-
-  OnPrimaryBufferEnd();
-
-  trace_writer_.WritePrimaryBufferEnd();
-
-  reader_ = old_reader;
-  return write_index;
-}
-
-void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
-  // Execute commands!
-  RingBuffer old_reader = reader_;
-
-  new (&reader_)
-      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
-
-  reader_.set_write_offset(count * sizeof(uint32_t));
-
-  do {
-    if (!ExecutePacket()) {
-      XELOGE("**** ExecutePacket: Failed to execute packet.");
-      assert_always();
-      break;
-    }
-  } while (reader_.read_count());
-  reader_ = old_reader;
-}

 void CommandProcessor::InitializeTrace() {
  // Write the initial register values, to be loaded directly into the
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -19,11 +19,8 @@
 #include <string>
 #include <vector>

-#include "xenia/base/dma.h"
 #include "xenia/base/ring_buffer.h"
-#include "xenia/base/threading.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/kernel/xthread.h"
@ -82,7 +79,6 @@ class CommandProcessor {
  CommandProcessor(GraphicsSystem* graphics_system,
                   kernel::KernelState* kernel_state);
  virtual ~CommandProcessor();
-  dma::XeDMAC* GetDMAC() const { return dmac_; }
  uint32_t counter() const { return counter_; }
  void increment_counter() { counter_++; }

@ -135,7 +131,7 @@ class CommandProcessor {

  void UpdateWritePointer(uint32_t value);

-  void ExecutePacket(uint32_t ptr, uint32_t count);
+  

  bool is_paused() const { return paused_; }
  void Pause();
@ -172,7 +168,7 @@ class CommandProcessor {
                                          uint32_t num_registers);

  XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(
+  void WriteOneRegisterFromRing(
      uint32_t base,
      uint32_t
          num_times);  // repeatedly write a value to one register, presumably a
@ -221,7 +217,7 @@ class CommandProcessor {
  virtual void PrepareForWait();
  virtual void ReturnFromWait();

-  uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
+  
  virtual void OnPrimaryBufferEnd() {}

 #include "pm4_command_processor_declare.h"
@ -300,7 +296,6 @@ class CommandProcessor {
  reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
  reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
  uint32_t gamma_ramp_rw_component_ = 0;
-  dma::XeDMAC* dmac_ = nullptr;
 };

 }  // namespace gpu
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // validity is tracked.
  const Shader::ConstantRegisterMap& constant_map_vertex =
      vertex_shader->constant_register_map();
-  for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap);
-       ++i) {
-    uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i];
-    uint32_t j;
-    while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
-      vfetch_bits_remaining &= ~(uint32_t(1) << j);
-      uint32_t vfetch_index = i * 32 + j;
-      const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
-          XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
-      switch (vfetch_constant.type) {
-        case xenos::FetchConstantType::kVertex:
-          break;
-        case xenos::FetchConstantType::kInvalidVertex:
-          if (cvars::gpu_allow_invalid_fetch_constants) {
+  {
+    uint32_t vfetch_addresses[96];
+    uint32_t vfetch_sizes[96];
+    uint32_t vfetch_current_queued = 0;
+    for (uint32_t i = 0;
+         i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) {
+      uint32_t vfetch_bits_remaining =
+          constant_map_vertex.vertex_fetch_bitmap[i];
+      uint32_t j;
+      while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
+        vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining);
+        uint32_t vfetch_index = i * 32 + j;
+        const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
+            XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
+        switch (vfetch_constant.type) {
+          case xenos::FetchConstantType::kVertex:
            break;
-          }
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! "
-              "This is incorrect behavior, but you can try bypassing this by "
-              "launching Xenia with --gpu_allow_invalid_fetch_constants=true.",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
-        default:
-          XELOGW(
-              "Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!",
-              vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
-          return false;
+          case xenos::FetchConstantType::kInvalidVertex:
+            if (cvars::gpu_allow_invalid_fetch_constants) {
+              break;
+            }
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" "
+                "type! "
+                "This is incorrect behavior, but you can try bypassing this by "
+                "launching Xenia with "
+                "--gpu_allow_invalid_fetch_constants=true.",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+          default:
+            XELOGW(
+                "Vertex fetch constant {} ({:08X} {:08X}) is completely "
+                "invalid!",
+                vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
+            return false;
+        }
+        vfetch_addresses[vfetch_current_queued] = vfetch_constant.address;
+        vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size;
      }
-      if (!shared_memory_->RequestRange(vfetch_constant.address << 2,
-                                        vfetch_constant.size << 2)) {
-        XELOGE(
-            "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
-            "shared memory",
-            vfetch_constant.address << 2, vfetch_constant.size << 2);
-        return false;
+    }
+
+    if (vfetch_current_queued) {
+      // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
+      // pre-acquire the critical region so we're not repeatedly re-acquiring it
+      // in requestrange
+      auto shared_memory_request_range_hoisted =
+          global_critical_region::Acquire();
+
+      for (uint32_t i = 0; i < vfetch_current_queued; ++i) {
+        if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2,
+                                          vfetch_sizes[i] << 2)) {
+          XELOGE(
+              "Failed to request vertex buffer at 0x{:08X} (size {}) in the "
+              "shared memory",
+              vfetch_addresses[i] << 2, vfetch_sizes[i] << 2);
+          return false;
+        }
      }
    }
  }
-
  // Gather memexport ranges and ensure the heaps for them are resident, and
  // also load the data surrounding the export and to fill the regions that
  // won't be modified by the shaders.
@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() {
    shared_memory_->InitializeTraceCompleteDownloads();
  }
 }
-static void DmaPrefunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_range;
-  readback_range.Begin = 0;
-  readback_range.End = job->size;
-  void* readback_mapping;
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-
-  HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
-  xenia_assert(SUCCEEDED(mapres));
-
-  job->source = (uint8_t*)readback_mapping;
-}
-
-static void DmaPostfunc(dma::XeDMAJob* job) {
-  D3D12_RANGE readback_write_range = {};
-  ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
-  readback_buffer->Unmap(0, &readback_write_range);
-}

 bool D3D12CommandProcessor::IssueCopy() {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() {
  if (!BeginSubmission(true)) {
    return false;
  }
-
  if (!cvars::d3d12_readback_resolve) {
    uint32_t written_address, written_length;
    return render_target_cache_->Resolve(*memory_, *shared_memory_,
@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
            readback_buffer, 0, shared_memory_buffer, written_address,
            written_length);
        if (AwaitAllQueueOperationsCompletion()) {
-#if 1
-        D3D12_RANGE readback_range;
-        readback_range.Begin = 0;
-        readback_range.End = written_length;
-        void* readback_mapping;
-        if (SUCCEEDED(
-                readback_buffer->Map(0, &readback_range, &readback_mapping))) {
-          // chrispy: this memcpy needs to be optimized as much as possible
+          D3D12_RANGE readback_range;
+          readback_range.Begin = 0;
+          readback_range.End = written_length;
+          void* readback_mapping;
+          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                             &readback_mapping))) {
+            // chrispy: this memcpy needs to be optimized as much as possible

-          auto physaddr = memory_->TranslatePhysical(written_address);
-          dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length);
-          // XEDmaCpy(physaddr, readback_mapping, written_length);
-          D3D12_RANGE readback_write_range = {};
-          readback_buffer->Unmap(0, &readback_write_range);
-        }
-
-#else
-          dma::XeDMAJob job{};
-          job.destination = memory_->TranslatePhysical(written_address);
-          job.size = written_length;
-          job.source = nullptr;
-          job.userdata1 = (void*)readback_buffer;
-          job.precall = DmaPrefunc;
-          job.postcall = DmaPostfunc;
-
-          readback_available_ = GetDMAC()->PushDMAJob(&job);
-
-#endif
+            auto physaddr = memory_->TranslatePhysical(written_address);
+            memory::vastcpy(physaddr, (uint8_t*)readback_mapping,
+                            written_length);
+            // XEDmaCpy(physaddr, readback_mapping, written_length);
+            D3D12_RANGE readback_write_range = {};
+            readback_buffer->Unmap(0, &readback_write_range);
+          }
        }
      }
    }
@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
    uint32_t user_clip_plane_index;
    while (xe::bit_scan_forward(user_clip_planes_remaining,
                                &user_clip_plane_index)) {
-      user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index);
+      user_clip_planes_remaining =
+          xe::clear_lowest_bit(user_clip_planes_remaining);
      const float* user_clip_plane =
          &regs[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32;
      if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane,
@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
  uint32_t textures_remaining = used_texture_mask;
  uint32_t texture_index;
  while (xe::bit_scan_forward(textures_remaining, &texture_index)) {
-    textures_remaining &= ~(uint32_t(1) << texture_index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    uint32_t& texture_signs_uint =
        system_constants_.texture_swizzled_signs[texture_index >> 2];
    uint32_t texture_signs_shift = (texture_index & 3) * 8;
@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
  if (size == 0) {
    return nullptr;
  }
-#if 1
-  if (readback_available_) {
-    GetDMAC()->WaitJobDone(readback_available_);
-    readback_available_ = 0;
-  }
-#endif
  size = xe::align(size, kReadbackBufferSizeIncrement);
  if (size > readback_buffer_size_) {
    const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -21,7 +21,6 @@
 #include <utility>

 #include "xenia/base/assert.h"
-#include "xenia/base/dma.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@ -50,8 +49,10 @@ struct MemExportRange {
  uint32_t size_dwords;
 };
 class D3D12CommandProcessor final : public CommandProcessor {
- public:
+ protected:
 #include "../pm4_command_processor_declare.h"
+
+ public:
  explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
                                 kernel::KernelState* kernel_state);
  ~D3D12CommandProcessor();
@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
                                                 uint32_t base,
                                                 uint32_t num_registers);
  XE_NOINLINE
-  virtual void WriteOneRegisterFromRing(uint32_t base,
-                                        uint32_t num_times) override;
+  void WriteOneRegisterFromRing(uint32_t base,
+                                        uint32_t num_times);

  XE_FORCEINLINE
  void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor {

  static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
  ID3D12Resource* readback_buffer_ = nullptr;
-  dma::DMACJobHandle readback_available_ = 0;
  uint32_t readback_buffer_size_ = 0;

  std::atomic<bool> pix_capture_requested_ = false;
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(

 bool D3D12SharedMemory::UploadRanges(
    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_page_ranges) {
+    uint32_t num_upload_page_ranges) {
  if (!num_upload_page_ranges) {
    return true;
  }
  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
  command_processor_.SubmitBarriers();
  auto& command_list = command_processor_.GetDeferredCommandList();
-  // for (auto upload_range : upload_page_ranges) {
-  for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
+  for (uint32_t i = 0; i < num_upload_page_ranges; ++i) {
    auto& upload_range = upload_page_ranges[i];
    uint32_t upload_range_start = upload_range.first;
    uint32_t upload_range_length = upload_range.second;
@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges(
                     uint32_t(upload_buffer_size), false, false);

      if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
-        dma::vastcpy(
+        memory::vastcpy(
            upload_buffer_mapping,
            memory().TranslatePhysical(upload_range_start << page_size_log2()),
            static_cast<uint32_t>(upload_buffer_size));
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;

 private:
  D3D12CommandProcessor& command_processor_;
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
  uint32_t textures_remaining = used_texture_mask;
  uint32_t index;
  while (xe::bit_scan_forward(textures_remaining, &index)) {
-    textures_remaining &= ~(uint32_t(1) << index);
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    const TextureBinding* binding = GetValidTextureBinding(index);
    if (!binding) {
      continue;
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -9,21 +9,14 @@

 #include "xenia/gpu/draw_util.h"

-#include <algorithm>
-#include <cmath>
 #include <cstring>

-#include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/texture_cache.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/ui/graphics_util.h"

 // Very prominent in 545407F2.
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -16,7 +16,6 @@

 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@ -1,9 +1,14 @@


 void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
-
+virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT;
 virtual bool ExecutePacket();
-XE_NOINLINE
+
+public:
+void ExecutePacket(uint32_t ptr, uint32_t count);
+
+protected:
+ XE_NOINLINE
 bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
 XE_NOINLINE
 bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount();

 XE_NOINLINE
 XE_COLD
-bool ExecutePacketType3_CountOverflow(uint32_t count);
+bool ExecutePacketType3_CountOverflow(uint32_t count);
+XE_NOINLINE
+XE_COLD
+bool ExecutePacketType0_CountOverflow(uint32_t count);
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() {
  }
 }
 XE_NOINLINE
+XE_COLD
+bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
+  XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
+         COMMAND_PROCESSOR::GetCurrentRingReadCount(),
+         count * sizeof(uint32_t));
+  return false;
+}
+    /*
+	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+*/
+XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
  // Type-0 packet.
  // Write count registers in sequence to the registers starting at
  // (base_index << 2).

  uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
-  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) {
-    XELOGE(
-        "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
-        COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t));
-    return false;
-  }

-  trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);

-  uint32_t base_index = (packet & 0x7FFF);
-  uint32_t write_one_reg = (packet >> 15) & 0x1;
+  if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
+      count * sizeof(uint32_t)) {
+    trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);

-  if (!write_one_reg) {
-    COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count);
+    uint32_t base_index = (packet & 0x7FFF);
+    uint32_t write_one_reg = (packet >> 15) & 0x1;

+    if (!write_one_reg) {
+      COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index,
+                                                    count);
+
+    } else {
+      COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    }
+
+    trace_writer_.WritePacketEnd();
+    return true;
  } else {
-    COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
+    return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count);
  }
-
-  trace_writer_.WritePacketEnd();
-  return true;
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT {
@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(

 XE_NOINLINE
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
+  /*
+	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
+	then use the wait_info value in order to select the bits that correctly implement the condition
+	If neither subtraction has the signbit set then that means the value is equal
+  */
  bool matched = false;
  switch (wait_info & 0x7) {
    case 0x0:  // Never.
@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
  return true;
 }

+/*
+        todo: shouldn't this do something?
+*/
+
 bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(
    uint32_t packet, uint32_t count) XE_RESTRICT {
  // selective invalidation of state pointers
@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(

  return true;
 }
+
+uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
+                                                uint32_t write_index) {
+  SCOPE_profile_cpu_f("gpu");
+#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
+  // If we have a pending trace stream open it now. That way we ensure we get
+  // all commands.
+  if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
+    uint32_t title_id = kernel_state_->GetExecutableModule()
+                            ? kernel_state_->GetExecutableModule()->title_id()
+                            : 0;
+    auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
+    auto path = trace_stream_path_ / file_name;
+    trace_writer_.Open(path, title_id);
+    InitializeTrace();
+  }
+#endif
+  // Adjust pointer base.
+  uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
+  start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
+  uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
+  end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
+
+  trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
+
+  // Execute commands!
+
+  RingBuffer old_reader = reader_;
+  new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
+                            primary_buffer_size_);
+
+  reader_.set_read_offset(read_index * sizeof(uint32_t));
+  reader_.set_write_offset(write_index * sizeof(uint32_t));
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      GetCurrentRingReadCount());
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      // This probably should be fatal - but we're going to continue anyways.
+      XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+
+  COMMAND_PROCESSOR::OnPrimaryBufferEnd();
+
+  trace_writer_.WritePrimaryBufferEnd();
+
+  reader_ = old_reader;
+  return write_index;
+}
+
+void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) {
+  // Execute commands!
+  RingBuffer old_reader = reader_;
+
+  new (&reader_)
+      RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
+
+  reader_.set_write_offset(count * sizeof(uint32_t));
+
+  do {
+    if (!COMMAND_PROCESSOR::ExecutePacket()) {
+      XELOGE("**** ExecutePacket: Failed to execute packet.");
+      assert_always();
+      break;
+    }
+  } while (reader_.read_count());
+  reader_ = old_reader;
+}
--- a/src/xenia/gpu/sampler_info.h
+++ b/src/xenia/gpu/sampler_info.h
@ -11,7 +11,6 @@
 #define XENIA_GPU_SAMPLER_INFO_H_

 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -24,7 +24,6 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shader_interpreter.cc
+++ b/src/xenia/gpu/shader_interpreter.cc
@ -13,13 +13,6 @@
 #include <cmath>
 #include <cstring>

-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
-#include "xenia/gpu/registers.h"
-#include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/xenos.h"
-
 namespace xe {
 namespace gpu {

--- a/src/xenia/gpu/shader_interpreter.h
+++ b/src/xenia/gpu/shader_interpreter.h
@ -14,12 +14,9 @@
 #include <cstddef>
 #include <cstdint>

-#include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"

 namespace xe {
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -9,15 +9,9 @@

 #include "xenia/gpu/shader_translator.h"

-#include <algorithm>
 #include <cstdarg>
-#include <cstring>
-#include <set>
-#include <string>

-#include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/gpu/gpu_flags.h"

 namespace xe {
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@ -11,16 +11,7 @@
 #define XENIA_GPU_SHADER_TRANSLATOR_H_

 #include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "xenia/base/math.h"
-#include "xenia/base/string_buffer.h"
-#include "xenia/gpu/registers.h"
 #include "xenia/gpu/shader.h"
-#include "xenia/gpu/ucode.h"
-#include "xenia/gpu/xenos.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -10,15 +10,11 @@
 #include "xenia/gpu/shared_memory.h"

 #include <algorithm>
-#include <utility>

 #include "xenia/base/assert.h"
 #include "xenia/base/bit_range.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/profiling.h"
-#include "xenia/memory.h"

 namespace xe {
 namespace gpu {
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@ -10,12 +10,6 @@
 #ifndef XENIA_GPU_SHARED_MEMORY_H_
 #define XENIA_GPU_SHARED_MEMORY_H_

-#include <cstdint>
-#include <mutex>
-#include <utility>
-#include <vector>
-
-#include "xenia/base/mutex.h"
 #include "xenia/memory.h"

 namespace xe {
@ -141,7 +135,7 @@ class SharedMemory {
  // overall bounds of pages to be uploaded.
  virtual bool UploadRanges(
      const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-      unsigned num_upload_ranges) = 0;
+      uint32_t num_upload_ranges) = 0;

  const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
    return trace_download_ranges_;
@ -183,14 +177,10 @@ class SharedMemory {
  FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
      upload_ranges_;

-  // GPU-written memory downloading for traces. <Start address, length>.
-  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
-  uint32_t trace_download_page_count_ = 0;
-
  // Mutex between the guest memory subsystem and the command processor, to be
  // locked when checking or updating validity of pages/ranges and when firing
  // watches.
-  xe::global_critical_region global_critical_region_;
+  static constexpr xe::global_critical_region global_critical_region_{};

  // ***************************************************************************
  // Things below should be fully protected by global_critical_region.
@ -266,6 +256,11 @@ class SharedMemory {
  uint32_t watch_node_current_pool_allocated_ = 0;
  WatchRange* watch_range_first_free_ = nullptr;
  WatchNode* watch_node_first_free_ = nullptr;
+
+  // GPU-written memory downloading for traces. <Start address, length>.
+  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
+  uint32_t trace_download_page_count_ = 0;
+
  // Triggers the watches (global and per-range), removing triggered range
  // watches.
  void FireWatches(uint32_t page_first, uint32_t page_last,
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -9,21 +9,11 @@

 #include "xenia/gpu/texture_cache.h"

-#include <algorithm>
-#include <cstdint>
-#include <utility>
-
-#include "xenia/base/assert.h"
 #include "xenia/base/clock.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/register_file.h"
-#include "xenia/gpu/texture_info.h"
-#include "xenia/gpu/texture_util.h"
-#include "xenia/gpu/xenos.h"

 DEFINE_int32(
    draw_resolution_scale_x, 1,
@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
  uint32_t bindings_changed = 0;
  uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
  uint32_t index = 0;
+
+  Texture* textures_to_load[64];  // max bits = 32, can be unsigned + signed
+                                  // means max array size = 64
+  uint32_t num_textures_to_load = 0;
  while (xe::bit_scan_forward(textures_remaining, &index)) {
    uint32_t index_bit = UINT32_C(1) << index;
-    textures_remaining &= ~index_bit;
+    textures_remaining = xe::clear_lowest_bit(textures_remaining);
    TextureBinding& binding = texture_bindings_[index];
    const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
        XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6);
@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
      binding.texture_signed = nullptr;
    }
    if (load_unsigned_data && binding.texture != nullptr) {
-      LoadTextureData(*binding.texture);
+      textures_to_load[num_textures_to_load++] = binding.texture;
    }
    if (load_signed_data && binding.texture_signed != nullptr) {
-      LoadTextureData(*binding.texture_signed);
+      textures_to_load[num_textures_to_load++] = binding.texture_signed;
    }
  }
+
+  LoadTexturesData(textures_to_load, num_textures_to_load);
+
  if (bindings_changed) {
    UpdateTextureBindingsImpl(bindings_changed);
  }
@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
  texture->LogAction("Created");
  return texture;
 }
+void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
+  assert_true(n_textures <= 64);
+  if (n_textures < 2) {
+    if (!n_textures) {
+      return;
+    } else {
+      LoadTextureData(*textures[0]);
+      return;
+    }
+  }

+  uint64_t index_base_outdated = 0;
+  uint64_t index_mips_outdated = 0;
+  uint32_t nkept = 0;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      Texture* current = textures[i];
+
+      auto base_outdated = current->base_outdated(global_lock);
+      auto mips_outdated = current->mips_outdated(global_lock);
+
+      index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
+      index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
+      if (!base_outdated && !mips_outdated) {
+        textures[i] = nullptr;
+
+      } else {
+        nkept++;
+      }
+    }
+  }
+
+  if (nkept == 0) {
+    return;
+  }
+
+  for (uint32_t i = 0; i < n_textures; ++i) {
+    Texture* p_texture = textures[i];
+    if (!p_texture) {
+      continue;
+    }
+    textures[i] = nullptr;
+    Texture& texture = *p_texture;
+
+    TextureKey texture_key = texture.key();
+    // Implementation may load multiple blocks at once via accesses of up to 128
+    // bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
+    // if the texture is small (especially if it's linear), the last blocks
+    // won't be cut off (hosts may return 0, 0, 0, 0 for the whole
+    // R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
+    // are actually provided for it).
+
+    // Request uploading of the texture data to the shared memory.
+    // This is also necessary when resolution scaling is used - the texture
+    // cache relies on shared memory for invalidation of both unscaled and
+    // scaled textures. Plus a texture may be unscaled partially, when only a
+    // portion of its pages is invalidated, in this case we'll need the texture
+    // from the shared memory to load the unscaled parts.
+    // TODO(Triang3l): Load unscaled parts.
+    bool base_resolved = texture.GetBaseResolved();
+    if (index_base_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.base_page << 12,
+              xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &base_resolved)) {
+        continue;
+      }
+    }
+    bool mips_resolved = texture.GetMipsResolved();
+    if (index_mips_outdated & (1ULL << i)) {
+      if (!shared_memory().RequestRange(
+              texture_key.mip_page << 12,
+              xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
+              texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
+        continue;
+      }
+    }
+    if (texture_key.scaled_resolve) {
+      // Make sure all the scaled resolve memory is resident and accessible from
+      // the shader, including any possible padding that hasn't yet been touched
+      // by an actual resolve, but is still included in the texture size, so the
+      // GPU won't be trying to access unmapped memory.
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
+                                              texture.GetGuestBaseSize(), 4)) {
+        continue;
+      }
+      if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
+                                              texture.GetGuestMipsSize(), 4)) {
+        continue;
+      }
+    }
+
+    // Actually load the texture data.
+    if (!LoadTextureDataFromResidentMemoryImpl(
+            texture, (index_base_outdated & (1ULL << i)) != 0,
+            (index_mips_outdated & (1ULL << i)) != 0)) {
+      continue;
+    }
+
+    // Update the source of the texture (resolve vs. CPU or memexport) for
+    // purposes of handling piecewise gamma emulation via sRGB and for
+    // resolution scale in sampling offsets.
+    if (!texture_key.scaled_resolve) {
+      texture.SetBaseResolved(base_resolved);
+      texture.SetMipsResolved(mips_resolved);
+    }
+    // reque for makeuptodatandwatch
+    textures[i] = &texture;
+  }
+  {
+    auto crit = global_critical_region_.Acquire();
+
+    for (uint32_t i = 0; i < n_textures; ++i) {
+      auto texture = textures[i];
+      if (!texture) {
+        continue;
+      }
+      // Mark the ranges as uploaded and watch them. This is needed for scaled
+      // resolves as well to detect when the CPU wants to reuse the memory for a
+      // regular texture or a vertex buffer, and thus the scaled resolve version
+      // is not up to date anymore.
+      texture->MakeUpToDateAndWatch(crit);
+
+      texture->LogAction("Loaded");
+    }
+  }
+}
 bool TextureCache::LoadTextureData(Texture& texture) {
  // Check what needs to be uploaded.
  bool base_outdated, mips_outdated;
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -559,6 +559,7 @@ class TextureCache {
    return load_shader_info_[load_shader_index];
  }
  bool LoadTextureData(Texture& texture);
+  void LoadTexturesData(Texture** textures, uint32_t n_textures);
  // Writes the texture data (for base, mips or both - but not neither) from the
  // shared memory or the scaled resolve memory. The shared memory management is
  // done outside this function, the implementation just needs to load the data
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -8,14 +8,7 @@
 */

 #include "xenia/gpu/texture_info.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
 #include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
 #include "xenia/base/xxhash.h"

 namespace xe {
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -13,7 +13,6 @@
 #include <array>
 #include <cstring>
 #include <memory>
-#include "xenia/base/assert.h"
 #include "xenia/gpu/xenos.h"

 namespace xe {
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -8,13 +8,6 @@
 */

 #include "xenia/gpu/texture_util.h"
-
-#include <algorithm>
-#include <cstring>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace texture_util {
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -10,11 +10,6 @@
 #ifndef XENIA_GPU_UCODE_H_
 #define XENIA_GPU_UCODE_H_

-#include <cstdint>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/math.h"
-#include "xenia/base/platform.h"
 #include "xenia/gpu/xenos.h"

 // The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource,
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@ -46,6 +46,9 @@ namespace gpu {
 namespace vulkan {

 class VulkanCommandProcessor final : public CommandProcessor {
+ protected:
+#include "../pm4_command_processor_declare.h"
+
 public:
  // Single-descriptor layouts for use within a single frame.
  enum class SingleTransientDescriptorLayout {
@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor {
    kStorageBufferCompute,
    kCount,
  };
-#include "../pm4_command_processor_declare.h"

  class ScratchBufferAcquisition {
   public:
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(

 bool VulkanSharedMemory::UploadRanges(
    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
-    unsigned num_upload_ranges) {
+    uint32_t num_upload_ranges) {
  if (!num_upload_ranges) {
    return true;
  }
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
-                        upload_page_ranges, unsigned num_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+                    uint32_t num_ranges) override;

 private:
  void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -9,10 +9,6 @@

 #include "xenia/gpu/xenos.h"

-#include <cmath>
-
-#include "xenia/base/math.h"
-
 namespace xe {
 namespace gpu {
 namespace xenos {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -10,13 +10,10 @@
 #ifndef XENIA_GPU_XENOS_H_
 #define XENIA_GPU_XENOS_H_

-#include <algorithm>

-#include "xenia/base/assert.h"
-#include "xenia/base/byte_order.h"
-#include "xenia/base/math.h"
 #include "xenia/base/memory.h"
-#include "xenia/base/platform.h"
+#include "xenia/base/math.h"
+

 namespace xe {
 namespace gpu {
--- a/src/xenia/kernel/util/object_table.cc
+++ b/src/xenia/kernel/util/object_table.cc
@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) {

 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle) {
-  auto object = ObjectTable::LookupObject(handle, false);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked) {
+  auto object = ObjectTable::LookupObject(handle, already_locked);
  auto result = object_ref<XObject>(reinterpret_cast<XObject*>(object));
  return result;
 }
--- a/src/xenia/kernel/util/object_table.h
+++ b/src/xenia/kernel/util/object_table.h
@ -46,10 +46,9 @@ class ObjectTable {
  // Restores a XObject reference with a handle. Mainly for internal use - do
  // not use.
  X_STATUS RestoreHandle(X_HANDLE handle, XObject* object);
-
  template <typename T>
-  object_ref<T> LookupObject(X_HANDLE handle) {
-    auto object = LookupObject(handle, false);
+  object_ref<T> LookupObject(X_HANDLE handle, bool already_locked = false) {
+    auto object = LookupObject(handle, already_locked);
    if (T::kObjectType == XObject::Type::Socket) {
      object = LookupObject((handle | 0xF8000000), false);
    }
@ -110,7 +109,8 @@ class ObjectTable {

 // Generic lookup
 template <>
-object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle);
+object_ref<XObject> ObjectTable::LookupObject<XObject>(
+    X_HANDLE handle, bool already_locked);

 }  // namespace util
 }  // namespace kernel
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc
@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub);

 dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr,
                                                      lpdword_t driver_ptr) {
+  if (!callback_ptr) {
+    return X_E_INVALIDARG;
+  }
+
  uint32_t callback = callback_ptr[0];
+
+  if (!callback) {
+    return X_E_INVALIDARG;
+  }
  uint32_t callback_arg = callback_ptr[1];

  auto audio_system = kernel_state()->emulator()->audio_system();
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle,
 }
 DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented,
                         kHighFrequency);
+dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) {
+  X_STATUS result = X_STATUS_SUCCESS;

+  auto ev = kernel_state()->object_table()->LookupObject<XEvent>(handle);
+  if (ev) {
+    uint32_t type_tmp, state_tmp;
+
+    ev->Query(&type_tmp, &state_tmp);
+
+    out_struc[0] = type_tmp;
+    out_struc[1] = state_tmp;
+
+  } else {
+    result = X_STATUS_INVALID_HANDLE;
+  }
+
+  return result;
+}
+DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented,
+                         kHighFrequency);
 uint32_t xeNtClearEvent(uint32_t handle) {
  X_STATUS result = X_STATUS_SUCCESS;

@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry(
    lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) {
  assert_true(wait_type <= 1);

-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
-    auto object_ref =
-        XObject::GetNativeObject<XObject>(kernel_state(), object_ptr);
-    if (!object_ref) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
+      auto object_ref = XObject::GetNativeObject<XObject>(kernel_state(),
+                                                          object_ptr, -1, true);
+      if (!object_ref) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+
+      objects[n] = std::move(object_ref);
    }
-
-    objects.push_back(std::move(object_ref));
  }
-
  uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
-  return XObject::WaitMultiple(uint32_t(objects.size()),
-                               reinterpret_cast<XObject**>(objects.data()),
-                               wait_type, wait_reason, processor_mode,
-                               alertable, timeout_ptr ? &timeout : nullptr);
+  return XObject::WaitMultiple(
+      uint32_t(count), reinterpret_cast<XObject**>(&objects[0]), wait_type,
+      wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr);
 }
 DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented,
                         kBlocking, kHighFrequency);
@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be<uint32_t>* handles,
                                      uint64_t* timeout_ptr) {
  assert_true(wait_type <= 1);

-  std::vector<object_ref<XObject>> objects;
-  for (uint32_t n = 0; n < count; n++) {
-    uint32_t object_handle = handles[n];
-    auto object =
-        kernel_state()->object_table()->LookupObject<XObject>(object_handle);
-    if (!object) {
-      return X_STATUS_INVALID_PARAMETER;
+  assert_true(count <= 64);
+  object_ref<XObject> objects[64];
+
+  /*
+        Reserving to squash the constant reallocations, in a benchmark of one
+     particular game over a period of five minutes roughly 11% of CPU time was
+     spent inside a helper function to Windows' heap allocation function. 7% of
+     that time was traced back to here
+
+         edit: actually switched to fixed size array, as there can never be more
+     than 64 events specified
+  */
+  {
+    auto crit = global_critical_region::AcquireDirect();
+    for (uint32_t n = 0; n < count; n++) {
+      uint32_t object_handle = handles[n];
+      auto object = kernel_state()->object_table()->LookupObject<XObject>(
+          object_handle, true);
+      if (!object) {
+        return X_STATUS_INVALID_PARAMETER;
+      }
+      objects[n] = std::move(object);
    }
-    objects.push_back(std::move(object));
  }

-  return XObject::WaitMultiple(count,
-                               reinterpret_cast<XObject**>(objects.data()),
+  return XObject::WaitMultiple(count, reinterpret_cast<XObject**>(&objects[0]),
                               wait_type, 6, wait_mode, alertable, timeout_ptr);
 }

@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
    dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
    dword_t alertable, lpqword_t timeout_ptr) {
  uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
+  if (!count || count > 64 || wait_type != 1 && wait_type) {
+    return X_STATUS_INVALID_PARAMETER;
+  }
  return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
                                      alertable,
                                      timeout_ptr ? &timeout : nullptr);
@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
                                                      dword_t r6,
                                                      lpqword_t timeout_ptr) {
  X_STATUS result = X_STATUS_SUCCESS;
+  // pre-lock for these two handle lookups
+  global_critical_region::mutex().lock();

-  auto signal_object =
-      kernel_state()->object_table()->LookupObject<XObject>(signal_handle);
+  auto signal_object = kernel_state()->object_table()->LookupObject<XObject>(
+      signal_handle, true);
  auto wait_object =
-      kernel_state()->object_table()->LookupObject<XObject>(wait_handle);
+      kernel_state()->object_table()->LookupObject<XObject>(wait_handle, true);
+  global_critical_region::mutex().unlock();
  if (signal_object && wait_object) {
    uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
    result =
--- a/src/xenia/kernel/xevent.cc
+++ b/src/xenia/kernel/xevent.cc
@ -71,7 +71,12 @@ int32_t XEvent::Reset() {
  event_->Reset();
  return 1;
 }
+void XEvent::Query(uint32_t* out_type, uint32_t* out_state) {
+  auto [type, state] = event_->Query();

+  *out_type = type;
+  *out_state = state;
+}
 void XEvent::Clear() { event_->Reset(); }

 bool XEvent::Save(ByteStream* stream) {
--- a/src/xenia/kernel/xevent.h
+++ b/src/xenia/kernel/xevent.h
@ -36,6 +36,7 @@ class XEvent : public XObject {
  int32_t Set(uint32_t priority_increment, bool wait);
  int32_t Pulse(uint32_t priority_increment, bool wait);
  int32_t Reset();
+  void Query(uint32_t* out_type, uint32_t* out_state);
  void Clear();

  bool Save(ByteStream* stream) override;
--- a/src/xenia/kernel/xobject.cc
+++ b/src/xenia/kernel/xobject.cc
@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                               uint32_t wait_type, uint32_t wait_reason,
                               uint32_t processor_mode, uint32_t alertable,
                               uint64_t* opt_timeout) {
-  std::vector<xe::threading::WaitHandle*> wait_handles(count);
+  xe::threading::WaitHandle* wait_handles[64];
+
  for (size_t i = 0; i < count; ++i) {
    wait_handles[i] = objects[i]->GetWaitHandle();
    assert_not_null(wait_handles[i]);
@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
                  : std::chrono::milliseconds::max();

  if (wait_type) {
-    auto result = xe::threading::WaitAny(std::move(wait_handles),
+    auto result = xe::threading::WaitAny(wait_handles, count,
                                         alertable ? true : false, timeout_ms);
    switch (result.first) {
      case xe::threading::WaitResult::kSuccess:
@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
        return X_STATUS_UNSUCCESSFUL;
    }
  } else {
-    auto result = xe::threading::WaitAll(std::move(wait_handles),
+    auto result = xe::threading::WaitAll(wait_handles, count,
                                         alertable ? true : false, timeout_ms);
    switch (result) {
      case xe::threading::WaitResult::kSuccess:
@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) {
 }

 object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
-                                             void* native_ptr,
-                                             int32_t as_type) {
+                                             void* native_ptr, int32_t as_type,
+                                             bool already_locked) {
  assert_not_null(native_ptr);

  // Unfortunately the XDK seems to inline some KeInitialize calls, meaning
@ -373,10 +374,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
  // We identify this by setting wait_list_flink to a magic value. When set,
  // wait_list_blink will hold a handle to our object.

-  auto global_lock = xe::global_critical_region::AcquireDirect();
+  if (!already_locked) {
+    global_critical_region::mutex().lock();
+  }

  auto header = reinterpret_cast<X_DISPATCH_HEADER*>(native_ptr);
-
+  XObject* result;
  if (as_type == -1) {
    as_type = header->type;
  }
@ -385,10 +388,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
    // Already initialized.
    // TODO: assert if the type of the object != as_type
    uint32_t handle = header->wait_list_blink;
-    auto object = kernel_state->object_table()->LookupObject<XObject>(handle);
-
+    result = kernel_state->object_table()
+                 ->LookupObject<XObject>(handle, true)
+                 .release();
+    goto return_result;
    // TODO(benvanik): assert nothing has been changed in the struct.
-    return object;
+    // return object;
  } else {
    // First use, create new.
    // https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html
@ -430,14 +435,22 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
      case 24:  // ThreadedDpcObject
      default:
        assert_always();
-        return NULL;
+        result = nullptr;
+        goto return_result;
+
+        // return NULL;
    }

    // Stash pointer in struct.
    // FIXME: This assumes the object contains a dispatch header (some don't!)
    StashHandle(header, object->handle());
+    result = object;

-    return object_ref<XObject>(object);
+  return_result:
+    if (!already_locked) {
+      global_critical_region::mutex().unlock();
+    }
+    return object_ref<XObject>(result);
  }
 }

--- a/src/xenia/kernel/xobject.h
+++ b/src/xenia/kernel/xobject.h
@ -192,10 +192,12 @@ class XObject {

  static object_ref<XObject> GetNativeObject(KernelState* kernel_state,
                                             void* native_ptr,
-                                             int32_t as_type = -1);
+                                             int32_t as_type = -1,
+                                             bool already_locked = false);
  template <typename T>
  static object_ref<T> GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type = -1);
+                                       void* native_ptr, int32_t as_type = -1,
+                                       bool already_locked = false);

 protected:
  bool SaveObject(ByteStream* stream);
@ -362,9 +364,11 @@ object_ref<T> retain_object(T* ptr) {

 template <typename T>
 object_ref<T> XObject::GetNativeObject(KernelState* kernel_state,
-                                       void* native_ptr, int32_t as_type) {
+                                       void* native_ptr, int32_t as_type,
+                                       bool already_locked) {
  return object_ref<T>(reinterpret_cast<T*>(
-      GetNativeObject(kernel_state, native_ptr, as_type).release()));
+      GetNativeObject(kernel_state, native_ptr, as_type, already_locked)
+          .release()));
 }

 }  // namespace kernel