Merge pull request #61 from chrisps/canary_experimental

performance improvements, kernel fixes, cpu accuracy improvements
2022-08-21 09:31:09 +02:00 · 2022-08-21 09:31:09 +02:00 · 0b013fdc6b
parent 010b59e81c d85bfc1894
commit 0b013fdc6b
70 changed files with 1744 additions and 541 deletions
--- a/src/xenia/app/emulator_window.cc
+++ b/src/xenia/app/emulator_window.cc
@ -979,7 +979,12 @@ void EmulatorWindow::ToggleDisplayConfigDialog() {
 }

 void EmulatorWindow::ToggleControllerVibration() {
-  emulator()->input_system()->ToggleVibration();
+  auto input_sys = emulator()->input_system();
+  if (input_sys) {
+    auto input_lock = input_sys->lock();
+
+    input_sys->ToggleVibration();
+  }
 }

 void EmulatorWindow::ShowCompatibility() {
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }

 bool XmaContext::Work() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  if (!is_allocated() || !is_enabled()) {
    return false;
  }
@ -106,7 +106,7 @@ bool XmaContext::Work() {
 }

 void XmaContext::Enable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);

  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
  XMA_CONTEXT_DATA data(context_ptr);
@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) {
 }

 void XmaContext::Clear() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  XELOGAPU("XmaContext: reset context {}", id());

  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
@ -151,14 +151,14 @@ void XmaContext::Clear() {
 }

 void XmaContext::Disable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  XELOGAPU("XmaContext: disabling context {}", id());
  set_is_enabled(false);
 }

 void XmaContext::Release() {
  // Lock it in case the decoder thread is working on it now.
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  assert_true(is_allocated_ == true);

  set_is_allocated(false);
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@ -200,7 +200,7 @@ class XmaContext {

  uint32_t id_ = 0;
  uint32_t guest_ptr_ = 0;
-  std::mutex lock_;
+  xe_mutex lock_;
  bool is_allocated_ = false;
  bool is_enabled_ = false;
  // bool is_dirty_ = true;
--- a/src/xenia/base/clock.cc
+++ b/src/xenia/base/clock.cc
@ -15,6 +15,13 @@

 #include "xenia/base/assert.h"
 #include "xenia/base/math.h"
+#include "xenia/base/mutex.h"
+
+#if defined(_WIN32)
+
+#include "xenia/base/platform_win.h"
+
+#endif

 DEFINE_bool(clock_no_scaling, false,
            "Disable scaling code. Time management and locking is bypassed. "
@ -42,8 +49,13 @@ std::pair<uint64_t, uint64_t> guest_tick_ratio_ = std::make_pair(1, 1);
 uint64_t last_guest_tick_count_ = 0;
 // Last sampled host tick count.
 uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
+
+
+using tick_mutex_type = xe_unlikely_mutex;  
+
 // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
-std::mutex tick_mutex_;
+// std::mutex tick_mutex_;
+static tick_mutex_type tick_mutex_;

 void RecomputeGuestTickScalar() {
  // Create a rational number with numerator (first) and denominator (second)
@ -61,7 +73,7 @@ void RecomputeGuestTickScalar() {
  // Keep this a rational calculation and reduce the fraction
  reduce_fraction(frac);

-  std::lock_guard<std::mutex> lock(tick_mutex_);
+  std::lock_guard<tick_mutex_type> lock(tick_mutex_);
  guest_tick_ratio_ = frac;
 }

@ -75,7 +87,7 @@ uint64_t UpdateGuestClock() {
    return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second;
  }

-  std::unique_lock<std::mutex> lock(tick_mutex_, std::defer_lock);
+  std::unique_lock<tick_mutex_type> lock(tick_mutex_, std::defer_lock);
  if (lock.try_lock()) {
    // Translate host tick count to guest tick count.
    uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
@ -107,7 +119,6 @@ inline uint64_t QueryGuestSystemTimeOffset() {

  return guest_tick_count * numerator / denominator;
 }
-
 uint64_t Clock::QueryHostTickFrequency() {
 #if XE_CLOCK_RAW_AVAILABLE
  if (cvars::clock_source_raw) {
@ -137,7 +148,7 @@ void Clock::set_guest_time_scalar(double scalar) {
 }

 std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
-  std::lock_guard<std::mutex> lock(tick_mutex_);
+  std::lock_guard<tick_mutex_type> lock(tick_mutex_);
  return guest_tick_ratio_;
 }

@ -159,6 +170,7 @@ uint64_t Clock::QueryGuestTickCount() {
  return guest_tick_count;
 }

+uint64_t* Clock::GetGuestTickCountPointer() { return &last_guest_tick_count_; }
 uint64_t Clock::QueryGuestSystemTime() {
  if (cvars::clock_no_scaling) {
    return Clock::QueryHostSystemTime();
--- a/src/xenia/base/clock.h
+++ b/src/xenia/base/clock.h
@ -33,11 +33,15 @@ class Clock {
  // Either from platform suplied time source or from hardware directly.
  static uint64_t host_tick_frequency_platform();
 #if XE_CLOCK_RAW_AVAILABLE
+  XE_NOINLINE
  static uint64_t host_tick_frequency_raw();
 #endif
  // Host tick count. Generally QueryHostTickCount() should be used.
  static uint64_t host_tick_count_platform();
 #if XE_CLOCK_RAW_AVAILABLE
+  //chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time
+  //the branch history was lost
+  XE_NOINLINE
  static uint64_t host_tick_count_raw();
 #endif

@ -70,6 +74,8 @@ class Clock {
  // Queries the current guest tick count, accounting for frequency adjustment
  // and scaling.
  static uint64_t QueryGuestTickCount();
+
+  static uint64_t* GetGuestTickCountPointer();
  // Queries the guest time, in FILETIME format, accounting for scaling.
  static uint64_t QueryGuestSystemTime();
  // Queries the milliseconds since the guest began, accounting for scaling.
--- a/src/xenia/base/clock_win.cc
+++ b/src/xenia/base/clock_win.cc
@ -12,7 +12,17 @@
 #include "xenia/base/platform_win.h"

 namespace xe {
+#if XE_USE_KUSER_SHARED == 1
+uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; }

+uint64_t Clock::host_tick_count_platform() {
+  return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
+}
+uint64_t Clock::QueryHostSystemTime() {
+  return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
+}
+
+#else
 uint64_t Clock::host_tick_frequency_platform() {
  LARGE_INTEGER frequency;
  QueryPerformanceFrequency(&frequency);
@ -27,13 +37,13 @@ uint64_t Clock::host_tick_count_platform() {
  }
  return time;
 }
-
 uint64_t Clock::QueryHostSystemTime() {
  FILETIME t;
  GetSystemTimeAsFileTime(&t);
  return (uint64_t(t.dwHighDateTime) << 32) | t.dwLowDateTime;
 }

+#endif
 uint64_t Clock::QueryHostUptimeMillis() {
  return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
 }
--- a/src/xenia/base/clock_x64.cc
+++ b/src/xenia/base/clock_x64.cc
@ -41,10 +41,14 @@
                 "\n"                                                       \
                 "Set the cvar 'clock_source_raw' to 'false'.");

+
+
+
 namespace xe {
 // Getting the TSC frequency can be a bit tricky. This method here only works on
 // Intel as it seems. There is no easy way to get the frequency outside of ring0
 // on AMD, so we fail gracefully if not possible.
+XE_NOINLINE
 uint64_t Clock::host_tick_frequency_raw() {
  uint32_t eax, ebx, ecx, edx;

@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() {
    return 0;
  }

+
+  
  if (max_cpuid >= 0x15) {
    // 15H Get TSC/Crystal ratio and Crystal Hz.
    xe_cpu_cpuid(0x15, eax, ebx, ecx, edx);
@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() {
    return cpu_base_freq;
  }

+
  CLOCK_FATAL("The clock frequency could not be determined.");
  return 0;
 }
-
+XE_NOINLINE
 uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); }

 }  // namespace xe
--- a/src/xenia/base/console_win.cc
+++ b/src/xenia/base/console_win.cc
@ -19,7 +19,7 @@ namespace xe {

 // TODO(Triang3l): Set the default depending on the actual subsystem. Currently
 // it inhibits message boxes.
-static bool has_console_attached_ = true;
+static bool has_console_attached_ = false;

 bool has_console_attached() { return has_console_attached_; }

--- a/src/xenia/base/logging.h
+++ b/src/xenia/base/logging.h
@ -78,17 +78,25 @@ std::pair<char*, size_t> GetThreadBuffer();
 void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);

 }  // namespace internal
-
-// Appends a line to the log with {fmt}-style formatting.
 template <typename... Args>
-void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
+XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
+                                                         const char prefix_char,
+                                                         const char* format,
+                                                         const Args&... args) {
+  auto target = internal::GetThreadBuffer();
+  auto result = fmt::format_to_n(target.first, target.second, format, args...);
+  internal::AppendLogLine(log_level, prefix_char, result.size);
+}
+
+  // Appends a line to the log with {fmt}-style formatting.
+//chrispy: inline the initial check, outline the append. the append should happen rarely for end users
+template <typename... Args>
+XE_FORCEINLINE static void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
                         const char* format, const Args&... args) {
  if (!internal::ShouldLog(log_level)) {
    return;
  }
-  auto target = internal::GetThreadBuffer();
-  auto result = fmt::format_to_n(target.first, target.second, format, args...);
-  internal::AppendLogLine(log_level, prefix_char, result.size);
+  AppendLogLineFormat_Impl(log_level, prefix_char, format, args...);
 }

 // Appends a line to the log.
@ -98,18 +106,19 @@ void AppendLogLine(LogLevel log_level, const char prefix_char,
 }  // namespace logging

 // Logs a fatal error and aborts the program.
-void FatalError(const std::string_view str);
+[[noreturn]] void FatalError(const std::string_view str);

 }  // namespace xe

 #if XE_OPTION_ENABLE_LOGGING

 template <typename... Args>
-void XELOGE(const char* format, const Args&... args) {
+XE_COLD void XELOGE(const char* format, const Args&... args) {
  xe::logging::AppendLogLineFormat(xe::LogLevel::Error, '!', format, args...);
 }

 template <typename... Args>
+XE_COLD
 void XELOGW(const char* format, const Args&... args) {
  xe::logging::AppendLogLineFormat(xe::LogLevel::Warning, 'w', format, args...);
 }
@ -131,12 +140,12 @@ void XELOGCPU(const char* format, const Args&... args) {

 template <typename... Args>
 void XELOGAPU(const char* format, const Args&... args) {
-  xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'A', format, args...);
+  xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'A', format, args...);
 }

 template <typename... Args>
 void XELOGGPU(const char* format, const Args&... args) {
-  xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'G', format, args...);
+  xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'G', format, args...);
 }

 template <typename... Args>
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -376,6 +376,29 @@ template <int N>
 int64_t m128_i64(const __m128& v) {
  return m128_i64<N>(_mm_castps_pd(v));
 }
+/*
+	
+	std::min/max float has handling for nans, where if either argument is nan the first argument is returned
+
+	minss/maxss are different, if either argument is nan the second operand to the instruction is returned
+	this is problematic because we have no assurances from the compiler on the argument ordering
+
+	so only use in places where nan handling is not needed
+*/
+static float xe_minf(float x, float y) {
+  return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
+}
+static float xe_maxf(float x, float y) {
+  return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
+}
+static float xe_rcpf(float den) {
+  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
+}
+
+#else
+static float xe_minf(float x, float y) { return std::min<float>(x, y); }
+static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
+static float xe_rcpf(float den) { return 1.0f / den; }
 #endif

 // Similar to the C++ implementation of XMConvertFloatToHalf and
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -467,6 +467,259 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
  return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
 }

+// chrispy::todo:use for command stream vector, resize happens a ton and has to
+// call memset
+template <size_t sz>
+class FixedVMemVector {
+  static_assert((sz & 65535) == 0,
+                "Always give fixed_vmem_vector a size divisible by 65536 to "
+                "avoid wasting memory on windows");
+
+  uint8_t* data_;
+  size_t nbytes_;
+
+ public:
+  FixedVMemVector()
+      : data_((uint8_t*)memory::AllocFixed(
+            nullptr, sz, memory::AllocationType::kReserveCommit,
+            memory::PageAccess::kReadWrite)),
+        nbytes_(0) {}
+  ~FixedVMemVector() {
+    if (data_) {
+      memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease);
+      data_ = nullptr;
+    }
+    nbytes_ = 0;
+  }
+
+  uint8_t* data() const { return data_; }
+  size_t size() const { return nbytes_; }
+
+  void resize(size_t newsize) {
+    nbytes_ = newsize;
+    xenia_assert(newsize < sz);
+  }
+  size_t alloc() const { return sz; }
+
+  void clear() {
+    resize(0);  // todo:maybe zero out
+  }
+  void reserve(size_t size) { xenia_assert(size < sz); }
+};
+// software prefetches/cache operations
+namespace swcache {
+/*
+        warning, prefetchw's current behavior is not consistent across msvc and
+   clang, for clang it will only compile to prefetchw if the set architecture
+   supports it, for msvc however it will unconditionally compile to prefetchw!
+        so prefetchw support is still in process
+
+
+        only use these if you're absolutely certain you know what you're doing;
+   you can easily tank performance through misuse CPUS have excellent automatic
+   prefetchers that can predict patterns, but in situations where memory
+   accesses are super unpredictable and follow no pattern you can make use of
+   them
+
+        another scenario where it can be handy is when crossing page boundaries,
+   as many automatic prefetchers do not allow their streams to cross pages (no
+   idea what this means for huge pages)
+
+        I believe software prefetches do not kick off an automatic prefetcher
+   stream, so you can't just prefetch one line of the data you're about to
+   access and be fine, you need to go all the way
+
+        prefetchnta is implementation dependent, and that makes its use a bit
+   limited. For intel cpus, i believe it only prefetches the line into one way
+   of the L3
+
+        for amd cpus, it marks the line as requiring immediate eviction, the
+   next time an entry is needed in the set it resides in it will be evicted. ms
+   does dumb shit for memcpy, like looping over the contents of the source
+   buffer and doing prefetchnta on them, likely evicting some of the data they
+   just prefetched by the end of the buffer, and probably messing up data that
+   was already in the cache
+
+
+        another warning for these: this bypasses what i think is called
+   "critical word load", the data will always become available starting from the
+   very beginning of the line instead of from the piece that is needed
+
+        L1I cache is not prefetchable, however likely all cpus can fulfill
+   requests for the L1I from L2, so prefetchL2 on instructions should be fine
+
+        todo: clwb, clflush
+*/
+#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
+
+XE_FORCEINLINE
+static void PrefetchW(const void* addr) { __builtin_prefetch(addr, 1, 0); }
+XE_FORCEINLINE
+
+static void PrefetchNTA(const void* addr) { __builtin_prefetch(addr, 0, 0); }
+XE_FORCEINLINE
+
+static void PrefetchL3(const void* addr) { __builtin_prefetch(addr, 0, 1); }
+XE_FORCEINLINE
+
+static void PrefetchL2(const void* addr) { __builtin_prefetch(addr, 0, 2); }
+XE_FORCEINLINE
+
+static void PrefetchL1(const void* addr) { __builtin_prefetch(addr, 0, 3); }
+#elif XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
+XE_FORCEINLINE
+static void PrefetchW(const void* addr) { _m_prefetchw(addr); }
+
+XE_FORCEINLINE
+static void PrefetchNTA(const void* addr) {
+  _mm_prefetch((const char*)addr, _MM_HINT_NTA);
+}
+XE_FORCEINLINE
+
+static void PrefetchL3(const void* addr) {
+  _mm_prefetch((const char*)addr, _MM_HINT_T2);
+}
+XE_FORCEINLINE
+
+static void PrefetchL2(const void* addr) {
+  _mm_prefetch((const char*)addr, _MM_HINT_T1);
+}
+XE_FORCEINLINE
+
+static void PrefetchL1(const void* addr) {
+  _mm_prefetch((const char*)addr, _MM_HINT_T0);
+}
+
+#else
+XE_FORCEINLINE
+static void PrefetchW(const void* addr) {}
+
+XE_FORCEINLINE
+static void PrefetchNTA(const void* addr) {}
+XE_FORCEINLINE
+
+static void PrefetchL3(const void* addr) {}
+XE_FORCEINLINE
+
+static void PrefetchL2(const void* addr) {}
+XE_FORCEINLINE
+
+static void PrefetchL1(const void* addr) {}
+
+#endif
+
+enum class PrefetchTag { Write, Nontemporal, Level3, Level2, Level1 };
+
+template <PrefetchTag tag>
+static void Prefetch(const void* addr) {
+  static_assert(false, "Unknown tag");
+}
+
+template <>
+static void Prefetch<PrefetchTag::Write>(const void* addr) {
+  PrefetchW(addr);
+}
+template <>
+static void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
+  PrefetchNTA(addr);
+}
+template <>
+static void Prefetch<PrefetchTag::Level3>(const void* addr) {
+  PrefetchL3(addr);
+}
+template <>
+static void Prefetch<PrefetchTag::Level2>(const void* addr) {
+  PrefetchL2(addr);
+}
+template <>
+static void Prefetch<PrefetchTag::Level1>(const void* addr) {
+  PrefetchL1(addr);
+}
+// todo: does aarch64 have streaming stores/loads?
+
+/*
+        non-temporal stores/loads
+
+        the stores allow cacheable memory to behave like write-combining memory.
+        on the first nt store to a line, an intermediate buffer will be
+   allocated by the cpu for stores that come after. once the entire contents of
+   the line have been written the intermediate buffer will be transmitted to
+   memory
+
+        the written line will not be cached and if it is in the cache it will be
+   invalidated from all levels of the hierarchy
+
+        the cpu in this case does not have to read line from memory when we
+   first write to it if it is not anywhere in the cache, so we use half the
+   memory bandwidth using these stores
+
+        non-temporal loads are... loads, but they dont use the cache. you need
+   to manually insert memory barriers (_ReadWriteBarrier, ReadBarrier, etc, do
+   not use any barriers that generate actual code) if on msvc to prevent it from
+   moving the load of the data to just before the use of the data (immediately
+   requiring the memory to be available = big stall)
+
+
+
+*/
+
+#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
+#define XE_MSVC_REORDER_BARRIER _ReadWriteBarrier
+
+#else
+// if the compiler actually has pipelining for instructions we dont need a
+// barrier
+#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
+#endif
+#if XE_ARCH_AMD64 == 1
+
+XE_FORCEINLINE
+static void WriteLineNT(void* destination, const void* source) {
+  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  __m256i low = _mm256_loadu_si256((const __m256i*)source);
+  __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
+  XE_MSVC_REORDER_BARRIER();
+  _mm256_stream_si256((__m256i*)destination, low);
+  _mm256_stream_si256(&((__m256i*)destination)[1], high);
+}
+
+XE_FORCEINLINE
+static void ReadLineNT(void* destination, const void* source) {
+  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+  __m256i low = _mm256_stream_load_si256((const __m256i*)source);
+  __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
+  XE_MSVC_REORDER_BARRIER();
+  _mm256_storeu_si256((__m256i*)destination, low);
+  _mm256_storeu_si256(&((__m256i*)destination)[1], high);
+}
+
+XE_FORCEINLINE
+static void WriteFence() { _mm_sfence(); }
+XE_FORCEINLINE
+static void ReadFence() { _mm_lfence(); }
+XE_FORCEINLINE
+static void ReadWriteFence() { _mm_mfence(); }
+#else
+
+XE_FORCEINLINE
+static void WriteLineNT(void* destination, const void* source) {
+  assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
+  memcpy(destination, source, 64);
+}
+
+XE_FORCEINLINE
+static void ReadLineNT(void* destination, const void* source) {
+  assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
+  memcpy(destination, source, 64);
+}
+XE_FORCEINLINE
+static void WriteFence() {}
+XE_FORCEINLINE
+static void ReadFence() {}
+XE_FORCEINLINE
+static void ReadWriteFence() {}
+#endif
+}  // namespace swcache
 }  // namespace xe

 #endif  // XENIA_BASE_MEMORY_H_
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@ -8,11 +8,79 @@
 */

 #include "xenia/base/mutex.h"
+#if XE_PLATFORM_WIN32 == 1
+#include "xenia/base/platform_win.h"
+#endif

 namespace xe {
+#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
+// default spincount for entercriticalsection is insane on windows, 0x20007D0i64
+// (33556432 times!!) when a lock is highly contended performance degrades
+// sharply on some processors todo: perhaps we should have a set of optional
+// jobs that processors can do instead of spinning, for instance, sorting a list
+// so we have better locality later or something
+#define XE_CRIT_SPINCOUNT 128
+/*
+chrispy: todo, if a thread exits before releasing the global mutex we need to
+check this and release the mutex one way to do this is by using FlsAlloc and
+PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
+thread exits
+*/
+thread_local unsigned global_mutex_depth = 0;
+static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+
+xe_global_mutex::xe_global_mutex() {
+  InitializeCriticalSectionEx(global_critical_section(this), XE_CRIT_SPINCOUNT,
+                              CRITICAL_SECTION_NO_DEBUG_INFO);
+}
+xe_global_mutex ::~xe_global_mutex() {
+  DeleteCriticalSection(global_critical_section(this));
+}
+void xe_global_mutex::lock() {
+  if (global_mutex_depth) {
+  } else {
+    EnterCriticalSection(global_critical_section(this));
+  }
+  global_mutex_depth++;
+}
+void xe_global_mutex::unlock() {
+  if (--global_mutex_depth == 0) {
+    LeaveCriticalSection(global_critical_section(this));
+  }
+}
+bool xe_global_mutex::try_lock() {
+  if (global_mutex_depth) {
+    ++global_mutex_depth;
+    return true;
+  } else {
+    BOOL success = TryEnterCriticalSection(global_critical_section(this));
+    if (success) {
+      ++global_mutex_depth;
+    }
+    return success;
+  }
+}
+
+CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+xe_fast_mutex::xe_fast_mutex() {
+  InitializeCriticalSectionEx(fast_crit(this), XE_CRIT_SPINCOUNT,
+                              CRITICAL_SECTION_NO_DEBUG_INFO);
+}
+xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); }
+
+void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); }
+void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); }
+bool xe_fast_mutex::try_lock() {
+  return TryEnterCriticalSection(fast_crit(this));
+}
+#endif
 // chrispy: moved this out of body of function to eliminate the initialization
 // guards
-static std::recursive_mutex global_mutex;
-std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
+static global_mutex_type global_mutex;
+global_mutex_type& global_critical_region::mutex() { return global_mutex; }

 }  // namespace xe
--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@ -9,11 +9,90 @@

 #ifndef XENIA_BASE_MUTEX_H_
 #define XENIA_BASE_MUTEX_H_
-
 #include <mutex>
+#include "platform.h"

+#define XE_ENABLE_FAST_WIN32_MUTEX 1
 namespace xe {

+#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
+/*
+   must conform to
+   BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as
+   well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable
+
+   this emulates a recursive mutex, except with far less overhead
+*/
+
+class alignas(4096) xe_global_mutex {
+  char detail[64];
+
+ public:
+  xe_global_mutex();
+  ~xe_global_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+using global_mutex_type = xe_global_mutex;
+
+class alignas(64) xe_fast_mutex {
+  char detail[64];
+
+ public:
+  xe_fast_mutex();
+  ~xe_fast_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+// a mutex that is extremely unlikely to ever be locked
+// use for race conditions that have extremely remote odds of happening
+class xe_unlikely_mutex {
+  std::atomic<uint32_t> mut;
+  bool _tryget() {
+    uint32_t lock_expected = 0;
+    return mut.compare_exchange_strong(lock_expected, 1);
+  }
+
+ public:
+  xe_unlikely_mutex() : mut(0) {}
+  ~xe_unlikely_mutex() { mut = 0; }
+
+  void lock() {
+    uint32_t lock_expected = 0;
+
+    if (XE_LIKELY(_tryget())) {
+      return;
+    } else {
+      do {
+        // chrispy: warning, if no SMT, mm_pause does nothing...
+#if XE_ARCH_AMD64 == 1
+        _mm_pause();
+#endif
+
+      } while (!_tryget());
+    }
+  }
+  void unlock() { mut.exchange(0); }
+  bool try_lock() { return _tryget(); }
+};
+using xe_mutex = xe_fast_mutex;
+#else
+using global_mutex_type = std::recursive_mutex;
+using xe_mutex = std::mutex;
+using xe_unlikely_mutex = std::mutex;
+#endif
+struct null_mutex {
+ public:
+  static void lock() {}
+  static void unlock() {}
+  static bool try_lock() { return true; }
+};
+
+using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // The global critical region mutex singleton.
 // This must guard any operation that may suspend threads or be sensitive to
 // being suspended such as global table locks and such.
@ -54,30 +133,30 @@ namespace xe {
 // };
 class global_critical_region {
 public:
-  static std::recursive_mutex& mutex();
+  static global_mutex_type& mutex();

  // Acquires a lock on the global critical section.
  // Use this when keeping an instance is not possible. Otherwise, prefer
  // to keep an instance of global_critical_region near the members requiring
  // it to keep things readable.
-  static std::unique_lock<std::recursive_mutex> AcquireDirect() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  static global_unique_lock_type AcquireDirect() {
+    return global_unique_lock_type(mutex());
  }

  // Acquires a lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> Acquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  inline global_unique_lock_type Acquire() {
+    return global_unique_lock_type(mutex());
  }

  // Acquires a deferred lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> AcquireDeferred() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::defer_lock);
+  inline global_unique_lock_type AcquireDeferred() {
+    return global_unique_lock_type(mutex(), std::defer_lock);
  }

  // Tries to acquire a lock on the glboal critical section.
  // Check owns_lock() to see if the lock was successfully acquired.
-  inline std::unique_lock<std::recursive_mutex> TryAcquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::try_to_lock);
+  inline global_unique_lock_type TryAcquire() {
+    return global_unique_lock_type(mutex(), std::try_to_lock);
  }
 };

--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -122,6 +122,7 @@
 #define XE_COLD __attribute__((cold))
 #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
 #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
+
 #else
 #define XE_FORCEINLINE inline
 #define XE_NOINLINE
@ -129,6 +130,24 @@
 #define XE_LIKELY(...) (!!(__VA_ARGS__))
 #define XE_UNLIKELY(...) (!!(__VA_ARGS__))
 #endif
+// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
+// acts as __restrict across the board todo: __restrict is part of the type
+// system, we might actually have to still emit it on clang and gcc
+#if XE_COMPILER_CLANG_CL == 0 && XE_COMPILER_MSVC == 1
+
+#define XE_RESTRICT __restrict
+#else
+#define XE_RESTRICT
+#endif
+
+#if XE_ARCH_AMD64 == 1
+#define XE_HOST_CACHE_LINE_SIZE 64
+#elif XE_ARCH_ARM64 == 1
+#define XE_HOST_CACHE_LINE_SIZE 64
+#else
+
+#error unknown cache line size for unknown architecture!
+#endif

 namespace xe {

--- a/src/xenia/base/platform_win.h
+++ b/src/xenia/base/platform_win.h
@ -34,31 +34,48 @@
 #undef DeleteFile
 #undef GetFirstChild

-#define	XE_USE_NTDLL_FUNCTIONS 1
-#if XE_USE_NTDLL_FUNCTIONS==1
+#define XE_USE_NTDLL_FUNCTIONS 1
+//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
+// 
+#define XE_USE_KUSER_SHARED 0
+#if XE_USE_NTDLL_FUNCTIONS == 1
 /*
-	ntdll versions of functions often skip through a lot of extra garbage in KernelBase
+        ntdll versions of functions often skip through a lot of extra garbage in
+   KernelBase
 */
-#define XE_NTDLL_IMPORT(name, cls, clsvar) \
-  static class cls {                       \
-   public:                                 \
-	FARPROC fn;\
-    cls() : fn(nullptr) {\
-		auto ntdll = GetModuleHandleA("ntdll.dll");\
-		if (ntdll) {                                \
-			fn = GetProcAddress(ntdll, #name );\
-		}\
-	}                                  \
-	template <typename TRet = void, typename... TArgs> \
-    inline TRet invoke(TArgs... args) {\
-		return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
-	}\
-	inline operator bool() const {\
-		return fn!=nullptr;\
-	}\
+#define XE_NTDLL_IMPORT(name, cls, clsvar)                                   \
+  static class cls {                                                         \
+   public:                                                                   \
+    FARPROC fn;                                                              \
+    cls() : fn(nullptr) {                                                    \
+      auto ntdll = GetModuleHandleA("ntdll.dll");                            \
+      if (ntdll) {                                                           \
+        fn = GetProcAddress(ntdll, #name);                                   \
+      }                                                                      \
+    }                                                                        \
+    template <typename TRet = void, typename... TArgs>                       \
+    inline TRet invoke(TArgs... args) {                                      \
+      return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...); \
+    }                                                                        \
+    inline operator bool() const { return fn != nullptr; }                   \
  } clsvar
 #else
 #define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false

+#endif
+#if XE_USE_KUSER_SHARED==1
+// KUSER_SHARED
+struct __declspec(align(4)) _KSYSTEM_TIME {
+  unsigned int LowPart;
+  int High1Time;
+  int High2Time;
+};
+
+static constexpr size_t KSUER_SHARED_SYSTEMTIME_OFFSET = 0x14;
+static unsigned char* KUserShared() { return (unsigned char*)0x7FFE0000ULL; }
+static volatile _KSYSTEM_TIME* GetKUserSharedSystemTime() {
+  return reinterpret_cast<volatile _KSYSTEM_TIME*>(
+      KUserShared() + KSUER_SHARED_SYSTEMTIME_OFFSET);
+}
 #endif
 #endif  // XENIA_BASE_PLATFORM_WIN_H_
--- a/src/xenia/base/ring_buffer.cc
+++ b/src/xenia/base/ring_buffer.cc
@ -8,46 +8,52 @@
 */

 #include "xenia/base/ring_buffer.h"
-
 #include <algorithm>
 #include <cstring>

 namespace xe {

 RingBuffer::RingBuffer(uint8_t* buffer, size_t capacity)
-    : buffer_(buffer), capacity_(capacity) {}
+    : buffer_(buffer),
+      capacity_(static_cast<ring_size_t>(capacity)),
+      read_offset_(0),
+      write_offset_(0) {}

-void RingBuffer::AdvanceRead(size_t count) {
+void RingBuffer::AdvanceRead(size_t _count) {
+  ring_size_t count = static_cast<ring_size_t>(_count);
  if (read_offset_ + count < capacity_) {
    read_offset_ += count;
  } else {
-    size_t left_half = capacity_ - read_offset_;
-    size_t right_half = count - left_half;
+    ring_size_t left_half = capacity_ - read_offset_;
+    ring_size_t right_half = count - left_half;
    read_offset_ = right_half;
  }
 }

-void RingBuffer::AdvanceWrite(size_t count) {
+void RingBuffer::AdvanceWrite(size_t _count) {
+  ring_size_t count = static_cast<ring_size_t>(_count);
+
  if (write_offset_ + count < capacity_) {
    write_offset_ += count;
  } else {
-    size_t left_half = capacity_ - write_offset_;
-    size_t right_half = count - left_half;
+    ring_size_t left_half = capacity_ - write_offset_;
+    ring_size_t right_half = count - left_half;
    write_offset_ = right_half;
  }
 }

-RingBuffer::ReadRange RingBuffer::BeginRead(size_t count) {
-  count = std::min(count, capacity_);
+RingBuffer::ReadRange RingBuffer::BeginRead(size_t _count) {
+  ring_size_t count =
+      std::min<ring_size_t>(static_cast<ring_size_t>(_count), capacity_);
  if (!count) {
    return {0};
  }
  if (read_offset_ + count < capacity_) {
-    return {buffer_ + read_offset_, count, nullptr, 0};
+    return {buffer_ + read_offset_, nullptr, count, 0};
  } else {
-    size_t left_half = capacity_ - read_offset_;
-    size_t right_half = count - left_half;
-    return {buffer_ + read_offset_, left_half, buffer_, right_half};
+    ring_size_t left_half = capacity_ - read_offset_;
+    ring_size_t right_half = count - left_half;
+    return {buffer_ + read_offset_, buffer_, left_half, right_half};
  }
 }

@ -59,7 +65,8 @@ void RingBuffer::EndRead(ReadRange read_range) {
  }
 }

-size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
+size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
+  ring_size_t count = static_cast<ring_size_t>(_count);
  count = std::min(count, capacity_);
  if (!count) {
    return 0;
@ -69,7 +76,7 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
  if (read_offset_ < write_offset_) {
    assert_true(read_offset_ + count <= write_offset_);
  } else if (read_offset_ + count >= capacity_) {
-    size_t left_half = capacity_ - read_offset_;
+    ring_size_t left_half = capacity_ - read_offset_;
    assert_true(count - left_half <= write_offset_);
  }

@ -77,8 +84,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
    std::memcpy(buffer, buffer_ + read_offset_, count);
    read_offset_ += count;
  } else {
-    size_t left_half = capacity_ - read_offset_;
-    size_t right_half = count - left_half;
+    ring_size_t left_half = capacity_ - read_offset_;
+    ring_size_t right_half = count - left_half;
    std::memcpy(buffer, buffer_ + read_offset_, left_half);
    std::memcpy(buffer + left_half, buffer_, right_half);
    read_offset_ = right_half;
@ -87,7 +94,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
  return count;
 }

-size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
+size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
+  ring_size_t count = static_cast<ring_size_t>(_count);
  count = std::min(count, capacity_);
  if (!count) {
    return 0;
@ -105,8 +113,8 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
    std::memcpy(buffer_ + write_offset_, buffer, count);
    write_offset_ += count;
  } else {
-    size_t left_half = capacity_ - write_offset_;
-    size_t right_half = count - left_half;
+    ring_size_t left_half = capacity_ - write_offset_;
+    ring_size_t right_half = count - left_half;
    std::memcpy(buffer_ + write_offset_, buffer, left_half);
    std::memcpy(buffer_, buffer + left_half, right_half);
    write_offset_ = right_half;
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@ -17,6 +17,8 @@

 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
+#include "xenia/base/math.h"
+#include "xenia/base/memory.h"

 namespace xe {
 /*
@ -39,18 +41,24 @@ namespace xe {
   that the registers no longer need the rex prefix, shrinking the generated
   code a bit.. like i said, every bit helps in this class
 */
+using ring_size_t = uint32_t;
 class RingBuffer {
 public:
  RingBuffer(uint8_t* buffer, size_t capacity);

  uint8_t* buffer() const { return buffer_; }
-  size_t capacity() const { return capacity_; }
+  ring_size_t capacity() const { return capacity_; }
  bool empty() const { return read_offset_ == write_offset_; }

-  size_t read_offset() const { return read_offset_; }
-  uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; }
+  ring_size_t read_offset() const { return read_offset_; }
+  uintptr_t read_ptr() const {
+    return uintptr_t(buffer_) + static_cast<uintptr_t>(read_offset_);
+  }
+
+  // todo: offset/ capacity_ is probably always 1 when its over, just check and
+  // subtract instead
  void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
-  size_t read_count() const {
+  ring_size_t read_count() const {
 // chrispy: these branches are unpredictable
 #if 0
    if (read_offset_ == write_offset_) {
@ -61,14 +69,14 @@ class RingBuffer {
      return (capacity_ - read_offset_) + write_offset_;
    }
 #else
-    size_t read_offs = read_offset_;
-    size_t write_offs = write_offset_;
-    size_t cap = capacity_;
+    ring_size_t read_offs = read_offset_;
+    ring_size_t write_offs = write_offset_;
+    ring_size_t cap = capacity_;

-    size_t offset_delta = write_offs - read_offs;
-    size_t wrap_read_count = (cap - read_offs) + write_offs;
+    ring_size_t offset_delta = write_offs - read_offs;
+    ring_size_t wrap_read_count = (cap - read_offs) + write_offs;

-    size_t comparison_value = read_offs <= write_offs;
+    ring_size_t comparison_value = read_offs <= write_offs;
 #if 0
    size_t selector =
        static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
@ -89,10 +97,12 @@ class RingBuffer {
 #endif
  }

-  size_t write_offset() const { return write_offset_; }
+  ring_size_t write_offset() const { return write_offset_; }
  uintptr_t write_ptr() const { return uintptr_t(buffer_) + write_offset_; }
-  void set_write_offset(size_t offset) { write_offset_ = offset % capacity_; }
-  size_t write_count() const {
+  void set_write_offset(size_t offset) {
+    write_offset_ = static_cast<ring_size_t>(offset) % capacity_;
+  }
+  ring_size_t write_count() const {
    if (read_offset_ == write_offset_) {
      return capacity_;
    } else if (write_offset_ < read_offset_) {
@ -107,13 +117,35 @@ class RingBuffer {

  struct ReadRange {
    const uint8_t* first;
-    size_t first_length;
+
    const uint8_t* second;
-    size_t second_length;
+    ring_size_t first_length;
+    ring_size_t second_length;
  };
  ReadRange BeginRead(size_t count);
  void EndRead(ReadRange read_range);

+  /*
+        BeginRead, but if there is a second Range it will prefetch all lines of it
+
+		this does not prefetch the first range, because software prefetching can do that faster than we can
+  */
+  template <swcache::PrefetchTag tag>
+  XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
+    ReadRange range = BeginRead(count);
+
+    if (range.second) {
+      ring_size_t numlines =
+          xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
+          XE_HOST_CACHE_LINE_SIZE;
+	  //chrispy: maybe unroll?
+      for (ring_size_t i = 0; i < numlines; ++i) {
+        swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
+      }
+    }
+    return range;
+  }
+
  size_t Read(uint8_t* buffer, size_t count);
  template <typename T>
  size_t Read(T* buffer, size_t count) {
@ -156,29 +188,29 @@ class RingBuffer {

 private:
  uint8_t* buffer_ = nullptr;
-  size_t capacity_ = 0;
-  size_t read_offset_ = 0;
-  size_t write_offset_ = 0;
+  ring_size_t capacity_ = 0;
+  ring_size_t read_offset_ = 0;
+  ring_size_t write_offset_ = 0;
 };

 template <>
 inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
-  size_t read_offset = this->read_offset_;
+  ring_size_t read_offset = this->read_offset_;
  xenia_assert(this->capacity_ >= 4);

-  size_t next_read_offset = read_offset + 4;
-  #if 0
+  ring_size_t next_read_offset = read_offset + 4;
+#if 0
  size_t zerotest = next_read_offset - this->capacity_;
  // unpredictable branch, use bit arith instead
  // todo: it would be faster to use lzcnt, but we need to figure out if all
  // machines we support support it
  next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
-  #else
+#else
  if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
    next_read_offset = 0;
-	//todo: maybe prefetch next? or should that happen much earlier?
+    // todo: maybe prefetch next? or should that happen much earlier?
  }
-  #endif
+#endif
  this->read_offset_ = next_read_offset;
  unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
  return xe::byte_swap(ring_value);
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value);
 // be kept short or else all timers will be impacted. This is a simplified
 // wrapper around QueueTimerRecurring which automatically cancels the timer on
 // destruction.
+//only used by XboxkrnlModule::XboxkrnlModule
 class HighResolutionTimer {
  HighResolutionTimer(std::chrono::milliseconds interval,
                      std::function<void()> callback) {
--- a/src/xenia/base/threading_timer_queue.cc
+++ b/src/xenia/base/threading_timer_queue.cc
@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem;
 	edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds

 */
-using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy;
+using WaitStrat = dp::blocking_wait_strategy;

 class TimerQueue {
 public:
@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() {
    spinner.spin_once();
  }
 }
-
+//unused
 std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
                                       void* userdata,
                                       WaitItem::clock::time_point due) {
@ -213,7 +213,7 @@ std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
      std::make_shared<WaitItem>(std::move(callback), userdata, &timer_queue_,
                                 due, WaitItem::clock::duration::zero()));
 }
-
+// only used by HighResolutionTimer
 std::weak_ptr<WaitItem> QueueTimerRecurring(
    std::function<void(void*)> callback, void* userdata,
    WaitItem::clock::time_point due, WaitItem::clock::duration interval) {
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -10,7 +10,7 @@
 #include "xenia/cpu/backend/x64/x64_backend.h"

 #include <stddef.h>
-
+#include <algorithm>
 #include "third_party/capstone/include/capstone/capstone.h"
 #include "third_party/capstone/include/capstone/x86.h"

@ -50,6 +50,9 @@ DEFINE_bool(record_mmio_access_exceptions, true,
            "for them. This info can then be used on a subsequent run to "
            "instruct the recompiler to emit checks",
            "CPU");
+#if XE_X64_PROFILER_AVAILABLE == 1
+DECLARE_bool(instrument_call_times);
+#endif

 namespace xe {
 namespace cpu {
@ -96,6 +99,68 @@ static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
  reinterpret_cast<X64Backend*>(context)
      ->RecordMMIOExceptionForGuestInstruction(hostaddr);
 }
+#if XE_X64_PROFILER_AVAILABLE == 1
+// todo: better way of passing to atexit. maybe do in destructor instead?
+// nope, destructor is never called
+static GuestProfilerData* backend_profiler_data = nullptr;
+
+static uint64_t nanosecond_lifetime_start = 0;
+static void WriteGuestProfilerData() {
+  if (cvars::instrument_call_times) {
+    uint64_t end = Clock::QueryHostSystemTime();
+
+    uint64_t total = end - nanosecond_lifetime_start;
+
+    double totaltime_divisor = static_cast<double>(total);
+
+    FILE* output_file = nullptr;
+    std::vector<std::pair<uint32_t, uint64_t>> unsorted_profile{};
+    for (auto&& entry : *backend_profiler_data) {
+      if (entry.second) {  // skip times of 0
+        unsorted_profile.emplace_back(entry.first, entry.second);
+      }
+    }
+
+    std::sort(unsorted_profile.begin(), unsorted_profile.end(),
+              [](auto& x, auto& y) { return x.second < y.second; });
+
+    fopen_s(&output_file, "profile_times.txt", "w");
+    FILE* idapy_file = nullptr;
+    fopen_s(&idapy_file, "profile_print_times.py", "w");
+
+    for (auto&& sorted_entry : unsorted_profile) {
+      // double time_in_seconds =
+      //    static_cast<double>(sorted_entry.second) / 10000000.0;
+      double time_in_milliseconds =
+          static_cast<double>(sorted_entry.second) / (10000000.0 / 1000.0);
+
+      double slice = static_cast<double>(sorted_entry.second) /
+                     static_cast<double>(totaltime_divisor);
+
+      fprintf(output_file,
+              "%X took %.20f milliseconds, totaltime slice percentage %.20f \n",
+              sorted_entry.first, time_in_milliseconds, slice);
+
+      fprintf(idapy_file,
+              "print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n",
+              sorted_entry.first, time_in_milliseconds, slice);
+    }
+
+    fclose(output_file);
+    fclose(idapy_file);
+  }
+}
+
+static void GuestProfilerUpdateThreadProc() {
+  nanosecond_lifetime_start = Clock::QueryHostSystemTime();
+
+  do {
+    xe::threading::Sleep(std::chrono::seconds(30));
+    WriteGuestProfilerData();
+  } while (true);
+}
+static std::unique_ptr<xe::threading::Thread> g_profiler_update_thread{};
+#endif

 bool X64Backend::Initialize(Processor* processor) {
  if (!Backend::Initialize(processor)) {
@ -159,6 +224,21 @@ bool X64Backend::Initialize(Processor* processor) {

  processor->memory()->SetMMIOExceptionRecordingCallback(
      ForwardMMIOAccessForRecording, (void*)this);
+
+#if XE_X64_PROFILER_AVAILABLE == 1
+  if (cvars::instrument_call_times) {
+    backend_profiler_data = &profiler_data_;
+    xe::threading::Thread::CreationParameters slimparams;
+
+    slimparams.create_suspended = false;
+    slimparams.initial_priority = xe::threading::ThreadPriority::kLowest;
+    slimparams.stack_size = 65536 * 4;
+
+    g_profiler_update_thread = std::move(xe::threading::Thread::Create(
+        slimparams, GuestProfilerUpdateThreadProc));
+  }
+#endif
+
  return true;
 }

@ -734,6 +814,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  bctx->flags = 0;
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
+  bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
 }
 const uint32_t mxcsr_table[8] = {
    0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
@ -747,6 +828,23 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
  bctx->mxcsr_fpu = mxcsr_table[control];
  ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
 }
+
+#if XE_X64_PROFILER_AVAILABLE == 1
+uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
+  // who knows, we might want to compile different versions of a function one
+  // day
+  auto entry = profiler_data_.find(guest_address);
+
+  if (entry != profiler_data_.end()) {
+    return &entry->second;
+  } else {
+    profiler_data_[guest_address] = 0;
+
+    return &profiler_data_[guest_address];
+  }
+}
+
+#endif
 }  // namespace x64
 }  // namespace backend
 }  // namespace cpu
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -15,6 +15,14 @@
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/backend.h"

+#if XE_PLATFORM_WIN32 == 1
+// we use KUSER_SHARED's systemtime field, which is at a fixed address and
+// obviously windows specific, to get the start/end time for a function using
+// rdtsc would be too slow and skew the results by consuming extra cpu time, so
+// we have lower time precision but better overall accuracy
+#define XE_X64_PROFILER_AVAILABLE 1
+#endif
+
 DECLARE_int32(x64_extension_mask);

 namespace xe {
@ -24,6 +32,8 @@ namespace xe {
 namespace cpu {
 namespace backend {
 namespace x64 {
+// mapping of guest function addresses to total nanoseconds taken in the func
+using GuestProfilerData = std::map<uint32_t, uint64_t>;

 class X64CodeCache;

@ -37,8 +47,10 @@ typedef void (*ResolveFunctionThunk)();
 // negatively index the membase reg)
 struct X64BackendContext {
  void* ResolveFunction_Ptr;  // cached pointer to resolvefunction
-  unsigned int mxcsr_fpu;     // currently, the way we implement rounding mode
-                              // affects both vmx and the fpu
+  uint64_t* guest_tick_count;
+
+  unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
+                           // affects both vmx and the fpu
  unsigned int mxcsr_vmx;
  unsigned int flags;   // bit 0 = 0 if mxcsr is fpu, else it is vmx
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
@ -93,7 +105,9 @@ class X64Backend : public Backend {
  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;

  void RecordMMIOExceptionForGuestInstruction(void* host_address);
-
+#if XE_X64_PROFILER_AVAILABLE == 1
+  uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
+#endif
 private:
  static bool ExceptionCallbackThunk(Exception* ex, void* data);
  bool ExceptionCallback(Exception* ex);
@ -106,6 +120,10 @@ class X64Backend : public Backend {
  HostToGuestThunk host_to_guest_thunk_;
  GuestToHostThunk guest_to_host_thunk_;
  ResolveFunctionThunk resolve_function_thunk_;
+
+#if XE_X64_PROFILER_AVAILABLE == 1
+  GuestProfilerData profiler_data_;
+#endif
 };

 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -57,6 +57,12 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
            "code. The workaround may cause reduced CPU performance but is a "
            "more accurate emulation",
            "x64");
+
+#if XE_X64_PROFILER_AVAILABLE == 1
+DEFINE_bool(instrument_call_times, false,
+            "Compute time taken for functions, for profiling guest code",
+            "x64");
+#endif
 namespace xe {
 namespace cpu {
 namespace backend {
@ -120,28 +126,37 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
 */
  unsigned int data[4];
  Xbyak::util::Cpu::getCpuid(0x80000001, data);
-  if (data[2] & (1U << 5)) {
+  unsigned amd_flags = data[2];
+  if (amd_flags & (1U << 5)) {
    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
      feature_flags_ |= kX64EmitLZCNT;
    }
  }
+  // todo: although not reported by cpuid, zen 1 and zen+ also have fma4
+  if (amd_flags & (1U << 16)) {
+    if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
+      feature_flags_ |= kX64EmitFMA4;
+    }
+  }
+  if (amd_flags & (1U << 21)) {
+    if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
+      feature_flags_ |= kX64EmitTBM;
+    }
+  }
  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
    bool is_zennish = cpu_.displayFamily >= 0x17;
-
+    /*
+                chrispy: according to agner's tables, all amd architectures that
+       we support (ones with avx) have the same timings for
+       jrcxz/loop/loope/loopne as for other jmps
+        */
+    feature_flags_ |= kX64FastJrcx;
+    feature_flags_ |= kX64FastLoop;
    if (is_zennish) {
      // ik that i heard somewhere that this is the case for zen, but i need to
      // verify. cant find my original source for that.
      // todo: ask agner?
      feature_flags_ |= kX64FlagsIndependentVars;
-      feature_flags_ |= kX64FastJrcx;
-
-      if (cpu_.displayFamily > 0x17) {
-        feature_flags_ |= kX64FastLoop;
-
-      } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
-        feature_flags_ |= kX64FastLoop;
-      }  // todo:figure out at model zen+ became zen2, this is just the model
-         // for my cpu, which is ripper90
    }
  }
  may_use_membase32_as_zero_reg_ =
@ -157,6 +172,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
                      std::vector<SourceMapEntry>* out_source_map) {
  SCOPE_profile_cpu_f("cpu");
  guest_module_ = dynamic_cast<XexModule*>(function->module());
+  current_guest_function_ = function->address();
  // Reset.
  debug_info_ = debug_info;
  debug_info_flags_ = debug_info_flags;
@ -286,10 +302,19 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  * chrispy: removed this, it serves no purpose
  mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
  */
+
  mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);

  mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);  // 0

+#if XE_X64_PROFILER_AVAILABLE == 1
+  if (cvars::instrument_call_times) {
+    mov(rdx, 0x7ffe0014);  // load pointer to kusershared systemtime
+    mov(rdx, qword[rdx]);
+    mov(qword[rsp + StackLayout::GUEST_PROFILER_START],
+        rdx);  // save time for end of function
+  }
+#endif
  // Safe now to do some tracing.
  if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
    // We require 32-bit addresses.
@ -363,6 +388,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
  */
  code_offsets.epilog = getSize();
+  EmitProfilerEpilogue();

  add(rsp, (uint32_t)stack_size);
  ret();
@ -391,6 +417,27 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

  return true;
 }
+// dont use rax, we do this in tail call handling
+void X64Emitter::EmitProfilerEpilogue() {
+#if XE_X64_PROFILER_AVAILABLE == 1
+  if (cvars::instrument_call_times) {
+    uint64_t* profiler_entry =
+        backend()->GetProfilerRecordForFunction(current_guest_function_);
+    mov(ecx, 0x7ffe0014);
+    mov(rdx, qword[rcx]);
+    mov(rbx, (uintptr_t)profiler_entry);
+    sub(rdx, qword[rsp + StackLayout::GUEST_PROFILER_START]);
+
+    // atomic add our time to the profiler entry
+    // this could be atomic free if we had per thread profile counts, and on a
+    // threads exit we lock and sum up to the global counts, which would make
+    // this a few cycles less intrusive, but its good enough for now
+    //  actually... lets just try without atomics lol
+    // lock();
+    add(qword[rbx], rdx);
+  }
+#endif
+}

 void X64Emitter::MarkSourceOffset(const Instr* i) {
  auto entry = source_map_arena_.Alloc<SourceMapEntry>();
@ -558,7 +605,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
  if (instr->flags & hir::CALL_TAIL) {
    // Since we skip the prolog we need to mark the return here.
    EmitTraceUserCallReturn();
-
+    EmitProfilerEpilogue();
    // Pass the callers return address over.
    mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);

@ -602,7 +649,7 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
  if (instr->flags & hir::CALL_TAIL) {
    // Since we skip the prolog we need to mark the return here.
    EmitTraceUserCallReturn();
-
+    EmitProfilerEpilogue();
    // Pass the callers return address over.
    mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);

@ -952,7 +999,34 @@ static const vec128_t xmm_consts[] = {
    /*XMMVSRShlByteshuf*/
    v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
    // XMMVSRMask
-    vec128b(1)};
+    vec128b(1),
+    /*
+        XMMF16UnpackLCPI2
+        */
+
+    vec128i(0x38000000),
+    /*
+                XMMF16UnpackLCPI3
+        */
+    vec128q(0x7fe000007fe000ULL),
+
+    /* XMMF16PackLCPI0*/
+    vec128i(0x8000000),
+    /*XMMF16PackLCPI2*/
+    vec128i(0x47ffe000),
+    /*XMMF16PackLCPI3*/
+    vec128i(0xc7800000),
+    /*XMMF16PackLCPI4
+     */
+    vec128i(0xf7fdfff),
+    /*XMMF16PackLCPI5*/
+    vec128i(0x7fff),
+    /*
+    XMMF16PackLCPI6
+    */
+    vec128i(0x8000)
+
+};

 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
  for (auto& vec : xmm_consts) {
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -159,7 +159,15 @@ enum XmmConst {
  XMMThreeFloatMask,  // for clearing the fourth float prior to DOT_PRODUCT_3
  XMMXenosF16ExtRangeStart,
  XMMVSRShlByteshuf,
-  XMMVSRMask
+  XMMVSRMask,
+  XMMF16UnpackLCPI2,  // 0x38000000, 1/ 32768
+  XMMF16UnpackLCPI3,  // 0x0x7fe000007fe000
+  XMMF16PackLCPI0,
+  XMMF16PackLCPI2,
+  XMMF16PackLCPI3,
+  XMMF16PackLCPI4,
+  XMMF16PackLCPI5,
+  XMMF16PackLCPI6
 };
 // X64Backend specific Instr->runtime_flags
 enum : uint32_t {
@ -177,7 +185,7 @@ class XbyakAllocator : public Xbyak::Allocator {
 enum X64EmitterFeatureFlags {
  kX64EmitAVX2 = 1 << 0,
  kX64EmitFMA = 1 << 1,
-  kX64EmitLZCNT = 1 << 2,
+  kX64EmitLZCNT = 1 << 2,  // this is actually ABM and includes popcount
  kX64EmitBMI1 = 1 << 3,
  kX64EmitBMI2 = 1 << 4,
  kX64EmitF16C = 1 << 5,
@ -201,7 +209,11 @@ enum X64EmitterFeatureFlags {
                // inc/dec) do not introduce false dependencies on EFLAGS
                // because the individual flags are treated as different vars by
                // the processor. (this applies to zen)
-  kX64EmitPrefetchW = 1 << 16
+  kX64EmitPrefetchW = 1 << 16,
+  kX64EmitXOP = 1 << 17,   // chrispy: xop maps really well to many vmx
+                           // instructions, and FX users need the boost
+  kX64EmitFMA4 = 1 << 18,  // todo: also use on zen1?
+  kX64EmitTBM = 1 << 19
 };
 class ResolvableGuestCall {
 public:
@ -337,6 +349,8 @@ class X64Emitter : public Xbyak::CodeGenerator {

  XexModule* GuestModule() { return guest_module_; }

+  void EmitProfilerEpilogue();
+
 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
                GuestFunction* function = nullptr);
@ -352,7 +366,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  XexModule* guest_module_ = nullptr;
  Xbyak::util::Cpu cpu_;
  uint32_t feature_flags_ = 0;
-
+  uint32_t current_guest_function_ = 0;
  Xbyak::Label* epilog_label_ = nullptr;

  hir::Instr* current_instr_ = nullptr;
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -19,10 +19,6 @@
 #include "xenia/base/cvar.h"
 #include "xenia/cpu/backend/x64/x64_stack_layout.h"

-DEFINE_bool(use_extended_range_half, true,
-            "Emulate extended range half-precision, may be slower on games "
-            "that use it heavily",
-            "CPU");
 namespace xe {
 namespace cpu {
 namespace backend {
@ -1982,6 +1978,137 @@ struct PERMUTE_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);

+#define LCPI(name, quad1) const __m128i name = _mm_set1_epi32(quad1)
+// xmm0 is precasted to int, but contains float
+// chrispy: todo: make available to gpu code
+static __m128i xenos_float4_to_float16_x4(__m128i xmm0) {
+  LCPI(LCPI0_0, 2147483647);
+  LCPI(LCPI0_1, 1207951360);
+  LCPI(LCPI0_2, 134217728);
+  LCPI(LCPI0_3, 3347054592);
+  LCPI(LCPI0_4, 260038655);
+  LCPI(LCPI0_5, 32767);
+  LCPI(LCPI0_6, 4294934528);
+
+  __m128i xmm1 = _mm_and_si128(xmm0, LCPI0_0);
+
+  __m128i xmm2 = LCPI0_1;
+
+  __m128i xmm3 = _mm_add_epi32(xmm0, LCPI0_2);
+  xmm2 = _mm_cmpgt_epi32(xmm2, xmm1);
+  xmm3 = _mm_srli_epi32(xmm3, 13);
+  xmm1 = _mm_add_epi32(xmm1, LCPI0_3);
+  __m128i xmm4 = _mm_min_epu32(xmm1, LCPI0_4);
+  xmm1 = _mm_cmpeq_epi32(xmm1, xmm4);
+  xmm4 = LCPI0_5;
+  xmm3 = _mm_and_si128(xmm3, xmm4);
+  xmm1 = _mm_and_si128(xmm1, xmm3);
+
+  xmm1 = _mm_castps_si128(_mm_blendv_ps(
+      _mm_castsi128_ps(xmm4), _mm_castsi128_ps(xmm1), _mm_castsi128_ps(xmm2)));
+  xmm0 = _mm_srli_epi32(xmm0, 16);
+  xmm0 = _mm_and_si128(xmm0, LCPI0_6);
+  xmm0 = _mm_or_si128(xmm1, xmm0);
+  xmm0 = _mm_packus_epi32(xmm0, _mm_setzero_si128());
+  return xmm0;
+}
+// returns floats, uncasted
+// chrispy: todo, make this available to gpu code?
+static __m128i xenos_halves_to_floats(__m128i xmm0) {
+  LCPI(LCPI3_0, 0x1f);
+  LCPI(LCPI3_1, 0x80000000);
+  LCPI(LCPI3_2, 0x38000000);
+  LCPI(LCPI3_3, 0x7fe000);
+
+  __m128i xmm1, xmm2, xmm3, xmm4;
+
+  xmm1 = _mm_cvtepu16_epi32(xmm0);
+
+  xmm2 = _mm_srli_epi32(xmm1, 10);
+
+  xmm2 = _mm_and_si128(xmm2, LCPI3_0);
+
+  xmm0 = _mm_cvtepi16_epi32(xmm0);
+
+  xmm0 = _mm_and_si128(xmm0, LCPI3_1);
+
+  xmm3 = _mm_setzero_si128();
+
+  xmm4 = _mm_slli_epi32(xmm2, 23);
+
+  xmm4 = _mm_add_epi32(xmm4, LCPI3_2);
+
+  xmm2 = _mm_cmpeq_epi32(xmm2, xmm3);
+
+  xmm1 = _mm_slli_epi32(xmm1, 13);
+
+  xmm1 = _mm_and_si128(xmm1, LCPI3_3);
+
+  xmm3 = _mm_andnot_si128(xmm2, xmm4);
+
+  xmm1 = _mm_andnot_si128(xmm2, xmm1);
+
+  xmm0 = _mm_or_si128(xmm1, xmm0);
+  xmm0 = _mm_or_si128(xmm0, xmm3);
+  return xmm0;
+}
+
+#undef LCPI
+template <typename Inst>
+static void emit_fast_f16_unpack(X64Emitter& e, const Inst& i,
+                                 XmmConst initial_shuffle) {
+  auto src1 = i.src1;
+
+  e.vpshufb(i.dest, src1, e.GetXmmConstPtr(initial_shuffle));
+  e.vpmovsxwd(e.xmm1, i.dest);
+
+  e.vpsrld(e.xmm2, e.xmm1, 10);
+  e.vpmovsxwd(e.xmm0, i.dest);
+  e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS));
+  e.vpand(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
+
+  e.vpslld(e.xmm3, e.xmm2, 23);
+
+  e.vpaddd(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMF16UnpackLCPI2));
+
+  e.vpcmpeqd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero));
+
+  e.vpslld(e.xmm1, e.xmm1, 13);
+
+  e.vpandn(e.xmm1, e.xmm2, e.xmm1);
+  e.vpandn(e.xmm2, e.xmm2, e.xmm3);
+
+  e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMF16UnpackLCPI3));
+  e.vpor(e.xmm0, e.xmm1, e.xmm0);
+  e.vpor(i.dest, e.xmm0, e.xmm2);
+}
+template <typename Inst>
+static void emit_fast_f16_pack(X64Emitter& e, const Inst& i,
+                               XmmConst final_shuffle) {
+  e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMF16PackLCPI0));
+  e.vpand(e.xmm2, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
+  e.vmovdqa(e.xmm3, e.GetXmmConstPtr(XMMF16PackLCPI2));
+
+  e.vpcmpgtd(e.xmm3, e.xmm3, e.xmm2);
+  e.vpsrld(e.xmm1, e.xmm1, 13);
+
+  e.vpaddd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI3));
+  e.vpminud(e.xmm0, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI4));
+
+  e.vpcmpeqd(e.xmm2, e.xmm2, e.xmm0);
+  e.vmovdqa(e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI5));
+  e.vpand(e.xmm1, e.xmm1, e.xmm0);
+  e.vpand(e.xmm1, e.xmm2, e.xmm1);
+  e.vpxor(e.xmm2, e.xmm2, e.xmm2);
+
+  e.vblendvps(e.xmm1, e.xmm0, e.xmm1, e.xmm3);
+
+  e.vpsrld(e.xmm0, i.src1, 16);
+  e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI6));
+  e.vorps(e.xmm0, e.xmm1, e.xmm0);
+  e.vpackusdw(i.dest, e.xmm0, e.xmm2);
+  e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(final_shuffle));
+}
 // ============================================================================
 // OPCODE_SWIZZLE
 // ============================================================================
@ -2081,14 +2208,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    alignas(16) uint16_t b[8];
    _mm_store_ps(a, src1);
    std::memset(b, 0, sizeof(b));
-    if (!cvars::use_extended_range_half) {
-      for (int i = 0; i < 2; i++) {
-        b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
-      }
-    } else {
-      for (int i = 0; i < 2; i++) {
-        b[7 - i] = float_to_xenos_half(a[i]);
-      }
+
+    for (int i = 0; i < 2; i++) {
+      b[7 - i] = float_to_xenos_half(a[i]);
    }

    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
@ -2098,70 +2220,26 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
    // dest = [(src1.x | src1.y), 0, 0, 0]

-    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
-      Xmm src;
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // 0|0|0|0|W|Z|Y|X
-      e.vcvtps2ph(i.dest, src, 0b00000011);
-      // Shuffle to X|Y|0|0|0|0|0|0
-      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2));
+    if (i.src1.is_constant) {
+      e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
    } else {
-      if (i.src1.is_constant) {
-        e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
-      } else {
-        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
-      }
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
-      e.vmovaps(i.dest, e.xmm0);
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+    e.vmovaps(i.dest, e.xmm0);
  }
-  static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
-    alignas(16) float a[4];
-    alignas(16) uint16_t b[8];
-    _mm_store_ps(a, src1);
-    std::memset(b, 0, sizeof(b));
-    if (!cvars::use_extended_range_half) {
-      for (int i = 0; i < 4; i++) {
-        b[7 - (i ^ 2)] =
-            half_float::detail::float2half<std::round_toward_zero>(a[i]);
-      }
-    } else {
-      for (int i = 0; i < 4; i++) {
-        b[7 - (i ^ 2)] = float_to_xenos_half(a[i]);
-      }
-    }

-    return _mm_load_si128(reinterpret_cast<__m128i*>(b));
-  }
  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
-
-    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
-      Xmm src;
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // 0|0|0|0|W|Z|Y|X
-      e.vcvtps2ph(i.dest, src, 0b00000011);
-      // Shuffle to Z|W|X|Y|0|0|0|0
-      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
+    if (!i.src1.is_constant) {
+      emit_fast_f16_pack(e, i, XMMPackFLOAT16_4);
    } else {
-      if (i.src1.is_constant) {
-        e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
-      } else {
-        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
+      vec128_t result = vec128b(0);
+      for (unsigned idx = 0; idx < 4; ++idx) {
+        result.u16[(7 - (idx ^ 2))] =
+            float_to_xenos_half(i.src1.constant().f32[idx]);
      }
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
-      e.vmovaps(i.dest, e.xmm0);
+
+      e.LoadConstantXmm(i.dest, result);
    }
  }
  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
@ -2508,15 +2586,10 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    alignas(16) float b[4];
    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);

-    if (!cvars::use_extended_range_half) {
-      for (int i = 0; i < 2; i++) {
-        b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
-      }
-    } else {
-      for (int i = 0; i < 2; i++) {
-        b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
-      }
+    for (int i = 0; i < 2; i++) {
+      b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
    }
+
    // Constants, or something
    b[2] = 0.f;
    b[3] = 1.f;
@ -2536,74 +2609,28 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // Also zero out the high end.
    // TODO(benvanik): special case constant unpacks that just get 0/1/etc.

-    if (e.IsFeatureEnabled(kX64EmitF16C) &&
-        !cvars::use_extended_range_half) {  // todo: can use cvtph and bit logic
-                                            // to implement
-      Xmm src;
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
-      }
-      // sx = src.iw >> 16;
-      // sy = src.iw & 0xFFFF;
-      // dest = { XMConvertHalfToFloat(sx),
-      //          XMConvertHalfToFloat(sy),
-      //          0.0,
-      //          1.0 };
-      // Shuffle to 0|0|0|0|0|0|Y|X
-      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2));
-      e.vcvtph2ps(i.dest, i.dest);
-      e.vpshufd(i.dest, i.dest, 0b10100100);
-      e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
+    if (i.src1.is_constant) {
+      e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
    } else {
-      if (i.src1.is_constant) {
-        e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
-      } else {
-        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
-      }
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
-      e.vmovaps(i.dest, e.xmm0);
+      e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+    e.vmovaps(i.dest, e.xmm0);
  }
-  static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
-    alignas(16) uint16_t a[8];
-    alignas(16) float b[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
-    if (!cvars::use_extended_range_half) {
-      for (int i = 0; i < 4; i++) {
-        b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
-      }
-    } else {
-      for (int i = 0; i < 4; i++) {
-        b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]);
-      }
-    }

-    return _mm_load_ps(b);
-  }
  static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
-    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
-    if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
-      Xmm src;
-      if (i.src1.is_constant) {
-        src = i.dest;
-        e.LoadConstantXmm(src, i.src1.constant());
-      } else {
-        src = i.src1;
+    if (i.src1.is_constant) {
+      vec128_t result{};
+
+      for (int idx = 0; idx < 4; ++idx) {
+        result.f32[idx] =
+            xenos_half_to_float(i.src1.constant().u16[VEC128_W(4 + idx)]);
      }
-      // Shuffle to 0|0|0|0|W|Z|Y|X
-      e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4));
-      e.vcvtph2ps(i.dest, i.dest);
+
+      e.LoadConstantXmm(i.dest, result);
+
    } else {
-      if (i.src1.is_constant) {
-        e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
-      } else {
-        e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
-      }
-      e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
-      e.vmovaps(i.dest, e.xmm0);
+      emit_fast_f16_unpack(e, i, XMMUnpackFLOAT16_4);
    }
  }
  static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -50,6 +50,10 @@ DEFINE_bool(no_round_to_single, false,
            "Not for users, breaks games. Skip rounding double values to "
            "single precision and back",
            "CPU");
+DEFINE_bool(
+    inline_loadclock, false,
+    "Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
+    "CPU");
 namespace xe {
 namespace cpu {
 namespace backend {
@ -475,33 +479,39 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
 // ============================================================================
 struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // When scaling is disabled and the raw clock source is selected, the code
-    // in the Clock class is actually just forwarding tick counts after one
-    // simple multiply and division. In that case we rather bake the scaling in
-    // here to cut extra function calls with CPU cache misses and stack frame
-    // overhead.
-    if (cvars::clock_no_scaling && cvars::clock_source_raw) {
-      auto ratio = Clock::guest_tick_ratio();
-      // The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
-      // mfence/lfence magic the rdtsc instruction can be executed sooner or
-      // later in the cache window. Since it's resolution however is much higher
-      // than the 360's mftb instruction this can safely be ignored.
-
-      // Read time stamp in edx (high part) and eax (low part).
-      e.rdtsc();
-      // Make it a 64 bit number in rax.
-      e.shl(e.rdx, 32);
-      e.or_(e.rax, e.rdx);
-      // Apply tick frequency scaling.
-      e.mov(e.rcx, ratio.first);
-      e.mul(e.rcx);
-      // We actually now have a 128 bit number in rdx:rax.
-      e.mov(e.rcx, ratio.second);
-      e.div(e.rcx);
-      e.mov(i.dest, e.rax);
+    if (cvars::inline_loadclock) {
+      e.mov(e.rcx,
+            e.GetBackendCtxPtr(offsetof(X64BackendContext, guest_tick_count)));
+      e.mov(i.dest, e.qword[e.rcx]);
    } else {
-      e.CallNative(LoadClock);
-      e.mov(i.dest, e.rax);
+      // When scaling is disabled and the raw clock source is selected, the code
+      // in the Clock class is actually just forwarding tick counts after one
+      // simple multiply and division. In that case we rather bake the scaling
+      // in here to cut extra function calls with CPU cache misses and stack
+      // frame overhead.
+      if (cvars::clock_no_scaling && cvars::clock_source_raw) {
+        auto ratio = Clock::guest_tick_ratio();
+        // The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
+        // mfence/lfence magic the rdtsc instruction can be executed sooner or
+        // later in the cache window. Since it's resolution however is much
+        // higher than the 360's mftb instruction this can safely be ignored.
+
+        // Read time stamp in edx (high part) and eax (low part).
+        e.rdtsc();
+        // Make it a 64 bit number in rax.
+        e.shl(e.rdx, 32);
+        e.or_(e.rax, e.rdx);
+        // Apply tick frequency scaling.
+        e.mov(e.rcx, ratio.first);
+        e.mul(e.rcx);
+        // We actually now have a 128 bit number in rdx:rax.
+        e.mov(e.rcx, ratio.second);
+        e.div(e.rcx);
+        e.mov(i.dest, e.rax);
+      } else {
+        e.CallNative(LoadClock);
+        e.mov(i.dest, e.rax);
+      }
    }
  }
  static uint64_t LoadClock(void* raw_context) {
@ -539,10 +549,12 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
 struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vmaxps(dest, src1, src2);
-                               });
+	//if 0 and -0, return 0! opposite of minfp
+    auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    e.vmaxps(e.xmm2, src1, src2);
+    e.vmaxps(e.xmm3, src2, src1);
+    e.vandps(i.dest, e.xmm2, e.xmm3);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128);
@ -597,10 +609,11 @@ struct MIN_F64 : Sequence<MIN_F64, I<OPCODE_MIN, F64Op, F64Op, F64Op>> {
 struct MIN_V128 : Sequence<MIN_V128, I<OPCODE_MIN, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
-    EmitCommutativeBinaryXmmOp(e, i,
-                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
-                                 e.vminps(dest, src1, src2);
-                               });
+    auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    e.vminps(e.xmm2, src1, src2);
+    e.vminps(e.xmm3, src2, src1);
+    e.vorps(i.dest, e.xmm2, e.xmm3);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32,
@ -768,6 +781,7 @@ struct SELECT_V128_V128
    } else if (mayblend == PermittedBlend::Ps) {
      e.vblendvps(i.dest, src2, src3, src1);
    } else {
+		//ideally we would have an xop path here...
      // src1 ? src2 : src3;
      e.vpandn(e.xmm3, src1, src2);
      e.vpand(i.dest, src1, src3);
@ -1932,6 +1946,53 @@ struct MUL_ADD_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);

+struct NEGATED_MUL_ADD_F64
+    : Sequence<NEGATED_MUL_ADD_F64,
+               I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Fpu);
+
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
+    if (e.IsFeatureEnabled(kX64EmitFMA)) {
+      // todo: this is garbage
+      e.vmovapd(e.xmm3, src1);
+      e.vfmadd213sd(e.xmm3, src2, src3);
+      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
+    } else {
+      // todo: might need to use x87 in this case...
+      e.vmulsd(e.xmm3, src1, src2);
+      e.vaddsd(i.dest, e.xmm3, src3);
+      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
+    }
+  }
+};
+struct NEGATED_MUL_ADD_V128
+    : Sequence<NEGATED_MUL_ADD_V128,
+               I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
+
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
+    if (e.IsFeatureEnabled(kX64EmitFMA)) {
+      // todo: this is garbage
+      e.vmovaps(e.xmm3, src1);
+      e.vfmadd213ps(e.xmm3, src2, src3);
+      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
+    } else {
+      // todo: might need to use x87 in this case...
+      e.vmulps(e.xmm3, src1, src2);
+      e.vaddps(i.dest, e.xmm3, src3);
+      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
+                     NEGATED_MUL_ADD_V128);
+
 // ============================================================================
 // OPCODE_MUL_SUB
 // ============================================================================
@ -1944,12 +2005,7 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
 // - 132 -> $1 = $1 * $3 - $2
 // - 213 -> $1 = $2 * $1 - $3
 // - 231 -> $1 = $2 * $3 - $1
-struct MUL_SUB_F32
-    : Sequence<MUL_SUB_F32, I<OPCODE_MUL_SUB, F32Op, F32Op, F32Op, F32Op>> {
-  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    assert_impossible_sequence(MUL_SUB_F32);
-  }
-};
+
 struct MUL_SUB_F64
    : Sequence<MUL_SUB_F64, I<OPCODE_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -1991,7 +2047,54 @@ struct MUL_SUB_V128
    }
  }
 };
-EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128);
+EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
+
+struct NEGATED_MUL_SUB_F64
+    : Sequence<NEGATED_MUL_SUB_F64,
+               I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Fpu);
+
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
+    if (e.IsFeatureEnabled(kX64EmitFMA)) {
+      // todo: this is garbage
+      e.vmovapd(e.xmm3, src1);
+      e.vfmsub213sd(e.xmm3, src2, src3);
+      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
+    } else {
+      // todo: might need to use x87 in this case...
+      e.vmulsd(e.xmm3, src1, src2);
+      e.vsubsd(i.dest, e.xmm3, src3);
+      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
+    }
+  }
+};
+struct NEGATED_MUL_SUB_V128
+    : Sequence<NEGATED_MUL_SUB_V128,
+               I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.ChangeMxcsrMode(MXCSRMode::Vmx);
+
+    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
+    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
+    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
+    if (e.IsFeatureEnabled(kX64EmitFMA)) {
+      // todo: this is garbage
+      e.vmovaps(e.xmm3, src1);
+      e.vfmsub213ps(e.xmm3, src2, src3);
+      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
+    } else {
+      // todo: might need to use x87 in this case...
+      e.vmulps(e.xmm3, src1, src2);
+      e.vsubps(i.dest, e.xmm3, src3);
+      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
+                     NEGATED_MUL_SUB_V128);

 // ============================================================================
 // OPCODE_NEG
@ -2264,7 +2367,7 @@ struct DOT_PRODUCT_3_V128
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    // todo: add fast_dot_product path that just checks for infinity instead of
    // using mxcsr
-    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];

    // this is going to hurt a bit...
    /*
@ -2380,7 +2483,7 @@ struct DOT_PRODUCT_4_V128
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    // todo: add fast_dot_product path that just checks for infinity instead of
    // using mxcsr
-    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
+    auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];

    bool is_lensqr = i.instr->src1.value == i.instr->src2.value;

@ -3162,9 +3265,9 @@ struct SET_ROUNDING_MODE_I32
    // backends dont have to worry about it
    if (i.src1.is_constant) {
      e.mov(e.eax, mxcsr_table[i.src1.constant()]);
-      e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64], e.eax);
+      e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
      e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
-      e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]);
+      e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);

    } else {
      e.mov(e.ecx, i.src1);
--- a/src/xenia/cpu/backend/x64/x64_stack_layout.h
+++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h
@ -123,7 +123,10 @@ class StackLayout {
   */
  static const size_t GUEST_STACK_SIZE = 104;
  //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
-  static const size_t GUEST_SCRATCH64 = 80;
+  static const size_t GUEST_SCRATCH = 0;
+  
+  //when profiling is on, this stores the nanosecond time at the start of the function
+  static const size_t GUEST_PROFILER_START = 80;
  static const size_t GUEST_RET_ADDR = 88;
  static const size_t GUEST_CALL_RET_ADDR = 96;
 };
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -600,6 +600,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          break;
        case OPCODE_MAX:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            if (should_skip_because_of_float) {
+              break;
+			}
            v->set_from(i->src1.value);
            v->Max(i->src2.value);
            i->Remove();
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1636,15 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2,
 Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-  #if 0
-  bool c1 = value1->IsConstant();
-  bool c2 = value2->IsConstant();
-  if (c1 && c2) {
-    Value* dest = CloneValue(value1);
-    dest->Mul(value2);
-    return Add(dest, value3);
-  }
-  #endif
+
  Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
@ -1655,15 +1647,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
 Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
-  #if 0
-  bool c1 = value1->IsConstant();
-  bool c2 = value2->IsConstant();
-  if (c1 && c2) {
-    Value* dest = CloneValue(value1);
-    dest->Mul(value2);
-    return Sub(dest, value3);
-  }
-  #endif
+
  Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
@ -1671,6 +1655,30 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  return i->dest;
 }

+Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
+  ASSERT_TYPES_EQUAL(value1, value2);
+  ASSERT_TYPES_EQUAL(value1, value3);
+
+  Instr* i =
+      AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
+  i->set_src1(value1);
+  i->set_src2(value2);
+  i->set_src3(value3);
+  return i->dest;
+}
+
+Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
+  ASSERT_TYPES_EQUAL(value1, value2);
+  ASSERT_TYPES_EQUAL(value1, value3);
+
+  Instr* i =
+      AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
+  i->set_src1(value1);
+  i->set_src2(value2);
+  i->set_src3(value3);
+  return i->dest;
+}
+
 Value* HIRBuilder::Neg(Value* value) {
  Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
  i->set_src1(value);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -214,6 +214,10 @@ class HIRBuilder {
  Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* MulAdd(Value* value1, Value* value2, Value* value3);  // (1 * 2) + 3
  Value* MulSub(Value* value1, Value* value2, Value* value3);  // (1 * 2) - 3
+  Value* NegatedMulAdd(Value* value1, Value* value2,
+                       Value* value3);  // -((1 * 2) + 3)
+  Value* NegatedMulSub(Value* value1, Value* value2,
+                       Value* value3);  // -((1 * 2) - 3)
  Value* Neg(Value* value);
  Value* Abs(Value* value);
  Value* Sqrt(Value* value);
@ -265,6 +269,7 @@ class HIRBuilder {
  Value* AtomicAdd(Value* address, Value* value);
  Value* AtomicSub(Value* address, Value* value);
  void SetNJM(Value* value);
+
 protected:
  void DumpValue(StringBuffer* str, Value* value);
  void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op);
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -208,6 +208,12 @@ enum Opcode {
  OPCODE_STORE_OFFSET,
  OPCODE_LOAD,
  OPCODE_STORE,
+  // chrispy: todo: implement, our current codegen for the unaligned loads is
+  // very bad
+  OPCODE_LVLX,
+  OPCODE_LVRX,
+  OPCODE_STVLX,
+  OPCODE_STVRX,
  OPCODE_MEMSET,
  OPCODE_CACHE_CONTROL,
  OPCODE_MEMORY_BARRIER,
@ -244,7 +250,9 @@ enum Opcode {
  OPCODE_MUL_HI,  // TODO(benvanik): remove this and add INT128 type.
  OPCODE_DIV,
  OPCODE_MUL_ADD,
+  OPCODE_NEGATED_MUL_ADD,
  OPCODE_MUL_SUB,
+  OPCODE_NEGATED_MUL_SUB,
  OPCODE_NEG,
  OPCODE_ABS,
  OPCODE_SQRT,
@ -284,7 +292,8 @@ enum Opcode {
  OPCODE_TO_SINGLE,  // i could not find a decent name to assign to this opcode,
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
-	  OPCODE_SET_NJM, 
+  OPCODE_SET_NJM,
+
  __OPCODE_MAX_VALUE,  // Keep at end.
 };

--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -464,6 +464,19 @@ DEFINE_OPCODE(
    OPCODE_SIG_V_V_V_V,
    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)

+DEFINE_OPCODE(
+    OPCODE_NEGATED_MUL_ADD,
+    "negated_mul_add",
+    OPCODE_SIG_V_V_V_V,
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
+
+DEFINE_OPCODE(
+    OPCODE_NEGATED_MUL_SUB,
+    "negated_mul_sub",
+    OPCODE_SIG_V_V_V_V,
+    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
+
+
 DEFINE_OPCODE(
    OPCODE_NEG,
    "neg",
@ -692,4 +705,5 @@ DEFINE_OPCODE(
 	"set_njm",
 	OPCODE_SIG_X_V,
 	0
-)
+)
+
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -613,10 +613,12 @@ class Value {
  // returns true if every single use is as an operand to a single instruction
  // (add var2, var1, var1)
  bool AllUsesByOneInsn() const;
-  //the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context
+  // the maybe is here because this includes vec128, which is untyped data that
+  // can be treated as float or int depending on the context
  bool MaybeFloaty() const {
    return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
  }
+
 private:
  static bool CompareInt8(Opcode opcode, Value* a, Value* b);
  static bool CompareInt16(Opcode opcode, Value* a, Value* b);
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@ -48,7 +48,7 @@ class MMIOHandler {
  typedef uint32_t (*HostToGuestVirtual)(const void* context,
                                         const void* host_address);
  typedef bool (*AccessViolationCallback)(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others?
      void* context, void* host_address, bool is_write);

  // access_violation_callback is called with global_critical_region locked once
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -14,8 +14,8 @@
 #include <mutex>
 #include <string>

+#include "xenia/base/mutex.h"
 #include "xenia/base/vec128.h"
-
 namespace xe {
 namespace cpu {
 class Processor;
@ -390,9 +390,11 @@ typedef struct alignas(64) PPCContext_s {
  // These are split to make it easier to do DCE on unused stores.
  uint64_t cr() const;
  void set_cr(uint64_t value);
-
+  // todo: remove, saturation should be represented by a vector
  uint8_t vscr_sat;

+  uint32_t vrsave;
+
  // uint32_t get_fprf() {
  //   return fpscr.value & 0x000F8000;
  // }
@ -405,7 +407,7 @@ typedef struct alignas(64) PPCContext_s {

  // Global interrupt lock, held while interrupts are disabled or interrupts are
  // executing. This is shared among all threads and comes from the processor.
-  std::recursive_mutex* global_mutex;
+  global_mutex_type* global_mutex;

  // Used to shuttle data into externs. Contents volatile.
  uint64_t scratch;
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -1197,7 +1197,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
  Value* c = f.VectorDenormFlush(f.LoadVR(vc));

-  Value* v = f.Neg(f.MulSub(a, c, b));
+  Value* v = f.NegatedMulSub(a, c, b);
  f.StoreVR(vd, v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_control.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_control.cc
@ -16,6 +16,12 @@
 #include "xenia/cpu/ppc/ppc_hir_builder.h"

 #include <stddef.h>
+// chrispy: added this, we can have simpler control flow and do dce on the
+// inputs
+DEFINE_bool(ignore_trap_instructions, true,
+            "Generate no code for powerpc trap instructions, can result in "
+            "better performance in games that aggressively check with trap.",
+            "CPU");

 namespace xe {
 namespace cpu {
@ -449,6 +455,9 @@ constexpr uint32_t TRAP_SLT = 1 << 4, TRAP_SGT = 1 << 3, TRAP_EQ = 1 << 2,

 int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
                   uint32_t TO) {
+  if (cvars::ignore_trap_instructions) {
+    return 0;
+  }
  // if (a < b) & TO[0] then TRAP
  // if (a > b) & TO[1] then TRAP
  // if (a = b) & TO[2] then TRAP
@ -521,6 +530,9 @@ int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
 }

 int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
+  if (cvars::ignore_trap_instructions) {
+    return 0;
+  }
  // a <- (RA)
  // b <- (RB)
  // if (a < b) & TO[0] then TRAP
@ -534,6 +546,9 @@ int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
+  if (cvars::ignore_trap_instructions) {
+    return 0;
+  }
  // a <- (RA)
  // if (a < EXTS(SI)) & TO[0] then TRAP
  // if (a > EXTS(SI)) & TO[1] then TRAP
@ -546,6 +561,9 @@ int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
+  if (cvars::ignore_trap_instructions) {
+    return 0;
+  }
  // a <- EXTS((RA)[32:63])
  // b <- EXTS((RB)[32:63])
  // if (a < b) & TO[0] then TRAP
@ -561,6 +579,9 @@ int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_twi(PPCHIRBuilder& f, const InstrData& i) {
+  if (cvars::ignore_trap_instructions) {
+    return 0;
+  }
  // a <- EXTS((RA)[32:63])
  // if (a < EXTS(SI)) & TO[0] then TRAP
  // if (a > EXTS(SI)) & TO[1] then TRAP
@ -645,7 +666,9 @@ int InstrEmit_mfspr(PPCHIRBuilder& f, const InstrData& i) {
      break;
    case 256:
      // VRSAVE
-      v = f.LoadZeroInt64();
+
+      v = f.ZeroExtend(f.LoadContext(offsetof(PPCContext, vrsave), INT32_TYPE),
+                       INT64_TYPE);
      break;
    case 268:
      // TB
@ -749,6 +772,8 @@ int InstrEmit_mtspr(PPCHIRBuilder& f, const InstrData& i) {
      f.StoreCTR(rt);
      break;
    case 256:
+
+      f.StoreContext(offsetof(PPCContext, vrsave), f.Truncate(rt, INT32_TYPE));
      // VRSAVE
      break;
    default:
@ -768,6 +793,7 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
  // bit 48 = EE; interrupt enabled
  // bit 62 = RI; recoverable interrupt
  // return 8000h if unlocked (interrupts enabled), else 0
+#if 0
  f.MemoryBarrier();
  if (cvars::disable_global_lock || true) {
    f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
@ -777,63 +803,23 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
    f.StoreGPR(i.X.RT,
               f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
  }
+#else
+  f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
+#endif
  return 0;
 }

 int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
-  if (i.X.RA & 0x01) {
-    // L = 1
-    // iff storing from r13
-    f.MemoryBarrier();
-    f.StoreContext(
-        offsetof(PPCContext, scratch),
-        f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
-#if 0
-    if (i.X.RT == 13) {
-      // iff storing from r13 we are taking a lock (disable interrupts).
-      if (!cvars::disable_global_lock) {
-        f.CallExtern(f.builtins()->enter_global_lock);
-      }
-    } else {
-      // Otherwise we are restoring interrupts (probably).
-      if (!cvars::disable_global_lock) {
-        f.CallExtern(f.builtins()->leave_global_lock);
-      }
-    }
-#endif
-    return 0;
-  } else {
-    // L = 0
-    XEINSTRNOTIMPLEMENTED();
-    return 1;
-  }
+  f.StoreContext(
+      offsetof(PPCContext, scratch),
+      f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
+  return 0;
 }

 int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
-  if (i.X.RA & 0x01) {
-    // L = 1
-    f.MemoryBarrier();
-    f.StoreContext(offsetof(PPCContext, scratch),
-                   f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
-#if 0
-    if (i.X.RT == 13) {
-      // iff storing from r13 we are taking a lock (disable interrupts).
-      if (!cvars::disable_global_lock) {
-        f.CallExtern(f.builtins()->enter_global_lock);
-      }
-    } else {
-      // Otherwise we are restoring interrupts (probably).
-      if (!cvars::disable_global_lock) {
-        f.CallExtern(f.builtins()->leave_global_lock);
-      }
-    }
-#endif
-    return 0;
-  } else {
-    // L = 0
-    XEINSTRNOTIMPLEMENTED();
-    return 1;
-  }
+  f.StoreContext(offsetof(PPCContext, scratch),
+                 f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
+  return 0;
 }

 void RegisterEmitCategoryControl() {
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.Neg(
-      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
+  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+                             f.LoadFPR(i.A.FRB));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.Neg(
-      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
+  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+                             f.LoadFPR(i.A.FRB));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.Neg(
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
+  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+                             f.LoadFPR(i.A.FRB));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {

 int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.Neg(
-      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
+  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+                             f.LoadFPR(i.A.FRB));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@ -834,6 +834,7 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {

  // Issue memory barrier for when we go out of lock and want others to see our
  // updates.
+
  f.MemoryBarrier();

  return 0;
--- a/src/xenia/cpu/thread_state.cc
+++ b/src/xenia/cpu/thread_state.cc
@ -77,7 +77,8 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,

  // Allocate with 64b alignment.

-  context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext());  // memory::AlignedAlloc<ppc::PPCContext>(64);
+  context_ = reinterpret_cast<ppc::PPCContext*>(
+      AllocateContext());
  processor->backend()->InitializeBackendContext(context_);
  assert_true(((uint64_t)context_ & 0x3F) == 0);
  std::memset(context_, 0, sizeof(ppc::PPCContext));
@ -93,6 +94,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
  // Set initial registers.
  context_->r[1] = stack_base;
  context_->r[13] = pcr_address;
+  // fixme: VSCR must be set here!
 }

 ThreadState::~ThreadState() {
@ -105,7 +107,7 @@ ThreadState::~ThreadState() {
  if (context_) {
    FreeContext(reinterpret_cast<void*>(context_));
  }
- // memory::AlignedFree(context_);
+  // memory::AlignedFree(context_);
 }

 void ThreadState::Bind(ThreadState* thread_state) {
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -29,10 +29,20 @@
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"

+#if defined(NDEBUG)
+static constexpr bool should_log_unknown_reg_writes() { return false; }
+
+#else
+
 DEFINE_bool(log_unknown_register_writes, false,
            "Log writes to unknown registers from "
            "CommandProcessor::WriteRegister. Has significant performance hit.",
            "GPU");
+static bool should_log_unknown_reg_writes() {
+  return cvars::log_unknown_register_writes;
+}
+#endif
+
 namespace xe {
 namespace gpu {

@ -465,7 +475,7 @@ void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
  }
 }
 void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
-  if (XE_UNLIKELY(cvars::log_unknown_register_writes)) {
+  if (should_log_unknown_reg_writes()) {
    // chrispy: rearrange check order, place set after checks
    if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) {
      XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
@ -493,15 +503,45 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
  // very unlikely. these ORS here are meant to be bitwise ors, so that we do
  // not do branching evaluation of the conditions (we will almost always take
  // all of the branches)
-  if (XE_UNLIKELY(
-          (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
-          (index == XE_GPU_REG_COHER_STATUS_HOST) |
-          ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
-           (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
+
+  unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
+                  (index == XE_GPU_REG_COHER_STATUS_HOST) |
+                  ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
+                   (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
+  // chrispy: reordered for msvc branch probability (assumes if is taken and
+  // else is not)
+  if (XE_LIKELY(expr == 0)) {
+  } else {
    HandleSpecialRegisterWrite(index, value);
  }
 }
+void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
+                                             uint32_t* base,
+                                             uint32_t num_registers) {
+  for (uint32_t i = 0; i < num_registers; ++i) {
+    uint32_t data = xe::load_and_swap<uint32_t>(base + i);
+    this->WriteRegister(start_index + i, data);
+  }
+}

+void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
+                                                  uint32_t base,
+                                                  uint32_t num_registers) {
+  for (uint32_t i = 0; i < num_registers; ++i) {
+    uint32_t data = ring->ReadAndSwap<uint32_t>();
+    WriteRegister(base + i, data);
+  }
+}
+
+void CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
+                                                uint32_t base,
+                                                uint32_t num_times) {
+  for (uint32_t m = 0; m < num_times; m++) {
+    uint32_t reg_data = ring->ReadAndSwap<uint32_t>();
+    uint32_t target_index = base;
+    WriteRegister(target_index, reg_data);
+  }
+}
 void CommandProcessor::MakeCoherent() {
  SCOPE_profile_cpu_f("gpu");

@ -623,15 +663,20 @@ void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
 }

 bool CommandProcessor::ExecutePacket(RingBuffer* reader) {
+  // prefetch the wraparound range
+  // it likely is already in L3 cache, but in a zen system it may be another
+  // chiplets l3
+  reader->BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
+      reader->read_count());
  const uint32_t packet = reader->ReadAndSwap<uint32_t>();
  const uint32_t packet_type = packet >> 30;
-  if (packet == 0 || packet == 0x0BADF00D) {
+  if (XE_UNLIKELY(packet == 0 || packet == 0x0BADF00D)) {
    trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1);
    trace_writer_.WritePacketEnd();
    return true;
  }

-  if (packet == 0xCDCDCDCD) {
+  if (XE_UNLIKELY(packet == 0xCDCDCDCD)) {
    XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!");
  }

@ -667,10 +712,10 @@ bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) {

  uint32_t base_index = (packet & 0x7FFF);
  uint32_t write_one_reg = (packet >> 15) & 0x1;
-  for (uint32_t m = 0; m < count; m++) {
-    uint32_t reg_data = reader->ReadAndSwap<uint32_t>();
-    uint32_t target_index = write_one_reg ? base_index : base_index + m;
-    WriteRegister(target_index, reg_data);
+  if (write_one_reg) {
+    WriteOneRegisterFromRing(reader, base_index, count);
+  } else {
+    WriteRegisterRangeFromRing(reader, base_index, count);
  }

  trace_writer_.WritePacketEnd();
@ -934,7 +979,7 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader,
                                                  uint32_t count) {
  SCOPE_profile_cpu_f("gpu");

-  XELOGI("XE_SWAP");
+  XELOGD("XE_SWAP");

  Profiler::Flip();

@ -1467,10 +1512,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader,
      reader->AdvanceRead((count - 1) * sizeof(uint32_t));
      return true;
  }
-  for (uint32_t n = 0; n < count - 1; n++, index++) {
-    uint32_t data = reader->ReadAndSwap<uint32_t>();
-    WriteRegister(index, data);
-  }
+
+  WriteRegisterRangeFromRing(reader, index, count - 1);
+
  return true;
 }

@ -1479,10 +1523,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader,
                                                        uint32_t count) {
  uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
  uint32_t index = offset_type & 0xFFFF;
-  for (uint32_t n = 0; n < count - 1; n++, index++) {
-    uint32_t data = reader->ReadAndSwap<uint32_t>();
-    WriteRegister(index, data);
-  }
+
+  WriteRegisterRangeFromRing(reader, index, count - 1);
+
  return true;
 }

@ -1517,12 +1560,12 @@ bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader,
      assert_always();
      return true;
  }
+
  trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4);
-  for (uint32_t n = 0; n < size_dwords; n++, index++) {
-    uint32_t data = xe::load_and_swap<uint32_t>(
-        memory_->TranslatePhysical(address + n * 4));
-    WriteRegister(index, data);
-  }
+
+  WriteRegistersFromMem(index, (uint32_t*)memory_->TranslatePhysical(address),
+                        size_dwords);
+
  return true;
 }

@ -1530,10 +1573,9 @@ bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS(
    RingBuffer* reader, uint32_t packet, uint32_t count) {
  uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
  uint32_t index = offset_type & 0xFFFF;
-  for (uint32_t n = 0; n < count - 1; n++, index++) {
-    uint32_t data = reader->ReadAndSwap<uint32_t>();
-    WriteRegister(index, data);
-  }
+
+  WriteRegisterRangeFromRing(reader, index, count - 1);
+
  return true;
 }

--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -153,8 +153,24 @@ class CommandProcessor {
  // rarely needed, most register writes have no special logic here
  XE_NOINLINE
  void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
+  XE_FORCEINLINE
  virtual void WriteRegister(uint32_t index, uint32_t value);

+  // mem has big-endian register values
+  XE_FORCEINLINE
+  virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
+                                     uint32_t num_registers);
+
+  XE_FORCEINLINE
+  virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                                          uint32_t num_registers);
+
+  XE_FORCEINLINE
+  virtual void WriteOneRegisterFromRing(
+      xe::RingBuffer* ring, uint32_t base,
+      uint32_t
+          num_times);  // repeatedly write a value to one register, presumably a
+                       // register with special handling for writes
  const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
    return gamma_ramp_256_entry_table_;
  }
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -1710,7 +1710,60 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
    }
  }
 }
+void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
+                                                  uint32_t* base,
+                                                  uint32_t num_registers) {
+  for (uint32_t i = 0; i < num_registers; ++i) {
+    uint32_t data = xe::load_and_swap<uint32_t>(base + i);
+    D3D12CommandProcessor::WriteRegister(start_index + i, data);
+  }
+}
+void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
+                                                       uint32_t base,
+                                                       uint32_t num_registers) {
+  // we already brought it into L2 earlier
+  RingBuffer::ReadRange range =
+      ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(num_registers *
+                                                              sizeof(uint32_t));

+  uint32_t num_regs_firstrange =
+      static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
+
+  D3D12CommandProcessor::WriteRegistersFromMem(
+      base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
+      num_regs_firstrange);
+  if (range.second) {
+    D3D12CommandProcessor::WriteRegistersFromMem(
+        base + num_regs_firstrange,
+        reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
+        num_registers - num_regs_firstrange);
+  }
+  ring->EndRead(range);
+}
+void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
+                                                     uint32_t base,
+                                                     uint32_t num_times) {
+  auto read = ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(
+      num_times * sizeof(uint32_t));
+
+  uint32_t first_length = read.first_length / sizeof(uint32_t);
+
+  for (uint32_t i = 0; i < first_length; ++i) {
+    D3D12CommandProcessor::WriteRegister(
+        base, xe::load_and_swap<uint32_t>(read.first + (sizeof(uint32_t) * i)));
+  }
+
+  if (read.second) {
+    uint32_t second_length = read.second_length / sizeof(uint32_t);
+
+    for (uint32_t i = 0; i < second_length; ++i) {
+      D3D12CommandProcessor::WriteRegister(
+          base,
+          xe::load_and_swap<uint32_t>(read.second + (sizeof(uint32_t) * i)));
+    }
+  }
+  ring->EndRead(read);
+}
 void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
  gamma_ramp_256_entry_table_up_to_date_ = false;
 }
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -42,7 +42,7 @@ namespace xe {
 namespace gpu {
 namespace d3d12 {

-class D3D12CommandProcessor : public CommandProcessor {
+class D3D12CommandProcessor final : public CommandProcessor {
 public:
  explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
                                 kernel::KernelState* kernel_state);
@ -203,9 +203,17 @@ class D3D12CommandProcessor : public CommandProcessor {
 protected:
  bool SetupContext() override;
  void ShutdownContext() override;
-
+  XE_FORCEINLINE
  void WriteRegister(uint32_t index, uint32_t value) override;
-
+  XE_FORCEINLINE
+  virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
+                                     uint32_t num_registers) override;
+  XE_FORCEINLINE
+  virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
+                                          uint32_t num_registers) override;
+  XE_FORCEINLINE
+  virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
+                                        uint32_t num_times) override;
  void OnGammaRamp256EntryTableValueWritten() override;
  void OnGammaRampPWLValueWritten() override;

--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@ -406,14 +406,16 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
 }

 bool D3D12SharedMemory::UploadRanges(
-    const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
-  if (upload_page_ranges.empty()) {
+    const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) {
+  if (!num_upload_page_ranges) {
    return true;
  }
  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
  command_processor_.SubmitBarriers();
  auto& command_list = command_processor_.GetDeferredCommandList();
-  for (auto upload_range : upload_page_ranges) {
+  //for (auto upload_range : upload_page_ranges) {
+  for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
+    auto& upload_range = upload_page_ranges[i];
    uint32_t upload_range_start = upload_range.first;
    uint32_t upload_range_length = upload_range.second;
    trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
-                        upload_page_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
+                        upload_page_ranges, unsigned num_ranges) override;

 private:
  D3D12CommandProcessor& command_processor_;
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
-  const uintmax_t* stream = command_stream_.data();
-  size_t stream_remaining = command_stream_.size();
+  const uintmax_t* stream = (const uintmax_t*)command_stream_.data();
+  size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t);
  ID3D12PipelineState* current_pipeline_state = nullptr;
  while (stream_remaining != 0) {
    const CommandHeader& header =
@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,

 void* DeferredCommandList::WriteCommand(Command command,
                                        size_t arguments_size_bytes) {
+
  size_t arguments_size_elements =
-      (arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
+      round_up(arguments_size_bytes, sizeof(uintmax_t), false);
+
+      //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
+  #if 0
  size_t offset = command_stream_.size();
  command_stream_.resize(offset + kCommandHeaderSizeElements +
                         arguments_size_elements);
@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command,
  header.command = command;
  header.arguments_size_elements = uint32_t(arguments_size_elements);
  return command_stream_.data() + (offset + kCommandHeaderSizeElements);
+  #else
+
+  size_t offset = command_stream_.size();
+  constexpr size_t kCommandHeaderSizeBytes =
+      kCommandHeaderSizeElements * sizeof(uintmax_t);
+  command_stream_.resize(offset + kCommandHeaderSizeBytes +
+                         arguments_size_elements);
+  CommandHeader& header =
+      *reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
+  header.command = command;
+  header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
+  return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
+  #endif
 }

 }  // namespace d3d12
--- a/src/xenia/gpu/d3d12/deferred_command_list.h
+++ b/src/xenia/gpu/d3d12/deferred_command_list.h
@ -19,7 +19,7 @@
 #include "xenia/base/literals.h"
 #include "xenia/base/math.h"
 #include "xenia/ui/d3d12/d3d12_api.h"
-
+#include "xenia/base/memory.h"
 namespace xe {
 namespace gpu {
 namespace d3d12 {
@ -30,8 +30,12 @@ class D3D12CommandProcessor;

 class DeferredCommandList {
 public:
+  static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb
+  /*
+	chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps
+  */
  DeferredCommandList(const D3D12CommandProcessor& command_processor,
-                      size_t initial_size_bytes = 1_MiB);
+                      size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST);

  void Reset();
  void Execute(ID3D12GraphicsCommandList* command_list,
@ -562,7 +566,8 @@ class DeferredCommandList {
  const D3D12CommandProcessor& command_processor_;

  // uintmax_t to ensure uint64_t and pointer alignment of all structures.
-  std::vector<uintmax_t> command_stream_;
+  //std::vector<uintmax_t> command_stream_;
+  FixedVMemVector<MAX_SIZEOF_COMMANDLIST> command_stream_;
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
           xenos::kMaxResolveSize);
    y1 = y0 + int32_t(xenos::kMaxResolveSize);
  }
-
+  //fails in forza horizon 1
  assert_true(x0 < x1 && y0 < y1);
  if (x0 >= x1 || y0 >= y1) {
    XELOGE("Resolve region is empty");
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@ -883,7 +883,7 @@ class PrimitiveProcessor {
  // Must be called in a global critical region.
  void UpdateCacheBucketsNonEmptyL2(
      uint32_t bucket_index_div_64,
-      [[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
+      [[maybe_unused]] const global_unique_lock_type&
          global_lock) {
    uint64_t& cache_buckets_non_empty_l2_ref =
        cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -320,8 +320,7 @@ void Shader::GatherVertexFetchInformation(
  for (auto& vertex_binding : vertex_bindings_) {
    if (vertex_binding.fetch_constant == op.fetch_constant_index()) {
      // It may not hold that all strides are equal, but I hope it does.
-      assert_true(!fetch_instr.attributes.stride ||
-                  vertex_binding.stride_words == fetch_instr.attributes.stride);
+
      vertex_binding.attributes.push_back({});
      attrib = &vertex_binding.attributes.back();
      break;
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@ -14,6 +14,7 @@

 #include "xenia/base/assert.h"
 #include "xenia/base/bit_range.h"
+#include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
 #include "xenia/base/profiling.h"
@ -344,7 +345,7 @@ void SharedMemory::UnlinkWatchRange(WatchRange* range) {
  range->next_free = watch_range_first_free_;
  watch_range_first_free_ = range;
 }
-
+// todo: optimize, an enormous amount of cpu time (1.34%) is spent here.
 bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
                                bool* any_data_resolved_out) {
  if (!length) {
@ -364,14 +365,20 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
    return false;
  }

+  unsigned int current_upload_range = 0;
  uint32_t page_first = start >> page_size_log2_;
  uint32_t page_last = (start + length - 1) >> page_size_log2_;

  upload_ranges_.clear();
+
+  std::pair<uint32_t, uint32_t>* uploads =
+      reinterpret_cast<std::pair<uint32_t, uint32_t>*>(upload_ranges_.data());
+
  bool any_data_resolved = false;
  uint32_t block_first = page_first >> 6;
  uint32_t block_last = page_last >> 6;
  uint32_t range_start = UINT32_MAX;
+
  {
    auto global_lock = global_critical_region_.Acquire();
    for (uint32_t i = block_first; i <= block_last; ++i) {
@ -412,8 +419,13 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
          if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
            break;
          }
-          upload_ranges_.push_back(
-              std::make_pair(range_start, (i << 6) + block_page - range_start));
+          if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) {
+            xe::FatalError(
+                "Hit max upload ranges in shared_memory.cc, tell a dev to "
+                "raise the limit!");
+          }
+          uploads[current_upload_range++] =
+              std::make_pair(range_start, (i << 6) + block_page - range_start);
          // In the next iteration within this block, consider this range valid
          // since it has been queued for upload.
          block_valid |= (uint64_t(1) << block_page) - 1;
@ -423,17 +435,17 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
    }
  }
  if (range_start != UINT32_MAX) {
-    upload_ranges_.push_back(
-        std::make_pair(range_start, page_last + 1 - range_start));
+    uploads[current_upload_range++] =
+        (std::make_pair(range_start, page_last + 1 - range_start));
  }
  if (any_data_resolved_out) {
    *any_data_resolved_out = any_data_resolved;
  }
-  if (upload_ranges_.empty()) {
+  if (!current_upload_range) {
    return true;
  }

-  return UploadRanges(upload_ranges_);
+  return UploadRanges(uploads, current_upload_range);
 }

 std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@ -35,7 +35,7 @@ class SharedMemory {
  virtual void SetSystemPageBlocksValidWithGpuDataWritten();

  typedef void (*GlobalWatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
  typedef void* GlobalWatchHandle;
  // Registers a callback invoked when something is invalidated in the GPU
@ -49,9 +49,9 @@ class SharedMemory {
  GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
                                        void* callback_context);
  void UnregisterGlobalWatch(GlobalWatchHandle handle);
-  typedef void (*WatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
-      void* data, uint64_t argument, bool invalidated_by_gpu);
+  typedef void (*WatchCallback)(const global_unique_lock_type& global_lock,
+                                void* context, void* data, uint64_t argument,
+                                bool invalidated_by_gpu);
  typedef void* WatchHandle;
  // Registers a callback invoked when the specified memory range is invalidated
  // in the GPU memory copy by the CPU or (if triggered explicitly - such as by
@ -140,7 +140,8 @@ class SharedMemory {
  // ascending address order, so front and back can be used to determine the
  // overall bounds of pages to be uploaded.
  virtual bool UploadRanges(
-      const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
+      const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+      unsigned num_upload_ranges) = 0;

  const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
    return trace_download_ranges_;
@ -174,10 +175,13 @@ class SharedMemory {

  void* memory_invalidation_callback_handle_ = nullptr;
  void* memory_data_provider_handle_ = nullptr;
+  static constexpr unsigned int MAX_UPLOAD_RANGES = 65536;

  // Ranges that need to be uploaded, generated by GetRangesToUpload (a
  // persistently allocated vector).
-  std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
+  // std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
+  FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
+      upload_ranges_;

  // GPU-written memory downloading for traces. <Start address, length>.
  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() {
 }

 void TextureCache::Texture::MakeUpToDateAndWatch(
-    const std::unique_lock<std::recursive_mutex>& global_lock) {
+    const global_unique_lock_type& global_lock) {
  SharedMemory& shared_memory = texture_cache().shared_memory();
  if (base_outdated_) {
    assert_not_zero(GetGuestBaseSize());
@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }

 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const std::unique_lock<std::recursive_mutex>& global_lock,
+    [[maybe_unused]] const global_unique_lock_type& global_lock,
    bool is_mip) {
  if (is_mip) {
    assert_not_zero(GetGuestMipsSize());
@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback(
  }
 }

-void TextureCache::WatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
+                                 void* context,
    void* data, uint64_t argument, bool invalidated_by_gpu) {
  Texture& texture = *static_cast<Texture*>(context);
  texture.WatchCallback(global_lock, argument != 0);
@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
 }

 void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+    const global_unique_lock_type& global_lock, void* context,
    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
  TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
  texture_cache->ScaledResolveGlobalWatchCallback(
@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }

 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock,
+    const global_unique_lock_type& global_lock,
    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
  assert_true(IsDrawResolutionScaled());
  if (invalidated_by_gpu) {
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -230,19 +230,15 @@ class TextureCache {
    }
    bool IsResolved() const { return base_resolved_ || mips_resolved_; }

-    bool base_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool base_outdated(const global_unique_lock_type& global_lock) const {
      return base_outdated_;
    }
-    bool mips_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool mips_outdated(const global_unique_lock_type& global_lock) const {
      return mips_outdated_;
    }
-    void MakeUpToDateAndWatch(
-        const std::unique_lock<std::recursive_mutex>& global_lock);
+    void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);

-    void WatchCallback(
-        const std::unique_lock<std::recursive_mutex>& global_lock, bool is_mip);
+    void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);

    // For LRU caching - updates the last usage frame and moves the texture to
    // the end of the usage queue. Must be called any time the texture is
@ -579,8 +575,8 @@ class TextureCache {
  void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);

  // Shared memory callback for texture data invalidation.
-  static void WatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+  static void WatchCallback(const global_unique_lock_type& global_lock,
+                            void* context,
      void* data, uint64_t argument, bool invalidated_by_gpu);

  // Checks if there are any pages that contain scaled resolve data within the
@ -589,10 +585,10 @@ class TextureCache {
  // Global shared memory invalidation callback for invalidating scaled resolved
  // texture data.
  static void ScaledResolveGlobalWatchCallbackThunk(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
  void ScaledResolveGlobalWatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock,
+      const global_unique_lock_type& global_lock,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);

  const RegisterFile& register_file_;
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -1157,7 +1157,14 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
    }
  }
 }
-
+void VulkanCommandProcessor::WriteRegistersFromMem(uint32_t start_index,
+                                                   uint32_t* base,
+                                                   uint32_t num_registers) {
+  for (uint32_t i = 0; i < num_registers; ++i) {
+    uint32_t data = xe::load_and_swap<uint32_t>(base + i);
+    VulkanCommandProcessor::WriteRegister(start_index + i, data);
+  }
+}
 void VulkanCommandProcessor::SparseBindBuffer(
    VkBuffer buffer, uint32_t bind_count, const VkSparseMemoryBind* binds,
    VkPipelineStageFlags wait_stage_mask) {
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@ -45,7 +45,7 @@ namespace xe {
 namespace gpu {
 namespace vulkan {

-class VulkanCommandProcessor : public CommandProcessor {
+class VulkanCommandProcessor final : public CommandProcessor {
 public:
  // Single-descriptor layouts for use within a single frame.
  enum class SingleTransientDescriptorLayout {
@ -259,8 +259,11 @@ class VulkanCommandProcessor : public CommandProcessor {
 protected:
  bool SetupContext() override;
  void ShutdownContext() override;
-
+  XE_FORCEINLINE
  void WriteRegister(uint32_t index, uint32_t value) override;
+  XE_FORCEINLINE
+  virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
+                                     uint32_t num_registers) override;

  void OnGammaRamp256EntryTableValueWritten() override;
  void OnGammaRampPWLValueWritten() override;
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@ -376,18 +376,21 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
 }

 bool VulkanSharedMemory::UploadRanges(
-    const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
-  if (upload_page_ranges.empty()) {
+    const std::pair<uint32_t, uint32_t>* upload_page_ranges,
+    unsigned num_upload_ranges) {
+  if (!num_upload_ranges) {
    return true;
  }
+
+  auto& range_front = upload_page_ranges[0];
+  auto& range_back = upload_page_ranges[num_upload_ranges - 1];
+
  // upload_page_ranges are sorted, use them to determine the range for the
  // ordering barrier.
  Use(Usage::kTransferDestination,
-      std::make_pair(
-          upload_page_ranges.front().first << page_size_log2(),
-          (upload_page_ranges.back().first + upload_page_ranges.back().second -
-           upload_page_ranges.front().first)
-              << page_size_log2()));
+      std::make_pair(range_front.first << page_size_log2(),
+                     (range_back.first + range_back.second - range_front.first)
+                         << page_size_log2()));
  command_processor_.SubmitBarriers(true);
  DeferredCommandBuffer& command_buffer =
      command_processor_.deferred_command_buffer();
@ -395,9 +398,11 @@ bool VulkanSharedMemory::UploadRanges(
  bool successful = true;
  upload_regions_.clear();
  VkBuffer upload_buffer_previous = VK_NULL_HANDLE;
-  for (auto upload_range : upload_page_ranges) {
-    uint32_t upload_range_start = upload_range.first;
-    uint32_t upload_range_length = upload_range.second;
+
+  // for (auto upload_range : upload_page_ranges) {
+  for (unsigned int i = 0; i < num_upload_ranges; ++i) {
+    uint32_t upload_range_start = upload_page_ranges[i].first;
+    uint32_t upload_range_length = upload_page_ranges[i].second;
    trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
                                  upload_range_length << page_size_log2());
    while (upload_range_length) {
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
                                        uint32_t length_allocations) override;

-  bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
-                        upload_page_ranges) override;
+  bool UploadRanges(const std::pair<uint32_t, uint32_t>*
+                        upload_page_ranges, unsigned num_ranges) override;

 private:
  void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
--- a/src/xenia/hid/input_system.cc
+++ b/src/xenia/hid/input_system.cc
@ -137,6 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
  modified_vibration.right_motor_speed = 0;
  return modified_vibration;
 }
-
+std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
+  return std::unique_lock<xe_unlikely_mutex>{lock_};
+}
 }  // namespace hid
 }  // namespace xe
--- a/src/xenia/hid/input_system.h
+++ b/src/xenia/hid/input_system.h
@ -12,7 +12,7 @@

 #include <memory>
 #include <vector>
-
+#include "xenia/base/mutex.h"
 #include "xenia/hid/input.h"
 #include "xenia/hid/input_driver.h"
 #include "xenia/xbox.h"
@ -48,6 +48,8 @@ class InputSystem {
  void UpdateUsedSlot(uint8_t slot, bool connected);
  uint8_t GetConnectedSlots() const { return connected_slot; }

+  std::unique_lock<xe_unlikely_mutex> lock();
+
 private:
  xe::ui::Window* window_ = nullptr;

@ -55,6 +57,7 @@ class InputSystem {

  X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
  uint8_t connected_slot = 0b0001;
+  xe_unlikely_mutex lock_;
 };

 }  // namespace hid
--- a/src/xenia/hid/xinput/xinput_input_driver.cc
+++ b/src/xenia/hid/xinput/xinput_input_driver.cc
@ -14,9 +14,9 @@

 #include <xinput.h>  // NOLINT(build/include_order)

+#include "xenia/base/clock.h"
 #include "xenia/base/logging.h"
 #include "xenia/hid/hid_flags.h"
-
 namespace xe {
 namespace hid {
 namespace xinput {
@ -81,13 +81,39 @@ X_STATUS XInputInputDriver::Setup() {
  }
  return X_STATUS_SUCCESS;
 }
+constexpr uint64_t SKIP_INVALID_CONTROLLER_TIME = 1100;
+static uint64_t last_invalid_time[4];
+
+static DWORD should_skip(uint32_t user_index) {
+  uint64_t time = last_invalid_time[user_index];
+  if (time) {
+    uint64_t deltatime = xe::Clock::QueryHostUptimeMillis() - time;
+
+    if (deltatime < SKIP_INVALID_CONTROLLER_TIME) {
+      return ERROR_DEVICE_NOT_CONNECTED;
+    }
+    last_invalid_time[user_index] = 0;
+  }
+  return 0;
+}
+
+static void set_skip(uint32_t user_index) {
+  last_invalid_time[user_index] = xe::Clock::QueryHostUptimeMillis();
+}

 X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,
                                            X_INPUT_CAPABILITIES* out_caps) {
+  DWORD skipper = should_skip(user_index);
+  if (skipper) {
+    return skipper;
+  }
  XINPUT_CAPABILITIES native_caps;
  auto xigc = (decltype(&XInputGetCapabilities))XInputGetCapabilities_;
  DWORD result = xigc(user_index, flags, &native_caps);
  if (result) {
+    if (result == ERROR_DEVICE_NOT_CONNECTED) {
+      set_skip(user_index);
+    }
    return result;
  }

@ -110,10 +136,18 @@ X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,

 X_RESULT XInputInputDriver::GetState(uint32_t user_index,
                                     X_INPUT_STATE* out_state) {
+  DWORD skipper = should_skip(user_index);
+  if (skipper) {
+    return skipper;
+  }
  XINPUT_STATE native_state;
  auto xigs = (decltype(&XInputGetState))XInputGetState_;
+
  DWORD result = xigs(user_index, &native_state);
  if (result) {
+    if (result == ERROR_DEVICE_NOT_CONNECTED) {
+      set_skip(user_index);
+    }
    return result;
  }

@ -131,11 +165,18 @@ X_RESULT XInputInputDriver::GetState(uint32_t user_index,

 X_RESULT XInputInputDriver::SetState(uint32_t user_index,
                                     X_INPUT_VIBRATION* vibration) {
+  DWORD skipper = should_skip(user_index);
+  if (skipper) {
+    return skipper;
+  }
  XINPUT_VIBRATION native_vibration;
  native_vibration.wLeftMotorSpeed = vibration->left_motor_speed;
  native_vibration.wRightMotorSpeed = vibration->right_motor_speed;
  auto xiss = (decltype(&XInputSetState))XInputSetState_;
  DWORD result = xiss(user_index, &native_vibration);
+  if (result == ERROR_DEVICE_NOT_CONNECTED) {
+    set_skip(user_index);
+  }
  return result;
 }

--- a/src/xenia/kernel/kernel_state.cc
+++ b/src/xenia/kernel/kernel_state.cc
@ -948,7 +948,11 @@ bool KernelState::Restore(ByteStream* stream) {
 }

 uint8_t KernelState::GetConnectedUsers() const {
-  return emulator_->input_system()->GetConnectedSlots();
+  auto input_sys = emulator_->input_system();
+
+  auto lock = input_sys->lock();
+
+  return input_sys->GetConnectedSlots();
 }

 void KernelState::UpdateUsedUserProfiles() {
--- a/src/xenia/kernel/xam/xam_input.cc
+++ b/src/xenia/kernel/xam/xam_input.cc
@ -58,6 +58,7 @@ dword_result_t XamInputGetCapabilities_entry(
  }

  auto input_system = kernel_state()->emulator()->input_system();
+  auto lock = input_system->lock();
  return input_system->GetCapabilities(actual_user_index, flags, caps);
 }
 DECLARE_XAM_EXPORT1(XamInputGetCapabilities, kInput, kSketchy);
@ -81,6 +82,7 @@ dword_result_t XamInputGetCapabilitiesEx_entry(
  }

  auto input_system = kernel_state()->emulator()->input_system();
+  auto lock = input_system->lock();
  return input_system->GetCapabilities(actual_user_index, flags, caps);
 }
 DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
@ -88,6 +90,13 @@ DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetstate(v=vs.85).aspx
 dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
                                      pointer_t<X_INPUT_STATE> input_state) {
+  if (input_state) {
+    memset((void*)input_state.host_address(), 0, sizeof X_INPUT_STATE);
+  }
+  if (user_index >= 4) {
+    return X_ERROR_DEVICE_NOT_CONNECTED;
+  }
+
  // Games call this with a NULL state ptr, probably as a query.

  if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
@ -96,12 +105,14 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
  }

  uint32_t actual_user_index = user_index;
+  // chrispy: change this, logic is not right
  if ((actual_user_index & 0xFF) == 0xFF || (flags & XINPUT_FLAG_ANY_USER)) {
    // Always pin user to 0.
    actual_user_index = 0;
  }

  auto input_system = kernel_state()->emulator()->input_system();
+  auto lock = input_system->lock();
  return input_system->GetState(user_index, input_state);
 }
 DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
@ -109,6 +120,9 @@ DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
 // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
 dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
                                      pointer_t<X_INPUT_VIBRATION> vibration) {
+  if (user_index >= 4) {
+    return X_E_DEVICE_NOT_CONNECTED;
+  }
  if (!vibration) {
    return X_ERROR_BAD_ARGUMENTS;
  }
@ -120,6 +134,7 @@ dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
  }

  auto input_system = kernel_state()->emulator()->input_system();
+  auto lock = input_system->lock();
  return input_system->SetState(user_index, vibration);
 }
 DECLARE_XAM_EXPORT1(XamInputSetState, kInput, kImplemented);
@ -147,6 +162,7 @@ dword_result_t XamInputGetKeystroke_entry(
  }

  auto input_system = kernel_state()->emulator()->input_system();
+  auto lock = input_system->lock();
  return input_system->GetKeystroke(user_index, flags, keystroke);
 }
 DECLARE_XAM_EXPORT1(XamInputGetKeystroke, kInput, kImplemented);
@ -166,14 +182,15 @@ dword_result_t XamInputGetKeystrokeEx_entry(

  uint32_t user_index = *user_index_ptr;
  auto input_system = kernel_state()->emulator()->input_system();
-
+  auto lock = input_system->lock();
  if ((user_index & 0xFF) == 0xFF) {
    // Always pin user to 0.
    user_index = 0;
  }

  if (flags & XINPUT_FLAG_ANY_USER) {
-    // That flag means we should iterate over every connected controller and check which one have pending request.
+    // That flag means we should iterate over every connected controller and
+    // check which one have pending request.
    auto result = X_ERROR_DEVICE_NOT_CONNECTED;
    for (uint32_t i = 0; i < 4; i++) {
      auto result = input_system->GetKeystroke(i, flags, keystroke);
@ -188,6 +205,7 @@ dword_result_t XamInputGetKeystrokeEx_entry(
  }

  auto result = input_system->GetKeystroke(user_index, flags, keystroke);
+
  if (XSUCCEEDED(result)) {
    *user_index_ptr = keystroke->user_index;
  }
@ -202,7 +220,8 @@ X_HRESULT_result_t XamUserGetDeviceContext_entry(dword_t user_index,
  // If this function fails they assume zero, so let's fail AND
  // set zero just to be safe.
  *out_ptr = 0;
-  if (kernel_state()->IsUserSignedIn(user_index) || (user_index & 0xFF) == 0xFF) {
+  if (kernel_state()->IsUserSignedIn(user_index) ||
+      (user_index & 0xFF) == 0xFF) {
    *out_ptr = (uint32_t)user_index;
    return X_E_SUCCESS;
  } else {
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
@ -121,7 +121,8 @@ dword_result_t NtAllocateVirtualMemory_entry(lpdword_t base_addr_ptr,
                               ? -int32_t(region_size_ptr.value())
                               : region_size_ptr.value();

-  adjusted_size = xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);
+  adjusted_size =
+      xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);

  // Allocate.
  uint32_t allocation_type = 0;
@ -295,10 +296,19 @@ struct X_MEMORY_BASIC_INFORMATION {
  be<uint32_t> protect;
  be<uint32_t> type;
 };
-
+// chrispy: added region_type ? guessed name, havent seen any except 0 used
 dword_result_t NtQueryVirtualMemory_entry(
    dword_t base_address,
-    pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr) {
+    pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr,
+    dword_t region_type) {
+  switch (region_type) {
+    case 0:
+    case 1:
+    case 2:
+      break;
+    default:
+      return X_STATUS_INVALID_PARAMETER;
+  }
  auto heap = kernel_state()->memory()->LookupHeap(base_address);
  HeapAllocationInfo alloc_info;
  if (heap == nullptr || !heap->QueryRegionInfo(base_address, &alloc_info)) {
@ -373,8 +383,9 @@ dword_result_t MmAllocatePhysicalMemoryEx_entry(
  // min_addr_range/max_addr_range are bounds in physical memory, not virtual.
  uint32_t heap_base = heap->heap_base();
  uint32_t heap_physical_address_offset = heap->GetPhysicalAddress(heap_base);
-  // TODO(Gliniak): Games like 545108B4 compares min_addr_range with value returned.
-  // 0x1000 offset causes it to go below that minimal range and goes haywire
+  // TODO(Gliniak): Games like 545108B4 compares min_addr_range with value
+  // returned. 0x1000 offset causes it to go below that minimal range and goes
+  // haywire
  if (min_addr_range && max_addr_range) {
    heap_physical_address_offset = 0;
  }
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc
@ -53,21 +53,20 @@ DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemory, kMemory, kImplemented);
 // https://msdn.microsoft.com/en-us/library/ff552123
 dword_result_t RtlCompareMemoryUlong_entry(lpvoid_t source, dword_t length,
                                           dword_t pattern) {
-  // Return 0 if source/length not aligned
-  if (source.guest_address() % 4 || length % 4) {
-    return 0;
-  }
+  uint32_t num_compared_bytes = 0;

-  uint32_t n = 0;
-  for (uint32_t i = 0; i < (length / 4); i++) {
-    // FIXME: This assumes as_array returns xe::be
-    uint32_t val = source.as_array<uint32_t>()[i];
-    if (val == pattern) {
-      n++;
+  uint32_t swapped_pattern = xe::byte_swap(pattern.value());
+
+  char* host_source = (char*)source.host_address();
+
+  for (uint32_t aligned_length = length & 0xFFFFFFFCU; aligned_length;
+       num_compared_bytes += 4) {
+    if (*(uint32_t*)(host_source + num_compared_bytes) != swapped_pattern) {
+      break;
    }
+    aligned_length = aligned_length - 4;
  }
-
-  return n;
+  return num_compared_bytes;
 }
 DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemoryUlong, kMemory, kImplemented);

@ -85,23 +84,61 @@ void RtlFillMemoryUlong_entry(lpvoid_t destination, dword_t length,
 }
 DECLARE_XBOXKRNL_EXPORT1(RtlFillMemoryUlong, kMemory, kImplemented);

-dword_result_t RtlUpperChar_entry(dword_t in) {
-  char c = in & 0xFF;
-  if (c >= 'a' && c <= 'z') {
-    return c ^ 0x20;
-  }
+static constexpr const unsigned char rtl_lower_table[256] = {
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xA,  0xB,
+    0xC,  0xD,  0xE,  0xF,  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
+    0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+    0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
+    0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
+    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
+    0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+    0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
+    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
+    0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
+    0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
+    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
+    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
+    0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
+    0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
+    0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
+    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
+    0xFC, 0xFD, 0xFE, 0xFF};

-  return c;
+static constexpr const unsigned char rtl_upper_table[256] = {
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xA,  0xB,
+    0xC,  0xD,  0xE,  0xF,  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
+    0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+    0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+    0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
+    0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+    0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+    0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
+    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
+    0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
+    0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
+    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
+    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
+    0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
+    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xC0, 0xC1, 0xC2, 0xC3,
+    0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
+    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xF7, 0xD8, 0xD9, 0xDA, 0xDB,
+    0xDC, 0xDD, 0xDE, 0x3F};
+
+dword_result_t RtlUpperChar_entry(dword_t in) {
+  return rtl_upper_table[in & 0xff];
 }
 DECLARE_XBOXKRNL_EXPORT1(RtlUpperChar, kNone, kImplemented);

 dword_result_t RtlLowerChar_entry(dword_t in) {
-  char c = in & 0xFF;
-  if (c >= 'A' && c <= 'Z') {
-    return c ^ 0x20;
-  }
-
-  return c;
+  return rtl_lower_table[in & 0xff];
 }
 DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented);

--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc
@ -473,7 +473,8 @@ void VdSwap_entry(
    dwords[i] = xenos::MakePacketType2();
  }
 }
-DECLARE_XBOXKRNL_EXPORT2(VdSwap, kVideo, kImplemented, kImportant);
+DECLARE_XBOXKRNL_EXPORT3(VdSwap, kVideo, kImplemented, kHighFrequency,
+                         kImportant);

 void RegisterVideoExports(xe::cpu::ExportResolver* export_resolver,
                          KernelState* kernel_state) {
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
 }

 bool Memory::AccessViolationCallback(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    void* host_address, bool is_write) {
  // Access via physical_membase_ is special, when need to bypass everything
  // (for instance, for a data provider to actually write the data) so only
@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback(
 }

 bool Memory::AccessViolationCallbackThunk(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    void* context, void* host_address, bool is_write) {
  return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
      std::move(global_lock_locked_once), host_address, is_write);
 }

 bool Memory::TriggerPhysicalMemoryCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    uint32_t virtual_address, uint32_t length, bool is_write,
    bool unwatch_exact_range, bool unprotect) {
  BaseHeap* heap = LookupHeap(virtual_address);
@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
 }

 bool PhysicalHeap::TriggerCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    uint32_t virtual_address, uint32_t length, bool is_write,
    bool unwatch_exact_range, bool unprotect) {
  // TODO(Triang3l): Support read watches.
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap {
                             bool enable_invalidation_notifications,
                             bool enable_data_providers);
  // Returns true if any page in the range was watched.
-  bool TriggerCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
      uint32_t virtual_address, uint32_t length, bool is_write,
      bool unwatch_exact_range, bool unprotect = true);

@ -459,7 +458,7 @@ class Memory {
  // TODO(Triang3l): Implement data providers - this is why locking depth of 1
  // will be required in the future.
  bool TriggerPhysicalMemoryCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
      uint32_t virtual_address, uint32_t length, bool is_write,
      bool unwatch_exact_range, bool unprotect = true);

@ -508,11 +507,10 @@ class Memory {
  static uint32_t HostToGuestVirtualThunk(const void* context,
                                          const void* host_address);

-  bool AccessViolationCallback(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
      void* host_address, bool is_write);
  static bool AccessViolationCallbackThunk(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
      void* context, void* host_address, bool is_write);

  std::filesystem::path file_name_;
--- a/third_party/FFmpeg
+++ b/third_party/FFmpeg
@ -1 +1 @@
-Subproject commit 15ece0882e8d5875051ff5b73c5a8326f7cee9f5
+Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7
--- a/third_party/disruptorplus
+++ b/third_party/disruptorplus
@ -1 +1 @@
-Subproject commit 5787e9cb7551c8c79cf9ce14f7be860dc907e9a4
+Subproject commit 302b6e03e829c6d6a70415f10d818a5088cb6ccf