Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90%

But for normal msvc builds i would put it at around 30-50% Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger fixed a number of errors where temporaries were being passed by reference/pointer Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes. Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold. Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me. Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases ^this can be toggled off in the platform_win header handle indirect call true with constant function pointer, was occurring in h3 lookup host format swizzle in denser array by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar ^looking up whether its known or not took approx 0.3% cpu time Changed some things in /cpu to make the project UNITYBUILD friendly The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds) Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed added support for docdecaduple precision floating point so that we can represent our performance gains numerically tons of other stuff im probably forgetting
2022-08-13 12:59:00 -07:00 · 2022-08-13 12:59:00 -07:00 · cb85fe401c
parent 2f59487bf3
commit cb85fe401c
49 changed files with 1462 additions and 483 deletions
--- a/src/xenia/base/byte_order.h
+++ b/src/xenia/base/byte_order.h
@ -46,7 +46,9 @@ static_assert((std::endian::native == std::endian::big) ||

 namespace xe {

-#if XE_COMPILER_MSVC
+// chrispy: added workaround for clang, otherwise byteswap_ulong becomes calls
+// to ucrtbase
+#if XE_COMPILER_MSVC == 1 && !defined(__clang__)
 #define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort
 #define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong
 #define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64
--- a/src/xenia/base/mapped_memory_win.cc
+++ b/src/xenia/base/mapped_memory_win.cc
@ -28,7 +28,8 @@ namespace xe {
 class Win32MappedMemory : public MappedMemory {
 public:
  // CreateFile returns INVALID_HANDLE_VALUE in case of failure.
-  static constexpr HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE;
+  // chrispy: made inline const to get around clang error
+  static inline const HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE;
  // CreateFileMapping returns nullptr in case of failure.
  static constexpr HANDLE kMappingHandleInvalid = nullptr;

--- a/src/xenia/base/memory_win.cc
+++ b/src/xenia/base/memory_win.cc
@ -15,7 +15,15 @@
                            WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
 #define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
 #endif
-
+/*
+        these two dont bypass much ms garbage compared to the threading ones,
+   but Protect is used by PhysicalHeap::EnableAccessCallbacks which eats a lot
+   of cpu time, so every bit counts
+*/
+XE_NTDLL_IMPORT(NtProtectVirtualMemory, cls_NtProtectVirtualMemory,
+                NtProtectVirtualMemoryPointer);
+XE_NTDLL_IMPORT(NtQueryVirtualMemory, cls_NtQueryVirtualMemory,
+                NtQueryVirtualMemoryPointer);
 namespace xe {
 namespace memory {

@ -139,6 +147,18 @@ bool Protect(void* base_address, size_t length, PageAccess access,
    *out_old_access = PageAccess::kNoAccess;
  }
  DWORD new_protect = ToWin32ProtectFlags(access);
+
+#if XE_USE_NTDLL_FUNCTIONS == 1
+
+  DWORD old_protect = 0;
+  SIZE_T MemoryLength = length;
+  PVOID MemoryCache = base_address;
+
+  BOOL result = NtProtectVirtualMemoryPointer.invoke<NTSTATUS>(
+                    (HANDLE)0xFFFFFFFFFFFFFFFFLL, &MemoryCache, &MemoryLength,
+                    new_protect, &old_protect) >= 0;
+
+#else
 #ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  DWORD old_protect = 0;
  BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect);
@ -146,6 +166,7 @@ bool Protect(void* base_address, size_t length, PageAccess access,
  ULONG old_protect = 0;
  BOOL result = VirtualProtectFromApp(base_address, length, ULONG(new_protect),
                                      &old_protect);
+#endif
 #endif
  if (!result) {
    return false;
@ -161,8 +182,17 @@ bool QueryProtect(void* base_address, size_t& length, PageAccess& access_out) {

  MEMORY_BASIC_INFORMATION info;
  ZeroMemory(&info, sizeof(info));
+#if XE_USE_NTDLL_FUNCTIONS == 1
+  ULONG_PTR ResultLength;

+  NTSTATUS query_result = NtQueryVirtualMemoryPointer.invoke<NTSTATUS>(
+      (HANDLE)0xFFFFFFFFFFFFFFFFLL, (PVOID)base_address,
+      0 /* MemoryBasicInformation*/, &info, length, &ResultLength);
+  SIZE_T result = query_result >= 0 ? ResultLength : 0;
+#else
  SIZE_T result = VirtualQuery(base_address, &info, length);
+
+#endif
  if (!result) {
    return false;
  }
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@ -10,10 +10,9 @@
 #include "xenia/base/mutex.h"

 namespace xe {
-
-std::recursive_mutex& global_critical_region::mutex() {
-  static std::recursive_mutex global_mutex;
-  return global_mutex;
-}
+// chrispy: moved this out of body of function to eliminate the initialization
+// guards
+static std::recursive_mutex global_mutex;
+std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }

 }  // namespace xe
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -41,19 +41,33 @@
 #error Unsupported target OS.
 #endif

-#if defined(__clang__)
+#if defined(__clang__) && !defined(_MSC_VER)  // chrispy: support clang-cl
 #define XE_COMPILER_CLANG 1
+#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1
 #elif defined(__GNUC__)
 #define XE_COMPILER_GNUC 1
+#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
 #elif defined(_MSC_VER)
 #define XE_COMPILER_MSVC 1
+#define XE_COMPILER_HAS_MSVC_EXTENSIONS 1
 #elif defined(__MINGW32)
 #define XE_COMPILER_MINGW32 1
+#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
 #elif defined(__INTEL_COMPILER)
 #define XE_COMPILER_INTEL 1
 #else
 #define XE_COMPILER_UNKNOWN 1
 #endif
+// chrispy: had to place this here.
+#if defined(__clang__) && defined(_MSC_VER)
+#define XE_COMPILER_CLANG_CL 1
+#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1
+#endif
+
+// clang extensions == superset of gnu extensions
+#if XE_COMPILER_HAS_CLANG_EXTENSIONS == 1
+#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
+#endif

 #if defined(_M_AMD64) || defined(__amd64__)
 #define XE_ARCH_AMD64 1
@ -93,6 +107,29 @@
 #define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value)
 #define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value)

+#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
+#define XE_FORCEINLINE __forceinline
+#define XE_NOINLINE __declspec(noinline)
+// can't properly emulate "cold" in msvc, but can still segregate the function
+// into its own seg
+#define XE_COLD __declspec(code_seg(".cold"))
+#define XE_LIKELY(...) (!!(__VA_ARGS__))
+#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
+
+#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
+#define XE_FORCEINLINE __attribute__((always_inline))
+#define XE_NOINLINE __attribute__((noinline))
+#define XE_COLD __attribute__((cold))
+#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
+#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
+#else
+#define XE_FORCEINLINE inline
+#define XE_NOINLINE
+#define XE_COLD
+#define XE_LIKELY(...) (!!(__VA_ARGS__))
+#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
+#endif
+
 namespace xe {

 #if XE_PLATFORM_WIN32
--- a/src/xenia/base/platform_win.h
+++ b/src/xenia/base/platform_win.h
@ -34,4 +34,31 @@
 #undef DeleteFile
 #undef GetFirstChild

+#define	XE_USE_NTDLL_FUNCTIONS 1
+#if XE_USE_NTDLL_FUNCTIONS==1
+/*
+	ntdll versions of functions often skip through a lot of extra garbage in KernelBase
+*/
+#define XE_NTDLL_IMPORT(name, cls, clsvar) \
+  static class cls {                       \
+   public:                                 \
+	FARPROC fn;\
+    cls() : fn(nullptr) {\
+		auto ntdll = GetModuleHandleA("ntdll.dll");\
+		if (ntdll) {                                \
+			fn = GetProcAddress(ntdll, #name );\
+		}\
+	}                                  \
+	template <typename TRet = void, typename... TArgs> \
+    inline TRet invoke(TArgs... args) {\
+		return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
+	}\
+	inline operator bool() const {\
+		return fn!=nullptr;\
+	}\
+  } clsvar
+#else
+#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false
+
+#endif
 #endif  // XENIA_BASE_PLATFORM_WIN_H_
--- a/src/xenia/base/profiling.h
+++ b/src/xenia/base/profiling.h
@ -20,7 +20,7 @@
 #include "xenia/ui/virtual_key.h"
 #include "xenia/ui/window_listener.h"

-#if XE_PLATFORM_WIN32
+#if XE_PLATFORM_WIN32 && 0
 #define XE_OPTION_PROFILING 1
 #define XE_OPTION_PROFILING_UI 1
 #else
--- a/src/xenia/base/ring_buffer.h
+++ b/src/xenia/base/ring_buffer.h
@ -19,7 +19,26 @@
 #include "xenia/base/byte_order.h"

 namespace xe {
+/*
+        todo: this class is CRITICAL to the performance of the entire emulator
+        currently, about 0.74% cpu time is still taken up by ReadAndSwap, 0.23
+   is used by read_count I believe that part of the issue is that smaller
+   ringbuffers are kicking off an automatic prefetcher stream, that ends up
+   reading ahead of the end of the ring because it can only go in a straight
+   line it then gets a cache miss when it eventually wraps around to the start
+   of the ring? really hard to tell whats going on there honestly, maybe we can
+   occasionally prefetch the first line of the ring to L1? For the automatic
+   prefetching i don't think there are any good options. I don't know if we have
+   any control over where these buffers will be (they seem to be in guest memory
+   :/), but if we did we could right-justify the buffer so that the final byte
+   of the ring ends at the end of a page. i think most automatic prefetchers
+   cannot cross page boundaries it does feel like something isnt right here
+   though

+        todo: microoptimization, we can change our size members to be uint32 so
+   that the registers no longer need the rex prefix, shrinking the generated
+   code a bit.. like i said, every bit helps in this class
+*/
 class RingBuffer {
 public:
  RingBuffer(uint8_t* buffer, size_t capacity);
@ -32,6 +51,8 @@ class RingBuffer {
  uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; }
  void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
  size_t read_count() const {
+// chrispy: these branches are unpredictable
+#if 0
    if (read_offset_ == write_offset_) {
      return 0;
    } else if (read_offset_ < write_offset_) {
@ -39,6 +60,33 @@ class RingBuffer {
    } else {
      return (capacity_ - read_offset_) + write_offset_;
    }
+#else
+    size_t read_offs = read_offset_;
+    size_t write_offs = write_offset_;
+    size_t cap = capacity_;
+
+    size_t offset_delta = write_offs - read_offs;
+    size_t wrap_read_count = (cap - read_offs) + write_offs;
+
+    size_t comparison_value = read_offs <= write_offs;
+#if 0
+    size_t selector =
+        static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
+    offset_delta &= selector;
+
+    wrap_read_count &= ~selector;
+    return offset_delta | wrap_read_count;
+#else
+
+    if (XE_LIKELY(read_offs <= write_offs)) {
+      return offset_delta;  // will be 0 if they are equal, semantically
+                            // identical to old code (i checked the asm, msvc
+                            // does not automatically do this)
+    } else {
+      return wrap_read_count;
+    }
+#endif
+#endif
  }

  size_t write_offset() const { return write_offset_; }
@ -113,6 +161,28 @@ class RingBuffer {
  size_t write_offset_ = 0;
 };

+template <>
+inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
+  size_t read_offset = this->read_offset_;
+  xenia_assert(this->capacity_ >= 4);
+
+  size_t next_read_offset = read_offset + 4;
+  #if 0
+  size_t zerotest = next_read_offset - this->capacity_;
+  // unpredictable branch, use bit arith instead
+  // todo: it would be faster to use lzcnt, but we need to figure out if all
+  // machines we support support it
+  next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
+  #else
+  if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
+    next_read_offset = 0;
+	//todo: maybe prefetch next? or should that happen much earlier?
+  }
+  #endif
+  this->read_offset_ = next_read_offset;
+  unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
+  return xe::byte_swap(ring_value);
+}
 }  // namespace xe

 #endif  // XENIA_BASE_RING_BUFFER_H_
--- a/src/xenia/base/threading_timer_queue.cc
+++ b/src/xenia/base/threading_timer_queue.cc
@ -10,12 +10,12 @@
 #include <algorithm>
 #include <forward_list>

+#include "third_party/disruptorplus/include/disruptorplus/blocking_wait_strategy.hpp"
 #include "third_party/disruptorplus/include/disruptorplus/multi_threaded_claim_strategy.hpp"
 #include "third_party/disruptorplus/include/disruptorplus/ring_buffer.hpp"
 #include "third_party/disruptorplus/include/disruptorplus/sequence_barrier.hpp"
 #include "third_party/disruptorplus/include/disruptorplus/spin_wait.hpp"
 #include "third_party/disruptorplus/include/disruptorplus/spin_wait_strategy.hpp"
-
 #include "xenia/base/assert.h"
 #include "xenia/base/threading.h"
 #include "xenia/base/threading_timer_queue.h"
@ -26,6 +26,12 @@ namespace xe {
 namespace threading {

 using WaitItem = TimerQueueWaitItem;
+/*
+        chrispy: changed this to a blocking wait from a spin-wait, the spin was
+   monopolizing a ton of cpu time (depending on the game 2-4% of total cpu time)
+   on my 3990x no complaints since that change
+*/
+using WaitStrat = dp::blocking_wait_strategy;

 class TimerQueue {
 public:
@ -147,9 +153,10 @@ class TimerQueue {
  // This ring buffer will be used to introduce timers queued by the public API
  static constexpr size_t kWaitCount = 512;
  dp::ring_buffer<std::shared_ptr<WaitItem>> buffer_;
-  dp::spin_wait_strategy wait_strategy_;
-  dp::multi_threaded_claim_strategy<dp::spin_wait_strategy> claim_strategy_;
-  dp::sequence_barrier<dp::spin_wait_strategy> consumed_;
+
+  WaitStrat wait_strategy_;
+  dp::multi_threaded_claim_strategy<WaitStrat> claim_strategy_;
+  dp::sequence_barrier<WaitStrat> consumed_;

  // This is a _sorted_ (ascending due_) list of active timers managed by a
  // dedicated thread
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -7,19 +7,49 @@
 ******************************************************************************
 */

+#include <winternl.h>
 #include "xenia/base/assert.h"
 #include "xenia/base/chrono_steady_cast.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/platform_win.h"
 #include "xenia/base/threading.h"
 #include "xenia/base/threading_timer_queue.h"
-
+#if defined(__clang__)
+// chrispy: i do not understand why this is an error for clang here
+// something about the quoted __FUNCTION__ freaks it out (clang 14.0.1)
 #define LOG_LASTERROR()                                                       \
-  { XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); }
-
+  do {                                                                        \
+    XELOGI("Win32 Error 0x{:08X} in {} (...)", GetLastError(), __FUNCTION__); \
+  } while (false)
+#else
+#define LOG_LASTERROR()                                                      \
+  do {                                                                       \
+    XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); \
+  } while (false)
+#endif
 typedef HANDLE (*SetThreadDescriptionFn)(HANDLE hThread,
                                         PCWSTR lpThreadDescription);

+// sys function for ntyieldexecution, by calling it we sidestep
+// RtlGetCurrentUmsThread
+XE_NTDLL_IMPORT(NtYieldExecution, cls_NtYieldExecution,
+                NtYieldExecutionPointer);
+// sidestep the activation context/remapping special windows handles like stdout
+XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
+                NtWaitForSingleObjectPointer);
+
+XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
+// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
+// the events state prior to the call, but we dont need that. might need to
+// check whether one or the other is faster in the kernel though yeah, just
+// checked, the code in ntoskrnl is way simpler for clearevent than resetevent
+XE_NTDLL_IMPORT(NtClearEvent, cls_NtClearEvent, NtClearEventPointer);
+XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer);
+
+// heavily called, we dont skip much garbage by calling this, but every bit
+// counts
+XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
+                NtReleaseSemaphorePointer);
 namespace xe {
 namespace threading {

@ -80,7 +110,13 @@ void set_name(const std::string_view name) {
 }

 void MaybeYield() {
+#if defined(XE_USE_NTDLL_FUNCTIONS)
+  NtYieldExecutionPointer.invoke();
+#else
  SwitchToThread();
+#endif
+
+  // memorybarrier is really not necessary here...
  MemoryBarrier();
 }

@ -134,8 +170,26 @@ class Win32Handle : public T {
 WaitResult Wait(WaitHandle* wait_handle, bool is_alertable,
                std::chrono::milliseconds timeout) {
  HANDLE handle = wait_handle->native_handle();
-  DWORD result = WaitForSingleObjectEx(handle, DWORD(timeout.count()),
-                                       is_alertable ? TRUE : FALSE);
+  DWORD result;
+  DWORD timeout_dw = DWORD(timeout.count());
+  BOOL bAlertable = is_alertable ? TRUE : FALSE;
+  // todo: we might actually be able to use NtWaitForSingleObject even if its
+  // alertable, just need to study whether
+  // RtlDeactivateActivationContextUnsafeFast/RtlActivateActivationContext are
+  // actually needed for us
+#if XE_USE_NTDLL_FUNCTIONS == 1
+  if (bAlertable) {
+    result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable);
+  } else {
+    LARGE_INTEGER timeout_big;
+    timeout_big.QuadPart = -10000LL * static_cast<int64_t>(timeout_dw);
+
+    result = NtWaitForSingleObjectPointer.invoke<NTSTATUS>(
+        handle, bAlertable, timeout_dw == INFINITE ? nullptr : &timeout_big);
+  }
+#else
+  result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable);
+#endif
  switch (result) {
    case WAIT_OBJECT_0:
      return WaitResult::kSuccess;
@ -178,7 +232,9 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
                                           size_t wait_handle_count,
                                           bool wait_all, bool is_alertable,
                                           std::chrono::milliseconds timeout) {
-  std::vector<HANDLE> handles(wait_handle_count);
+  std::vector<HANDLE> handles(
+      wait_handle_count);  // max handles is like 64, so it would make more
+                           // sense to just do a fixed size array here
  for (size_t i = 0; i < wait_handle_count; ++i) {
    handles[i] = wait_handles[i]->native_handle();
  }
@ -208,9 +264,16 @@ class Win32Event : public Win32Handle<Event> {
 public:
  explicit Win32Event(HANDLE handle) : Win32Handle(handle) {}
  ~Win32Event() override = default;
+#if XE_USE_NTDLL_FUNCTIONS == 1
+  void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
+  void Reset() override { NtClearEventPointer.invoke(handle_); }
+  void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
+#else
  void Set() override { SetEvent(handle_); }
  void Reset() override { ResetEvent(handle_); }
  void Pulse() override { PulseEvent(handle_); }
+
+#endif
 };

 std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
@ -220,6 +283,7 @@ std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
    return std::make_unique<Win32Event>(handle);
  } else {
    LOG_LASTERROR();
+
    return nullptr;
  }
 }
@ -240,10 +304,15 @@ class Win32Semaphore : public Win32Handle<Semaphore> {
  explicit Win32Semaphore(HANDLE handle) : Win32Handle(handle) {}
  ~Win32Semaphore() override = default;
  bool Release(int release_count, int* out_previous_count) override {
+#if XE_USE_NTDLL_FUNCTIONS == 1
+    return NtReleaseSemaphorePointer.invoke<NTSTATUS>(handle_, release_count,
+                                                      out_previous_count) >= 0;
+#else
    return ReleaseSemaphore(handle_, release_count,
                            reinterpret_cast<LPLONG>(out_previous_count))
               ? true
               : false;
+#endif
  }
 };

--- a/src/xenia/base/utf8.cc
+++ b/src/xenia/base/utf8.cc
@ -82,8 +82,9 @@ std::string upper_ascii(const std::string_view view) {
 template <bool LOWER>
 inline size_t hash_fnv1a(const std::string_view view) {
  const size_t offset_basis = 0xCBF29CE484222325ull;
+  // chrispy: constant capture errors on clang
+  auto work = [](size_t hash, uint8_t byte_of_data) {
    const size_t prime = 0x00000100000001B3ull;
-  auto work = [&prime](size_t hash, uint8_t byte_of_data) {
    hash ^= byte_of_data;
    hash *= prime;
    return hash;
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -25,7 +25,7 @@
 #include "xenia/cpu/breakpoint.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/stack_walker.h"
-
+#include "xenia/cpu/xex_module.h"
 DEFINE_int32(x64_extension_mask, -1,
             "Allow the detection and utilization of specific instruction set "
             "features.\n"
@ -45,6 +45,12 @@ DEFINE_int32(x64_extension_mask, -1,
             "   -1 = Detect and utilize all possible processor features\n",
             "x64");

+DEFINE_bool(record_mmio_access_exceptions, true,
+            "For guest addresses records whether we caught any mmio accesses "
+            "for them. This info can then be used on a subsequent run to "
+            "instruct the recompiler to emit checks",
+            "CPU");
+
 namespace xe {
 namespace cpu {
 namespace backend {
@ -86,6 +92,11 @@ X64Backend::~X64Backend() {
  ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
 }

+static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
+  reinterpret_cast<X64Backend*>(context)
+      ->RecordMMIOExceptionForGuestInstruction(hostaddr);
+}
+
 bool X64Backend::Initialize(Processor* processor) {
  if (!Backend::Initialize(processor)) {
    return false;
@ -146,6 +157,8 @@ bool X64Backend::Initialize(Processor* processor) {
  // Setup exception callback
  ExceptionHandler::Install(&ExceptionCallbackThunk, this);

+  processor->memory()->SetMMIOExceptionRecordingCallback(
+      ForwardMMIOAccessForRecording, (void*)this);
  return true;
 }

@ -390,7 +403,28 @@ bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) {
  auto backend = reinterpret_cast<X64Backend*>(data);
  return backend->ExceptionCallback(ex);
 }
+void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) {
+  uint64_t host_addr_u64 = (uint64_t)host_address;

+  auto fnfor = code_cache()->LookupFunction(host_addr_u64);
+  if (fnfor) {
+    uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64);
+
+    Module* guest_module = fnfor->module();
+    if (guest_module) {
+      XexModule* xex_guest_module = dynamic_cast<XexModule*>(guest_module);
+
+      if (xex_guest_module) {
+        cpu::InfoCacheFlags* icf =
+            xex_guest_module->GetInstructionAddressFlags(guestaddr);
+
+        if (icf) {
+          icf->accessed_mmio = true;
+        }
+      }
+    }
+  }
+}
 bool X64Backend::ExceptionCallback(Exception* ex) {
  if (ex->code() != Exception::Code::kIllegalInstruction) {
    // We only care about illegal instructions. Other things will be handled by
@ -399,6 +433,8 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
    return false;
  }

+  // processor_->memory()->LookupVirtualMappedRange()
+
  // Verify an expected illegal instruction.
  auto instruction_bytes =
      xe::load_and_swap<uint16_t>(reinterpret_cast<void*>(ex->pc()));
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -92,6 +92,8 @@ class X64Backend : public Backend {
  }
  virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;

+  void RecordMMIOExceptionForGuestInstruction(void* host_address);
+
 private:
  static bool ExceptionCallbackThunk(Exception* ex, void* data);
  bool ExceptionCallback(Exception* ex);
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -156,7 +156,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
                      void** out_code_address, size_t* out_code_size,
                      std::vector<SourceMapEntry>* out_source_map) {
  SCOPE_profile_cpu_f("cpu");
-
+  guest_module_ = dynamic_cast<XexModule*>(function->module());
  // Reset.
  debug_info_ = debug_info;
  debug_info_flags_ = debug_info_flags;
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -18,8 +18,8 @@
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/hir/instr.h"
 #include "xenia/cpu/hir/value.h"
+#include "xenia/cpu/xex_module.h"
 #include "xenia/memory.h"
-
 // NOTE: must be included last as it expects windows.h to already be included.
 #include "third_party/xbyak/xbyak/xbyak.h"
 #include "third_party/xbyak/xbyak/xbyak_util.h"
@ -65,11 +65,7 @@ enum class SimdDomain : uint32_t {
               // CONFLICTING means its used in multiple domains)
 };

-enum class MXCSRMode : uint32_t {
-	Unknown,
-	Fpu,
-	Vmx
-};
+enum class MXCSRMode : uint32_t { Unknown, Fpu, Vmx };

 static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
  if (dom1 == dom2) {
@ -326,16 +322,21 @@ class X64Emitter : public Xbyak::CodeGenerator {
  size_t stack_size() const { return stack_size_; }
  SimdDomain DeduceSimdDomain(const hir::Value* for_value);

-  void ForgetMxcsrMode() {
-    mxcsr_mode_ = MXCSRMode::Unknown;
-  }
+  void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
  /*
-	returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr
+        returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
+     clearing the overflow flag, as it will never be set in the vmx fpscr
  */
-  bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE
+  bool ChangeMxcsrMode(
+      MXCSRMode new_mode,
+      bool already_set = false);  // already_set means that the caller already
+                                  // did vldmxcsr, used for SET_ROUNDING_MODE
+
+  void LoadFpuMxcsrDirect();  // unsafe, does not change mxcsr_mode_
+  void LoadVmxMxcsrDirect();  // unsafe, does not change mxcsr_mode_
+
+  XexModule* GuestModule() { return guest_module_; }

-  void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_
-  void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_
 protected:
  void* Emplace(const EmitFunctionInfo& func_info,
                GuestFunction* function = nullptr);
@ -348,6 +349,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
  X64Backend* backend_ = nullptr;
  X64CodeCache* code_cache_ = nullptr;
  XbyakAllocator* allocator_ = nullptr;
+  XexModule* guest_module_ = nullptr;
  Xbyak::util::Cpu cpu_;
  uint32_t feature_flags_ = 0;

--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@ -60,23 +60,46 @@ union InstrKey {

  InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
  InstrKey(uint32_t v) : value(v) {}
+
+  // this used to take about 1% cpu while precompiling
+  // it kept reloading opcode, and also constantly repacking and unpacking the
+  // bitfields. instead, we pack the fields at the very end
  InstrKey(const Instr* i) : value(0) {
-    opcode = i->opcode->num;
-    uint32_t sig = i->opcode->signature;
-    dest =
-        GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
-    src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
-    if (src1 == OPCODE_SIG_TYPE_V) {
-      src1 += i->src1.value->type;
+    const OpcodeInfo* info = i->GetOpcodeInfo();
+
+    uint32_t sig = info->signature;
+
+    OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
+
+    UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
+
+    uint32_t out_desttype = (uint32_t)dest_type;
+    uint32_t out_src1type = (uint32_t)src1_type;
+    uint32_t out_src2type = (uint32_t)src2_type;
+    uint32_t out_src3type = (uint32_t)src3_type;
+
+    Value* destv = i->dest;
+    // pre-deref, even if not value
+    Value* src1v = i->src1.value;
+    Value* src2v = i->src2.value;
+    Value* src3v = i->src3.value;
+
+    if (out_src1type == OPCODE_SIG_TYPE_V) {
+      out_src1type += src1v->type;
    }
-    src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
-    if (src2 == OPCODE_SIG_TYPE_V) {
-      src2 += i->src2.value->type;
+
+    if (out_src2type == OPCODE_SIG_TYPE_V) {
+      out_src2type += src2v->type;
    }
-    src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
-    if (src3 == OPCODE_SIG_TYPE_V) {
-      src3 += i->src3.value->type;
+
+    if (out_src3type == OPCODE_SIG_TYPE_V) {
+      out_src3type += src3v->type;
    }
+    opcode = info->num;
+    dest = out_desttype ? OPCODE_SIG_TYPE_V + destv->type : 0;
+    src1 = out_src1type;
+    src2 = out_src2type;
+    src3 = out_src3type;
  }

  template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -18,7 +18,7 @@
 #include "xenia/cpu/backend/x64/x64_op.h"
 #include "xenia/cpu/backend/x64/x64_tracers.h"
 #include "xenia/cpu/ppc/ppc_context.h"
-
+#include "xenia/cpu/processor.h"
 DEFINE_bool(
    elide_e0_check, false,
    "Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
@ -27,6 +27,10 @@ DEFINE_bool(enable_rmw_context_merging, false,
            "Permit merging read-modify-write HIR instr sequences together "
            "into x86 instructions that use a memory operand.",
            "x64");
+DEFINE_bool(emit_mmio_aware_stores_for_recorded_exception_addresses, true,
+            "Uses info gathered via record_mmio_access_exceptions to emit "
+            "special stores that are faster than trapping the exception",
+            "CPU");

 namespace xe {
 namespace cpu {
@ -965,6 +969,21 @@ struct STORE_MMIO_I32
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32);
+// according to triangle we dont support mmio reads atm so no point in
+// implementing this for them
+static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
+  if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
+    return false;
+  }
+  uint32_t guestaddr = i->GuestAddressFor();
+  if (!guestaddr) {
+    return false;
+  }
+
+  auto flags = e.GuestModule()->GetInstructionAddressFlags(guestaddr);
+
+  return flags && flags->accessed_mmio;
+}

 // ============================================================================
 // OPCODE_LOAD_OFFSET
@ -1030,6 +1049,28 @@ struct LOAD_OFFSET_I64
 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
                     LOAD_OFFSET_I32, LOAD_OFFSET_I64);

+template <typename T, bool swap>
+static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
+  if (swap) {
+    value = xe::byte_swap(value);
+  }
+  if (guestaddr >= 0xE0000000) {
+    guestaddr += 0x1000;
+  }
+
+  auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
+
+  auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
+  if (!gaddr) {
+    *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
+  } else {
+    value = xe::byte_swap(value); /*
+          was having issues, found by comparing the values used with exceptions
+          to these that we were reversed...
+    */
+    gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
+  }
+}
 // ============================================================================
 // OPCODE_STORE_OFFSET
 // ============================================================================
@ -1038,6 +1079,7 @@ struct STORE_OFFSET_I8
               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I8Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+
    if (i.src3.is_constant) {
      e.mov(e.byte[addr], i.src3.constant());
    } else {
@ -1076,6 +1118,30 @@ struct STORE_OFFSET_I32
    : Sequence<STORE_OFFSET_I32,
               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareStore<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareStore<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+      if (i.src2.is_constant) {
+        e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
+      } else {
+        e.add(e.GetNativeParam(0).cvt32(), i.src2);
+      }
+      if (i.src3.is_constant) {
+        e.mov(e.GetNativeParam(1).cvt32(), i.src3.constant());
+      } else {
+        e.mov(e.GetNativeParam(1).cvt32(), i.src3);
+      }
+      e.CallNativeSafe(addrptr);
+
+    } else {
      auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        assert_false(i.src3.is_constant);
@ -1096,6 +1162,7 @@ struct STORE_OFFSET_I32
        }
      }
    }
+  }
 };

 struct STORE_OFFSET_I64
@ -1290,6 +1357,25 @@ struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
 };
 struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (IsPossibleMMIOInstruction(e, i.instr)) {
+      void* addrptr = (void*)&MMIOAwareStore<uint32_t, false>;
+
+      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+        addrptr = (void*)&MMIOAwareStore<uint32_t, true>;
+      }
+      if (i.src1.is_constant) {
+        e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
+      } else {
+        e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
+      }
+      if (i.src2.is_constant) {
+        e.mov(e.GetNativeParam(1).cvt32(), i.src2.constant());
+      } else {
+        e.mov(e.GetNativeParam(1).cvt32(), i.src2);
+      }
+      e.CallNativeSafe(addrptr);
+
+    } else {
      auto addr = ComputeMemoryAddress(e, i.src1);
      if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
        assert_false(i.src2.is_constant);
@ -1305,8 +1391,9 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
          e.mov(e.dword[addr], i.src2);
        }
      }
+    }
    if (IsTracingData()) {
-      addr = ComputeMemoryAddress(e, i.src1);
+      auto addr = ComputeMemoryAddress(e, i.src1);
      e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
      e.lea(e.GetNativeParam(0), e.ptr[addr]);
      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -1683,6 +1683,9 @@ struct DIV_I16 : Sequence<DIV_I16, I<OPCODE_DIV, I16Op, I16Op, I16Op>> {
    assert_impossible_sequence(DIV_I16);
  }
 };
+/*
+        TODO: hoist the overflow/zero checks into HIR
+*/
 struct DIV_I32 : Sequence<DIV_I32, I<OPCODE_DIV, I32Op, I32Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xbyak::Label skip;
@ -1766,6 +1769,9 @@ struct DIV_I32 : Sequence<DIV_I32, I<OPCODE_DIV, I32Op, I32Op, I32Op>> {
    e.mov(i.dest, e.eax);
  }
 };
+/*
+        TODO: hoist the overflow/zero checks into HIR
+*/
 struct DIV_I64 : Sequence<DIV_I64, I<OPCODE_DIV, I64Op, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xbyak::Label skip;
@ -1811,7 +1817,7 @@ struct DIV_I64 : Sequence<DIV_I64, I<OPCODE_DIV, I64Op, I64Op, I64Op>> {
      } else {
        // check for signed overflow
        if (i.src1.is_constant) {
-          if (i.src1.constant() != (1 << 31)) {
+          if (i.src1.constant() != (1ll << 63)) {
            // we're good, overflow is impossible
          } else {
            e.cmp(i.src2, -1);  // otherwise, if src2 is -1 then we have
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -149,7 +149,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
              i->Remove();
            }
            result = true;
+          } else if (i->src2.value->IsConstant()) {  // chrispy: fix h3 bug from
+                                                     // const indirect call true
+            auto function = processor_->LookupFunction(
+                uint32_t(i->src2.value->constant.i32));
+            if (!function) {
+              break;
            }
+            // i->Replace(&OPCODE_CALL_TRUE_info, i->flags);
+            i->opcode = &OPCODE_CALL_TRUE_info;
+            i->set_src2(nullptr);
+            i->src2.symbol = function;
+            result = true;
+          }
+
          break;

        case OPCODE_BRANCH_TRUE:
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -796,10 +796,13 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,

  if (var_definition) {
    var_definition = var_definition->GetDestDefSkipAssigns();
-    if (var_definition != NULL)
-    {
+    if (!var_definition) {
+      return false;
+    }
    def_opcode = var_definition->opcode->num;
  }
+  if (!var_definition) {
+    return false;
  }
  // x == 0 -> !x
  if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
@ -1231,13 +1234,12 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
  result = false;
  return value;
 }
-
-bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
+bool SimplificationPass::SimplifyAddWithSHL(hir::Instr* i,
                                            hir::HIRBuilder* builder) {
  /*
  example: (x <<1 ) + x == (x*3)

-  */
+*/
  auto [shlinsn, addend] =
      i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info);
  if (!shlinsn) {
@ -1278,11 +1280,81 @@ bool SimplificationPass::SimplifyAddArith(hir::Instr* i,

  return true;
 }
+bool SimplificationPass::SimplifyAddToSelf(hir::Instr* i,
+                                           hir::HIRBuilder* builder) {
+  /*
+          heres a super easy one
+  */
+
+  if (i->src1.value != i->src2.value) {
+    return false;
+  }
+
+  i->opcode = &OPCODE_SHL_info;
+
+  i->set_src2(builder->LoadConstantUint8(1));
+
+  return true;
+}
+bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
+                                          hir::HIRBuilder* builder) {
+  if (SimplifyAddWithSHL(i, builder)) {
+    return true;
+  }
+  if (SimplifyAddToSelf(i, builder)) {
+    return true;
+  }
+  return false;
+}

 bool SimplificationPass::SimplifySubArith(hir::Instr* i,
                                          hir::HIRBuilder* builder) {
+  /*
+  todo: handle expressions like (x*8) - (x*5) == (x*3)...if these can even
+  happen of course */
  return false;
 }
+bool SimplificationPass::SimplifySHLArith(hir::Instr* i,
+                                          hir::HIRBuilder* builder) {
+  Value* sh = i->src2.value;
+
+  Value* shifted = i->src1.value;
+
+  if (!sh->IsConstant()) {
+    return false;
+  }
+
+  hir::Instr* definition = shifted->GetDefSkipAssigns();
+
+  if (!definition) {
+    return false;
+  }
+
+  if (definition->GetOpcodeNum() != OPCODE_MUL) {
+    return false;
+  }
+
+  if (definition->flags != ARITHMETIC_UNSIGNED) {
+    return false;
+  }
+
+  auto [mulconst, mulnonconst] = definition->BinaryValueArrangeAsConstAndVar();
+
+  if (!mulconst) {
+    return false;
+  }
+
+  auto newmul = builder->AllocValue(mulconst->type);
+  newmul->set_from(mulconst);
+
+  newmul->Shl(sh);
+
+  i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
+  i->set_src1(mulnonconst);
+  i->set_src2(newmul);
+
+  return true;
+}
 bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
                                            hir::HIRBuilder* builder) {
  if (!i->dest) {
@ -1301,6 +1373,9 @@ bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
    case OPCODE_SUB: {
      return SimplifySubArith(i, builder);
    }
+    case OPCODE_SHL: {
+      return SimplifySHLArith(i, builder);
+    }
  }
  return false;
 }
@ -1317,6 +1392,97 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
  }
  return result;
 }
+
+/*
+        todo: add load-store simplification pass
+
+        do things like load-store byteswap elimination, for instance,
+
+        if a value is loaded, ored with a constant mask, and then stored, we
+   simply have to byteswap the mask it will be ored with and then we can
+   eliminate the two byteswaps
+
+        the same can be done for and, or, xor, andn with constant masks
+
+
+        this can also be done for comparisons with 0 for equality and not equal
+
+
+        another optimization: with ppc you cannot move a floating point register
+   directly to a gp one, a gp one directly to a floating point register, or a
+   vmx one to either. so guest code will store the result to the stack, and then
+   load it to the register it needs in HIR we can sidestep this. we will still
+   need to byteswap and store the result for correctness, but we can eliminate
+   the load and byteswap by grabbing the original value from the store
+
+        skyth's sanic idb, 0x824D7724
+    lis       r11,
+    lfs       f0, flt_8200CBCC@l(r11)
+    fmuls     f0, time, f0
+    fctidz    f0, f0        # vcvttss2si
+    stfd      f0, 0x190+var_138(r1)
+    lwz       r30, 0x190+var_138+4(r1)
+    cmplwi    cr6, r30, 0x63 # 'c'
+    ble       cr6, counter_op
+
+
+
+*/
+
+/*
+        todo: simple loop unrolling
+        skyth sanic 0x831D9908
+
+        mr        r30, r4
+        mr        r29, r5
+        mr        r11, r7
+        li        r31, 0
+
+loc_831D9928:
+        slwi      r9, r11, 1
+        addi      r10, r11, 1
+        addi      r8, r1, 0xD0+var_80
+        clrlwi    r11, r10, 16
+        cmplwi    cr6, r11, 0x10
+        sthx      r31, r9, r8
+        ble       cr6, loc_831D9928
+
+        v5 = 1;
+                do
+                {
+                        v6 = 2 * v5;
+                        v5 = (unsigned __int16)(v5 + 1);
+                        *(_WORD *)&v24[v6] = 0;
+                }
+                while ( v5 <= 0x10 );
+                v7 = 0;
+                do
+                {
+                        v8 = __ROL4__(*(unsigned __int8 *)(v7 + a2), 1);
+                        v7 = (unsigned __int16)(v7 + 1);
+                        ++*(_WORD *)&v24[v8];
+                }
+                while ( v7 < 8 );
+                v9 = 1;
+                v25[0] = 0;
+                do
+                {
+                        v10 = 2 * v9;
+                        v11 = 16 - v9;
+                        v9 = (unsigned __int16)(v9 + 1);
+                        v25[v10 / 2] = (*(_WORD *)&v24[v10] << v11) + *(_WORD
+*)&v24[v10 + 48];
+                }
+                while ( v9 <= 0x10 );
+
+
+        skyth sanic:
+        sub_831BBAE0
+
+        sub_831A41A8
+
+
+*/
 }  // namespace passes
 }  // namespace compiler
 }  // namespace cpu
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -36,9 +36,11 @@ class SimplificationPass : public ConditionalGroupSubpass {
  // handles simple multiplication/addition rules
  bool SimplifyBasicArith(hir::HIRBuilder* builder);
  bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
-
+  bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder);
+  bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder);
  bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
  bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
+  bool SimplifySHLArith(hir::Instr* i, hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -200,6 +200,20 @@ const Instr* Instr::GetNonFakePrev() const {
  }
  return curr;
 }
+
+uint32_t Instr::GuestAddressFor() const {
+  Instr* srch = prev;
+
+  while (srch) {
+    if (srch->GetOpcodeNum() == OPCODE_SOURCE_OFFSET) {
+      return (uint32_t)srch->src1.offset;
+    }
+    srch = srch->prev;
+  }
+
+  return 0;  // eek.
+}
+
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -169,6 +169,8 @@ if both are constant, return nullptr, nullptr
  // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
  // OPCODE_SOURCE_OFFSET
  const hir::Instr* GetNonFakePrev() const;
+
+  uint32_t GuestAddressFor() const;
 };

 }  // namespace hir
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@ -30,7 +30,8 @@ std::unique_ptr<MMIOHandler> MMIOHandler::Install(
    HostToGuestVirtual host_to_guest_virtual,
    const void* host_to_guest_virtual_context,
    AccessViolationCallback access_violation_callback,
-    void* access_violation_callback_context) {
+    void* access_violation_callback_context,
+    MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context) {
  // There can be only one handler at a time.
  assert_null(global_handler_);
  if (global_handler_) {
@ -40,7 +41,8 @@ std::unique_ptr<MMIOHandler> MMIOHandler::Install(
  auto handler = std::unique_ptr<MMIOHandler>(new MMIOHandler(
      virtual_membase, physical_membase, membase_end, host_to_guest_virtual,
      host_to_guest_virtual_context, access_violation_callback,
-      access_violation_callback_context));
+      access_violation_callback_context, record_mmio_callback,
+      record_mmio_context));

  // Install the exception handler directed at the MMIOHandler.
  ExceptionHandler::Install(ExceptionCallbackThunk, handler.get());
@ -54,14 +56,18 @@ MMIOHandler::MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase,
                         HostToGuestVirtual host_to_guest_virtual,
                         const void* host_to_guest_virtual_context,
                         AccessViolationCallback access_violation_callback,
-                         void* access_violation_callback_context)
+                         void* access_violation_callback_context,
+                         MmioAccessRecordCallback record_mmio_callback,
+                         void* record_mmio_context)
    : virtual_membase_(virtual_membase),
      physical_membase_(physical_membase),
      memory_end_(membase_end),
      host_to_guest_virtual_(host_to_guest_virtual),
      host_to_guest_virtual_context_(host_to_guest_virtual_context),
      access_violation_callback_(access_violation_callback),
-      access_violation_callback_context_(access_violation_callback_context) {}
+      access_violation_callback_context_(access_violation_callback_context),
+      record_mmio_callback_(record_mmio_callback),
+      record_mmio_context_(record_mmio_context) {}

 MMIOHandler::~MMIOHandler() {
  ExceptionHandler::Uninstall(ExceptionCallbackThunk, this);
@ -412,6 +418,8 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
    // Quick kill anything outside our mapping.
    return false;
  }
+  uint64_t hostip = ex->pc();
+
  void* fault_host_address = reinterpret_cast<void*>(ex->fault_address());

  // Access violations are pretty rare, so we can do a linear search here.
@ -561,6 +569,13 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
  }
 #endif  // XE_ARCH_ARM64

+  if (record_mmio_callback_) {
+    // record that the guest address corresponding to the faulting instructions'
+    // host address reads/writes mmio. we can backpropagate this info on future
+    // compilations
+    record_mmio_callback_(record_mmio_context_, (void*)ex->pc());
+  }
+
  // Advance RIP to the next instruction so that we resume properly.
  ex->set_resume_pc(rip + decoded_load_store.length);

--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@ -29,7 +29,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context,
                                     uint32_t addr);
 typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context,
                                  uint32_t addr, uint32_t value);
-
+typedef void (*MmioAccessRecordCallback)(void* context,
+                                         void* host_insn_address);
 struct MMIORange {
  uint32_t address;
  uint32_t mask;
@ -58,7 +59,8 @@ class MMIOHandler {
      HostToGuestVirtual host_to_guest_virtual,
      const void* host_to_guest_virtual_context,
      AccessViolationCallback access_violation_callback,
-      void* access_violation_callback_context);
+      void* access_violation_callback_context,
+      MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context);
  static MMIOHandler* global_handler() { return global_handler_; }

  bool RegisterRange(uint32_t virtual_address, uint32_t mask, uint32_t size,
@ -68,13 +70,20 @@ class MMIOHandler {

  bool CheckLoad(uint32_t virtual_address, uint32_t* out_value);
  bool CheckStore(uint32_t virtual_address, uint32_t value);
+  void SetMMIOExceptionRecordingCallback(MmioAccessRecordCallback callback,
+                                         void* context) {
+    record_mmio_context_ = context;
+    record_mmio_callback_ = callback;
+  }

 protected:
  MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase,
              uint8_t* membase_end, HostToGuestVirtual host_to_guest_virtual,
              const void* host_to_guest_virtual_context,
              AccessViolationCallback access_violation_callback,
-              void* access_violation_callback_context);
+              void* access_violation_callback_context,
+              MmioAccessRecordCallback record_mmio_callback,
+              void* record_mmio_context);

  static bool ExceptionCallbackThunk(Exception* ex, void* data);
  bool ExceptionCallback(Exception* ex);
@ -90,7 +99,9 @@ class MMIOHandler {

  AccessViolationCallback access_violation_callback_;
  void* access_violation_callback_context_;
+  MmioAccessRecordCallback record_mmio_callback_;

+  void* record_mmio_context_;
  static MMIOHandler* global_handler_;

  xe::global_critical_region global_critical_region_;
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -1439,11 +1439,23 @@ int InstrEmit_vsel(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_vsel128(PPCHIRBuilder& f, const InstrData& i) {
  return InstrEmit_vsel_(f, VX128_VD128, VX128_VA128, VX128_VB128, VX128_VD128);
 }
+// chrispy: this is test code for checking whether a game takes advantage of the
+// VSR/VSL undocumented/undefined variable shift behavior
+static void AssertShiftElementsOk(PPCHIRBuilder& f, Value* v) {
+#if 0
+  Value* splatted = f.Splat(f.Extract(v, (uint8_t)0, INT8_TYPE), VEC128_TYPE);

+  Value* checkequal = f.Xor(splatted, v);
+  f.DebugBreakTrue(f.IsTrue(checkequal));
+#endif
+}
 int InstrEmit_vsl(PPCHIRBuilder& f, const InstrData& i) {
-  Value* v = f.Shl(f.LoadVR(i.VX.VA),
-                   f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE),
-                         f.LoadConstantInt8(0b111)));
+  Value* va = f.LoadVR(i.VX.VA);
+  Value* vb = f.LoadVR(i.VX.VB);
+
+  AssertShiftElementsOk(f, vb);
+  Value* v =
+      f.Shl(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111)));
  f.StoreVR(i.VX.VD, v);
  return 0;
 }
@ -1623,9 +1635,13 @@ int InstrEmit_vspltisw128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_vsr(PPCHIRBuilder& f, const InstrData& i) {
-  Value* v = f.Shr(f.LoadVR(i.VX.VA),
-                   f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE),
-                         f.LoadConstantInt8(0b111)));
+  Value* va = f.LoadVR(i.VX.VA);
+  Value* vb = f.LoadVR(i.VX.VB);
+
+  AssertShiftElementsOk(f, vb);
+
+  Value* v =
+      f.Shr(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111)));
  f.StoreVR(i.VX.VD, v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_control.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_control.cc
@ -769,8 +769,14 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
  // bit 62 = RI; recoverable interrupt
  // return 8000h if unlocked (interrupts enabled), else 0
  f.MemoryBarrier();
+  if (cvars::disable_global_lock || true) {
+    f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
+
+  } else {
    f.CallExtern(f.builtins()->check_global_lock);
-  f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
+    f.StoreGPR(i.X.RT,
+               f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
+  }
  return 0;
 }

@ -782,6 +788,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
    f.StoreContext(
        offsetof(PPCContext, scratch),
        f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
+#if 0
    if (i.X.RT == 13) {
      // iff storing from r13 we are taking a lock (disable interrupts).
      if (!cvars::disable_global_lock) {
@ -793,6 +800,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
        f.CallExtern(f.builtins()->leave_global_lock);
      }
    }
+#endif
    return 0;
  } else {
    // L = 0
@ -807,6 +815,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
    f.MemoryBarrier();
    f.StoreContext(offsetof(PPCContext, scratch),
                   f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
+#if 0
    if (i.X.RT == 13) {
      // iff storing from r13 we are taking a lock (disable interrupts).
      if (!cvars::disable_global_lock) {
@ -818,6 +827,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
        f.CallExtern(f.builtins()->leave_global_lock);
      }
    }
+#endif
    return 0;
  } else {
    // L = 0
--- a/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc
+++ b/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc
@ -5406,6 +5406,7 @@ PPCOpcodeDisasmInfo ppc_opcode_disasm_table[] = {
  INSTRUCTION(0x6c000000, "xoris"       , kD      , kI, kGeneral, "XOR Immediate Shifted"                                                      , (PPCOpcodeField::kRS,PPCOpcodeField::kUIMM), (PPCOpcodeField::kRA), PrintDisasm_xoris),
  INSTRUCTION(0x7c000278, "xorx"        , kX      , kI, kGeneral, "XOR"                                                                        , (PPCOpcodeField::kRS,PPCOpcodeField::kRB), (PPCOpcodeField::kRA,PPCOpcodeField::kCRcond), PrintDisasm_xorx),
 };
+#undef INSTRUCTION
 static_assert(sizeof(ppc_opcode_disasm_table) / sizeof(PPCOpcodeDisasmInfo) == static_cast<int>(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen");

 const PPCOpcodeDisasmInfo& GetOpcodeDisasmInfo(PPCOpcode opcode) {
--- a/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc
+++ b/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc
@ -470,6 +470,7 @@ PPCOpcodeInfo ppc_opcode_table[] = {
  INSTRUCTION(0x6c000000, "xoris"       , kD      , kI, kGeneral),
  INSTRUCTION(0x7c000278, "xorx"        , kX      , kI, kGeneral),
 };
+#undef INSTRUCTION
 static_assert(sizeof(ppc_opcode_table) / sizeof(PPCOpcodeInfo) == static_cast<int>(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen");

 const PPCOpcodeInfo& GetOpcodeInfo(PPCOpcode opcode) {
--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@ -257,11 +257,22 @@ Function* Processor::ResolveFunction(uint32_t address) {

    // Grab symbol declaration.
    auto function = LookupFunction(address);
+
    if (!function) {
      entry->status = Entry::STATUS_FAILED;
      return nullptr;
    }

+    auto module_for = function->module();
+
+    auto xexmod = dynamic_cast<XexModule*>(module_for);
+    if (xexmod) {
+      auto addr_flags = xexmod->GetInstructionAddressFlags(address);
+      if (addr_flags) {
+        addr_flags->was_resolved = 1;
+      }
+    }
+
    if (!DemandFunction(function)) {
      entry->status = Entry::STATUS_FAILED;
      return nullptr;
--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@ -14,13 +14,16 @@
 #include "third_party/fmt/include/fmt/format.h"

 #include "xenia/base/byte_order.h"
+#include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
+
 #include "xenia/cpu/cpu_flags.h"
 #include "xenia/cpu/export_resolver.h"
 #include "xenia/cpu/lzx.h"
 #include "xenia/cpu/processor.h"
+#include "xenia/emulator.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/xmodule.h"

@ -29,6 +32,14 @@
 #include "third_party/crypto/rijndael-alg-fst.h"
 #include "third_party/pe/pe_image.h"

+DEFINE_bool(disable_instruction_infocache, false,
+            "Disables caching records of called instructions/mmio accesses.",
+            "CPU");
+DEFINE_bool(disable_function_precompilation, true,
+            "Disables pre-compiling guest functions that we know we've called "
+            "on previous runs",
+            "CPU");
+
 static const uint8_t xe_xex2_retail_key[16] = {
    0x20, 0xB1, 0x85, 0xA5, 0x9D, 0x28, 0xFD, 0xC3,
    0x40, 0x58, 0x3F, 0xBB, 0x08, 0x96, 0xBF, 0x91};
@ -977,6 +988,7 @@ bool XexModule::LoadContinue() {

  // Scan and find the low/high addresses.
  // All code sections are continuous, so this should be easy.
+  // could use a source for the above information
  auto heap = memory()->LookupHeap(base_address_);
  auto page_size = heap->page_size();

@ -1045,7 +1057,24 @@ bool XexModule::LoadContinue() {
      library_offset += library->size;
    }
  }
+  sha1::SHA1 final_image_sha_;

+  final_image_sha_.reset();
+
+  unsigned high_code = this->high_address_ - this->low_address_;
+
+  final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_),
+                                high_code);
+  final_image_sha_.finalize(image_sha_bytes_);
+
+  char fmtbuf[16];
+
+  for (unsigned i = 0; i < 16; ++i) {
+    sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]);
+    image_sha_str_ += &fmtbuf[0];
+  }
+
+  info_cache_.Init(this);
  // Find __savegprlr_* and __restgprlr_* and the others.
  // We can flag these for special handling (inlining/etc).
  if (!FindSaveRest()) {
@ -1288,7 +1317,68 @@ std::unique_ptr<Function> XexModule::CreateFunction(uint32_t address) {
  return std::unique_ptr<Function>(
      processor_->backend()->CreateGuestFunction(this, address));
 }
+void XexInfoCache::Init(XexModule* xexmod) {
+  if (cvars::disable_instruction_infocache) {
+    return;
+  }
+  auto emu = xexmod->kernel_state_->emulator();
+  std::filesystem::path infocache_path = emu->cache_root();

+  infocache_path.append(L"modules");
+
+  infocache_path.append(xexmod->image_sha_str_);
+
+  std::filesystem::create_directories(infocache_path);
+  infocache_path.append("executable_addr_flags.bin");
+
+  unsigned num_codebytes = xexmod->high_address_ - xexmod->low_address_;
+  num_codebytes += 3;  // round up to nearest multiple of 4
+  num_codebytes &= ~3;
+  bool did_exist = true;
+  if (!std::filesystem::exists(infocache_path)) {
+    xe::filesystem::CreateEmptyFile(infocache_path);
+    did_exist = false;
+  }
+
+  // todo: prepopulate with stuff from pdata, dll exports
+  this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
+      infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
+      sizeof(InfoCacheFlagsHeader) +
+          (sizeof(InfoCacheFlags) *
+           (num_codebytes /
+            4))));  // one infocacheflags entry for each PPC instr-sized addr
+
+  if (did_exist) {
+    xexmod->PrecompileKnownFunctions();
+  }
+}
+
+InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
+  if (guest_addr < low_address_ || guest_addr > high_address_) {
+    return nullptr;
+  }
+
+  guest_addr -= low_address_;
+
+  return info_cache_.LookupFlags(guest_addr);
+}
+
+void XexModule::PrecompileKnownFunctions() {
+  if (cvars::disable_function_precompilation) {
+    return;
+  }
+  uint32_t start = 0;
+  uint32_t end = (high_address_ - low_address_) / 4;
+  auto flags = info_cache_.LookupFlags(0);
+  if (!flags) {
+    return;
+  }
+  for (uint32_t i = 0; i < end; i++) {
+    if (flags[i].was_resolved) {
+      processor_->ResolveFunction(low_address_ + (i * 4));
+    }
+  }
+}
 bool XexModule::FindSaveRest() {
  // Special stack save/restore functions.
  // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm
--- a/src/xenia/cpu/xex_module.h
+++ b/src/xenia/cpu/xex_module.h
@ -12,7 +12,7 @@

 #include <string>
 #include <vector>
-
+#include "xenia/base/mapped_memory.h"
 #include "xenia/cpu/module.h"
 #include "xenia/kernel/util/xex2_info.h"

@ -30,6 +30,39 @@ constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
 constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');

 class Runtime;
+struct InfoCacheFlags {
+  uint32_t was_resolved : 1;  // has this address ever been called/requested
+                              // via resolvefunction?
+  uint32_t accessed_mmio : 1;
+  uint32_t reserved : 30;
+};
+struct XexInfoCache {
+  struct InfoCacheFlagsHeader {
+    unsigned char reserved[256];  // put xenia version here
+
+    InfoCacheFlags* LookupFlags(unsigned offset) {
+      return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
+    }
+  };
+  /*
+        for every 4-byte aligned address, records a 4 byte set of flags.
+  */
+  std::unique_ptr<MappedMemory> executable_addr_flags_;
+
+  void Init(class XexModule*);
+  InfoCacheFlags* LookupFlags(unsigned offset) {
+    offset /= 4;
+    if (!executable_addr_flags_) {
+      return nullptr;
+    }
+    uint8_t* data = executable_addr_flags_->data();
+
+    if (!data) {
+      return nullptr;
+    }
+    return reinterpret_cast<InfoCacheFlagsHeader*>(data)->LookupFlags(offset);
+  }
+};

 class XexModule : public xe::cpu::Module {
 public:
@ -174,10 +207,14 @@ class XexModule : public xe::cpu::Module {
             XEX_MODULE_PATCH_FULL));
  }

+  InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
+  void PrecompileKnownFunctions();
+
 protected:
  std::unique_ptr<Function> CreateFunction(uint32_t address) override;

 private:
+  friend struct XexInfoCache;
  void ReadSecurityInfo();

  int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key);
@ -217,6 +254,10 @@ class XexModule : public xe::cpu::Module {

  XexFormat xex_format_ = kFormatUnknown;
  SecurityInfoContext security_info_ = {};
+
+  uint8_t image_sha_bytes_[16];
+  std::string image_sha_str_;
+  XexInfoCache info_cache_;
 };

 }  // namespace cpu
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -16,6 +16,7 @@

 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/byte_stream.h"
+#include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/base/profiling.h"
@ -28,6 +29,10 @@
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"

+DEFINE_bool(log_unknown_register_writes, false,
+            "Log writes to unknown registers from "
+            "CommandProcessor::WriteRegister. Has significant performance hit.",
+            "GPU");
 namespace xe {
 namespace gpu {

@ -329,19 +334,9 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) {
  write_ptr_index_ = value;
  write_ptr_index_event_->Set();
 }
-
-void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
+                                                  uint32_t value) {
  RegisterFile& regs = *register_file_;
-  if (index >= RegisterFile::kRegisterCount) {
-    XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
-    return;
-  }
-
-  regs.values[index].u32 = value;
-  if (!regs.GetRegisterInfo(index)) {
-    XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
-  }
-
  // Scratch register writeback.
  if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
    uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
@ -469,6 +464,43 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
    }
  }
 }
+void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+  if (XE_UNLIKELY(cvars::log_unknown_register_writes)) {
+    // chrispy: rearrange check order, place set after checks
+    if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) {
+      XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
+    check_reg_out_of_bounds:
+      if (XE_UNLIKELY(index >= RegisterFile::kRegisterCount)) {
+        XELOGW("CommandProcessor::WriteRegister index out of bounds: {}",
+               index);
+        return;
+      }
+    }
+  } else {
+    goto check_reg_out_of_bounds;
+  }
+  register_file_->values[index].u32 = value;
+
+  //  regs with extra logic on write: XE_GPU_REG_COHER_STATUS_HOST
+  //  XE_GPU_REG_DC_LUT_RW_INDEX
+  // XE_GPU_REG_DC_LUT_SEQ_COLOR XE_GPU_REG_DC_LUT_PWL_DATA
+  // XE_GPU_REG_DC_LUT_30_COLOR
+
+  // quick pre-test
+  // todo: figure out just how unlikely this is. if very (it ought to be, theres
+  // a ton of registers other than these) make this predicate branchless and
+  // mark with unlikely, then make HandleSpecialRegisterWrite noinline yep, its
+  // very unlikely. these ORS here are meant to be bitwise ors, so that we do
+  // not do branching evaluation of the conditions (we will almost always take
+  // all of the branches)
+  if (XE_UNLIKELY(
+          (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
+          (index == XE_GPU_REG_COHER_STATUS_HOST) |
+          ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
+           (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
+    HandleSpecialRegisterWrite(index, value);
+  }
+}

 void CommandProcessor::MakeCoherent() {
  SCOPE_profile_cpu_f("gpu");
@ -570,7 +602,7 @@ void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) {
      // Return up a level if we encounter a bad packet.
      XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet.");
      assert_always();
-      //break;
+      // break;
    }
  } while (reader.read_count());

--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -150,7 +150,9 @@ class CommandProcessor {
  void WorkerThreadMain();
  virtual bool SetupContext() = 0;
  virtual void ShutdownContext() = 0;
-
+  // rarely needed, most register writes have no special logic here
+  XE_NOINLINE
+  void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
  virtual void WriteRegister(uint32_t index, uint32_t value);

  const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -712,7 +712,7 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) {
  ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height;
  ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth;
  ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth;
-  if (ff_viewport_update_needed_) {
+  if (XE_UNLIKELY(ff_viewport_update_needed_)) {
    ff_viewport_ = viewport;
    deferred_command_list_.RSSetViewport(ff_viewport_);
    ff_viewport_update_needed_ = false;
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
@ -4799,18 +4799,16 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears(
    if (!current_transfers.empty()) {
      are_current_command_list_render_targets_valid_ = false;
      if (dest_rt_key.is_depth) {
-        command_list.D3DOMSetRenderTargets(
-            0, nullptr, FALSE, &dest_d3d12_rt.descriptor_draw().GetHandle());
+        auto handle = dest_d3d12_rt.descriptor_draw().GetHandle();
+        command_list.D3DOMSetRenderTargets(0, nullptr, FALSE, &handle);
        if (!use_stencil_reference_output_) {
          command_processor_.SetStencilReference(UINT8_MAX);
        }
      } else {
-        command_list.D3DOMSetRenderTargets(
-            1,
-            &(dest_d3d12_rt.descriptor_load_separate().IsValid()
+        auto handle = dest_d3d12_rt.descriptor_load_separate().IsValid()
                          ? dest_d3d12_rt.descriptor_load_separate().GetHandle()
-                  : dest_d3d12_rt.descriptor_draw().GetHandle()),
-            FALSE, nullptr);
+                          : dest_d3d12_rt.descriptor_draw().GetHandle();
+        command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr);
      }

      uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles();
@ -5425,12 +5423,12 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears(
            dest_d3d12_rt.SetResourceState(D3D12_RESOURCE_STATE_RENDER_TARGET),
            D3D12_RESOURCE_STATE_RENDER_TARGET);
        if (clear_via_drawing) {
-          command_list.D3DOMSetRenderTargets(
-              1,
-              &(dest_d3d12_rt.descriptor_load_separate().IsValid()
+          auto handle =
+              (dest_d3d12_rt.descriptor_load_separate().IsValid()
                   ? dest_d3d12_rt.descriptor_load_separate().GetHandle()
-                    : dest_d3d12_rt.descriptor_draw().GetHandle()),
-              FALSE, nullptr);
+                   : dest_d3d12_rt.descriptor_draw().GetHandle());
+
+          command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr);
          are_current_command_list_render_targets_valid_ = true;
          D3D12_VIEWPORT clear_viewport;
          clear_viewport.TopLeftX = float(clear_rect.left);
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
@ -78,314 +78,24 @@ namespace shaders {
 #include "xenia/gpu/shaders/bytecode/d3d12_5_1/texture_load_r5g6b5_b5g6r5_scaled_cs.h"
 }  // namespace shaders

-const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = {
-    // k_1_REVERSE
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_1
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_8
-    {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
-     DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_1_5_5_5
-    // Red and blue swapped in the load shader for simplicity.
-    {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM,
-     kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_5_6_5
-    // Red and blue swapped in the load shader for simplicity.
-    {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
-     kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_6_5_5
-    // On the host, green bits in blue, blue bits in green.
-    {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
-     kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)},
-    // k_8_8_8_8
-    {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
-     kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown,
-     false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_2_10_10_10
-    {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM,
-     kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_8_A
-    {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
-     DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_8_B
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_8_8
-    {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb,
-     DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_Cr_Y1_Cb_Y0_REP
-    // Red and blue swapped in the load shader for simplicity.
-    // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for
-    // the signed version, separate unsigned and signed load shaders completely
-    // (as one doesn't need decompression for this format, while another does).
-    {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM,
-     kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexGBGR8ToRGB8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_Y1_Cr_Y0_Cb_REP
-    // Red and blue swapped in the load shader for simplicity.
-    // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for
-    // the signed version, separate unsigned and signed load shaders completely
-    // (as one doesn't need decompression for this format, while another does).
-    {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM,
-     kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexBGRG8ToRGB8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_16_16_EDRAM
-    // Not usable as a texture, also has -32...32 range.
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_8_8_8_8_A
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_4_4_4_4
-    // Red and blue swapped in the load shader for simplicity.
-    {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
-     kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_10_11_11
-    {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
-     kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
-     kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_11_11_10
-    {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
-     kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
-     kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_DXT1
-    {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_DXT2_3
-    {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_DXT4_5
-    {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_16_16_16_16_EDRAM
-    // Not usable as a texture, also has -32...32 range.
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // R32_FLOAT for depth because shaders would require an additional SRV to
-    // sample stencil, which we don't provide.
-    // k_24_8
-    {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm,
-     DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_24_8_FLOAT
-    {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat,
-     DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16
-    {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb,
-     DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_16
-    {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM,
-     kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown,
-     false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_16_16_16_16
-    {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
-     kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_16_EXPAND
-    {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
-     DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_16_EXPAND
-    {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb,
-     DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_16_16_16_16_EXPAND
-    {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
-     kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_16_FLOAT
-    {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
-     DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_16_FLOAT
-    {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb,
-     DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_16_16_16_16_FLOAT
-    {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
-     kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_32
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_32_32
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_32_32_32_32
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_32_FLOAT
-    {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb,
-     DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_32_32_FLOAT
-    {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndex64bpb,
-     DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_32_32_32_32_FLOAT
-    {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT,
-     kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_32_AS_8
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_32_AS_8_8
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_16_MPEG
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_16_MPEG
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_8_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_32_AS_8_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_32_AS_8_8_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_16_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_MPEG_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_16_16_MPEG_INTERLACED
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_DXN
-    {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8_UNORM,
-     kLoadShaderIndexDXNToRG8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_8_8_8_8_AS_16_16_16_16
-    {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
-     kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown,
-     false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_DXT1_AS_16_16_16_16
-    {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_DXT2_3_AS_16_16_16_16
-    {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_DXT4_5_AS_16_16_16_16
-    {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
-     DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_2_10_10_10_AS_16_16_16_16
-    {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
-     kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_10_11_11_AS_16_16_16_16
-    {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
-     kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
-     kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_11_11_10_AS_16_16_16_16
-    {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
-     kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
-     kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_32_32_32_FLOAT
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
-    // k_DXT3A
-    // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size
-    // alignment requirement.
-    {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_DXT5A
-    {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM,
-     kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
-    // k_CTX1
-    {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
-    // k_DXT3A_AS_1_1_1_1
-    {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
-     kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_8_8_8_8_GAMMA_EDRAM
-    // Not usable as a texture.
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-    // k_2_10_10_10_FLOAT_EDRAM
-    // Not usable as a texture.
-    {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
-     DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
-     kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
-};
+/*
+        chrispy: we're getting cache misses in GetHostFormatSwizzle, use a
+   denser array todo: not all 65536 possible swizzles are used, this could
+   probably be one cache line
+*/
+using SwizzleArray = std::array<unsigned short, 64>;
+
+static constexpr SwizzleArray build_xenos_swizzle_for_format() {
+  SwizzleArray result{0};
+
+  for (int i = 0; i < 64; ++i) {
+    result[i] =
+        static_cast<uint16_t>(D3D12TextureCache::host_formats_[i].swizzle);
+  }
+  return result;
+}
+alignas(64) constexpr SwizzleArray xenos_swizzle_for_format =
+    build_xenos_swizzle_for_format();

 D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file,
                                     D3D12SharedMemory& shared_memory,
@ -1544,7 +1254,8 @@ bool D3D12TextureCache::IsScaledResolveSupportedForFormat(
 }

 uint32_t D3D12TextureCache::GetHostFormatSwizzle(TextureKey key) const {
-  return host_formats_[uint32_t(key.format)].swizzle;
+  // return host_formats_[uint32_t(key.format)].swizzle;
+  return xenos_swizzle_for_format[uint32_t(key.format)];
 }

 uint32_t D3D12TextureCache::GetMaxHostTextureWidthHeight(
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h
@ -160,29 +160,6 @@ class D3D12TextureCache final : public TextureCache {
  ID3D12Resource* RequestSwapTexture(
      D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out,
      xenos::TextureFormat& format_out);
-
- protected:
-  bool IsSignedVersionSeparateForFormat(TextureKey key) const override;
-  bool IsScaledResolveSupportedForFormat(TextureKey key) const override;
-  uint32_t GetHostFormatSwizzle(TextureKey key) const override;
-
-  uint32_t GetMaxHostTextureWidthHeight(
-      xenos::DataDimension dimension) const override;
-  uint32_t GetMaxHostTextureDepthOrArraySize(
-      xenos::DataDimension dimension) const override;
-
-  std::unique_ptr<Texture> CreateTexture(TextureKey key) override;
-
-  // This binds pipelines, allocates descriptors, and copies!
-  bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base,
-                                             bool load_mips) override;
-
-  void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override;
-
- private:
-  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
-  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
-
  struct HostFormat {
    // Format info for the regular case.
    // DXGI format (typeless when different signedness or number representation
@ -223,6 +200,352 @@ class D3D12TextureCache final : public TextureCache {
    // Mapping of Xenos swizzle components to DXGI format components.
    uint32_t swizzle;
  };
+  static constexpr HostFormat host_formats_[64]{
+      // k_1_REVERSE
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_1
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_8
+      {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
+       DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_1_5_5_5
+      // Red and blue swapped in the load shader for simplicity.
+      {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM,
+       kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_5_6_5
+      // Red and blue swapped in the load shader for simplicity.
+      {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
+       kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_6_5_5
+      // On the host, green bits in blue, blue bits in green.
+      {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
+       kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)},
+      // k_8_8_8_8
+      {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_2_10_10_10
+      {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_8_A
+      {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
+       DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_8_B
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_8_8
+      {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb,
+       DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_Cr_Y1_Cb_Y0_REP
+      // Red and blue swapped in the load shader for simplicity.
+      // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is
+      // usable for
+      // the signed version, separate unsigned and signed load shaders
+      // completely
+      // (as one doesn't need decompression for this format, while another
+      // does).
+      {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM,
+       kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM,
+       kLoadShaderIndexGBGR8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_Y1_Cr_Y0_Cb_REP
+      // Red and blue swapped in the load shader for simplicity.
+      // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is
+      // usable for
+      // the signed version, separate unsigned and signed load shaders
+      // completely
+      // (as one doesn't need decompression for this format, while another
+      // does).
+      {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM,
+       kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM,
+       kLoadShaderIndexBGRG8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_16_16_EDRAM
+      // Not usable as a texture, also has -32...32 range.
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_8_8_8_8_A
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_4_4_4_4
+      // Red and blue swapped in the load shader for simplicity.
+      {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
+       kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_10_11_11
+      {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
+       kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
+       kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_11_11_10
+      {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
+       kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
+       kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_DXT1
+      {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_DXT2_3
+      {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_DXT4_5
+      {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_16_16_16_16_EDRAM
+      // Not usable as a texture, also has -32...32 range.
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // R32_FLOAT for depth because shaders would require an additional SRV
+      // to
+      // sample stencil, which we don't provide.
+      // k_24_8
+      {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm,
+       DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_24_8_FLOAT
+      {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat,
+       DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16
+      {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb,
+       DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_16
+      {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_16_16_16_16
+      {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
+       kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_16_EXPAND
+      {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
+       DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_16_EXPAND
+      {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_16_16_16_16_EXPAND
+      {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
+       kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_16_FLOAT
+      {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
+       DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_16_FLOAT
+      {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_16_16_16_16_FLOAT
+      {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
+       kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_32
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_32_32
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_32_32_32_32
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_32_FLOAT
+      {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb,
+       DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_32_32_FLOAT
+      {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT,
+       kLoadShaderIndex64bpb, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_32_32_32_32_FLOAT
+      {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT,
+       kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_32_AS_8
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_32_AS_8_8
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_16_MPEG
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_16_MPEG
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_8_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_32_AS_8_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_32_AS_8_8_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_16_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_MPEG_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_16_16_MPEG_INTERLACED
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_DXN
+      {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexDXNToRG8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_8_8_8_8_AS_16_16_16_16
+      {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_DXT1_AS_16_16_16_16
+      {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_DXT2_3_AS_16_16_16_16
+      {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_DXT4_5_AS_16_16_16_16
+      {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
+       DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_2_10_10_10_AS_16_16_16_16
+      {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
+       kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_10_11_11_AS_16_16_16_16
+      {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
+       kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
+       kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_11_11_10_AS_16_16_16_16
+      {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
+       kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
+       kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_32_32_32_FLOAT
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
+      // k_DXT3A
+      // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size
+      // alignment requirement.
+      {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_DXT5A
+      {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM,
+       kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
+      // k_CTX1
+      {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
+      // k_DXT3A_AS_1_1_1_1
+      {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
+       kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_8_8_8_8_GAMMA_EDRAM
+      // Not usable as a texture.
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+      // k_2_10_10_10_FLOAT_EDRAM
+      // Not usable as a texture.
+      {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
+       DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
+       kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
+  };
+
+ protected:
+  bool IsSignedVersionSeparateForFormat(TextureKey key) const override;
+  bool IsScaledResolveSupportedForFormat(TextureKey key) const override;
+  uint32_t GetHostFormatSwizzle(TextureKey key) const override;
+
+  uint32_t GetMaxHostTextureWidthHeight(
+      xenos::DataDimension dimension) const override;
+  uint32_t GetMaxHostTextureDepthOrArraySize(
+      xenos::DataDimension dimension) const override;
+
+  std::unique_ptr<Texture> CreateTexture(TextureKey key) override;
+
+  // This binds pipelines, allocates descriptors, and copies!
+  bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base,
+                                             bool load_mips) override;
+
+  void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override;
+
+ private:
+  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
+  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;

  class D3D12Texture final : public Texture {
   public:
@ -467,8 +790,6 @@ class D3D12TextureCache final : public TextureCache {

  xenos::ClampMode NormalizeClampMode(xenos::ClampMode clamp_mode) const;

-  static const HostFormat host_formats_[64];
-
  D3D12CommandProcessor& command_processor_;
  bool bindless_resources_used_;

--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@ -198,7 +198,7 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) {
                  // maximum [width(0x0FFF), height(0x0FFF)]
      return 0x050002D0;
    default:
-      if (!register_file_.GetRegisterInfo(r)) {
+      if (!register_file_.IsValidRegister(r)) {
        XELOGE("GPU: Read from unknown register ({:04X})", r);
      }
  }
--- a/src/xenia/gpu/register_file.cc
+++ b/src/xenia/gpu/register_file.cc
@ -8,7 +8,7 @@
 */

 #include "xenia/gpu/register_file.h"
-
+#include <array>
 #include <cstring>

 #include "xenia/base/math.h"
@ -17,6 +17,52 @@ namespace xe {
 namespace gpu {

 RegisterFile::RegisterFile() { std::memset(values, 0, sizeof(values)); }
+constexpr unsigned int GetHighestRegisterNumber() {
+  uint32_t highest = 0;
+#define XE_GPU_REGISTER(index, type, name) \
+  highest = std::max<uint32_t>(highest, index);
+#include "xenia/gpu/register_table.inc"
+#undef XE_GPU_REGISTER
+
+  return highest;
+}
+constexpr unsigned int GetLowestRegisterNumber() {
+  uint32_t lowest = UINT_MAX;
+#define XE_GPU_REGISTER(index, type, name) \
+  lowest = std::min<uint32_t>(lowest, index);
+#include "xenia/gpu/register_table.inc"
+#undef XE_GPU_REGISTER
+
+  return lowest;
+}
+
+static constexpr uint32_t lowest_register = GetLowestRegisterNumber();
+static constexpr uint32_t highest_register = GetHighestRegisterNumber();
+
+static constexpr uint32_t total_num_registers =
+    highest_register - lowest_register;
+
+static constexpr uint32_t num_required_words_for_registers =
+    ((total_num_registers + 63) & ~63) / 64;
+// can't use bitset, its not constexpr in c++ 17
+using ValidRegisterBitset = std::array<
+    uint64_t,
+    num_required_words_for_registers>;  // std::bitset<highest_register
+                                        // - lowest_register>;
+
+static constexpr ValidRegisterBitset BuildValidRegisterBitset() {
+  ValidRegisterBitset result{};
+#define XE_GPU_REGISTER(index, type, name)  \
+  result[(index - lowest_register) / 64] |= \
+      1ULL << ((index - lowest_register) % 64);
+
+#include "xenia/gpu/register_table.inc"
+#undef XE_GPU_REGISTER
+
+  return result;
+}
+static constexpr ValidRegisterBitset valid_register_bitset =
+    BuildValidRegisterBitset();

 const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) {
  switch (index) {
@ -34,6 +80,18 @@ const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) {
      return nullptr;
  }
 }
+/*
+        todo: this still uses a lot of cpu! our bitset is too large
+*/
+bool RegisterFile::IsValidRegister(uint32_t index) {
+  if (XE_UNLIKELY(index < lowest_register) ||
+      XE_UNLIKELY(index > highest_register)) {
+    return false;
+  }
+  uint32_t register_linear_index = index - lowest_register;

+  return (valid_register_bitset[register_linear_index / 64] &
+          (1ULL << (register_linear_index % 64))) != 0;
+}
 }  //  namespace gpu
 }  //  namespace xe
--- a/src/xenia/gpu/register_file.h
+++ b/src/xenia/gpu/register_file.h
@ -32,7 +32,7 @@ class RegisterFile {
  RegisterFile();

  static const RegisterInfo* GetRegisterInfo(uint32_t index);
-
+  static bool IsValidRegister(uint32_t index);
  static constexpr size_t kRegisterCount = 0x5003;
  union RegisterValue {
    uint32_t u32;
--- a/src/xenia/gpu/trace_viewer.cc
+++ b/src/xenia/gpu/trace_viewer.cc
@ -41,9 +41,6 @@
 #include "xenia/ui/windowed_app_context.h"
 #include "xenia/xbox.h"

-DEFINE_string(target_trace_file, "", "Specifies the trace file to load.",
-              "GPU");
-
 namespace xe {
 namespace gpu {

@ -66,7 +63,7 @@ TraceViewer::TraceViewer(xe::ui::WindowedAppContext& app_context,
 TraceViewer::~TraceViewer() = default;

 bool TraceViewer::OnInitialize() {
-  std::string path = cvars::target_trace_file;
+  std::string path = cvars::target_trace_file.u8string();

  // If no path passed, ask the user.
  // On Android, however, there's no synchronous file picker, and the trace file
--- a/src/xenia/gpu/trace_viewer.h
+++ b/src/xenia/gpu/trace_viewer.h
@ -12,6 +12,7 @@

 #include <string_view>

+#include "xenia/base/cvar.h"
 #include "xenia/emulator.h"
 #include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_player.h"
@ -24,7 +25,7 @@
 #include "xenia/ui/window.h"
 #include "xenia/ui/window_listener.h"
 #include "xenia/ui/windowed_app.h"
-
+DECLARE_path(target_trace_file);
 namespace xe {
 namespace gpu {

--- a/src/xenia/gpu/trace_writer.cc
+++ b/src/xenia/gpu/trace_writer.cc
@ -25,7 +25,7 @@

 namespace xe {
 namespace gpu {
-
+#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
 TraceWriter::TraceWriter(uint8_t* membase)
    : membase_(membase), file_(nullptr) {}

@ -362,6 +362,6 @@ void TraceWriter::WriteGammaRamp(
    fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_);
  }
 }
-
+#endif
 }  //  namespace gpu
 }  //  namespace xe
--- a/src/xenia/gpu/trace_writer.h
+++ b/src/xenia/gpu/trace_writer.h
@ -17,11 +17,22 @@
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/trace_protocol.h"

+// only enable trace writer in debug builds, measured hit from the trace
+// function calls (even if they just immediately return) is 0.40-0.60% cpu time
+// total. with inlining they just bloat the caller and negatively impact
+// register allocation for the caller
+#ifdef NDEBUG
+#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 0
+#else
+#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 1
+#endif
+
 namespace xe {
 namespace gpu {

 class TraceWriter {
 public:
+#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
  explicit TraceWriter(uint8_t* membase);
  ~TraceWriter();

@ -61,6 +72,49 @@ class TraceWriter {

  bool compress_output_ = true;
  size_t compression_threshold_ = 1024;  // Min. number of bytes to compress.
+
+#else
+  // this could be annoying to maintain if new methods are added or the
+  // signatures change
+  constexpr explicit TraceWriter(uint8_t* membase) {}
+
+  static constexpr bool is_open() { return false; }
+
+  static constexpr bool Open(const std::filesystem::path& path,
+                             uint32_t title_id) {
+    return false;
+  }
+  static constexpr void Flush() {}
+  static constexpr void Close() {}
+
+  static constexpr void WritePrimaryBufferStart(uint32_t base_ptr,
+                                                uint32_t count) {}
+  static constexpr void WritePrimaryBufferEnd() {}
+  static constexpr void WriteIndirectBufferStart(uint32_t base_ptr,
+                                                 uint32_t count) {}
+  static constexpr void WriteIndirectBufferEnd() {}
+  static constexpr void WritePacketStart(uint32_t base_ptr, uint32_t count) {}
+  static constexpr void WritePacketEnd() {}
+  static constexpr void WriteMemoryRead(uint32_t base_ptr, size_t length,
+                                        const void* host_ptr = nullptr) {}
+  static constexpr void WriteMemoryReadCached(uint32_t base_ptr,
+                                              size_t length) {}
+  static constexpr void WriteMemoryReadCachedNop(uint32_t base_ptr,
+                                                 size_t length) {}
+  static constexpr void WriteMemoryWrite(uint32_t base_ptr, size_t length,
+                                         const void* host_ptr = nullptr) {}
+  static constexpr void WriteEdramSnapshot(const void* snapshot) {}
+  static constexpr void WriteEvent(EventCommand::Type event_type) {}
+  static constexpr void WriteRegisters(uint32_t first_register,
+                                       const uint32_t* register_values,
+                                       uint32_t register_count,
+                                       bool execute_callbacks_on_play) {}
+  static constexpr void WriteGammaRamp(
+      const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table,
+      const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb,
+      uint32_t gamma_ramp_rw_component) {}
+
+#endif
 };

 }  // namespace gpu
--- a/src/xenia/kernel/user_module.cc
+++ b/src/xenia/kernel/user_module.cc
@ -225,6 +225,7 @@ X_STATUS UserModule::LoadContinue() {
  ldr_data->xex_header_base = guest_xex_header_;
  ldr_data->full_image_size = security_header->image_size;
  ldr_data->image_base = this->xex_module()->base_address();
+
  ldr_data->entry_point = entry_point_;

  OnLoad();
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -198,7 +198,8 @@ bool Memory::Initialize() {
  // Add handlers for MMIO.
  mmio_handler_ = cpu::MMIOHandler::Install(
      virtual_membase_, physical_membase_, physical_membase_ + 0x1FFFFFFF,
-      HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this);
+      HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this,
+      nullptr, nullptr);
  if (!mmio_handler_) {
    XELOGE("Unable to install MMIO handlers");
    assert_always();
@ -213,6 +214,11 @@ bool Memory::Initialize() {
  return true;
 }

+void Memory::SetMMIOExceptionRecordingCallback(
+    cpu::MmioAccessRecordCallback callback, void* context) {
+  mmio_handler_->SetMMIOExceptionRecordingCallback(callback, context);
+}
+
 static const struct {
  uint64_t virtual_address_start;
  uint64_t virtual_address_end;
@ -1530,7 +1536,8 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address,
 bool PhysicalHeap::AllocSystemHeap(uint32_t size, uint32_t alignment,
                                   uint32_t allocation_type, uint32_t protect,
                                   bool top_down, uint32_t* out_address) {
-  return Alloc(size, alignment, allocation_type, protect, top_down, out_address);
+  return Alloc(size, alignment, allocation_type, protect, top_down,
+               out_address);
 }

 bool PhysicalHeap::Decommit(uint32_t address, uint32_t size) {
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -498,6 +498,9 @@ class Memory {
  bool Save(ByteStream* stream);
  bool Restore(ByteStream* stream);

+  void SetMMIOExceptionRecordingCallback(cpu::MmioAccessRecordCallback callback,
+                                         void* context);
+
 private:
  int MapViews(uint8_t* mapping_base);
  void UnmapViews();
--- a/src/xenia/ui/window_win.cc
+++ b/src/xenia/ui/window_win.cc
@ -181,7 +181,6 @@ bool Win32Window::OpenImpl() {
      SetWindowPlacement(hwnd_, &initial_dpi_placement);
    }
  }
-
  // Disable rounded corners starting with Windows 11 (or silently receive and
  // ignore E_INVALIDARG on Windows versions before 10.0.22000.0), primarily to
  // preserve all pixels of the guest output.
@ -189,7 +188,6 @@ bool Win32Window::OpenImpl() {
  DwmSetWindowAttribute(hwnd_, DWMWA_WINDOW_CORNER_PREFERENCE,
                        &window_corner_preference,
                        sizeof(window_corner_preference));
-
  // Disable flicks.
  ATOM atom = GlobalAddAtomW(L"MicrosoftTabletPenServiceProperty");
  const DWORD_PTR dwHwndTabletProperty =
@ -1047,7 +1045,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam,
    } break;

    case WM_MOVE: {
-      OnMonitorUpdate(MonitorUpdateEvent(this, false));
+      // chrispy: fix clang use of temporary error
+      MonitorUpdateEvent update_event{this, false};
+      OnMonitorUpdate(update_event);
    } break;

    case WM_SIZE: {
@ -1084,7 +1084,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam,
    } break;

    case WM_DISPLAYCHANGE: {
-      OnMonitorUpdate(MonitorUpdateEvent(this, true));
+      // chrispy: fix clang use of temporary error
+      MonitorUpdateEvent update_event{this, true};
+      OnMonitorUpdate(update_event);
    } break;

    case WM_DPICHANGED: {