Merge branch 'master' of https://github.com/xenia-project/xenia into canary_new

2020-11-15 19:01:55 +01:00 · 2020-11-15 19:01:55 +01:00 · 7abe6312be
parent 4fbe33f967 d1f7ee3593
commit 7abe6312be
64 changed files with 3001 additions and 1293 deletions
--- a/.gdbinit
+++ b/.gdbinit
@ -0,0 +1,10 @@
+# Ignore HighResolutionTimer custom event
+handle SIG34 nostop noprint
+# Ignore PosixTimer custom event
+handle SIG35 nostop noprint
+# Ignore PosixThread exit event
+handle SIG32 nostop noprint
+# Ignore PosixThread suspend event
+handle SIG36 nostop noprint
+# Ignore PosixThread user callback event
+handle SIG37 nostop noprint
--- a/.gitmodules
+++ b/.gitmodules
@ -64,3 +64,6 @@
 [submodule "third_party/DirectXShaderCompiler"]
 	path = third_party/DirectXShaderCompiler
 	url = https://github.com/microsoft/DirectXShaderCompiler.git
+[submodule "third_party/premake-cmake"]
+	path = third_party/premake-cmake
+	url = https://github.com/Enhex/premake-cmake.git
--- a/.travis.yml
+++ b/.travis.yml
@ -28,9 +28,9 @@ addons:

 jobs:
  include:
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 LINT=true
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Debug
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Release
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 LINT=true
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Debug
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Release

 git:
  # We handle submodules ourselves in xenia-build setup.
@ -40,8 +40,10 @@ before_script:
  - export LIBVULKAN_VERSION=1.1.70
  - export CXX=$CXX_COMPILER
  - export CC=$C_COMPILER
+  - export AR=$AR_COMPILER
  # Dump useful info.
  - $CXX --version
+  - $AR_COMPILER --version
  - python3 --version
  - clang-format-9 --version
  - clang-format-9 -style=file -dump-config
--- a/docs/building.md
+++ b/docs/building.md
@ -91,12 +91,14 @@ Linux support is extremely experimental and presently incomplete.
 The build script uses LLVM/Clang 9. GCC while it should work in theory, is not easily
 interchangeable right now.

-[CodeLite](https://codelite.org) is the supported IDE and `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website.
-Normal building via `xb build` uses Make.
+* Normal building via `xb build` uses Make.
+* [CodeLite](https://codelite.org) is supported. `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website.
+* Experimental CMake generation is available to facilitate use of other IDEs such as [CLion](https://www.jetbrains.com/clion/). If `clion` is available inside `$PATH`, `xb devenv` will start it. Otherwise `build/CMakeLists.txt` needs to be generated by invoking `xb premake --devenv=cmake` manually.

 Clang-9 or newer should be available from system repositories on all up to date distributions.
 You will also need some development libraries. To get them on an Ubuntu system:
-```
+
+```bash
 sudo apt-get install libgtk-3-dev libpthread-stubs0-dev liblz4-dev libx11-dev libvulkan-dev libsdl2-dev libiberty-dev libunwind-dev libc++-dev libc++abi-dev
 ```

--- a/premake5.lua
+++ b/premake5.lua
@ -1,5 +1,6 @@
 include("tools/build")
 require("third_party/premake-export-compile-commands/export-compile-commands")
+require("third_party/premake-cmake/cmake")

 location(build_root)
 targetdir(build_bin)
@ -24,6 +25,9 @@ defines({
  "UNICODE",
 })

+cppdialect("C++17")
+symbols("On")
+
 -- TODO(DrChat): Find a way to disable this on other architectures.
 if ARCH ~= "ppc64" then
  filter("architecture:x86_64")
@ -44,30 +48,29 @@ filter("kind:StaticLib")

 filter("configurations:Checked")
  runtime("Debug")
+  optimize("Off")
  defines({
    "DEBUG",
  })
-  runtime("Debug")
 filter({"configurations:Checked", "platforms:Windows"})
  buildoptions({
-    "/RTCsu",   -- Full Run-Time Checks.
+    "/RTCsu",           -- Full Run-Time Checks.
+  })
+filter({"configurations:Checked", "platforms:Linux"})
+  defines({
+    "_GLIBCXX_DEBUG",   -- libstdc++ debug mode
  })

 filter("configurations:Debug")
-  runtime("Debug")
+  runtime("Release")
+  optimize("Off")
  defines({
    "DEBUG",
    "_NO_DEBUG_HEAP=1",
  })
-  runtime("Release")
-filter({"configurations:Debug", "platforms:Windows"})
-  linkoptions({
-    "/NODEFAULTLIB:MSVCRTD",
-  })
-
 filter({"configurations:Debug", "platforms:Linux"})
-  buildoptions({
-    "-g",
+  defines({
+    "_GLIBCXX_DEBUG",   -- make dbg symbols work on some distros
  })

 filter("configurations:Release")
@ -76,26 +79,18 @@ filter("configurations:Release")
    "NDEBUG",
    "_NO_DEBUG_HEAP=1",
  })
-  optimize("speed")
+  optimize("Speed")
  inlining("Auto")
  floatingpoint("Fast")
  flags({
    "LinkTimeOptimization",
  })
-  runtime("Release")
-filter({"configurations:Release", "platforms:Windows"})
-  linkoptions({
-    "/NODEFAULTLIB:MSVCRTD",
-  })
-
 filter("platforms:Linux")
  system("linux")
  toolset("clang")
-  cppdialect("C++17")
  buildoptions({
    -- "-mlzcnt",  -- (don't) Assume lzcnt is supported.
-    "`pkg-config --cflags gtk+-x11-3.0`",
-    "-fno-lto", -- Premake doesn't support LTO on clang
+    ({os.outputof("pkg-config --cflags gtk+-x11-3.0")})[1],
  })
  links({
    "stdc++fs",
@ -105,14 +100,13 @@ filter("platforms:Linux")
    "rt",
  })
  linkoptions({
-    "`pkg-config --libs gtk+-3.0`",
+    ({os.outputof("pkg-config --libs gtk+-3.0")})[1],
  })

 filter({"platforms:Linux", "kind:*App"})
  linkgroups("On")

 filter({"platforms:Linux", "language:C++", "toolset:gcc"})
-  cppdialect("C++17")
  links({
  })
  disablewarnings({
@ -147,13 +141,11 @@ filter({"platforms:Linux", "language:C++", "toolset:clang", "files:*.cc or *.cpp
 filter("platforms:Windows")
  system("windows")
  toolset("msc")
-  cppdialect("C++17")
  buildoptions({
-    "/MP",      -- Multiprocessor compilation.
    "/utf-8",   -- 'build correctly on systems with non-Latin codepages'.
    -- Mark warnings as severe
-    "/w14839", -- non-standard use of class 'type' as an argument to a variadic function
-    "/w14840", -- non-portable use of class 'type' as an argument to a variadic function
+    "/w14839",  -- non-standard use of class 'type' as an argument to a variadic function
+    "/w14840",  -- non-portable use of class 'type' as an argument to a variadic function
    -- Disable warnings
    "/wd4100",  -- Unreferenced parameters are ok.
    "/wd4201",  -- Nameless struct/unions are ok.
@ -163,10 +155,10 @@ filter("platforms:Windows")
    "/wd4189",  -- 'local variable is initialized but not referenced'.
  })
  flags({
-    "NoMinimalRebuild", -- Required for /MP above.
+    "MultiProcessorCompile",  -- Multiprocessor compilation.
+    "NoMinimalRebuild",       -- Required for /MP above.
  })

-  symbols("On")
  defines({
    "_CRT_NONSTDC_NO_DEPRECATE",
    "_CRT_SECURE_NO_WARNINGS",
--- a/src/xenia/app/emulator_window.cc
+++ b/src/xenia/app/emulator_window.cc
@ -71,8 +71,8 @@ std::unique_ptr<EmulatorWindow> EmulatorWindow::Create(Emulator* emulator) {
  std::unique_ptr<EmulatorWindow> emulator_window(new EmulatorWindow(emulator));

  emulator_window->loop()->PostSynchronous([&emulator_window]() {
-    xe::threading::set_name("Win32 Loop");
-    xe::Profiler::ThreadEnter("Win32 Loop");
+    xe::threading::set_name("Windowing Loop");
+    xe::Profiler::ThreadEnter("Windowing Loop");

    if (!emulator_window->Initialize()) {
      xe::FatalError("Failed to initialize main window");
--- a/src/xenia/app/premake5.lua
+++ b/src/xenia/app/premake5.lua
@ -8,19 +8,6 @@ project("xenia-app")
  targetname("xenia_canary")
  language("C++")
  links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "dxbc",
-    "discord-rpc",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
    "xenia-app-discord",
    "xenia-apu",
    "xenia-apu-nop",
@ -43,6 +30,21 @@ project("xenia-app")
    "xenia-ui-vulkan",
    "xenia-patcher",
    "xenia-vfs",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "dxbc",
+    "discord-rpc",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
    "xxhash",
  })
  defines({
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@ -302,6 +302,7 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {

  // No available data.
  if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) {
+    data->output_buffer_valid = 0;
    return;
  }

--- a/src/xenia/apu/xma_decoder.cc
+++ b/src/xenia/apu/xma_decoder.cc
@ -144,7 +144,7 @@ X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
        WorkerThreadMain();
        return 0;
      }));
-  worker_thread_->set_name("XMA Decoder Worker");
+  worker_thread_->set_name("XMA Decoder");
  worker_thread_->set_can_debugger_suspend(true);
  worker_thread_->Create();

--- a/src/xenia/base/debugging_posix.cc
+++ b/src/xenia/base/debugging_posix.cc
@ -9,21 +9,51 @@

 #include "xenia/base/debugging.h"

-#include <signal.h>
+#include <csignal>
 #include <cstdarg>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>

 #include "xenia/base/string_buffer.h"

 namespace xe {
 namespace debugging {

-bool IsDebuggerAttached() { return false; }
-void Break() { raise(SIGTRAP); }
+bool IsDebuggerAttached() {
+  std::ifstream proc_status_stream("/proc/self/status");
+  if (!proc_status_stream.is_open()) {
+    return false;
+  }
+  std::string line;
+  while (std::getline(proc_status_stream, line)) {
+    std::istringstream line_stream(line);
+    std::string key;
+    line_stream >> key;
+    if (key == "TracerPid:") {
+      uint32_t tracer_pid;
+      line_stream >> tracer_pid;
+      return tracer_pid != 0;
+    }
+  }
+  return false;
+}
+
+void Break() {
+  static std::once_flag flag;
+  std::call_once(flag, []() {
+    // Install handler for sigtrap only once
+    std::signal(SIGTRAP, [](int) {
+      // Forward signal to default handler after being caught
+      std::signal(SIGTRAP, SIG_DFL);
+    });
+  });
+  std::raise(SIGTRAP);
+}

 namespace internal {
-void DebugPrint(const char* s) {
-  // TODO: proper implementation.
-}
+void DebugPrint(const char* s) { std::clog << s << std::endl; }
 }  // namespace internal

 }  // namespace debugging
--- a/src/xenia/base/logging.cc
+++ b/src/xenia/base/logging.cc
@ -93,7 +93,7 @@ class Logger {

    write_thread_ =
        xe::threading::Thread::Create({}, [this]() { WriteThread(); });
-    write_thread_->set_name("xe::FileLogSink Writer");
+    write_thread_->set_name("Logging Writer");
  }

  ~Logger() {
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -76,14 +76,12 @@
 #endif  // XE_PLATFORM_MAC

 #if XE_COMPILER_MSVC
-#define XEPACKEDSTRUCT(name, value)                                  \
-  __pragma(pack(push, 1)) struct name##_s value __pragma(pack(pop)); \
-  typedef struct name##_s name;
+#define XEPACKEDSTRUCT(name, value) \
+  __pragma(pack(push, 1)) struct name value __pragma(pack(pop));
 #define XEPACKEDSTRUCTANONYMOUS(value) \
  __pragma(pack(push, 1)) struct value __pragma(pack(pop));
-#define XEPACKEDUNION(name, value)                                  \
-  __pragma(pack(push, 1)) union name##_s value __pragma(pack(pop)); \
-  typedef union name##_s name;
+#define XEPACKEDUNION(name, value) \
+  __pragma(pack(push, 1)) union name value __pragma(pack(pop));
 #else
 #define XEPACKEDSTRUCT(name, value) struct __attribute__((packed)) name value;
 #define XEPACKEDSTRUCTANONYMOUS(value) struct __attribute__((packed)) value;
--- a/src/xenia/base/string_util.h
+++ b/src/xenia/base/string_util.h
@ -10,11 +10,15 @@
 #ifndef XENIA_BASE_STRING_UTIL_H_
 #define XENIA_BASE_STRING_UTIL_H_

+#include <algorithm>
 #include <charconv>
+#include <cstddef>
+#include <cstring>
 #include <string>

 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
+#include "xenia/base/memory.h"
 #include "xenia/base/platform.h"
 #include "xenia/base/string.h"
 #include "xenia/base/vec128.h"
@ -30,6 +34,40 @@
 namespace xe {
 namespace string_util {

+inline size_t copy_truncating(char* dest, const std::string_view source,
+                              size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  std::memcpy(dest, source.data(), chars_copied);
+  dest[chars_copied] = '\0';
+  return chars_copied;
+}
+
+inline size_t copy_truncating(char16_t* dest, const std::u16string_view source,
+                              size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  std::memcpy(dest, source.data(), chars_copied * sizeof(char16_t));
+  dest[chars_copied] = u'\0';
+  return chars_copied;
+}
+
+inline size_t copy_and_swap_truncating(char16_t* dest,
+                                       const std::u16string_view source,
+                                       size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  xe::copy_and_swap(dest, source.data(), chars_copied);
+  dest[chars_copied] = u'\0';
+  return chars_copied;
+}
+
 inline std::string to_hex_string(uint32_t value) {
  return fmt::format("{:08X}", value);
 }
--- a/src/xenia/base/testing/threading_test.cc
+++ b/src/xenia/base/testing/threading_test.cc
@ -0,0 +1,967 @@
+/**
+******************************************************************************
+* Xenia : Xbox 360 Emulator Research Project                                 *
+******************************************************************************
+* Copyright 2018 Ben Vanik. All rights reserved.                             *
+* Released under the BSD license - see LICENSE in the root for more details. *
+******************************************************************************
+*/
+
+#include <array>
+
+#include "xenia/base/threading.h"
+
+#include "third_party/catch/include/catch.hpp"
+
+namespace xe {
+namespace base {
+namespace test {
+using namespace threading;
+using namespace std::chrono_literals;
+
+TEST_CASE("Fence") {
+  std::unique_ptr<threading::Fence> pFence;
+  std::unique_ptr<threading::HighResolutionTimer> pTimer;
+
+  // Signal without wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+
+  // Signal once and wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Signal twice and wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Signal and wait two times
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Wait();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Test to synchronize multiple threads
+  std::atomic<int> started(0);
+  std::atomic<int> finished(0);
+  pFence = std::make_unique<threading::Fence>();
+  auto func = [&pFence, &started, &finished] {
+    started.fetch_add(1);
+    pFence->Wait();
+    finished.fetch_add(1);
+  };
+
+  auto threads = std::array<std::thread, 5>({
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+  });
+
+  Sleep(100ms);
+  REQUIRE(started.load() == threads.size());
+  REQUIRE(finished.load() == 0);
+
+  pFence->Signal();
+
+  for (auto& t : threads) t.join();
+  REQUIRE(finished.load() == threads.size());
+}  // namespace test
+
+TEST_CASE("Get number of logical processors") {
+  auto count = std::thread::hardware_concurrency();
+  REQUIRE(logical_processor_count() == count);
+  REQUIRE(logical_processor_count() == count);
+  REQUIRE(logical_processor_count() == count);
+}
+
+TEST_CASE("Enable process to set thread affinity") {
+  EnableAffinityConfiguration();
+}
+
+TEST_CASE("Yield Current Thread", "MaybeYield") {
+  // Run to see if there are any errors
+  MaybeYield();
+}
+
+TEST_CASE("Sync with Memory Barrier", "SyncMemory") {
+  // Run to see if there are any errors
+  SyncMemory();
+}
+
+TEST_CASE("Sleep Current Thread", "Sleep") {
+  auto wait_time = 50ms;
+  auto start = std::chrono::steady_clock::now();
+  Sleep(wait_time);
+  auto duration = std::chrono::steady_clock::now() - start;
+  REQUIRE(duration >= wait_time);
+}
+
+TEST_CASE("Sleep Current Thread in Alertable State", "Sleep") {
+  auto wait_time = 50ms;
+  auto start = std::chrono::steady_clock::now();
+  auto result = threading::AlertableSleep(wait_time);
+  auto duration = std::chrono::steady_clock::now() - start;
+  REQUIRE(duration >= wait_time);
+  REQUIRE(result == threading::SleepResult::kSuccess);
+
+  // TODO(bwrsandman): Test a Thread to return kAlerted.
+  // Need callback to call extended I/O function (ReadFileEx or WriteFileEx)
+}
+
+TEST_CASE("TlsHandle") {
+  // Test Allocate
+  auto handle = threading::AllocateTlsHandle();
+
+  // Test Free
+  REQUIRE(threading::FreeTlsHandle(handle));
+  REQUIRE(!threading::FreeTlsHandle(handle));
+  REQUIRE(!threading::FreeTlsHandle(threading::kInvalidTlsHandle));
+
+  // Test setting values
+  handle = threading::AllocateTlsHandle();
+  REQUIRE(threading::GetTlsValue(handle) == 0);
+  uint32_t value = 0xDEADBEEF;
+  threading::SetTlsValue(handle, reinterpret_cast<uintptr_t>(&value));
+  auto p_received_value = threading::GetTlsValue(handle);
+  REQUIRE(threading::GetTlsValue(handle) != 0);
+  auto received_value = *reinterpret_cast<uint32_t*>(p_received_value);
+  REQUIRE(received_value == value);
+
+  uintptr_t non_thread_local_value = 0;
+  auto thread = Thread::Create({}, [&non_thread_local_value, &handle] {
+    non_thread_local_value = threading::GetTlsValue(handle);
+  });
+
+  auto result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(non_thread_local_value == 0);
+
+  // Cleanup
+  REQUIRE(threading::FreeTlsHandle(handle));
+}
+
+TEST_CASE("HighResolutionTimer") {
+  // The wait time is 500ms with an interval of 50ms
+  // Smaller values are not as precise and fail the test
+  const auto wait_time = 500ms;
+
+  // Time the actual sleep duration
+  {
+    const auto interval = 50ms;
+    std::atomic<uint64_t> counter;
+    auto start = std::chrono::steady_clock::now();
+    auto cb = [&counter] { ++counter; };
+    auto pTimer = HighResolutionTimer::CreateRepeating(interval, cb);
+    Sleep(wait_time);
+    pTimer.reset();
+    auto duration = std::chrono::steady_clock::now() - start;
+
+    // Should have run as many times as wait_time / timer_interval plus or
+    // minus 1 due to imprecision of Sleep
+    REQUIRE(duration.count() >= wait_time.count());
+    auto ratio = static_cast<uint64_t>(duration / interval);
+    REQUIRE(counter >= ratio - 1);
+    REQUIRE(counter <= ratio + 1);
+  }
+
+  // Test concurrent timers
+  {
+    const auto interval1 = 100ms;
+    const auto interval2 = 200ms;
+    std::atomic<uint64_t> counter1;
+    std::atomic<uint64_t> counter2;
+    auto start = std::chrono::steady_clock::now();
+    auto cb1 = [&counter1] { ++counter1; };
+    auto cb2 = [&counter2] { ++counter2; };
+    auto pTimer1 = HighResolutionTimer::CreateRepeating(interval1, cb1);
+    auto pTimer2 = HighResolutionTimer::CreateRepeating(interval2, cb2);
+    Sleep(wait_time);
+    pTimer1.reset();
+    pTimer2.reset();
+    auto duration = std::chrono::steady_clock::now() - start;
+
+    // Should have run as many times as wait_time / timer_interval plus or
+    // minus 1 due to imprecision of Sleep
+    REQUIRE(duration.count() >= wait_time.count());
+    auto ratio1 = static_cast<uint64_t>(duration / interval1);
+    auto ratio2 = static_cast<uint64_t>(duration / interval2);
+    REQUIRE(counter1 >= ratio1 - 1);
+    REQUIRE(counter1 <= ratio1 + 1);
+    REQUIRE(counter2 >= ratio2 - 1);
+    REQUIRE(counter2 <= ratio2 + 1);
+  }
+
+  // TODO(bwrsandman): Check on which thread callbacks are executed when
+  // spawned from differing threads
+}
+
+TEST_CASE("Wait on Multiple Handles", "Wait") {
+  auto mutant = Mutant::Create(true);
+  auto semaphore = Semaphore::Create(10, 10);
+  auto event_ = Event::CreateManualResetEvent(false);
+  auto thread = Thread::Create({}, [&mutant, &semaphore, &event_] {
+    event_->Set();
+    Wait(mutant.get(), false, 25ms);
+    semaphore->Release(1, nullptr);
+    Wait(mutant.get(), false, 25ms);
+    mutant->Release();
+  });
+
+  std::vector<WaitHandle*> handles = {
+      mutant.get(),
+      semaphore.get(),
+      event_.get(),
+      thread.get(),
+  };
+
+  auto any_result = WaitAny(handles, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+
+  auto all_result = WaitAll(handles, false, 100ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Signal and Wait") {
+  WaitResult result;
+  auto mutant = Mutant::Create(true);
+  auto event_ = Event::CreateAutoResetEvent(false);
+  auto thread = Thread::Create({}, [&mutant, &event_] {
+    Wait(mutant.get(), false);
+    event_->Set();
+  });
+  result = Wait(event_.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  result = SignalAndWait(mutant.get(), event_.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Wait on Event", "Event") {
+  auto evt = Event::CreateAutoResetEvent(false);
+  WaitResult result;
+
+  // Call wait on unset Event
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Call wait on set Event
+  evt->Set();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call wait on now consumed Event
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+}
+
+TEST_CASE("Reset Event", "Event") {
+  auto evt = Event::CreateAutoResetEvent(false);
+  WaitResult result;
+
+  // Call wait on reset Event
+  evt->Set();
+  evt->Reset();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Test resetting the unset event
+  evt->Reset();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Test setting the reset event
+  evt->Set();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Wait on Multiple Events", "Event") {
+  auto events = std::array<std::unique_ptr<Event>, 4>{
+      Event::CreateAutoResetEvent(false),
+      Event::CreateAutoResetEvent(false),
+      Event::CreateAutoResetEvent(false),
+      Event::CreateManualResetEvent(false),
+  };
+
+  std::array<char, 8> order = {0};
+  std::atomic_uint index(0);
+  auto sign_in = [&order, &index](uint32_t id) {
+    auto i = index.fetch_add(1, std::memory_order::memory_order_relaxed);
+    order[i] = static_cast<char>('0' + id);
+  };
+
+  auto threads = std::array<std::thread, 4>{
+      std::thread([&events, &sign_in] {
+        auto res = WaitAll({events[1].get(), events[3].get()}, false, 100ms);
+        if (res == WaitResult::kSuccess) {
+          sign_in(1);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAny({events[0].get(), events[2].get()}, false, 100ms);
+        if (res.first == WaitResult::kSuccess) {
+          sign_in(2);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAll({events[0].get(), events[2].get(), events[3].get()},
+                           false, 100ms);
+        if (res == WaitResult::kSuccess) {
+          sign_in(3);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAny({events[1].get(), events[3].get()}, false, 100ms);
+        if (res.first == WaitResult::kSuccess) {
+          sign_in(4);
+        }
+      }),
+  };
+
+  Sleep(10ms);
+  events[3]->Set();  // Signals thread id=4 and stays on for 1 and 3
+  Sleep(10ms);
+  events[1]->Set();  // Signals thread id=1
+  Sleep(10ms);
+  events[0]->Set();  // Signals thread id=2
+  Sleep(10ms);
+  events[2]->Set();  // Partial signals thread id=3
+  events[0]->Set();  // Signals thread id=3
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  INFO(order.data());
+  REQUIRE(order[0] == '4');
+  // TODO(bwrsandman): Order is not always maintained on linux
+  // REQUIRE(order[1] == '1');
+  // REQUIRE(order[2] == '2');
+  // REQUIRE(order[3] == '3');
+}
+
+TEST_CASE("Wait on Semaphore", "Semaphore") {
+  WaitResult result;
+  std::unique_ptr<Semaphore> sem;
+  int previous_count = 0;
+
+  // Wait on semaphore with no room
+  sem = Semaphore::Create(0, 5);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Add room in semaphore
+  REQUIRE(sem->Release(2, &previous_count));
+  REQUIRE(previous_count == 0);
+  REQUIRE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == 2);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == 2);
+
+  // Set semaphore over maximum_count
+  sem = Semaphore::Create(5, 5);
+  previous_count = -1;
+  REQUIRE_FALSE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+  sem = Semaphore::Create(0, 5);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Test invalid Release parameters
+  REQUIRE_FALSE(sem->Release(0, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(-1, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Wait on fully available semaphore
+  sem = Semaphore::Create(5, 5);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Semaphore between threads
+  sem = Semaphore::Create(5, 5);
+  Sleep(10ms);
+  // Occupy the semaphore with 5 threads
+  auto func = [&sem] {
+    auto res = Wait(sem.get(), false, 100ms);
+    Sleep(500ms);
+    if (res == WaitResult::kSuccess) {
+      sem->Release(1, nullptr);
+    }
+  };
+  auto threads = std::array<std::thread, 5>{
+      std::thread(func), std::thread(func), std::thread(func),
+      std::thread(func), std::thread(func),
+  };
+  // Give threads time to acquire semaphore
+  Sleep(10ms);
+  // Attempt to acquire full semaphore with current (6th) thread
+  result = Wait(sem.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  // Give threads time to release semaphore
+  for (auto& t : threads) {
+    t.join();
+  }
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  sem->Release(1, &previous_count);
+  REQUIRE(previous_count == 4);
+
+  // Test invalid construction parameters
+  // These are invalid according to documentation
+  // TODO(bwrsandman): Many of these invalid invocations succeed
+  sem = Semaphore::Create(-1, 5);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(10, 5);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(0, 0);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(0, -1);
+  // REQUIRE(sem.get() == nullptr);
+}
+
+TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+  int previous_count;
+  std::unique_ptr<Semaphore> sem0, sem1;
+
+  // Test Wait all which should fail
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(5, 5);
+  all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE_FALSE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Test Wait all again which should succeed
+  sem0 = Semaphore::Create(1, 5);
+  sem1 = Semaphore::Create(5, 5);
+  all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 4);
+
+  // Test Wait Any which should fail
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(0, 5);
+  any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+
+  // Test Wait Any which should succeed
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(5, 5);
+  any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 4);
+}
+
+TEST_CASE("Wait on Mutant", "Mutant") {
+  WaitResult result;
+  std::unique_ptr<Mutant> mut;
+
+  // Release on initially owned mutant
+  mut = Mutant::Create(true);
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Release on initially not-owned mutant
+  mut = Mutant::Create(false);
+  REQUIRE_FALSE(mut->Release());
+
+  // Wait on initially owned mutant
+  mut = Mutant::Create(true);
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Wait on initially not owned mutant
+  mut = Mutant::Create(false);
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Multiple waits (or locks)
+  mut = Mutant::Create(false);
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(mut.get(), false, 1ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  for (int i = 0; i < 10; ++i) {
+    REQUIRE(mut->Release());
+  }
+  REQUIRE_FALSE(mut->Release());
+
+  // Test mutants on other threads
+  auto thread1 = std::thread([&mut] {
+    Sleep(5ms);
+    mut = Mutant::Create(true);
+    Sleep(100ms);
+    mut->Release();
+  });
+  Sleep(10ms);
+  REQUIRE_FALSE(mut->Release());
+  Sleep(10ms);
+  result = Wait(mut.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread1.join();
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+}
+
+TEST_CASE("Wait on Multiple Mutants", "Mutant") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+  std::unique_ptr<Mutant> mut0, mut1;
+
+  // Test which should fail for WaitAll and WaitAny
+  auto thread0 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(true);
+    mut1 = Mutant::Create(true);
+    Sleep(50ms);
+    mut0->Release();
+    mut1->Release();
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  thread0.join();
+
+  // Test which should fail for WaitAll but not WaitAny
+  auto thread1 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(true);
+    mut1 = Mutant::Create(false);
+    Sleep(50ms);
+    mut0->Release();
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE(mut1->Release());
+  thread1.join();
+
+  // Test which should pass for WaitAll and WaitAny
+  auto thread2 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(false);
+    mut1 = Mutant::Create(false);
+    Sleep(50ms);
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  REQUIRE(mut0->Release());
+  REQUIRE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+  REQUIRE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  thread2.join();
+}
+
+TEST_CASE("Wait on Timer", "Timer") {
+  WaitResult result;
+  std::unique_ptr<Timer> timer;
+
+  // Test Manual Reset
+  timer = Timer::CreateManualResetTimer();
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(timer->SetOnce(1ms));  // Signals it
+  result = Wait(timer.get(), false, 2ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);  // Did not reset
+
+  // Test Synchronization
+  timer = Timer::CreateSynchronizationTimer();
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(timer->SetOnce(1ms));  // Signals it
+  result = Wait(timer.get(), false, 2ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);  // Did reset
+
+  // TODO(bwrsandman): This test unexpectedly fails under windows
+  // Test long due time
+  // timer = Timer::CreateSynchronizationTimer();
+  // REQUIRE(timer->SetOnce(10s));
+  // result = Wait(timer.get(), false, 10ms);  // Still signals under windows
+  // REQUIRE(result == WaitResult::kTimeout);
+
+  // Test Repeating
+  REQUIRE(timer->SetRepeating(1ms, 10ms));
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    INFO(i);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  MaybeYield();
+  Sleep(10ms);  // Skip a few events
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  // Cancel it
+  timer->Cancel();
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  MaybeYield();
+  Sleep(10ms);  // Skip a few events
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  // Cancel with SetOnce
+  REQUIRE(timer->SetRepeating(1ms, 10ms));
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  REQUIRE(timer->SetOnce(1ms));
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kSuccess);  // Signal from Set Once
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);  // No more signals from repeating
+}
+
+TEST_CASE("Wait on Multiple Timers", "Timer") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+
+  auto timer0 = Timer::CreateSynchronizationTimer();
+  auto timer1 = Timer::CreateManualResetTimer();
+
+  // None signaled
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 1ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 1ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+
+  // Some signaled
+  REQUIRE(timer1->SetOnce(1ms));
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+
+  // All signaled
+  REQUIRE(timer0->SetOnce(1ms));
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  REQUIRE(timer0->SetOnce(1ms));
+  Sleep(1ms);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+
+  // Check that timer0 reset
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+}
+
+TEST_CASE("Create and Trigger Timer Callbacks", "Timer") {
+  // TODO(bwrsandman): Check which thread performs callback and timing of
+  // callback
+  REQUIRE(true);
+}
+
+TEST_CASE("Set and Test Current Thread ID", "Thread") {
+  // System ID
+  auto system_id = current_thread_system_id();
+  REQUIRE(system_id > 0);
+
+  // Thread ID
+  auto thread_id = current_thread_id();
+  REQUIRE(thread_id == system_id);
+
+  // Set a new thread id
+  const uint32_t new_thread_id = 0xDEADBEEF;
+  set_current_thread_id(new_thread_id);
+  REQUIRE(current_thread_id() == new_thread_id);
+
+  // Set back original thread id of system
+  set_current_thread_id(std::numeric_limits<uint32_t>::max());
+  REQUIRE(current_thread_id() == system_id);
+
+  // TODO(bwrsandman): Test on Thread object
+}
+
+TEST_CASE("Set and Test Current Thread Name", "Thread") {
+  auto current_thread = Thread::GetCurrentThread();
+  REQUIRE(current_thread);
+  auto old_thread_name = current_thread->name();
+
+  std::string new_thread_name = "Threading Test";
+  REQUIRE_NOTHROW(set_name(new_thread_name));
+
+  // Restore the old catch.hpp thread name
+  REQUIRE_NOTHROW(set_name(old_thread_name));
+}
+
+TEST_CASE("Create and Run Thread", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  auto func = [] { Sleep(20ms); };
+
+  // Create most basic case of thread
+  thread = Thread::Create(params, func);
+  REQUIRE(thread->native_handle() != nullptr);
+  REQUIRE_NOTHROW(thread->affinity_mask());
+  REQUIRE(thread->name().empty());
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Add thread name
+  std::string new_name = "Test thread name";
+  thread = Thread::Create(params, func);
+  auto name = thread->name();
+  INFO(name.c_str());
+  REQUIRE(name.empty());
+  thread->set_name(new_name);
+  REQUIRE(thread->name() == new_name);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Use Terminate to end an infinitely looping thread
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Sleep(1ms);
+    }
+  });
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread->Terminate(-1);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call Exit from inside an infinitely looping thread
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Thread::Exit(-1);
+    }
+  });
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call timeout wait on self
+  result = Wait(Thread::GetCurrentThread(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  params.stack_size = 16 * 1024;
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Thread::Exit(-1);
+    }
+  });
+  REQUIRE(thread != nullptr);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // TODO(bwrsandman): Test with different priorities
+  // TODO(bwrsandman): Test setting and getting thread affinity
+}
+
+TEST_CASE("Test Suspending Thread", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  auto func = [] { Sleep(20ms); };
+
+  // Create initially suspended
+  params.create_suspended = true;
+  thread = threading::Thread::Create(params, func);
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+  params.create_suspended = false;
+
+  // Create and then suspend
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+
+  // Test recursive suspend
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend();
+  thread->Suspend();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+
+  // Test suspend count
+  uint32_t suspend_count = 0;
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 0);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 2);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 3);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 2);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 0);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+}
+
+TEST_CASE("Test Thread QueueUserCallback", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  std::atomic_int order;
+  int is_modified;
+  int has_finished;
+  auto callback = [&is_modified, &order] {
+    is_modified = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  };
+
+  // Without alertable
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&has_finished, &order] {
+    // Not using Alertable so callback is not registered
+    Sleep(90ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(is_modified == -1);
+  thread->QueueUserCallback(callback);
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == -1);
+  REQUIRE(has_finished == 0);
+
+  // With alertable
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&has_finished, &order] {
+    // Using Alertable so callback is registered
+    AlertableSleep(90ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(is_modified == -1);
+  thread->QueueUserCallback(callback);
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == 0);
+  REQUIRE(has_finished == 1);
+
+  // Test Exit command with QueueUserCallback
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&is_modified, &has_finished, &order] {
+    is_modified = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+    // Using Alertable so callback is registered
+    AlertableSleep(200ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread->QueueUserCallback([] { Thread::Exit(0); });
+  result = Wait(thread.get(), true, 500ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == 0);
+  REQUIRE(has_finished == -1);
+
+  // TODO(bwrsandman): Test alertable wait returning kUserCallback by using IO
+  // callbacks.
+}
+
+}  // namespace test
+}  // namespace base
+}  // namespace xe
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@ -24,29 +24,56 @@
 #include <utility>
 #include <vector>

+#include "xenia/base/assert.h"
+
 namespace xe {
 namespace threading {

+// This is more like an Event with self-reset when returning from Wait()
 class Fence {
 public:
-  Fence() : signaled_(false) {}
+  Fence() : signal_state_(0) {}
+
  void Signal() {
    std::unique_lock<std::mutex> lock(mutex_);
-    signaled_.store(true);
+    signal_state_ |= SIGMASK_;
    cond_.notify_all();
  }
+
+  // Wait for the Fence to be signaled. Clears the signal on return.
  void Wait() {
    std::unique_lock<std::mutex> lock(mutex_);
-    while (!signaled_.load()) {
+    assert_true((signal_state_ & ~SIGMASK_) < (SIGMASK_ - 1) &&
+                "Too many threads?");
+
+    // keep local copy to minimize loads
+    auto signal_state = ++signal_state_;
+    for (; !(signal_state & SIGMASK_); signal_state = signal_state_) {
      cond_.wait(lock);
    }
-    signaled_.store(false);
+
+    // We can't just clear the signal as other threads may not have read it yet
+    assert_true((signal_state & ~SIGMASK_) > 0);  // wait_count > 0
+    if (signal_state == (1 | SIGMASK_)) {         // wait_count == 1
+      // Last one out turn off the lights
+      signal_state_ = 0;
+    } else {
+      // Oops, another thread is still waiting, set the new count and keep the
+      // signal.
+      signal_state_ = --signal_state;
+    }
  }

 private:
+  using state_t_ = uint_fast32_t;
+  static constexpr state_t_ SIGMASK_ = state_t_(1)
+                                       << (sizeof(state_t_) * 8 - 1);
+
  std::mutex mutex_;
  std::condition_variable cond_;
-  std::atomic<bool> signaled_;
+  // Use the highest bit (sign bit) as the signal flag and the rest to count
+  // waiting threads.
+  volatile state_t_ signal_state_;
 };

 // Returns the total number of logical processors in the host system.
@ -308,12 +335,12 @@ class Timer : public WaitHandle {
                            std::chrono::milliseconds period,
                            std::function<void()> opt_callback = nullptr) = 0;
  template <typename Rep, typename Period>
-  void SetRepeating(std::chrono::nanoseconds due_time,
+  bool SetRepeating(std::chrono::nanoseconds due_time,
                    std::chrono::duration<Rep, Period> period,
                    std::function<void()> opt_callback = nullptr) {
-    SetRepeating(due_time,
-                 std::chrono::duration_cast<std::chrono::milliseconds>(period),
-                 std::move(opt_callback));
+    return SetRepeating(
+        due_time, std::chrono::duration_cast<std::chrono::milliseconds>(period),
+        std::move(opt_callback));
  }

  // Stops the timer before it can be set to the signaled state and cancels
@ -391,7 +418,7 @@ class Thread : public WaitHandle {

  // Decrements a thread's suspend count. When the suspend count is decremented
  // to zero, the execution of the thread is resumed.
-  virtual bool Resume(uint32_t* out_new_suspend_count = nullptr) = 0;
+  virtual bool Resume(uint32_t* out_previous_suspend_count = nullptr) = 0;

  // Suspends the specified thread.
  virtual bool Suspend(uint32_t* out_previous_suspend_count = nullptr) = 0;
--- a/src/xenia/base/threading_posix.cc
+++ b/src/xenia/base/threading_posix.cc
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -388,16 +388,16 @@ class Win32Thread : public Win32Handle<Thread> {
    QueueUserAPC(DispatchApc, handle_, reinterpret_cast<ULONG_PTR>(apc_data));
  }

-  bool Resume(uint32_t* out_new_suspend_count = nullptr) override {
-    if (out_new_suspend_count) {
-      *out_new_suspend_count = 0;
+  bool Resume(uint32_t* out_previous_suspend_count = nullptr) override {
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = 0;
    }
    DWORD result = ResumeThread(handle_);
    if (result == UINT_MAX) {
      return false;
    }
-    if (out_new_suspend_count) {
-      *out_new_suspend_count = result;
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = result;
    }
    return true;
  }
--- a/src/xenia/cpu/export_resolver.cc
+++ b/src/xenia/cpu/export_resolver.cc
@ -30,7 +30,7 @@ ExportResolver::Table::Table(const std::string_view module_name,
  }
  std::sort(
      exports_by_name_.begin(), exports_by_name_.end(),
-      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; });
+      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; });
 }

 ExportResolver::ExportResolver() = default;
@ -51,7 +51,7 @@ void ExportResolver::RegisterTable(
  }
  std::sort(
      all_exports_by_name_.begin(), all_exports_by_name_.end(),
-      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; });
+      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; });
 }

 Export* ExportResolver::GetExportByOrdinal(const std::string_view module_name,
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -73,7 +73,7 @@ bool CommandProcessor::Initialize(
        WorkerThreadMain();
        return 0;
      }));
-  worker_thread_->set_name("GraphicsSystem Command Processor");
+  worker_thread_->set_name("GPU Commands");
  worker_thread_->Create();

  return true;
@ -731,12 +731,20 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) {
    } break;
    case PM4_CONTEXT_UPDATE: {
      assert_true(count == 1);
-      uint64_t value = reader->ReadAndSwap<uint32_t>();
+      uint32_t value = reader->ReadAndSwap<uint32_t>();
      XELOGGPU("GPU context update = {:08X}", value);
      assert_true(value == 0);
      result = true;
      break;
    }
+    case PM4_WAIT_FOR_IDLE: {
+      // This opcode is used by "Duke Nukem Forever" while going/being ingame
+      assert_true(count == 1);
+      uint32_t value = reader->ReadAndSwap<uint32_t>();
+      XELOGGPU("GPU wait for idle = {:08X}", value);
+      result = true;
+      break;
+    }

    default:
      XELOGGPU("Unimplemented GPU OPCODE: 0x{:02X}\t\tCOUNT: {}\n", opcode,
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -21,6 +21,7 @@
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
+#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/d3d12/d3d12_util.h"
@ -387,7 +388,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
        sampler_count_vertex);
    return nullptr;
  }
-  root_signatures_bindful_.insert({index, root_signature});
+  root_signatures_bindful_.emplace(index, root_signature);
  return root_signature;
 }

@ -745,12 +746,11 @@ void D3D12CommandProcessor::SetSamplePositions(
  current_sample_positions_ = sample_positions;
 }

-void D3D12CommandProcessor::SetComputePipelineState(
-    ID3D12PipelineState* pipeline_state) {
-  if (current_external_pipeline_state_ != pipeline_state) {
-    deferred_command_list_.D3DSetPipelineState(pipeline_state);
-    current_external_pipeline_state_ = pipeline_state;
-    current_cached_pipeline_state_ = nullptr;
+void D3D12CommandProcessor::SetComputePipeline(ID3D12PipelineState* pipeline) {
+  if (current_external_pipeline_ != pipeline) {
+    deferred_command_list_.D3DSetPipelineState(pipeline);
+    current_external_pipeline_ = pipeline;
+    current_cached_pipeline_ = nullptr;
  }
 }

@ -773,8 +773,16 @@ std::string D3D12CommandProcessor::GetWindowTitleText() const {
    }
    // Currently scaling is only supported with ROV.
    if (texture_cache_ != nullptr && texture_cache_->IsResolutionScale2X()) {
-      return "Direct3D 12 - 2x";
+      return "Direct3D 12 - ROV 2x";
    }
+    // Rasterizer-ordered views are a feature very rarely used as of 2020 and
+    // that faces adoption complications (outside of Direct3D - on Vulkan - at
+    // least), but crucial to Xenia - raise awareness of its usage.
+    // https://github.com/KhronosGroup/Vulkan-Ecosystem/issues/27#issuecomment-455712319
+    // "In Xenia's title bar "D3D12 ROV" can be seen, which was a surprise, as I
+    //  wasn't aware that Xenia D3D12 backend was using Raster Order Views
+    //  feature" - oscarbg in that issue.
+    return "Direct3D 12 - ROV";
  }
  return "Direct3D 12";
 }
@ -1196,7 +1204,7 @@ bool D3D12CommandProcessor::SetupContext() {
      *this, *register_file_, bindless_resources_used_, edram_rov_used_,
      texture_cache_->IsResolutionScale2X() ? 2 : 1);
  if (!pipeline_cache_->Initialize()) {
-    XELOGE("Failed to initialize the graphics pipeline state cache");
+    XELOGE("Failed to initialize the graphics pipeline cache");
    return false;
  }

@ -1526,8 +1534,7 @@ void D3D12CommandProcessor::ShutdownContext() {
  // Shut down binding - bindless descriptors may be owned by subsystems like
  // the texture cache.

-  // Root signatured are used by pipeline states, thus freed after the pipeline
-  // states.
+  // Root signatures are used by pipelines, thus freed after the pipelines.
  ui::d3d12::util::ReleaseAndNull(root_signature_bindless_ds_);
  ui::d3d12::util::ReleaseAndNull(root_signature_bindless_vs_);
  for (auto it : root_signatures_bindful_) {
@ -1878,7 +1885,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           xenos::VertexShaderExportMode::kMultipass ||
       (primitive_two_faced && pa_su_sc_mode_cntl.cull_front &&
        pa_su_sc_mode_cntl.cull_back))) {
-    // All faces are culled - can't be expressed in the pipeline state.
+    // All faces are culled - can't be expressed in the pipeline.
    return true;
  }

@ -1954,7 +1961,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    line_loop_closing_index = 0;
  }

-  // Update the textures - this may bind pipeline state objects.
+  // Update the textures - this may bind pipelines.
  uint32_t used_texture_mask =
      vertex_shader->GetUsedTextureMask() |
      (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0);
@ -1972,21 +1979,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    early_z = true;
  }

-  // Create the pipeline state object if needed and bind it.
-  void* pipeline_state_handle;
+  // Create the pipeline if needed and bind it.
+  void* pipeline_handle;
  ID3D12RootSignature* root_signature;
  if (!pipeline_cache_->ConfigurePipeline(
          vertex_shader, pixel_shader, primitive_type_converted,
          indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16,
-          early_z, pipeline_render_targets, &pipeline_state_handle,
+          early_z, pipeline_render_targets, &pipeline_handle,
          &root_signature)) {
    return false;
  }
-  if (current_cached_pipeline_state_ != pipeline_state_handle) {
+  if (current_cached_pipeline_ != pipeline_handle) {
    deferred_command_list_.SetPipelineStateHandle(
-        reinterpret_cast<void*>(pipeline_state_handle));
-    current_cached_pipeline_state_ = pipeline_state_handle;
-    current_external_pipeline_state_ = nullptr;
+        reinterpret_cast<void*>(pipeline_handle));
+    current_cached_pipeline_ = pipeline_handle;
+    current_external_pipeline_ = nullptr;
  }

  // Update viewport, scissor, blend factor and stencil reference.
@ -2005,14 +2012,15 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  }
  // Must not call anything that can change the descriptor heap from now on!

-  // Ensure vertex and index buffers are resident and draw.
+  // Ensure vertex buffers are resident.
  // TODO(Triang3l): Cache residency for ranges in a way similar to how texture
-  // validity will be tracked.
+  // validity is tracked.
  uint64_t vertex_buffers_resident[2] = {};
-  for (const auto& vertex_binding : vertex_shader->vertex_bindings()) {
+  for (const Shader::VertexBinding& vertex_binding :
+       vertex_shader->vertex_bindings()) {
    uint32_t vfetch_index = vertex_binding.fetch_constant;
    if (vertex_buffers_resident[vfetch_index >> 6] &
-        (1ull << (vfetch_index & 63))) {
+        (uint64_t(1) << (vfetch_index & 63))) {
      continue;
    }
    const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
@ -2045,7 +2053,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
          vfetch_constant.address << 2, vfetch_constant.size << 2);
      return false;
    }
-    vertex_buffers_resident[vfetch_index >> 6] |= 1ull << (vfetch_index & 63);
+    vertex_buffers_resident[vfetch_index >> 6] |= uint64_t(1)
+                                                  << (vfetch_index & 63);
  }

  // Gather memexport ranges and ensure the heaps for them are resident, and
@ -2517,8 +2526,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
    submission_open_ = true;

    // Start a new deferred command list - will submit it to the real one in the
-    // end of the submission (when async pipeline state object creation requests
-    // are fulfilled).
+    // end of the submission (when async pipeline creation requests are
+    // fulfilled).
    deferred_command_list_.Reset();

    // Reset cached state of the command list.
@ -2527,8 +2536,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
    ff_blend_factor_update_needed_ = true;
    ff_stencil_ref_update_needed_ = true;
    current_sample_positions_ = xenos::MsaaSamples::k1X;
-    current_cached_pipeline_state_ = nullptr;
-    current_external_pipeline_state_ = nullptr;
+    current_cached_pipeline_ = nullptr;
+    current_external_pipeline_ = nullptr;
    current_graphics_root_signature_ = nullptr;
    current_graphics_root_up_to_date_ = 0;
    if (bindless_resources_used_) {
@ -2724,7 +2733,7 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) {
 }

 bool D3D12CommandProcessor::CanEndSubmissionImmediately() const {
-  return !submission_open_ || !pipeline_cache_->IsCreatingPipelineStates();
+  return !submission_open_ || !pipeline_cache_->IsCreatingPipelines();
 }

 void D3D12CommandProcessor::ClearCommandAllocatorCache() {
@ -2745,12 +2754,12 @@ void D3D12CommandProcessor::ClearCommandAllocatorCache() {
 }

 void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) {
-  auto& regs = *register_file_;
-
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES

+  const RegisterFile& regs = *register_file_;
+
  // Window parameters.
  // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h
  // See r200UpdateWindow:
@ -2838,34 +2847,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) {
  }

  // Scissor.
-  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
-  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
-  D3D12_RECT scissor;
-  scissor.left = pa_sc_window_scissor_tl.tl_x;
-  scissor.top = pa_sc_window_scissor_tl.tl_y;
-  scissor.right = pa_sc_window_scissor_br.br_x;
-  scissor.bottom = pa_sc_window_scissor_br.br_y;
-  if (!pa_sc_window_scissor_tl.window_offset_disable) {
-    scissor.left =
-        std::max(scissor.left + pa_sc_window_offset.window_x_offset, LONG(0));
-    scissor.top =
-        std::max(scissor.top + pa_sc_window_offset.window_y_offset, LONG(0));
-    scissor.right =
-        std::max(scissor.right + pa_sc_window_offset.window_x_offset, LONG(0));
-    scissor.bottom =
-        std::max(scissor.bottom + pa_sc_window_offset.window_y_offset, LONG(0));
-  }
-  scissor.left *= pixel_size_x;
-  scissor.top *= pixel_size_y;
-  scissor.right *= pixel_size_x;
-  scissor.bottom *= pixel_size_y;
-  ff_scissor_update_needed_ |= ff_scissor_.left != scissor.left;
-  ff_scissor_update_needed_ |= ff_scissor_.top != scissor.top;
-  ff_scissor_update_needed_ |= ff_scissor_.right != scissor.right;
-  ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor.bottom;
+  draw_util::Scissor scissor;
+  draw_util::GetScissor(regs, scissor);
+  D3D12_RECT scissor_rect;
+  scissor_rect.left = LONG(scissor.left * pixel_size_x);
+  scissor_rect.top = LONG(scissor.top * pixel_size_y);
+  scissor_rect.right = LONG((scissor.left + scissor.width) * pixel_size_x);
+  scissor_rect.bottom = LONG((scissor.top + scissor.height) * pixel_size_y);
+  ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left;
+  ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top;
+  ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right;
+  ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom;
  if (ff_scissor_update_needed_) {
-    ff_scissor_ = scissor;
-    deferred_command_list_.RSSetScissorRect(scissor);
+    ff_scissor_ = scissor_rect;
+    deferred_command_list_.RSSetScissorRect(scissor_rect);
    ff_scissor_update_needed_ = false;
  }

@ -2915,12 +2910,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
    uint32_t line_loop_closing_index, xenos::Endian index_endian,
    uint32_t used_texture_mask, bool early_z, uint32_t color_mask,
    const RenderTargetCache::PipelineRenderTarget render_targets[4]) {
-  auto& regs = *register_file_;
-
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES

+  const RegisterFile& regs = *register_file_;
  auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
  auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
@ -3103,14 +3097,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
  dirty |= system_constants_.line_loop_closing_index != line_loop_closing_index;
  system_constants_.line_loop_closing_index = line_loop_closing_index;

-  // Vertex index offset.
-  dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
-  system_constants_.vertex_base_index = vgt_indx_offset;
-
  // Index or tessellation edge factor buffer endianness.
  dirty |= system_constants_.vertex_index_endian != index_endian;
  system_constants_.vertex_index_endian = index_endian;

+  // Vertex index offset.
+  dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
+  system_constants_.vertex_base_index = vgt_indx_offset;
+
  // User clip planes (UCP_ENA_#), when not CLIP_DISABLE.
  if (!pa_cl_clip_cntl.clip_disable) {
    for (uint32_t i = 0; i < 6; ++i) {
@ -3574,7 +3568,7 @@ bool D3D12CommandProcessor::UpdateBindings(
          float_constant_map_vertex.float_bitmap[i];
      // If no float constants at all, we can reuse any buffer for them, so not
      // invalidating.
-      if (float_constant_map_vertex.float_count != 0) {
+      if (float_constant_count_vertex) {
        cbuffer_binding_float_vertex_.up_to_date = false;
      }
    }
@ -3589,7 +3583,7 @@ bool D3D12CommandProcessor::UpdateBindings(
          float_constant_map_pixel.float_bitmap[i]) {
        current_float_constant_map_pixel_[i] =
            float_constant_map_pixel.float_bitmap[i];
-        if (float_constant_map_pixel.float_count != 0) {
+        if (float_constant_count_pixel) {
          cbuffer_binding_float_pixel_.up_to_date = false;
        }
      }
@ -3889,8 +3883,8 @@ bool D3D12CommandProcessor::UpdateBindings(
                  sampler_parameters,
                  provider.OffsetSamplerDescriptor(
                      sampler_bindless_heap_cpu_start_, sampler_index));
-              texture_cache_bindless_sampler_map_.insert(
-                  {sampler_parameters.value, sampler_index});
+              texture_cache_bindless_sampler_map_.emplace(
+                  sampler_parameters.value, sampler_index);
            }
            current_sampler_bindless_indices_vertex_[j] = sampler_index;
          }
@ -3921,8 +3915,8 @@ bool D3D12CommandProcessor::UpdateBindings(
                  sampler_parameters,
                  provider.OffsetSamplerDescriptor(
                      sampler_bindless_heap_cpu_start_, sampler_index));
-              texture_cache_bindless_sampler_map_.insert(
-                  {sampler_parameters.value, sampler_index});
+              texture_cache_bindless_sampler_map_.emplace(
+                  sampler_parameters.value, sampler_index);
            }
            current_sampler_bindless_indices_pixel_[j] = sampler_index;
          }
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -190,19 +190,17 @@ class D3D12CommandProcessor : public CommandProcessor {
  // render targets or copying to depth render targets.
  void SetSamplePositions(xenos::MsaaSamples sample_positions);

-  // Returns a pipeline state object with deferred creation by its handle. May
-  // return nullptr if failed to create the pipeline state object.
-  inline ID3D12PipelineState* GetD3D12PipelineStateByHandle(
-      void* handle) const {
-    return pipeline_cache_->GetD3D12PipelineStateByHandle(handle);
+  // Returns a pipeline with deferred creation by its handle. May return nullptr
+  // if failed to create the pipeline.
+  ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
+    return pipeline_cache_->GetD3D12PipelineByHandle(handle);
  }

-  // Sets the current pipeline state to a compute one. This is for cache
-  // invalidation primarily. A submission must be open.
-  void SetComputePipelineState(ID3D12PipelineState* pipeline_state);
+  // Sets the current pipeline to a compute one. This is for cache invalidation
+  // primarily. A submission must be open.
+  void SetComputePipeline(ID3D12PipelineState* pipeline);

-  // For the pipeline state cache to call when binding layout UIDs may be
-  // reused.
+  // For the pipeline cache to call when binding layout UIDs may be reused.
  void NotifyShaderBindingsLayoutUIDsInvalidated();

  // Returns the text to display in the GPU backend name in the window title.
@ -327,8 +325,8 @@ class D3D12CommandProcessor : public CommandProcessor {
  bool EndSubmission(bool is_swap);
  // Checks if ending a submission right now would not cause potentially more
  // delay than it would reduce by making the GPU start working earlier - such
-  // as when there are unfinished graphics pipeline state creation requests that
-  // would need to be fulfilled before actually submitting the command list.
+  // as when there are unfinished graphics pipeline creation requests that would
+  // need to be fulfilled before actually submitting the command list.
  bool CanEndSubmissionImmediately() const;
  bool AwaitAllQueueOperationsCompletion() {
    CheckSubmissionFence(submission_current_);
@ -512,7 +510,7 @@ class D3D12CommandProcessor : public CommandProcessor {
    return cvars::internal_tile_height;
  }

-  inline std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
+  std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
    if (texture_cache_->IsResolutionScale2X()) {
      return std::make_pair(kSwapTextureWidth() * 2, kSwapTextureHeight() * 2);
    }
@ -557,13 +555,12 @@ class D3D12CommandProcessor : public CommandProcessor {
  // Current SSAA sample positions (to be updated by the render target cache).
  xenos::MsaaSamples current_sample_positions_;

-  // Currently bound pipeline state, either a graphics pipeline state object
-  // from the pipeline state cache (with potentially deferred creation -
-  // current_external_pipeline_state_ is nullptr in this case) or a non-Xenos
-  // graphics or compute pipeline state object (current_cached_pipeline_state_
-  // is nullptr in this case).
-  void* current_cached_pipeline_state_;
-  ID3D12PipelineState* current_external_pipeline_state_;
+  // Currently bound pipeline, either a graphics pipeline from the pipeline
+  // cache (with potentially deferred creation - current_external_pipeline_ is
+  // nullptr in this case) or a non-Xenos graphics or compute pipeline
+  // (current_cached_pipeline_ is nullptr in this case).
+  void* current_cached_pipeline_;
+  ID3D12PipelineState* current_external_pipeline_;

  // Currently bound graphics root signature.
  ID3D12RootSignature* current_graphics_root_signature_;
--- a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
+++ b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
@ -157,7 +157,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
  stretch_pipeline_desc.SampleDesc.Count = 1;
  if (FAILED(device->CreateGraphicsPipelineState(
          &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_pipeline_)))) {
-    XELOGE("Failed to create the front buffer stretch pipeline state");
+    XELOGE("Failed to create the front buffer stretch pipeline");
    stretch_gamma_root_signature_->Release();
    stretch_gamma_root_signature_ = nullptr;
    stretch_root_signature_->Release();
@ -170,8 +170,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
  if (FAILED(device->CreateGraphicsPipelineState(
          &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_gamma_pipeline_)))) {
    XELOGE(
-        "Failed to create the gamma-correcting front buffer stretch "
-        "pipeline state");
+        "Failed to create the gamma-correcting front buffer stretch pipeline");
    stretch_pipeline_->Release();
    stretch_pipeline_ = nullptr;
    stretch_gamma_root_signature_->Release();
--- a/src/xenia/gpu/d3d12/d3d12_shader.h
+++ b/src/xenia/gpu/d3d12/d3d12_shader.h
@ -85,7 +85,7 @@ class D3D12Shader : public Shader {
    return sampler_bindings_.data();
  }

-  // For owning subsystems like the pipeline state cache, accessors for unique
+  // For owning subsystems like the pipeline cache, accessors for unique
  // identifiers (used instead of hashes to make sure collisions can't happen)
  // of binding layouts used by the shader, for invalidation if a shader with an
  // incompatible layout was bound.
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@ -48,7 +48,7 @@ class D3D12SharedMemory : public SharedMemory {
  // UseForReading or UseForWriting.

  // Makes the buffer usable for vertices, indices and texture untiling.
-  inline void UseForReading() {
+  void UseForReading() {
    // Vertex fetch is also allowed in pixel shaders.
    CommitUAVWritesAndTransitionBuffer(
        D3D12_RESOURCE_STATE_INDEX_BUFFER |
@ -56,18 +56,18 @@ class D3D12SharedMemory : public SharedMemory {
        D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
  }
  // Makes the buffer usable for texture tiling after a resolve.
-  inline void UseForWriting() {
+  void UseForWriting() {
    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
  }
  // Makes the buffer usable as a source for copy commands.
-  inline void UseAsCopySource() {
+  void UseAsCopySource() {
    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
  }
  // Must be called when doing draws/dispatches modifying data within the shared
  // memory buffer as a UAV, to make sure that when UseForWriting is called the
  // next time, a UAV barrier will be done, and subsequent overlapping UAV
  // writes and reads are ordered.
-  inline void MarkUAVWritesCommitNeeded() {
+  void MarkUAVWritesCommitNeeded() {
    if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
      buffer_uav_writes_commit_needed_ = true;
    }
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@ -209,9 +209,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
        }
      } break;
      case Command::kSetPipelineStateHandle: {
-        current_pipeline_state =
-            command_processor_.GetD3D12PipelineStateByHandle(
-                *reinterpret_cast<void* const*>(stream));
+        current_pipeline_state = command_processor_.GetD3D12PipelineByHandle(
+            *reinterpret_cast<void* const*>(stream));
        if (current_pipeline_state) {
          command_list->SetPipelineState(current_pipeline_state);
        }
--- a/src/xenia/gpu/d3d12/deferred_command_list.h
+++ b/src/xenia/gpu/d3d12/deferred_command_list.h
@ -33,7 +33,7 @@ class DeferredCommandList {
  void Execute(ID3D12GraphicsCommandList* command_list,
               ID3D12GraphicsCommandList1* command_list_1);

-  inline void D3DClearUnorderedAccessViewUint(
+  void D3DClearUnorderedAccessViewUint(
      D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap,
      D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle, ID3D12Resource* resource,
      const UINT values[4], UINT num_rects, const D3D12_RECT* rects) {
@ -51,9 +51,9 @@ class DeferredCommandList {
    }
  }

-  inline void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset,
-                                  ID3D12Resource* src_buffer, UINT64 src_offset,
-                                  UINT64 num_bytes) {
+  void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset,
+                           ID3D12Resource* src_buffer, UINT64 src_offset,
+                           UINT64 num_bytes) {
    auto& args = *reinterpret_cast<D3DCopyBufferRegionArguments*>(WriteCommand(
        Command::kD3DCopyBufferRegion, sizeof(D3DCopyBufferRegionArguments)));
    args.dst_buffer = dst_buffer;
@ -63,26 +63,26 @@ class DeferredCommandList {
    args.num_bytes = num_bytes;
  }

-  inline void D3DCopyResource(ID3D12Resource* dst_resource,
-                              ID3D12Resource* src_resource) {
+  void D3DCopyResource(ID3D12Resource* dst_resource,
+                       ID3D12Resource* src_resource) {
    auto& args = *reinterpret_cast<D3DCopyResourceArguments*>(WriteCommand(
        Command::kD3DCopyResource, sizeof(D3DCopyResourceArguments)));
    args.dst_resource = dst_resource;
    args.src_resource = src_resource;
  }

-  inline void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst,
-                          const D3D12_TEXTURE_COPY_LOCATION& src) {
+  void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst,
+                   const D3D12_TEXTURE_COPY_LOCATION& src) {
    auto& args = *reinterpret_cast<CopyTextureArguments*>(
        WriteCommand(Command::kCopyTexture, sizeof(CopyTextureArguments)));
    std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
    std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
  }

-  inline void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst,
-                                UINT dst_x, UINT dst_y, UINT dst_z,
-                                const D3D12_TEXTURE_COPY_LOCATION& src,
-                                const D3D12_BOX& src_box) {
+  void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, UINT dst_x,
+                         UINT dst_y, UINT dst_z,
+                         const D3D12_TEXTURE_COPY_LOCATION& src,
+                         const D3D12_BOX& src_box) {
    auto& args = *reinterpret_cast<CopyTextureRegionArguments*>(WriteCommand(
        Command::kCopyTextureRegion, sizeof(CopyTextureRegionArguments)));
    std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
@ -93,8 +93,8 @@ class DeferredCommandList {
    args.src_box = src_box;
  }

-  inline void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
-                          UINT thread_group_count_z) {
+  void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
+                   UINT thread_group_count_z) {
    auto& args = *reinterpret_cast<D3DDispatchArguments*>(
        WriteCommand(Command::kD3DDispatch, sizeof(D3DDispatchArguments)));
    args.thread_group_count_x = thread_group_count_x;
@ -102,11 +102,10 @@ class DeferredCommandList {
    args.thread_group_count_z = thread_group_count_z;
  }

-  inline void D3DDrawIndexedInstanced(UINT index_count_per_instance,
-                                      UINT instance_count,
-                                      UINT start_index_location,
-                                      INT base_vertex_location,
-                                      UINT start_instance_location) {
+  void D3DDrawIndexedInstanced(UINT index_count_per_instance,
+                               UINT instance_count, UINT start_index_location,
+                               INT base_vertex_location,
+                               UINT start_instance_location) {
    auto& args = *reinterpret_cast<D3DDrawIndexedInstancedArguments*>(
        WriteCommand(Command::kD3DDrawIndexedInstanced,
                     sizeof(D3DDrawIndexedInstancedArguments)));
@ -117,9 +116,9 @@ class DeferredCommandList {
    args.start_instance_location = start_instance_location;
  }

-  inline void D3DDrawInstanced(UINT vertex_count_per_instance,
-                               UINT instance_count, UINT start_vertex_location,
-                               UINT start_instance_location) {
+  void D3DDrawInstanced(UINT vertex_count_per_instance, UINT instance_count,
+                        UINT start_vertex_location,
+                        UINT start_instance_location) {
    auto& args = *reinterpret_cast<D3DDrawInstancedArguments*>(WriteCommand(
        Command::kD3DDrawInstanced, sizeof(D3DDrawInstancedArguments)));
    args.vertex_count_per_instance = vertex_count_per_instance;
@ -128,7 +127,7 @@ class DeferredCommandList {
    args.start_instance_location = start_instance_location;
  }

-  inline void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) {
+  void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) {
    auto& args = *reinterpret_cast<D3D12_INDEX_BUFFER_VIEW*>(WriteCommand(
        Command::kD3DIASetIndexBuffer, sizeof(D3D12_INDEX_BUFFER_VIEW)));
    if (view != nullptr) {
@ -142,14 +141,13 @@ class DeferredCommandList {
    }
  }

-  inline void D3DIASetPrimitiveTopology(
-      D3D12_PRIMITIVE_TOPOLOGY primitive_topology) {
+  void D3DIASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY primitive_topology) {
    auto& arg = *reinterpret_cast<D3D12_PRIMITIVE_TOPOLOGY*>(WriteCommand(
        Command::kD3DIASetPrimitiveTopology, sizeof(D3D12_PRIMITIVE_TOPOLOGY)));
    arg = primitive_topology;
  }

-  inline void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
+  void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
    auto args = reinterpret_cast<FLOAT*>(
        WriteCommand(Command::kD3DOMSetBlendFactor, 4 * sizeof(FLOAT)));
    args[0] = blend_factor[0];
@ -158,7 +156,7 @@ class DeferredCommandList {
    args[3] = blend_factor[3];
  }

-  inline void D3DOMSetRenderTargets(
+  void D3DOMSetRenderTargets(
      UINT num_render_target_descriptors,
      const D3D12_CPU_DESCRIPTOR_HANDLE* render_target_descriptors,
      BOOL rts_single_handle_to_descriptor_range,
@ -185,14 +183,14 @@ class DeferredCommandList {
    }
  }

-  inline void D3DOMSetStencilRef(UINT stencil_ref) {
+  void D3DOMSetStencilRef(UINT stencil_ref) {
    auto& arg = *reinterpret_cast<UINT*>(
        WriteCommand(Command::kD3DOMSetStencilRef, sizeof(UINT)));
    arg = stencil_ref;
  }

-  inline void D3DResourceBarrier(UINT num_barriers,
-                                 const D3D12_RESOURCE_BARRIER* barriers) {
+  void D3DResourceBarrier(UINT num_barriers,
+                          const D3D12_RESOURCE_BARRIER* barriers) {
    if (num_barriers == 0) {
      return;
    }
@ -207,21 +205,22 @@ class DeferredCommandList {
                num_barriers * sizeof(D3D12_RESOURCE_BARRIER));
  }

-  inline void RSSetScissorRect(const D3D12_RECT& rect) {
+  void RSSetScissorRect(const D3D12_RECT& rect) {
    auto& arg = *reinterpret_cast<D3D12_RECT*>(
        WriteCommand(Command::kRSSetScissorRect, sizeof(D3D12_RECT)));
    arg = rect;
  }

-  inline void RSSetViewport(const D3D12_VIEWPORT& viewport) {
+  void RSSetViewport(const D3D12_VIEWPORT& viewport) {
    auto& arg = *reinterpret_cast<D3D12_VIEWPORT*>(
        WriteCommand(Command::kRSSetViewport, sizeof(D3D12_VIEWPORT)));
    arg = viewport;
  }

-  inline void D3DSetComputeRoot32BitConstants(
-      UINT root_parameter_index, UINT num_32bit_values_to_set,
-      const void* src_data, UINT dest_offset_in_32bit_values) {
+  void D3DSetComputeRoot32BitConstants(UINT root_parameter_index,
+                                       UINT num_32bit_values_to_set,
+                                       const void* src_data,
+                                       UINT dest_offset_in_32bit_values) {
    if (num_32bit_values_to_set == 0) {
      return;
    }
@ -235,9 +234,10 @@ class DeferredCommandList {
    std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t));
  }

-  inline void D3DSetGraphicsRoot32BitConstants(
-      UINT root_parameter_index, UINT num_32bit_values_to_set,
-      const void* src_data, UINT dest_offset_in_32bit_values) {
+  void D3DSetGraphicsRoot32BitConstants(UINT root_parameter_index,
+                                        UINT num_32bit_values_to_set,
+                                        const void* src_data,
+                                        UINT dest_offset_in_32bit_values) {
    if (num_32bit_values_to_set == 0) {
      return;
    }
@ -251,7 +251,7 @@ class DeferredCommandList {
    std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t));
  }

-  inline void D3DSetComputeRootConstantBufferView(
+  void D3DSetComputeRootConstantBufferView(
      UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) {
    auto& args = *reinterpret_cast<SetRootConstantBufferViewArguments*>(
        WriteCommand(Command::kD3DSetComputeRootConstantBufferView,
@ -260,7 +260,7 @@ class DeferredCommandList {
    args.buffer_location = buffer_location;
  }

-  inline void D3DSetGraphicsRootConstantBufferView(
+  void D3DSetGraphicsRootConstantBufferView(
      UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) {
    auto& args = *reinterpret_cast<SetRootConstantBufferViewArguments*>(
        WriteCommand(Command::kD3DSetGraphicsRootConstantBufferView,
@ -269,7 +269,7 @@ class DeferredCommandList {
    args.buffer_location = buffer_location;
  }

-  inline void D3DSetComputeRootDescriptorTable(
+  void D3DSetComputeRootDescriptorTable(
      UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) {
    auto& args = *reinterpret_cast<SetRootDescriptorTableArguments*>(
        WriteCommand(Command::kD3DSetComputeRootDescriptorTable,
@ -278,7 +278,7 @@ class DeferredCommandList {
    args.base_descriptor.ptr = base_descriptor.ptr;
  }

-  inline void D3DSetGraphicsRootDescriptorTable(
+  void D3DSetGraphicsRootDescriptorTable(
      UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) {
    auto& args = *reinterpret_cast<SetRootDescriptorTableArguments*>(
        WriteCommand(Command::kD3DSetGraphicsRootDescriptorTable,
@ -287,42 +287,40 @@ class DeferredCommandList {
    args.base_descriptor.ptr = base_descriptor.ptr;
  }

-  inline void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) {
+  void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) {
    auto& arg = *reinterpret_cast<ID3D12RootSignature**>(WriteCommand(
        Command::kD3DSetComputeRootSignature, sizeof(ID3D12RootSignature*)));
    arg = root_signature;
  }

-  inline void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) {
+  void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) {
    auto& arg = *reinterpret_cast<ID3D12RootSignature**>(WriteCommand(
        Command::kD3DSetGraphicsRootSignature, sizeof(ID3D12RootSignature*)));
    arg = root_signature;
  }

-  inline void SetDescriptorHeaps(
-      ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap,
-      ID3D12DescriptorHeap* sampler_descriptor_heap) {
+  void SetDescriptorHeaps(ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap,
+                          ID3D12DescriptorHeap* sampler_descriptor_heap) {
    auto& args = *reinterpret_cast<SetDescriptorHeapsArguments*>(WriteCommand(
        Command::kSetDescriptorHeaps, sizeof(SetDescriptorHeapsArguments)));
    args.cbv_srv_uav_descriptor_heap = cbv_srv_uav_descriptor_heap;
    args.sampler_descriptor_heap = sampler_descriptor_heap;
  }

-  inline void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) {
+  void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) {
    auto& arg = *reinterpret_cast<ID3D12PipelineState**>(WriteCommand(
        Command::kD3DSetPipelineState, sizeof(ID3D12PipelineState*)));
    arg = pipeline_state;
  }

-  inline void SetPipelineStateHandle(void* pipeline_state_handle) {
+  void SetPipelineStateHandle(void* pipeline_state_handle) {
    auto& arg = *reinterpret_cast<void**>(
        WriteCommand(Command::kSetPipelineStateHandle, sizeof(void*)));
    arg = pipeline_state_handle;
  }

-  inline void D3DSetSamplePositions(
-      UINT num_samples_per_pixel, UINT num_pixels,
-      const D3D12_SAMPLE_POSITION* sample_positions) {
+  void D3DSetSamplePositions(UINT num_samples_per_pixel, UINT num_pixels,
+                             const D3D12_SAMPLE_POSITION* sample_positions) {
    auto& args = *reinterpret_cast<D3DSetSamplePositionsArguments*>(
        WriteCommand(Command::kD3DSetSamplePositions,
                     sizeof(D3DSetSamplePositionsArguments)));
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -43,10 +43,10 @@ DEFINE_bool(
    "D3D12");
 DEFINE_int32(
    d3d12_pipeline_creation_threads, -1,
-    "Number of threads used for graphics pipeline state object creation. -1 to "
-    "calculate automatically (75% of logical CPU cores), a positive number to "
-    "specify the number of threads explicitly (up to the number of logical CPU "
-    "cores), 0 to disable multithreaded pipeline state object creation.",
+    "Number of threads used for graphics pipeline creation. -1 to calculate "
+    "automatically (75% of logical CPU cores), a positive number to specify "
+    "the number of threads explicitly (up to the number of logical CPU cores), "
+    "0 to disable multithreaded pipeline creation.",
    "D3D12");
 DEFINE_bool(d3d12_tessellation_wireframe, false,
            "Display tessellated surfaces as wireframe for debugging.",
@ -125,8 +125,8 @@ bool PipelineCache::Initialize() {
    logical_processor_count = 6;
  }
  // Initialize creation thread synchronization data even if not using creation
-  // threads because they may be used anyway to create pipeline state objects
-  // from the storage.
+  // threads because they may be used anyway to create pipelines from the
+  // storage.
  creation_threads_busy_ = 0;
  creation_completion_event_ =
      xe::threading::Event::CreateManualResetEvent(true);
@ -145,7 +145,7 @@ bool PipelineCache::Initialize() {
    for (size_t i = 0; i < creation_thread_count; ++i) {
      std::unique_ptr<xe::threading::Thread> creation_thread =
          xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); });
-      creation_thread->set_name("D3D12 Pipeline States");
+      creation_thread->set_name("D3D12 Pipelines");
      creation_threads_.push_back(std::move(creation_thread));
    }
  }
@ -184,13 +184,12 @@ void PipelineCache::ClearCache(bool shutting_down) {
  }
  ShutdownShaderStorage();

-  // Remove references to the current pipeline state object.
-  current_pipeline_state_ = nullptr;
+  // Remove references to the current pipeline.
+  current_pipeline_ = nullptr;

  if (!creation_threads_.empty()) {
-    // Empty the pipeline state object creation queue and make sure there are no
-    // threads currently creating pipeline state objects because pipeline states
-    // are going to be deleted.
+    // Empty the pipeline creation queue and make sure there are no threads
+    // currently creating pipelines because pipelines are going to be deleted.
    bool await_creation_completion_event = false;
    {
      std::lock_guard<std::mutex> lock(creation_request_lock_);
@ -207,13 +206,13 @@ void PipelineCache::ClearCache(bool shutting_down) {
    }
  }

-  // Destroy all pipeline state objects.
-  for (auto it : pipeline_states_) {
+  // Destroy all pipelines.
+  for (auto it : pipelines_) {
    it.second->state->Release();
    delete it.second;
  }
-  pipeline_states_.clear();
-  COUNT_profile_set("gpu/pipeline_cache/pipeline_states", 0);
+  pipelines_.clear();
+  COUNT_profile_set("gpu/pipeline_cache/pipelines", 0);

  // Destroy all shaders.
  command_processor_.NotifyShaderBindingsLayoutUIDsInvalidated();
@ -223,10 +222,10 @@ void PipelineCache::ClearCache(bool shutting_down) {
  }
  texture_binding_layout_map_.clear();
  texture_binding_layouts_.clear();
-  for (auto it : shader_map_) {
+  for (auto it : shaders_) {
    delete it.second;
  }
-  shader_map_.clear();
+  shaders_.clear();

  if (reinitialize_shader_storage) {
    InitializeShaderStorage(shader_storage_root, shader_storage_title_id,
@ -374,8 +373,7 @@ void PipelineCache::InitializeShaderStorage(
      }
      size_t ucode_byte_count =
          shader_header.ucode_dword_count * sizeof(uint32_t);
-      if (shader_map_.find(shader_header.ucode_data_hash) !=
-          shader_map_.end()) {
+      if (shaders_.find(shader_header.ucode_data_hash) != shaders_.end()) {
        // Already added - usually shaders aren't added without the intention of
        // translating them imminently, so don't do additional checks to
        // actually ensure that translation happens right now (they would cause
@ -402,7 +400,7 @@ void PipelineCache::InitializeShaderStorage(
      D3D12Shader* shader =
          new D3D12Shader(shader_header.type, ucode_data_hash,
                          ucode_dwords.data(), shader_header.ucode_dword_count);
-      shader_map_.insert({ucode_data_hash, shader});
+      shaders_.emplace(ucode_data_hash, shader);
      // Create new threads if the currently existing threads can't keep up with
      // file reading, but not more than the number of logical processors minus
      // one.
@ -439,7 +437,7 @@ void PipelineCache::InitializeShaderStorage(
      }
      shader_translation_threads.clear();
      for (D3D12Shader* shader : shaders_failed_to_translate) {
-        shader_map_.erase(shader->ucode_data_hash());
+        shaders_.erase(shader->ucode_data_hash());
        delete shader;
      }
    }
@ -460,72 +458,66 @@ void PipelineCache::InitializeShaderStorage(
  }

  // 'DXRO' or 'DXRT'.
-  const uint32_t pipeline_state_storage_magic_api =
+  const uint32_t pipeline_storage_magic_api =
      edram_rov_used_ ? 0x4F525844 : 0x54525844;

-  // Initialize the pipeline state storage stream.
-  uint64_t pipeline_state_storage_initialization_start_ =
+  // Initialize the pipeline storage stream.
+  uint64_t pipeline_storage_initialization_start_ =
      xe::Clock::QueryHostTickCount();
-  auto pipeline_state_storage_file_path =
+  auto pipeline_storage_file_path =
      shader_storage_shareable_root /
      fmt::format("{:08X}.{}.d3d12.xpso", title_id,
                  edram_rov_used_ ? "rov" : "rtv");
-  pipeline_state_storage_file_ =
-      xe::filesystem::OpenFile(pipeline_state_storage_file_path, "a+b");
-  if (!pipeline_state_storage_file_) {
+  pipeline_storage_file_ =
+      xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b");
+  if (!pipeline_storage_file_) {
    XELOGE(
-        "Failed to open the Direct3D 12 pipeline state description storage "
-        "file for writing, persistent shader storage will be disabled: {}",
-        xe::path_to_utf8(pipeline_state_storage_file_path));
+        "Failed to open the Direct3D 12 pipeline description storage file for "
+        "writing, persistent shader storage will be disabled: {}",
+        xe::path_to_utf8(pipeline_storage_file_path));
    fclose(shader_storage_file_);
    shader_storage_file_ = nullptr;
    return;
  }
-  pipeline_state_storage_file_flush_needed_ = false;
+  pipeline_storage_file_flush_needed_ = false;
  // 'XEPS'.
-  const uint32_t pipeline_state_storage_magic = 0x53504558;
+  const uint32_t pipeline_storage_magic = 0x53504558;
  struct {
    uint32_t magic;
    uint32_t magic_api;
    uint32_t version_swapped;
-  } pipeline_state_storage_file_header;
-  if (fread(&pipeline_state_storage_file_header,
-            sizeof(pipeline_state_storage_file_header), 1,
-            pipeline_state_storage_file_) &&
-      pipeline_state_storage_file_header.magic ==
-          pipeline_state_storage_magic &&
-      pipeline_state_storage_file_header.magic_api ==
-          pipeline_state_storage_magic_api &&
-      xe::byte_swap(pipeline_state_storage_file_header.version_swapped) ==
+  } pipeline_storage_file_header;
+  if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
+            1, pipeline_storage_file_) &&
+      pipeline_storage_file_header.magic == pipeline_storage_magic &&
+      pipeline_storage_file_header.magic_api == pipeline_storage_magic_api &&
+      xe::byte_swap(pipeline_storage_file_header.version_swapped) ==
          PipelineDescription::kVersion) {
-    uint64_t pipeline_state_storage_valid_bytes =
-        sizeof(pipeline_state_storage_file_header);
-    // Enqueue pipeline state descriptions written by previous Xenia executions
-    // until the end of the file or until a corrupted one is detected.
-    xe::filesystem::Seek(pipeline_state_storage_file_, 0, SEEK_END);
-    int64_t pipeline_state_storage_told_end =
-        xe::filesystem::Tell(pipeline_state_storage_file_);
-    size_t pipeline_state_storage_told_count =
-        size_t(pipeline_state_storage_told_end >=
-                       int64_t(pipeline_state_storage_valid_bytes)
-                   ? (uint64_t(pipeline_state_storage_told_end) -
-                      pipeline_state_storage_valid_bytes) /
-                         sizeof(PipelineStoredDescription)
-                   : 0);
-    if (pipeline_state_storage_told_count &&
-        xe::filesystem::Seek(pipeline_state_storage_file_,
-                             int64_t(pipeline_state_storage_valid_bytes),
-                             SEEK_SET)) {
+    uint64_t pipeline_storage_valid_bytes =
+        sizeof(pipeline_storage_file_header);
+    // Enqueue pipeline descriptions written by previous Xenia executions until
+    // the end of the file or until a corrupted one is detected.
+    xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END);
+    int64_t pipeline_storage_told_end =
+        xe::filesystem::Tell(pipeline_storage_file_);
+    size_t pipeline_storage_told_count = size_t(
+        pipeline_storage_told_end >= int64_t(pipeline_storage_valid_bytes)
+            ? (uint64_t(pipeline_storage_told_end) -
+               pipeline_storage_valid_bytes) /
+                  sizeof(PipelineStoredDescription)
+            : 0);
+    if (pipeline_storage_told_count &&
+        xe::filesystem::Seek(pipeline_storage_file_,
+                             int64_t(pipeline_storage_valid_bytes), SEEK_SET)) {
      std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
-      pipeline_stored_descriptions.resize(pipeline_state_storage_told_count);
-      pipeline_stored_descriptions.resize(fread(
-          pipeline_stored_descriptions.data(),
-          sizeof(PipelineStoredDescription), pipeline_state_storage_told_count,
-          pipeline_state_storage_file_));
+      pipeline_stored_descriptions.resize(pipeline_storage_told_count);
+      pipeline_stored_descriptions.resize(
+          fread(pipeline_stored_descriptions.data(),
+                sizeof(PipelineStoredDescription), pipeline_storage_told_count,
+                pipeline_storage_file_));
      if (!pipeline_stored_descriptions.empty()) {
        // Launch additional creation threads to use all cores to create
-        // pipeline state objects faster. Will also be using the main thread, so
-        // minus 1.
+        // pipelines faster. Will also be using the main thread, so minus 1.
        size_t creation_thread_original_count = creation_threads_.size();
        size_t creation_thread_needed_count =
            std::max(std::min(pipeline_stored_descriptions.size(),
@ -539,10 +531,10 @@ void PipelineCache::InitializeShaderStorage(
                  {}, [this, creation_thread_index]() {
                    CreationThread(creation_thread_index);
                  });
-          creation_thread->set_name("D3D12 Pipeline States Additional");
+          creation_thread->set_name("D3D12 Pipelines");
          creation_threads_.push_back(std::move(creation_thread));
        }
-        size_t pipeline_states_created = 0;
+        size_t pipelines_created = 0;
        for (const PipelineStoredDescription& pipeline_stored_description :
             pipeline_stored_descriptions) {
          const PipelineDescription& pipeline_description =
@ -554,30 +546,28 @@ void PipelineCache::InitializeShaderStorage(
                    0) != pipeline_stored_description.description_hash) {
            break;
          }
-          pipeline_state_storage_valid_bytes +=
-              sizeof(PipelineStoredDescription);
-          // Skip already known pipeline states - those have already been
-          // enqueued.
-          auto found_range = pipeline_states_.equal_range(
+          pipeline_storage_valid_bytes += sizeof(PipelineStoredDescription);
+          // Skip already known pipelines - those have already been enqueued.
+          auto found_range = pipelines_.equal_range(
              pipeline_stored_description.description_hash);
-          bool pipeline_state_found = false;
+          bool pipeline_found = false;
          for (auto it = found_range.first; it != found_range.second; ++it) {
-            PipelineState* found_pipeline_state = it->second;
-            if (!std::memcmp(&found_pipeline_state->description.description,
+            Pipeline* found_pipeline = it->second;
+            if (!std::memcmp(&found_pipeline->description.description,
                             &pipeline_description,
                             sizeof(pipeline_description))) {
-              pipeline_state_found = true;
+              pipeline_found = true;
              break;
            }
          }
-          if (pipeline_state_found) {
+          if (pipeline_found) {
            continue;
          }

          PipelineRuntimeDescription pipeline_runtime_description;
          auto vertex_shader_it =
-              shader_map_.find(pipeline_description.vertex_shader_hash);
-          if (vertex_shader_it == shader_map_.end()) {
+              shaders_.find(pipeline_description.vertex_shader_hash);
+          if (vertex_shader_it == shaders_.end()) {
            continue;
          }
          pipeline_runtime_description.vertex_shader = vertex_shader_it->second;
@ -586,8 +576,8 @@ void PipelineCache::InitializeShaderStorage(
          }
          if (pipeline_description.pixel_shader_hash) {
            auto pixel_shader_it =
-                shader_map_.find(pipeline_description.pixel_shader_hash);
-            if (pixel_shader_it == shader_map_.end()) {
+                shaders_.find(pipeline_description.pixel_shader_hash);
+            if (pixel_shader_it == shaders_.end()) {
              continue;
            }
            pipeline_runtime_description.pixel_shader = pixel_shader_it->second;
@ -607,36 +597,33 @@ void PipelineCache::InitializeShaderStorage(
          std::memcpy(&pipeline_runtime_description.description,
                      &pipeline_description, sizeof(pipeline_description));

-          PipelineState* new_pipeline_state = new PipelineState;
-          new_pipeline_state->state = nullptr;
-          std::memcpy(&new_pipeline_state->description,
-                      &pipeline_runtime_description,
+          Pipeline* new_pipeline = new Pipeline;
+          new_pipeline->state = nullptr;
+          std::memcpy(&new_pipeline->description, &pipeline_runtime_description,
                      sizeof(pipeline_runtime_description));
-          pipeline_states_.insert(
-              std::make_pair(pipeline_stored_description.description_hash,
-                             new_pipeline_state));
-          COUNT_profile_set("gpu/pipeline_cache/pipeline_states",
-                            pipeline_states_.size());
+          pipelines_.emplace(pipeline_stored_description.description_hash,
+                             new_pipeline);
+          COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
          if (!creation_threads_.empty()) {
            // Submit the pipeline for creation to any available thread.
            {
              std::lock_guard<std::mutex> lock(creation_request_lock_);
-              creation_queue_.push_back(new_pipeline_state);
+              creation_queue_.push_back(new_pipeline);
            }
            creation_request_cond_.notify_one();
          } else {
-            new_pipeline_state->state =
-                CreateD3D12PipelineState(pipeline_runtime_description);
+            new_pipeline->state =
+                CreateD3D12Pipeline(pipeline_runtime_description);
          }
-          ++pipeline_states_created;
+          ++pipelines_created;
        }
-        CreateQueuedPipelineStatesOnProcessorThread();
+        CreateQueuedPipelinesOnProcessorThread();
        if (creation_threads_.size() > creation_thread_original_count) {
          {
            std::lock_guard<std::mutex> lock(creation_request_lock_);
            creation_threads_shutdown_from_ = creation_thread_original_count;
            // Assuming the queue is empty because of
-            // CreateQueuedPipelineStatesOnProcessorThread.
+            // CreateQueuedPipelinesOnProcessorThread.
          }
          creation_request_cond_.notify_all();
          while (creation_threads_.size() > creation_thread_original_count) {
@ -664,26 +651,23 @@ void PipelineCache::InitializeShaderStorage(
          }
        }
        XELOGGPU(
-            "Created {} graphics pipeline state objects from the storage in {} "
-            "milliseconds",
-            pipeline_states_created,
+            "Created {} graphics pipelines from the storage in {} milliseconds",
+            pipelines_created,
            (xe::Clock::QueryHostTickCount() -
-             pipeline_state_storage_initialization_start_) *
+             pipeline_storage_initialization_start_) *
                1000 / xe::Clock::QueryHostTickFrequency());
      }
    }
-    xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_,
-                                      pipeline_state_storage_valid_bytes);
+    xe::filesystem::TruncateStdioFile(pipeline_storage_file_,
+                                      pipeline_storage_valid_bytes);
  } else {
-    xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, 0);
-    pipeline_state_storage_file_header.magic = pipeline_state_storage_magic;
-    pipeline_state_storage_file_header.magic_api =
-        pipeline_state_storage_magic_api;
-    pipeline_state_storage_file_header.version_swapped =
+    xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0);
+    pipeline_storage_file_header.magic = pipeline_storage_magic;
+    pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
+    pipeline_storage_file_header.version_swapped =
        xe::byte_swap(PipelineDescription::kVersion);
-    fwrite(&pipeline_state_storage_file_header,
-           sizeof(pipeline_state_storage_file_header), 1,
-           pipeline_state_storage_file_);
+    fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
+           1, pipeline_storage_file_);
  }

  shader_storage_root_ = storage_root;
@ -691,7 +675,7 @@ void PipelineCache::InitializeShaderStorage(

  // Start the storage writing thread.
  storage_write_flush_shaders_ = false;
-  storage_write_flush_pipeline_states_ = false;
+  storage_write_flush_pipelines_ = false;
  storage_write_thread_shutdown_ = false;
  storage_write_thread_ =
      xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); });
@ -708,12 +692,12 @@ void PipelineCache::ShutdownShaderStorage() {
    storage_write_thread_.reset();
  }
  storage_write_shader_queue_.clear();
-  storage_write_pipeline_state_queue_.clear();
+  storage_write_pipeline_queue_.clear();

-  if (pipeline_state_storage_file_) {
-    fclose(pipeline_state_storage_file_);
-    pipeline_state_storage_file_ = nullptr;
-    pipeline_state_storage_file_flush_needed_ = false;
+  if (pipeline_storage_file_) {
+    fclose(pipeline_storage_file_);
+    pipeline_storage_file_ = nullptr;
+    pipeline_storage_file_flush_needed_ = false;
  }

  if (shader_storage_file_) {
@ -728,30 +712,29 @@ void PipelineCache::ShutdownShaderStorage() {

 void PipelineCache::EndSubmission() {
  if (shader_storage_file_flush_needed_ ||
-      pipeline_state_storage_file_flush_needed_) {
+      pipeline_storage_file_flush_needed_) {
    {
      std::lock_guard<std::mutex> lock(storage_write_request_lock_);
      if (shader_storage_file_flush_needed_) {
        storage_write_flush_shaders_ = true;
      }
-      if (pipeline_state_storage_file_flush_needed_) {
-        storage_write_flush_pipeline_states_ = true;
+      if (pipeline_storage_file_flush_needed_) {
+        storage_write_flush_pipelines_ = true;
      }
    }
    storage_write_request_cond_.notify_one();
    shader_storage_file_flush_needed_ = false;
-    pipeline_state_storage_file_flush_needed_ = false;
+    pipeline_storage_file_flush_needed_ = false;
  }
  if (!creation_threads_.empty()) {
-    CreateQueuedPipelineStatesOnProcessorThread();
-    // Await creation of all queued pipeline state objects.
+    CreateQueuedPipelinesOnProcessorThread();
+    // Await creation of all queued pipelines.
    bool await_creation_completion_event;
    {
      std::lock_guard<std::mutex> lock(creation_request_lock_);
      // Assuming the creation queue is already empty (because the processor
-      // thread also worked on creating the leftover pipeline state objects), so
-      // only check if there are threads with pipeline state objects currently
-      // being created.
+      // thread also worked on creating the leftover pipelines), so only check
+      // if there are threads with pipelines currently being created.
      await_creation_completion_event = creation_threads_busy_ != 0;
      if (await_creation_completion_event) {
        creation_completion_event_->Reset();
@ -765,7 +748,7 @@ void PipelineCache::EndSubmission() {
  }
 }

-bool PipelineCache::IsCreatingPipelineStates() {
+bool PipelineCache::IsCreatingPipelines() {
  if (creation_threads_.empty()) {
    return false;
  }
@ -779,8 +762,8 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
                                       uint32_t dword_count) {
  // Hash the input memory and lookup the shader.
  uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0);
-  auto it = shader_map_.find(data_hash);
-  if (it != shader_map_.end()) {
+  auto it = shaders_.find(data_hash);
+  if (it != shaders_.end()) {
    // Shader has been previously loaded.
    return it->second;
  }
@ -790,7 +773,7 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
  // again.
  D3D12Shader* shader =
      new D3D12Shader(shader_type, data_hash, host_address, dword_count);
-  shader_map_.insert({data_hash, shader});
+  shaders_.emplace(data_hash, shader);

  return shader;
 }
@ -798,11 +781,11 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
 Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid()
    const {
  // If the values this functions returns are changed, INVALIDATE THE SHADER
-  // STORAGE (increase kVersion for BOTH shaders and pipeline states)! The
-  // exception is when the function originally returned "unsupported", but
-  // started to return a valid value (in this case the shader wouldn't be cached
-  // in the first place). Otherwise games will not be able to locate shaders for
-  // draws for which the host vertex shader type has changed!
+  // STORAGE (increase kVersion for BOTH shaders and pipelines)! The exception
+  // is when the function originally returned "unsupported", but started to
+  // return a valid value (in this case the shader wouldn't be cached in the
+  // first place). Otherwise games will not be able to locate shaders for draws
+  // for which the host vertex shader type has changed!
  const auto& regs = register_file_;
  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
  if (!xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
@ -929,13 +912,12 @@ bool PipelineCache::ConfigurePipeline(
    xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
    bool early_z,
    const RenderTargetCache::PipelineRenderTarget render_targets[5],
-    void** pipeline_state_handle_out,
-    ID3D12RootSignature** root_signature_out) {
+    void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES

-  assert_not_null(pipeline_state_handle_out);
+  assert_not_null(pipeline_handle_out);
  assert_not_null(root_signature_out);

  PipelineRuntimeDescription runtime_description;
@ -946,24 +928,24 @@ bool PipelineCache::ConfigurePipeline(
  }
  PipelineDescription& description = runtime_description.description;

-  if (current_pipeline_state_ != nullptr &&
-      !std::memcmp(&current_pipeline_state_->description.description,
-                   &description, sizeof(description))) {
-    *pipeline_state_handle_out = current_pipeline_state_;
+  if (current_pipeline_ != nullptr &&
+      !std::memcmp(&current_pipeline_->description.description, &description,
+                   sizeof(description))) {
+    *pipeline_handle_out = current_pipeline_;
    *root_signature_out = runtime_description.root_signature;
    return true;
  }

-  // Find an existing pipeline state object in the cache.
+  // Find an existing pipeline in the cache.
  uint64_t hash = XXH64(&description, sizeof(description), 0);
-  auto found_range = pipeline_states_.equal_range(hash);
+  auto found_range = pipelines_.equal_range(hash);
  for (auto it = found_range.first; it != found_range.second; ++it) {
-    PipelineState* found_pipeline_state = it->second;
-    if (!std::memcmp(&found_pipeline_state->description.description,
-                     &description, sizeof(description))) {
-      current_pipeline_state_ = found_pipeline_state;
-      *pipeline_state_handle_out = found_pipeline_state;
-      *root_signature_out = found_pipeline_state->description.root_signature;
+    Pipeline* found_pipeline = it->second;
+    if (!std::memcmp(&found_pipeline->description.description, &description,
+                     sizeof(description))) {
+      current_pipeline_ = found_pipeline;
+      *pipeline_handle_out = found_pipeline;
+      *root_signature_out = found_pipeline->description.root_signature;
      return true;
    }
  }
@ -974,33 +956,32 @@ bool PipelineCache::ConfigurePipeline(
    return false;
  }

-  PipelineState* new_pipeline_state = new PipelineState;
-  new_pipeline_state->state = nullptr;
-  std::memcpy(&new_pipeline_state->description, &runtime_description,
+  Pipeline* new_pipeline = new Pipeline;
+  new_pipeline->state = nullptr;
+  std::memcpy(&new_pipeline->description, &runtime_description,
              sizeof(runtime_description));
-  pipeline_states_.insert(std::make_pair(hash, new_pipeline_state));
-  COUNT_profile_set("gpu/pipeline_cache/pipeline_states",
-                    pipeline_states_.size());
+  pipelines_.emplace(hash, new_pipeline);
+  COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());

  if (!creation_threads_.empty()) {
-    // Submit the pipeline state object for creation to any available thread.
+    // Submit the pipeline for creation to any available thread.
    {
      std::lock_guard<std::mutex> lock(creation_request_lock_);
-      creation_queue_.push_back(new_pipeline_state);
+      creation_queue_.push_back(new_pipeline);
    }
    creation_request_cond_.notify_one();
  } else {
-    new_pipeline_state->state = CreateD3D12PipelineState(runtime_description);
+    new_pipeline->state = CreateD3D12Pipeline(runtime_description);
  }

-  if (pipeline_state_storage_file_) {
+  if (pipeline_storage_file_) {
    assert_not_null(storage_write_thread_);
-    pipeline_state_storage_file_flush_needed_ = true;
+    pipeline_storage_file_flush_needed_ = true;
    {
      std::lock_guard<std::mutex> lock(storage_write_request_lock_);
-      storage_write_pipeline_state_queue_.emplace_back();
+      storage_write_pipeline_queue_.emplace_back();
      PipelineStoredDescription& stored_description =
-          storage_write_pipeline_state_queue_.back();
+          storage_write_pipeline_queue_.back();
      stored_description.description_hash = hash;
      std::memcpy(&stored_description.description, &description,
                  sizeof(description));
@ -1008,8 +989,8 @@ bool PipelineCache::ConfigurePipeline(
    storage_write_request_cond_.notify_all();
  }

-  current_pipeline_state_ = new_pipeline_state;
-  *pipeline_state_handle_out = new_pipeline_state;
+  current_pipeline_ = new_pipeline;
+  *pipeline_handle_out = new_pipeline;
  *root_signature_out = runtime_description.root_signature;
  return true;
 }
@ -1136,8 +1117,8 @@ bool PipelineCache::TranslateShader(
        std::memcpy(
            texture_binding_layouts_.data() + new_uid.vector_span_offset,
            texture_bindings, texture_binding_layout_bytes);
-        texture_binding_layout_map_.insert(
-            {texture_binding_layout_hash, new_uid});
+        texture_binding_layout_map_.emplace(texture_binding_layout_hash,
+                                            new_uid);
      }
    }
    if (bindless_sampler_count) {
@ -1179,8 +1160,8 @@ bool PipelineCache::TranslateShader(
          vector_bindless_sampler_layout[i] =
              sampler_bindings[i].bindless_descriptor_index;
        }
-        bindless_sampler_layout_map_.insert(
-            {bindless_sampler_layout_hash, new_uid});
+        bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash,
+                                             new_uid);
      }
    }
  }
@ -1508,8 +1489,7 @@ bool PipelineCache::GetCurrentStateDescription(
        /* 16 */ PipelineBlendFactor::kSrcAlphaSat,
    };
    // Like kBlendFactorMap, but with color modes changed to alpha. Some
-    // pipeline state objects aren't created in Prey because a color mode is
-    // used for alpha.
+    // pipelines aren't created in Prey because a color mode is used for alpha.
    static const PipelineBlendFactor kBlendFactorAlphaMap[32] = {
        /*  0 */ PipelineBlendFactor::kZero,
        /*  1 */ PipelineBlendFactor::kOne,
@ -1569,18 +1549,16 @@ bool PipelineCache::GetCurrentStateDescription(
  return true;
 }

-ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState(
+ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
    const PipelineRuntimeDescription& runtime_description) {
  const PipelineDescription& description = runtime_description.description;

  if (runtime_description.pixel_shader != nullptr) {
-    XELOGGPU(
-        "Creating graphics pipeline state with VS {:016X}"
-        ", PS {:016X}",
-        runtime_description.vertex_shader->ucode_data_hash(),
-        runtime_description.pixel_shader->ucode_data_hash());
+    XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}",
+             runtime_description.vertex_shader->ucode_data_hash(),
+             runtime_description.pixel_shader->ucode_data_hash());
  } else {
-    XELOGGPU("Creating graphics pipeline state with VS {:016X}",
+    XELOGGPU("Creating graphics pipeline with VS {:016X}",
             runtime_description.vertex_shader->ucode_data_hash());
  }

@ -1893,20 +1871,18 @@ ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState(
    }
  }

-  // Create the pipeline state object.
+  // Create the D3D12 pipeline state object.
  auto device =
      command_processor_.GetD3D12Context().GetD3D12Provider().GetDevice();
  ID3D12PipelineState* state;
  if (FAILED(device->CreateGraphicsPipelineState(&state_desc,
                                                 IID_PPV_ARGS(&state)))) {
    if (runtime_description.pixel_shader != nullptr) {
-      XELOGE(
-          "Failed to create graphics pipeline state with VS {:016X}"
-          ", PS {:016X}",
-          runtime_description.vertex_shader->ucode_data_hash(),
-          runtime_description.pixel_shader->ucode_data_hash());
+      XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
+             runtime_description.vertex_shader->ucode_data_hash(),
+             runtime_description.pixel_shader->ucode_data_hash());
    } else {
-      XELOGE("Failed to create graphics pipeline state with VS {:016X}",
+      XELOGE("Failed to create graphics pipeline with VS {:016X}",
             runtime_description.vertex_shader->ucode_data_hash());
    }
    return nullptr;
@ -1933,7 +1909,7 @@ void PipelineCache::StorageWriteThread() {
  ucode_guest_endian.reserve(0xFFFF);

  bool flush_shaders = false;
-  bool flush_pipeline_states = false;
+  bool flush_pipelines = false;

  while (true) {
    if (flush_shaders) {
@ -1941,15 +1917,15 @@ void PipelineCache::StorageWriteThread() {
      assert_not_null(shader_storage_file_);
      fflush(shader_storage_file_);
    }
-    if (flush_pipeline_states) {
-      flush_pipeline_states = false;
-      assert_not_null(pipeline_state_storage_file_);
-      fflush(pipeline_state_storage_file_);
+    if (flush_pipelines) {
+      flush_pipelines = false;
+      assert_not_null(pipeline_storage_file_);
+      fflush(pipeline_storage_file_);
    }

    std::pair<const Shader*, reg::SQ_PROGRAM_CNTL> shader_pair = {};
    PipelineStoredDescription pipeline_description;
-    bool write_pipeline_state = false;
+    bool write_pipeline = false;
    {
      std::unique_lock<std::mutex> lock(storage_write_request_lock_);
      if (storage_write_thread_shutdown_) {
@ -1962,17 +1938,17 @@ void PipelineCache::StorageWriteThread() {
        storage_write_flush_shaders_ = false;
        flush_shaders = true;
      }
-      if (!storage_write_pipeline_state_queue_.empty()) {
+      if (!storage_write_pipeline_queue_.empty()) {
        std::memcpy(&pipeline_description,
-                    &storage_write_pipeline_state_queue_.front(),
+                    &storage_write_pipeline_queue_.front(),
                    sizeof(pipeline_description));
-        storage_write_pipeline_state_queue_.pop_front();
-        write_pipeline_state = true;
-      } else if (storage_write_flush_pipeline_states_) {
-        storage_write_flush_pipeline_states_ = false;
-        flush_pipeline_states = true;
+        storage_write_pipeline_queue_.pop_front();
+        write_pipeline = true;
+      } else if (storage_write_flush_pipelines_) {
+        storage_write_flush_pipelines_ = false;
+        flush_pipelines = true;
      }
-      if (!shader_pair.first && !write_pipeline_state) {
+      if (!shader_pair.first && !write_pipeline) {
        storage_write_request_cond_.wait(lock);
        continue;
      }
@ -1999,27 +1975,26 @@ void PipelineCache::StorageWriteThread() {
      }
    }

-    if (write_pipeline_state) {
-      assert_not_null(pipeline_state_storage_file_);
+    if (write_pipeline) {
+      assert_not_null(pipeline_storage_file_);
      fwrite(&pipeline_description, sizeof(pipeline_description), 1,
-             pipeline_state_storage_file_);
+             pipeline_storage_file_);
    }
  }
 }

 void PipelineCache::CreationThread(size_t thread_index) {
  while (true) {
-    PipelineState* pipeline_state_to_create = nullptr;
+    Pipeline* pipeline_to_create = nullptr;

    // Check if need to shut down or set the completion event and dequeue the
-    // pipeline state if there is any.
+    // pipeline if there is any.
    {
      std::unique_lock<std::mutex> lock(creation_request_lock_);
      if (thread_index >= creation_threads_shutdown_from_ ||
          creation_queue_.empty()) {
        if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
-          // Last pipeline state object in the queue created - signal the event
-          // if requested.
+          // Last pipeline in the queue created - signal the event if requested.
          creation_completion_set_event_ = false;
          creation_completion_event_->Set();
        }
@ -2029,23 +2004,22 @@ void PipelineCache::CreationThread(size_t thread_index) {
        creation_request_cond_.wait(lock);
        continue;
      }
-      // Take the pipeline state from the queue and increment the busy thread
-      // count until the pipeline state object is created - other threads must
-      // be able to dequeue requests, but can't set the completion event until
-      // the pipeline state objects are fully created (rather than just started
-      // creating).
-      pipeline_state_to_create = creation_queue_.front();
+      // Take the pipeline from the queue and increment the busy thread count
+      // until the pipeline is created - other threads must be able to dequeue
+      // requests, but can't set the completion event until the pipelines are
+      // fully created (rather than just started creating).
+      pipeline_to_create = creation_queue_.front();
      creation_queue_.pop_front();
      ++creation_threads_busy_;
    }

    // Create the D3D12 pipeline state object.
-    pipeline_state_to_create->state =
-        CreateD3D12PipelineState(pipeline_state_to_create->description);
+    pipeline_to_create->state =
+        CreateD3D12Pipeline(pipeline_to_create->description);

-    // Pipeline state object created - the thread is not busy anymore, safe to
-    // set the completion event if needed (at the next iteration, or in some
-    // other thread).
+    // Pipeline created - the thread is not busy anymore, safe to set the
+    // completion event if needed (at the next iteration, or in some other
+    // thread).
    {
      std::lock_guard<std::mutex> lock(creation_request_lock_);
      --creation_threads_busy_;
@ -2053,20 +2027,20 @@ void PipelineCache::CreationThread(size_t thread_index) {
  }
 }

-void PipelineCache::CreateQueuedPipelineStatesOnProcessorThread() {
+void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
  assert_false(creation_threads_.empty());
  while (true) {
-    PipelineState* pipeline_state_to_create;
+    Pipeline* pipeline_to_create;
    {
      std::lock_guard<std::mutex> lock(creation_request_lock_);
      if (creation_queue_.empty()) {
        break;
      }
-      pipeline_state_to_create = creation_queue_.front();
+      pipeline_to_create = creation_queue_.front();
      creation_queue_.pop_front();
    }
-    pipeline_state_to_create->state =
-        CreateD3D12PipelineState(pipeline_state_to_create->description);
+    pipeline_to_create->state =
+        CreateD3D12Pipeline(pipeline_to_create->description);
  }
 }

--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@ -29,6 +29,7 @@
 #include "xenia/gpu/dxbc_shader_translator.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/xenos.h"
+#include "xenia/ui/d3d12/d3d12_api.h"

 namespace xe {
 namespace gpu {
@ -54,7 +55,7 @@ class PipelineCache {
  void ShutdownShaderStorage();

  void EndSubmission();
-  bool IsCreatingPipelineStates();
+  bool IsCreatingPipelines();

  D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address,
                          const uint32_t* host_address, uint32_t dword_count);
@ -73,14 +74,12 @@ class PipelineCache {
      xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
      bool early_z,
      const RenderTargetCache::PipelineRenderTarget render_targets[5],
-      void** pipeline_state_handle_out,
-      ID3D12RootSignature** root_signature_out);
+      void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);

-  // Returns a pipeline state object with deferred creation by its handle. May
-  // return nullptr if failed to create the pipeline state object.
-  inline ID3D12PipelineState* GetD3D12PipelineStateByHandle(
-      void* handle) const {
-    return reinterpret_cast<const PipelineState*>(handle)->state;
+  // Returns a pipeline with deferred creation by its handle. May return nullptr
+  // if failed to create the pipeline.
+  ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
+    return reinterpret_cast<const Pipeline*>(handle)->state;
  }

 private:
@ -237,7 +236,7 @@ class PipelineCache {
      const RenderTargetCache::PipelineRenderTarget render_targets[5],
      PipelineRuntimeDescription& runtime_description_out);

-  ID3D12PipelineState* CreateD3D12PipelineState(
+  ID3D12PipelineState* CreateD3D12Pipeline(
      const PipelineRuntimeDescription& runtime_description);

  D3D12CommandProcessor& command_processor_;
@ -255,9 +254,9 @@ class PipelineCache {
  IDxcUtils* dxc_utils_ = nullptr;
  IDxcCompiler* dxc_compiler_ = nullptr;

-  // All loaded shaders mapped by their guest hash key.
+  // Ucode hash -> shader.
  std::unordered_map<uint64_t, D3D12Shader*, xe::hash::IdentityHasher<uint64_t>>
-      shader_map_;
+      shaders_;

  struct LayoutUID {
    size_t uid;
@ -285,21 +284,20 @@ class PipelineCache {
  // Xenos pixel shader provided.
  std::vector<uint8_t> depth_only_pixel_shader_;

-  struct PipelineState {
+  struct Pipeline {
    // nullptr if creation has failed.
    ID3D12PipelineState* state;
    PipelineRuntimeDescription description;
  };
-  // All previously generated pipeline state objects identified by hash and the
-  // description.
-  std::unordered_multimap<uint64_t, PipelineState*,
+  // All previously generated pipelines identified by hash and the description.
+  std::unordered_multimap<uint64_t, Pipeline*,
                          xe::hash::IdentityHasher<uint64_t>>
-      pipeline_states_;
+      pipelines_;

-  // Previously used pipeline state object. This matches our current state
-  // settings and allows us to quickly(ish) reuse the pipeline state if no
-  // registers have changed.
-  PipelineState* current_pipeline_state_ = nullptr;
+  // Previously used pipeline. This matches our current state settings and
+  // allows us to quickly(ish) reuse the pipeline if no registers have been
+  // changed.
+  Pipeline* current_pipeline_ = nullptr;

  // Currently open shader storage path.
  std::filesystem::path shader_storage_root_;
@ -309,10 +307,9 @@ class PipelineCache {
  FILE* shader_storage_file_ = nullptr;
  bool shader_storage_file_flush_needed_ = false;

-  // Pipeline state storage output stream, for preload in the next emulator
-  // runs.
-  FILE* pipeline_state_storage_file_ = nullptr;
-  bool pipeline_state_storage_file_flush_needed_ = false;
+  // Pipeline storage output stream, for preload in the next emulator runs.
+  FILE* pipeline_storage_file_ = nullptr;
+  bool pipeline_storage_file_flush_needed_ = false;

  // Thread for asynchronous writing to the storage streams.
  void StorageWriteThread();
@ -322,28 +319,27 @@ class PipelineCache {
  // thread is notified about its change via storage_write_request_cond_.
  std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>>
      storage_write_shader_queue_;
-  std::deque<PipelineStoredDescription> storage_write_pipeline_state_queue_;
+  std::deque<PipelineStoredDescription> storage_write_pipeline_queue_;
  bool storage_write_flush_shaders_ = false;
-  bool storage_write_flush_pipeline_states_ = false;
+  bool storage_write_flush_pipelines_ = false;
  bool storage_write_thread_shutdown_ = false;
  std::unique_ptr<xe::threading::Thread> storage_write_thread_;

-  // Pipeline state object creation threads.
+  // Pipeline creation threads.
  void CreationThread(size_t thread_index);
-  void CreateQueuedPipelineStatesOnProcessorThread();
+  void CreateQueuedPipelinesOnProcessorThread();
  std::mutex creation_request_lock_;
  std::condition_variable creation_request_cond_;
  // Protected with creation_request_lock_, notify_one creation_request_cond_
  // when set.
-  std::deque<PipelineState*> creation_queue_;
-  // Number of threads that are currently creating a pipeline state object -
-  // incremented when a pipeline state object is dequeued (the completion event
-  // can't be triggered before this is zero). Protected with
-  // creation_request_lock_.
+  std::deque<Pipeline*> creation_queue_;
+  // Number of threads that are currently creating a pipeline - incremented when
+  // a pipeline is dequeued (the completion event can't be triggered before this
+  // is zero). Protected with creation_request_lock_.
  size_t creation_threads_busy_ = 0;
-  // Manual-reset event set when the last queued pipeline state object is
-  // created and there are no more pipeline state objects to create. This is
-  // triggered by the thread creating the last pipeline state object.
+  // Manual-reset event set when the last queued pipeline is created and there
+  // are no more pipelines to create. This is triggered by the thread creating
+  // the last pipeline.
  std::unique_ptr<xe::threading::Event> creation_completion_event_;
  // Whether setting the event on completion is queued. Protected with
  // creation_request_lock_, notify_one creation_request_cond_ when set.
--- a/src/xenia/gpu/d3d12/premake5.lua
+++ b/src/xenia/gpu/d3d12/premake5.lua
@ -25,15 +25,6 @@ project("xenia-gpu-d3d12-trace-viewer")
  kind("WindowedApp")
  language("C++")
  links({
-    "aes_128",
-    "capstone",
-    "dxbc",
-    "fmt",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
    "xenia-apu",
    "xenia-apu-nop",
    "xenia-base",
@ -49,6 +40,17 @@ project("xenia-gpu-d3d12-trace-viewer")
    "xenia-ui-d3d12",
    "xenia-vfs",
    "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "dxbc",
+    "fmt",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
    "xxhash",
  })
  files({
@ -71,15 +73,6 @@ project("xenia-gpu-d3d12-trace-dump")
  kind("ConsoleApp")
  language("C++")
  links({
-    "aes_128",
-    "capstone",
-    "dxbc",
-    "fmt",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
    "xenia-apu",
    "xenia-apu-nop",
    "xenia-base",
@ -95,6 +88,17 @@ project("xenia-gpu-d3d12-trace-dump")
    "xenia-ui-d3d12",
    "xenia-vfs",
    "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "dxbc",
+    "fmt",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
    "xxhash",
  })
  files({
@ -109,4 +113,4 @@ project("xenia-gpu-d3d12-trace-dump")
      "2>&1",
      "1>scratch/stdout-trace-dump.txt",
    })
-  end
+  end
--- a/src/xenia/gpu/d3d12/primitive_converter.cc
+++ b/src/xenia/gpu/d3d12/primitive_converter.cc
@ -454,8 +454,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
  // again and again and exit.
  if (!conversion_needed || converted_index_count == 0) {
    converted_indices.gpu_address = 0;
-    converted_indices_cache_.insert(
-        std::make_pair(converted_indices.key.value, converted_indices));
+    converted_indices_cache_.emplace(converted_indices.key.value,
+                                     converted_indices);
    memory_regions_used_ |= memory_regions_used_bits;
    return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty
                                      : ConversionResult::kConversionNotNeeded;
@ -670,8 +670,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(

  // Cache and return the indices.
  converted_indices.gpu_address = gpu_address;
-  converted_indices_cache_.insert(
-      std::make_pair(converted_indices.key.value, converted_indices));
+  converted_indices_cache_.emplace(converted_indices.key.value,
+                                   converted_indices);
  memory_regions_used_ |= memory_regions_used_bits;
  gpu_address_out = gpu_address;
  index_count_out = converted_index_count;
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@ -277,20 +277,19 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
      return false;
    }

-    // Create the EDRAM load/store pipeline state objects.
+    // Create the EDRAM load/store pipelines.
    for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) {
      const EdramLoadStoreModeInfo& mode_info = edram_load_store_mode_info_[i];
-      edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState(
+      edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
          device, mode_info.load_shader, mode_info.load_shader_size,
          edram_load_store_root_signature_);
-      edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState(
+      edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
          device, mode_info.store_shader, mode_info.store_shader_size,
          edram_load_store_root_signature_);
      if (edram_load_pipelines_[i] == nullptr ||
          edram_store_pipelines_[i] == nullptr) {
-        XELOGE(
-            "Failed to create the EDRAM load/store pipeline states for mode {}",
-            i);
+        XELOGE("Failed to create the EDRAM load/store pipelines for mode {}",
+               i);
        Shutdown();
        return false;
      }
@ -299,7 +298,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
    }
  }

-  // Create the resolve root signatures and pipeline state objects.
+  // Create the resolve root signatures and pipelines.
  D3D12_ROOT_PARAMETER resolve_root_parameters[3];

  // Copying root signature.
@ -369,7 +368,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
    return false;
  }

-  // Copying pipeline state objects.
+  // Copying pipelines.
  uint32_t resolution_scale = resolution_scale_2x_ ? 2 : 1;
  for (size_t i = 0; i < size_t(draw_util::ResolveCopyShaderIndex::kCount);
       ++i) {
@ -381,63 +380,61 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
      continue;
    }
    const auto& resolve_copy_shader = resolve_copy_shaders_[i];
-    ID3D12PipelineState* resolve_copy_pipeline_state =
-        ui::d3d12::util::CreateComputePipelineState(
+    ID3D12PipelineState* resolve_copy_pipeline =
+        ui::d3d12::util::CreateComputePipeline(
            device, resolve_copy_shader.first, resolve_copy_shader.second,
            resolve_copy_root_signature_);
-    if (resolve_copy_pipeline_state == nullptr) {
-      XELOGE("Failed to create {} resolve copy pipeline state",
+    if (resolve_copy_pipeline == nullptr) {
+      XELOGE("Failed to create {} resolve copy pipeline",
             resolve_copy_shader_info.debug_name);
    }
-    resolve_copy_pipeline_state->SetName(reinterpret_cast<LPCWSTR>(
+    resolve_copy_pipeline->SetName(reinterpret_cast<LPCWSTR>(
        xe::to_utf16(resolve_copy_shader_info.debug_name).c_str()));
-    resolve_copy_pipeline_states_[i] = resolve_copy_pipeline_state;
+    resolve_copy_pipelines_[i] = resolve_copy_pipeline;
  }

-  // Clearing pipeline state objects.
-  resolve_clear_32bpp_pipeline_state_ =
-      ui::d3d12::util::CreateComputePipelineState(
-          device,
-          resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs
-                               : resolve_clear_32bpp_cs,
-          resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs)
-                               : sizeof(resolve_clear_32bpp_cs),
-          resolve_clear_root_signature_);
-  if (resolve_clear_32bpp_pipeline_state_ == nullptr) {
-    XELOGE("Failed to create the 32bpp resolve clear pipeline state");
+  // Clearing pipelines.
+  resolve_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device,
+      resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs
+                           : resolve_clear_32bpp_cs,
+      resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs)
+                           : sizeof(resolve_clear_32bpp_cs),
+      resolve_clear_root_signature_);
+  if (resolve_clear_32bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 32bpp resolve clear pipeline");
    Shutdown();
    return false;
  }
-  resolve_clear_32bpp_pipeline_state_->SetName(L"Resolve Clear 32bpp");
-  resolve_clear_64bpp_pipeline_state_ =
-      ui::d3d12::util::CreateComputePipelineState(
-          device,
-          resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs
-                               : resolve_clear_64bpp_cs,
-          resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs)
-                               : sizeof(resolve_clear_64bpp_cs),
-          resolve_clear_root_signature_);
-  if (resolve_clear_64bpp_pipeline_state_ == nullptr) {
-    XELOGE("Failed to create the 64bpp resolve clear pipeline state");
+  resolve_clear_32bpp_pipeline_->SetName(L"Resolve Clear 32bpp");
+  resolve_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device,
+      resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs
+                           : resolve_clear_64bpp_cs,
+      resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs)
+                           : sizeof(resolve_clear_64bpp_cs),
+      resolve_clear_root_signature_);
+  if (resolve_clear_64bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 64bpp resolve clear pipeline");
    Shutdown();
    return false;
  }
-  resolve_clear_64bpp_pipeline_state_->SetName(L"Resolve Clear 64bpp");
+  resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp");
  if (!edram_rov_used_) {
    assert_false(resolution_scale_2x_);
-    resolve_clear_depth_24_32_pipeline_state_ =
-        ui::d3d12::util::CreateComputePipelineState(
+    resolve_clear_depth_24_32_pipeline_ =
+        ui::d3d12::util::CreateComputePipeline(
            device, resolve_clear_depth_24_32_cs,
            sizeof(resolve_clear_depth_24_32_cs),
            resolve_clear_root_signature_);
-    if (resolve_clear_depth_24_32_pipeline_state_ == nullptr) {
+    if (resolve_clear_depth_24_32_pipeline_ == nullptr) {
      XELOGE(
          "Failed to create the 24-bit and 32-bit depth resolve clear pipeline "
          "state");
      Shutdown();
      return false;
    }
-    resolve_clear_64bpp_pipeline_state_->SetName(
+    resolve_clear_64bpp_pipeline_->SetName(
        L"Resolve Clear 24-bit & 32-bit Depth");
  }

@ -451,12 +448,12 @@ void RenderTargetCache::Shutdown() {

  edram_snapshot_restore_pool_.reset();
  ui::d3d12::util::ReleaseAndNull(edram_snapshot_download_buffer_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_state_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_state_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_state_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_);
  ui::d3d12::util::ReleaseAndNull(resolve_clear_root_signature_);
-  for (size_t i = 0; i < xe::countof(resolve_copy_pipeline_states_); ++i) {
-    ui::d3d12::util::ReleaseAndNull(resolve_copy_pipeline_states_[i]);
+  for (size_t i = 0; i < xe::countof(resolve_copy_pipelines_); ++i) {
+    ui::d3d12::util::ReleaseAndNull(resolve_copy_pipelines_[i]);
  }
  ui::d3d12::util::ReleaseAndNull(resolve_copy_root_signature_);
  for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) {
@ -1209,8 +1206,8 @@ bool RenderTargetCache::Resolve(const Memory& memory,
                0, sizeof(copy_shader_constants) / sizeof(uint32_t),
                &copy_shader_constants, 0);
          }
-          command_processor_.SetComputePipelineState(
-              resolve_copy_pipeline_states_[size_t(copy_shader)]);
+          command_processor_.SetComputePipeline(
+              resolve_copy_pipelines_[size_t(copy_shader)]);
          command_processor_.SubmitBarriers();
          command_list.D3DDispatch(copy_group_count_x, copy_group_count_y, 1);

@ -1279,9 +1276,9 @@ bool RenderTargetCache::Resolve(const Memory& memory,
        command_list.D3DSetComputeRoot32BitConstants(
            0, sizeof(depth_clear_constants) / sizeof(uint32_t),
            &depth_clear_constants, 0);
-        command_processor_.SetComputePipelineState(
-            clear_float32_depth ? resolve_clear_depth_24_32_pipeline_state_
-                                : resolve_clear_32bpp_pipeline_state_);
+        command_processor_.SetComputePipeline(
+            clear_float32_depth ? resolve_clear_depth_24_32_pipeline_
+                                : resolve_clear_32bpp_pipeline_);
        command_processor_.SubmitBarriers();
        command_list.D3DDispatch(clear_group_count.first,
                                 clear_group_count.second, 1);
@ -1301,10 +1298,10 @@ bool RenderTargetCache::Resolve(const Memory& memory,
              0, sizeof(color_clear_constants) / sizeof(uint32_t),
              &color_clear_constants, 0);
        }
-        command_processor_.SetComputePipelineState(
+        command_processor_.SetComputePipeline(
            resolve_info.color_edram_info.format_is_64bpp
-                ? resolve_clear_64bpp_pipeline_state_
-                : resolve_clear_32bpp_pipeline_state_);
+                ? resolve_clear_64bpp_pipeline_
+                : resolve_clear_32bpp_pipeline_);
        command_processor_.SubmitBarriers();
        command_list.D3DDispatch(clear_group_count.first,
                                 clear_group_count.second, 1);
@ -1816,7 +1813,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
                                render_target->footprints, nullptr, nullptr,
                                &copy_buffer_size);
  render_target->copy_buffer_size = uint32_t(copy_buffer_size);
-  render_targets_.insert(std::make_pair(key.value, render_target));
+  render_targets_.emplace(key.value, render_target);
  COUNT_profile_set("gpu/render_target_cache/render_targets",
                    render_targets_.size());
 #if 0
@ -2015,8 +2012,7 @@ void RenderTargetCache::StoreRenderTargetsToEdram() {
        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
    EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                               render_target->key.format);
-    command_processor_.SetComputePipelineState(
-        edram_store_pipelines_[size_t(mode)]);
+    command_processor_.SetComputePipeline(edram_store_pipelines_[size_t(mode)]);
    // 1 group per 80x16 samples.
    command_list.D3DDispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);

@ -2140,8 +2136,7 @@ void RenderTargetCache::LoadRenderTargetsFromEdram(
        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
    EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                               render_target->key.format);
-    command_processor_.SetComputePipelineState(
-        edram_load_pipelines_[size_t(mode)]);
+    command_processor_.SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
    // 1 group per 80x16 samples.
    command_list.D3DDispatch(render_target->key.width_ss_div_80, edram_rows, 1);

--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -237,14 +237,13 @@ class D3D12CommandProcessor;
 // get each of the 4 host pixels for each sample.
 class RenderTargetCache {
 public:
-  // Direct3D 12 debug layer does some kaschenit-style trolling by giving errors
-  // that contradict each other when you use null RTV descriptors - if you set
-  // a valid format in RTVFormats in the pipeline state, it says that null
-  // descriptors can only be used if the format in the pipeline state is
-  // DXGI_FORMAT_UNKNOWN, however, if DXGI_FORMAT_UNKNOWN is set, it complains
-  // that the format in the pipeline doesn't match the RTV format. So we have to
-  // make render target bindings consecutive and remap the output indices in
-  // pixel shaders.
+  // Direct3D 12 debug layer is giving errors that contradict each other when
+  // you use null RTV descriptors - if you set a valid format in RTVFormats in
+  // the pipeline state, it says that null descriptors can only be used if the
+  // format in the pipeline state is DXGI_FORMAT_UNKNOWN, however, if
+  // DXGI_FORMAT_UNKNOWN is set, it complains that the format in the pipeline
+  // state doesn't match the RTV format. So we have to make render target
+  // bindings consecutive and remap the output indices in pixel shaders.
  struct PipelineRenderTarget {
    uint32_t guest_render_target;
    DXGI_FORMAT format;
@ -304,8 +303,7 @@ class RenderTargetCache {
  // performance difference, but with EDRAM loads/stores less conversion should
  // be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and
  // it's probably more accurate.
-  static inline DXGI_FORMAT GetDepthDXGIFormat(
-      xenos::DepthRenderTargetFormat format) {
+  static DXGI_FORMAT GetDepthDXGIFormat(xenos::DepthRenderTargetFormat format) {
    return format == xenos::DepthRenderTargetFormat::kD24FS8
               ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT
               : DXGI_FORMAT_D24_UNORM_S8_UINT;
@ -537,7 +535,7 @@ class RenderTargetCache {
    // 16: - EDRAM pitch in tiles.
    uint32_t base_samples_2x_depth_pitch;
  };
-  // EDRAM pipeline states for the RTV/DSV path.
+  // EDRAM pipelines for the RTV/DSV path.
  static const EdramLoadStoreModeInfo
      edram_load_store_mode_info_[size_t(EdramLoadStoreMode::kCount)];
  ID3D12PipelineState*
@ -546,20 +544,20 @@ class RenderTargetCache {
  ID3D12PipelineState*
      edram_store_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};

-  // Resolve root signatures and pipeline state objects.
+  // Resolve root signatures and pipelines.
  ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
  static const std::pair<const uint8_t*, size_t>
      resolve_copy_shaders_[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
-  ID3D12PipelineState* resolve_copy_pipeline_states_[size_t(
+  ID3D12PipelineState* resolve_copy_pipelines_[size_t(
      draw_util::ResolveCopyShaderIndex::kCount)] = {};
  ID3D12RootSignature* resolve_clear_root_signature_ = nullptr;
  // Clearing 32bpp color, depth with ROV, or unorm depth without ROV.
-  ID3D12PipelineState* resolve_clear_32bpp_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_32bpp_pipeline_ = nullptr;
  // Clearing 64bpp color.
-  ID3D12PipelineState* resolve_clear_64bpp_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_64bpp_pipeline_ = nullptr;
  // Clearing float depth without ROV, both the float24 and the host float32
  // versions.
-  ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_ = nullptr;

  // FIXME(Triang3l): Investigate what's wrong with placed RTV/DSV aliasing on
  // Nvidia Maxwell 1st generation and older.
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@ -918,27 +918,24 @@ bool TextureCache::Initialize(bool edram_rov_used) {
    return false;
  }

-  // Create the loading pipeline state objects.
+  // Create the loading pipelines.
  for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) {
    const LoadModeInfo& mode_info = load_mode_info_[i];
-    load_pipeline_states_[i] = ui::d3d12::util::CreateComputePipelineState(
+    load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
        device, mode_info.shader, mode_info.shader_size, load_root_signature_);
-    if (load_pipeline_states_[i] == nullptr) {
-      XELOGE(
-          "Failed to create the texture loading pipeline state object for mode "
-          "{}",
-          i);
+    if (load_pipelines_[i] == nullptr) {
+      XELOGE("Failed to create the texture loading pipeline for mode {}", i);
      Shutdown();
      return false;
    }
    if (IsResolutionScale2X() && mode_info.shader_2x != nullptr) {
-      load_pipeline_states_2x_[i] = ui::d3d12::util::CreateComputePipelineState(
+      load_pipelines_2x_[i] = ui::d3d12::util::CreateComputePipeline(
          device, mode_info.shader_2x, mode_info.shader_2x_size,
          load_root_signature_);
-      if (load_pipeline_states_2x_[i] == nullptr) {
+      if (load_pipelines_2x_[i] == nullptr) {
        XELOGE(
-            "Failed to create the 2x-scaled texture loading pipeline state "
-            "for mode {}",
+            "Failed to create the 2x-scaled texture loading pipeline for mode "
+            "{}",
            i);
        Shutdown();
        return false;
@ -1024,8 +1021,8 @@ void TextureCache::Shutdown() {
  ui::d3d12::util::ReleaseAndNull(null_srv_descriptor_heap_);

  for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) {
-    ui::d3d12::util::ReleaseAndNull(load_pipeline_states_2x_[i]);
-    ui::d3d12::util::ReleaseAndNull(load_pipeline_states_[i]);
+    ui::d3d12::util::ReleaseAndNull(load_pipelines_2x_[i]);
+    ui::d3d12::util::ReleaseAndNull(load_pipelines_[i]);
  }
  ui::d3d12::util::ReleaseAndNull(load_root_signature_);

@ -1892,7 +1889,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
  if (IsResolutionScale2X() && key.tiled) {
    LoadMode load_mode = GetLoadMode(key);
    if (load_mode != LoadMode::kUnknown &&
-        load_pipeline_states_2x_[uint32_t(load_mode)] != nullptr) {
+        load_pipelines_2x_[uint32_t(load_mode)] != nullptr) {
      uint32_t base_size = 0, mip_size = 0;
      texture_util::GetTextureTotalSize(
          key.dimension, key.width, key.height, key.depth, key.format,
@ -2047,7 +2044,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
  }
  texture->base_watch_handle = nullptr;
  texture->mip_watch_handle = nullptr;
-  textures_.insert(std::make_pair(map_key, texture));
+  textures_.emplace(map_key, texture);
  COUNT_profile_set("gpu/texture_cache/textures", textures_.size());
  textures_total_size_ += texture->resource_size;
  COUNT_profile_set("gpu/texture_cache/total_size_mb",
@ -2079,10 +2076,10 @@ bool TextureCache::LoadTextureData(Texture* texture) {
    return false;
  }
  bool scaled_resolve = texture->key.scaled_resolve ? true : false;
-  ID3D12PipelineState* pipeline_state =
-      scaled_resolve ? load_pipeline_states_2x_[uint32_t(load_mode)]
-                     : load_pipeline_states_[uint32_t(load_mode)];
-  if (pipeline_state == nullptr) {
+  ID3D12PipelineState* pipeline = scaled_resolve
+                                      ? load_pipelines_2x_[uint32_t(load_mode)]
+                                      : load_pipelines_[uint32_t(load_mode)];
+  if (pipeline == nullptr) {
    return false;
  }
  const LoadModeInfo& load_mode_info = load_mode_info_[uint32_t(load_mode)];
@ -2296,7 +2293,7 @@ bool TextureCache::LoadTextureData(Texture* texture) {
                                                load_mode_info.srv_bpe_log2);
    }
  }
-  command_processor_.SetComputePipelineState(pipeline_state);
+  command_processor_.SetComputePipeline(pipeline);
  command_list.D3DSetComputeRootSignature(load_root_signature_);
  command_list.D3DSetComputeRootDescriptorTable(2, descriptor_dest.second);

@ -2597,7 +2594,7 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture,
  }
  device->CreateShaderResourceView(
      texture.resource, &desc, GetTextureDescriptorCPUHandle(descriptor_index));
-  texture.srv_descriptors.insert({descriptor_key, descriptor_index});
+  texture.srv_descriptors.emplace(descriptor_key, descriptor_index);
  return descriptor_index;
 }

--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@ -106,18 +106,18 @@ class TextureCache {
    bool operator!=(const TextureKey& key) const {
      return GetMapKey() != key.GetMapKey() || bucket_key != key.bucket_key;
    }
-    inline uint64_t GetMapKey() const {
+    uint64_t GetMapKey() const {
      return uint64_t(map_key[0]) | (uint64_t(map_key[1]) << 32);
    }
-    inline void SetMapKey(uint64_t key) {
+    void SetMapKey(uint64_t key) {
      map_key[0] = uint32_t(key);
      map_key[1] = uint32_t(key >> 32);
    }
-    inline bool IsInvalid() const {
+    bool IsInvalid() const {
      // Zero base and zero width is enough for a binding to be invalid.
      return map_key[0] == 0;
    }
-    inline void MakeInvalid() {
+    void MakeInvalid() {
      // Reset all for a stable hash.
      SetMapKey(0);
      bucket_key = 0;
@ -222,9 +222,7 @@ class TextureCache {

  void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled);

-  inline bool IsResolutionScale2X() const {
-    return scaled_resolve_buffer_ != nullptr;
-  }
+  bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr; }
  ID3D12Resource* GetScaledResolveBuffer() const {
    return scaled_resolve_buffer_;
  }
@ -233,7 +231,7 @@ class TextureCache {
                                         uint32_t length_unscaled);
  void UseScaledResolveBufferForReading();
  void UseScaledResolveBufferForWriting();
-  inline void MarkScaledResolveBufferUAVWritesCommitNeeded() {
+  void MarkScaledResolveBufferUAVWritesCommitNeeded() {
    if (scaled_resolve_buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
      scaled_resolve_buffer_uav_writes_commit_needed_ = true;
    }
@ -432,7 +430,7 @@ class TextureCache {
  // Whether the signed version of the texture has a different representation on
  // the host than its unsigned version (for example, if it's a fixed-point
  // texture emulated with a larger host pixel format).
-  static inline bool IsSignedVersionSeparate(xenos::TextureFormat format) {
+  static bool IsSignedVersionSeparate(xenos::TextureFormat format) {
    const HostFormat& host_format = host_formats_[uint32_t(format)];
    return host_format.load_mode_snorm != LoadMode::kUnknown &&
           host_format.load_mode_snorm != host_format.load_mode;
@ -441,26 +439,24 @@ class TextureCache {
  // of block-compressed textures with 4x4-aligned dimensions on PC).
  static bool IsDecompressionNeeded(xenos::TextureFormat format, uint32_t width,
                                    uint32_t height);
-  static inline DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format,
-                                                  uint32_t width,
-                                                  uint32_t height) {
+  static DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format,
+                                           uint32_t width, uint32_t height) {
    const HostFormat& host_format = host_formats_[uint32_t(format)];
    return IsDecompressionNeeded(format, width, height)
               ? host_format.dxgi_format_uncompressed
               : host_format.dxgi_format_resource;
  }
-  static inline DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) {
+  static DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) {
    return GetDXGIResourceFormat(key.format, key.width, key.height);
  }
-  static inline DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format,
-                                               uint32_t width,
-                                               uint32_t height) {
+  static DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format,
+                                        uint32_t width, uint32_t height) {
    const HostFormat& host_format = host_formats_[uint32_t(format)];
    return IsDecompressionNeeded(format, width, height)
               ? host_format.dxgi_format_uncompressed
               : host_format.dxgi_format_unorm;
  }
-  static inline DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
+  static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
    return GetDXGIUnormFormat(key.format, key.width, key.height);
  }

@ -550,9 +546,9 @@ class TextureCache {

  static const LoadModeInfo load_mode_info_[];
  ID3D12RootSignature* load_root_signature_ = nullptr;
-  ID3D12PipelineState* load_pipeline_states_[size_t(LoadMode::kCount)] = {};
-  // Load pipeline state objects for 2x-scaled resolved targets.
-  ID3D12PipelineState* load_pipeline_states_2x_[size_t(LoadMode::kCount)] = {};
+  ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {};
+  // Load pipelines for 2x-scaled resolved targets.
+  ID3D12PipelineState* load_pipelines_2x_[size_t(LoadMode::kCount)] = {};

  std::unordered_multimap<uint64_t, Texture*> textures_;
  uint64_t textures_total_size_ = 0;
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -111,6 +111,34 @@ int32_t FloatToD3D11Fixed16p8(float f32) {
  return result.s;
 }

+void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
+  // FIXME(Triang3l): Screen scissor isn't applied here, but it seems to be
+  // unused on Xbox 360 Direct3D 9.
+  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
+  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
+  uint32_t tl_x = pa_sc_window_scissor_tl.tl_x;
+  uint32_t tl_y = pa_sc_window_scissor_tl.tl_y;
+  uint32_t br_x = pa_sc_window_scissor_br.br_x;
+  uint32_t br_y = pa_sc_window_scissor_br.br_y;
+  if (!pa_sc_window_scissor_tl.window_offset_disable) {
+    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    tl_x = uint32_t(std::max(
+        int32_t(tl_x) + pa_sc_window_offset.window_x_offset, int32_t(0)));
+    tl_y = uint32_t(std::max(
+        int32_t(tl_y) + pa_sc_window_offset.window_y_offset, int32_t(0)));
+    br_x = uint32_t(std::max(
+        int32_t(br_x) + pa_sc_window_offset.window_x_offset, int32_t(0)));
+    br_y = uint32_t(std::max(
+        int32_t(br_y) + pa_sc_window_offset.window_y_offset, int32_t(0)));
+  }
+  br_x = std::max(br_x, tl_x);
+  br_y = std::max(br_y, tl_y);
+  scissor_out.left = tl_x;
+  scissor_out.top = tl_y;
+  scissor_out.width = br_x - tl_x;
+  scissor_out.height = br_y - tl_y;
+}
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth) {
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -33,6 +33,14 @@ namespace draw_util {
 // for use with the top-left rasterization rule later.
 int32_t FloatToD3D11Fixed16p8(float f32);

+struct Scissor {
+  uint32_t left;
+  uint32_t top;
+  uint32_t width;
+  uint32_t height;
+};
+void GetScissor(const RegisterFile& regs, Scissor& scissor_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 xenos::CopySampleSelect SanitizeCopySampleSelect(
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      break;
    case AluVectorOpcode::kMul:
    case AluVectorOpcode::kMad: {
-      bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad;
-      if (is_mad) {
-        DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]);
-      } else {
-        DxbcOpMul(per_component_dest, operands[0], operands[1]);
-      }
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t absolute_different =
+      // Not using DXBC mad to prevent fused multiply-add (mul followed by add
+      // may be optimized into non-fused mad by the driver in the identical
+      // operands case also).
+      DxbcOpMul(per_component_dest, operands[0], operands[1]);
+      uint32_t multiplicands_different =
          used_result_components &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+          ~instr.vector_operands[0].GetIdenticalComponents(
              instr.vector_operands[1]);
-      if (absolute_different) {
+      if (multiplicands_different) {
+        // Shader Model 3: +-0 or denormal * anything = +0.
        uint32_t is_zero_temp = PushSystemTemp();
-        DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different),
                  operands[0].Abs(), operands[1].Abs());
        // min isn't required to flush denormals, eq is.
-        DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different),
                 DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f));
-        DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different),
-                   DxbcSrc::R(is_zero_temp),
-                   is_mad ? operands[2] : DxbcSrc::LF(0.0f),
+        // Not replacing true `0 + term` with movc of the term because +0 + -0
+        // should result in +0, not -0.
+        DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different),
+                   DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f),
                   DxbcSrc::R(system_temp_result_));
        // Release is_zero_temp.
        PopSystemTemp();
      }
+      if (instr.vector_opcode == AluVectorOpcode::kMad) {
+        DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_),
+                  operands[2]);
+      }
    } break;

    case AluVectorOpcode::kMax:
@ -179,69 +181,40 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
        component_count = 4;
      }
      result_swizzle = DxbcSrc::kXXXX;
-      uint32_t absolute_different =
-          uint32_t((1 << component_count) - 1) &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
-              instr.vector_operands[1]);
-      if (absolute_different) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
-        // Add component products only if non-zero. For dp4, 16 scalar
-        // operations in the worst case (as opposed to always 20 for
-        // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands
-        // for dp4).
-        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001),
-                  operands[0].SelectFromSwizzled(0),
-                  operands[1].SelectFromSwizzled(0));
-        if (absolute_different & 0b0001) {
-          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010),
-                    operands[0].SelectFromSwizzled(0).Abs(),
-                    operands[1].SelectFromSwizzled(0).Abs());
-          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010),
-                   DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
+      uint32_t different = uint32_t((1 << component_count) - 1) &
+                           ~instr.vector_operands[0].GetIdenticalComponents(
+                               instr.vector_operands[1]);
+      for (uint32_t i = 0; i < component_count; ++i) {
+        DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
+                  operands[0].SelectFromSwizzled(i),
+                  operands[1].SelectFromSwizzled(i));
+        if ((different & (1 << i)) != 0) {
+          // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing
+          // true `0 + term` with movc of the term because +0 + -0 should result
+          // in +0, not -0).
+          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
+                    operands[0].SelectFromSwizzled(i).Abs(),
+                    operands[1].SelectFromSwizzled(i).Abs());
+          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+                   DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
                   DxbcSrc::LF(0.0f));
-          DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
-                     DxbcSrc::LF(0.0f),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-        }
-        for (uint32_t i = 1; i < component_count; ++i) {
-          bool component_different = (absolute_different & (1 << i)) != 0;
-          DxbcOpMAd(DxbcDest::R(system_temp_result_,
-                                component_different ? 0b0010 : 0b0001),
-                    operands[0].SelectFromSwizzled(i),
-                    operands[1].SelectFromSwizzled(i),
-                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-          if (component_different) {
-            DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
-                      operands[0].SelectFromSwizzled(i).Abs(),
-                      operands[1].SelectFromSwizzled(i).Abs());
-            DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+          DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
                     DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                     DxbcSrc::LF(0.0f));
-            DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
-          }
+                     DxbcSrc::LF(0.0f),
+                     DxbcSrc::R(system_temp_result_,
+                                i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX));
        }
-      } else {
-        if (component_count == 2) {
-          DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else if (component_count == 3) {
-          DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else {
-          assert_true(component_count == 4);
-          DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
+        if (i) {
+          // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar
+          // as of 2020 anyway, and not using mad for the same reason (mul
+          // followed by add may be optimized into non-fused mad by the driver
+          // in the identical operands case also).
+          DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
        }
      }
      if (component_count == 2) {
-        // Add the third operand. Since floating-point addition isn't
-        // associative, even though adding this in multiply-add for the first
-        // component would be faster, it's safer to add here, in the end.
        DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
                  DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
                  operands[2].SelectFromSwizzled(0));
@ -592,14 +565,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
        DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f));
      }
      if (used_result_components & 0b0010) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010),
                  operands[0].SelectFromSwizzled(1),
                  operands[1].SelectFromSwizzled(1));
-        if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+        if (!(instr.vector_operands[0].GetIdenticalComponents(
                  instr.vector_operands[1]) &
              0b0010)) {
+          // Shader Model 3: +-0 or denormal * anything = +0.
          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
                    operands[0].SelectFromSwizzled(1).Abs(),
                    operands[1].SelectFromSwizzled(1).Abs());
@ -700,8 +672,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
      DxbcOpMul(ps_dest, operand_0_a, operand_0_b);
      if (instr.scalar_operands[0].components[0] !=
          instr.scalar_operands[0].components[1]) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
        uint32_t is_zero_temp = PushSystemTemp();
        DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                  operand_0_b.Abs());
@ -714,58 +685,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
        PopSystemTemp();
      }
      break;
-    case AluScalarOpcode::kMulsPrev: {
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t is_zero_temp = PushSystemTemp();
-      DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
-                ps_src.Abs());
-      // min isn't required to flush denormals, eq is.
-      DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001),
-               DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
-      DxbcOpMul(ps_dest, operand_0_a, ps_src);
-      DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX),
-                 DxbcSrc::LF(0.0f), ps_src);
-      // Release is_zero_temp.
-      PopSystemTemp();
-    } break;
+    case AluScalarOpcode::kMulsPrev:
    case AluScalarOpcode::kMulsPrev2: {
      uint32_t test_temp = PushSystemTemp();
-      // Check if need to select the src0.a * ps case.
-      // ps != -FLT_MAX.
-      DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
-      // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX
-      // is already loaded to an SGPR, this is also false if it's NaN.
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // isfinite(src0.b).
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
-      // for NaN).
-      DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b);
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // ps is already known to be not NaN or Infinity, so multiplying it by 0
-      // will result in 0. However, src0.a can be anything, so the result should
-      // be zero if ps is zero.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f));
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        // Check if need to select the src0.a * ps case.
+        // ps != -FLT_MAX.
+        DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
+        // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since
+        // -FLT_MAX is already loaded to an SGPR, this is also false if it's
+        // NaN.
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // isfinite(src0.b).
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
+        // for NaN).
+        DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f),
+                 operand_0_b);
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
+      }
+      // Shader Model 3: +-0 or denormal * anything = +0.
+      DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(),
+                ps_src.Abs());
+      // min isn't required to flush denormals, eq is.
+      DxbcOpEq(DxbcDest::R(test_temp, 0b0001),
+               DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
      DxbcOpMul(ps_dest, operand_0_a, ps_src);
      DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
                 DxbcSrc::LF(0.0f), ps_src);
-      DxbcOpElse();
-      DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
-      DxbcOpEndIf();
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        DxbcOpElse();
+        DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
+        DxbcOpEndIf();
+      }
      // Release test_temp.
      PopSystemTemp();
    } break;
@ -1023,11 +986,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
    case AluScalarOpcode::kMulsc0:
    case AluScalarOpcode::kMulsc1:
      DxbcOpMul(ps_dest, operand_0_a, operand_1);
-      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+      if (!(instr.scalar_operands[0].GetIdenticalComponents(
                instr.scalar_operands[1]) &
            0b0001)) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
        uint32_t is_zero_temp = PushSystemTemp();
        DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                  operand_1.Abs());
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
@ -99,8 +99,8 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
    DxbcOpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
              DxbcSrc::LU(~uint32_t(3)));
  }
-  // Add the word offset from the instruction, plus the offset of the first
-  // needed word within the element.
+  // Add the word offset from the instruction (signed), plus the offset of the
+  // first needed word within the element.
  uint32_t first_word_index;
  xe::bit_scan_forward(needed_words, &first_word_index);
  int32_t first_word_buffer_offset =
@ -1730,10 +1730,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
        }
        uint32_t texture_binding_index_unsigned =
            FindOrAddTextureBinding(tfetch_index, srv_dimension, false);
-        const TextureBinding& texture_binding_unsigned =
-            texture_bindings_[texture_binding_index_unsigned];
        uint32_t texture_binding_index_signed =
            FindOrAddTextureBinding(tfetch_index, srv_dimension, true);
+        const TextureBinding& texture_binding_unsigned =
+            texture_bindings_[texture_binding_index_unsigned];
        const TextureBinding& texture_binding_signed =
            texture_bindings_[texture_binding_index_signed];
        DxbcSrc srv_unsigned(DxbcSrc::LF(0.0f)), srv_signed(DxbcSrc::LF(0.0f));
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@ -135,7 +135,7 @@ X_STATUS GraphicsSystem::Setup(cpu::Processor* processor,
      }));
  // As we run vblank interrupts the debugger must be able to suspend us.
  vsync_worker_thread_->set_can_debugger_suspend(true);
-  vsync_worker_thread_->set_name("GraphicsSystem Vsync");
+  vsync_worker_thread_->set_name("GPU VSync");
  vsync_worker_thread_->Create();

  if (cvars::trace_gpu_stream) {
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -65,17 +65,17 @@ enum class InstructionStorageTarget {
 // disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both
 // skipped components and zeros, which cannot be encoded, and therefore it will
 // not).
-constexpr uint32_t GetInstructionStorageTargetUsedComponents(
+constexpr uint32_t GetInstructionStorageTargetUsedComponentCount(
    InstructionStorageTarget target) {
  switch (target) {
    case InstructionStorageTarget::kNone:
-      return 0b0000;
+      return 0;
    case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
-      return 0b0111;
+      return 3;
    case InstructionStorageTarget::kDepth:
-      return 0b0001;
+      return 1;
    default:
-      return 0b1111;
+      return 4;
  }
 }

@ -136,8 +136,9 @@ struct InstructionResult {
  // Returns the write mask containing only components actually present in the
  // target.
  uint32_t GetUsedWriteMask() const {
-    return original_write_mask &
-           GetInstructionStorageTargetUsedComponents(storage_target);
+    uint32_t target_component_count =
+        GetInstructionStorageTargetUsedComponentCount(storage_target);
+    return original_write_mask & ((1 << target_component_count) - 1);
  }
  // True if the components are in their 'standard' swizzle arrangement (xyzw).
  bool IsStandardSwizzle() const {
@ -161,6 +162,28 @@ struct InstructionResult {
    }
    return used_components;
  }
+  // Returns which components of the used write mask are constant, and what
+  // values they have.
+  uint32_t GetUsedConstantComponents(uint32_t& constant_values_out) const {
+    uint32_t constant_components = 0;
+    uint32_t constant_values = 0;
+    uint32_t used_write_mask = GetUsedWriteMask();
+    for (uint32_t i = 0; i < 4; ++i) {
+      if (!(used_write_mask & (1 << i))) {
+        continue;
+      }
+      SwizzleSource component = components[i];
+      if (component >= SwizzleSource::kX && component <= SwizzleSource::kW) {
+        continue;
+      }
+      constant_components |= 1 << i;
+      if (component == SwizzleSource::k1) {
+        constant_values |= 1 << i;
+      }
+    }
+    constant_values_out = constant_values;
+    return constant_components;
+  }
 };

 enum class InstructionStorageSource {
@ -212,14 +235,18 @@ struct InstructionOperand {
    return false;
  }

-  // Returns which components of two operands are identical, but may have
-  // different signs (for simplicity of usage with GetComponent, treating the
-  // rightmost component as replicated).
-  uint32_t GetAbsoluteIdenticalComponents(
-      const InstructionOperand& other) const {
+  // Returns which components of two operands will always be bitwise equal
+  // (disregarding component_count for simplicity of usage with GetComponent,
+  // treating the rightmost component as replicated). This, strictly with all
+  // conditions, must be used when emulating Shader Model 3 +-0 * x = +0
+  // multiplication behavior with IEEE-compliant multiplication (because
+  // -0 * |-0|, or -0 * +0, is -0, while the result must be +0).
+  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
    if (storage_source != other.storage_source ||
        storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode) {
+        storage_addressing_mode != other.storage_addressing_mode ||
+        is_negated != other.is_negated ||
+        is_absolute_value != other.is_absolute_value) {
      return 0;
    }
    uint32_t identical_components = 0;
@ -229,16 +256,6 @@ struct InstructionOperand {
    }
    return identical_components;
  }
-  // Returns which components of two operands will always be bitwise equal, but
-  // may have different signs (disregarding component_count for simplicity of
-  // usage with GetComponent, treating the rightmost component as replicated).
-  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
-    if (is_negated != other.is_negated ||
-        is_absolute_value != other.is_absolute_value) {
-      return 0;
-    }
-    return GetAbsoluteIdenticalComponents(other);
-  }
 };

 struct ParsedExecInstruction {
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@ -25,6 +25,9 @@ namespace gpu {
 // system page size granularity.
 class SharedMemory {
 public:
+  static constexpr uint32_t kBufferSizeLog2 = 29;
+  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
+
  virtual ~SharedMemory();
  // Call in the implementation-specific ClearCache.
  virtual void ClearCache();
@ -98,9 +101,6 @@ class SharedMemory {
  // destructor.
  void ShutdownCommon();

-  static constexpr uint32_t kBufferSizeLog2 = 29;
-  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
-
  // Sparse allocations are 4 MB, so not too many of them are allocated, but
  // also not to waste too much memory for padding (with 16 MB there's too
  // much).
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 or denormal * anything = 0) wherever it's present (mul,
-//   mad, dp, etc.) and for NaN in min/max. It's very important to respect this
-//   rule for multiplication, as games often rely on it in vector normalization
-//   (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in
-//   games - causes white screen in Halo 3, white specular on characters in GTA
-//   IV.
-// TODO(Triang3l): Investigate signed zero handling in multiplication.
+//   multiplication (+-0 or denormal * anything = +0) wherever it's present
+//   (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect
+//   this rule for multiplication, as games often rely on it in vector
+//   normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of
+//   things in games - causes white screen in Halo 3, white specular on
+//   characters in GTA IV. The result is always positive zero in this case, no
+//   matter what the signs of the other operands are, according to R5xx
+//   Acceleration section 8.7.5 "Legacy multiply behavior" and testing on
+//   Adreno 200. This means that the following need to be taken into account
+//   (according to 8.7.2 "ALU Non-Transcendental Floating Point"):
+//   - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3
+//     handling, it should result in +0.
+//   - +0 + -0 is +0, so multiply-add should not be replaced with conditional
+//     move of the third operand in case of zero multiplicands, because the term
+//     may be -0, while the result should be +0 in this case.
+//   http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
+//   Multiply-add also appears to be not fused (the SM3 behavior instruction on
+//   GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
+//   should not use instructions that may be interpreted by the host GPU as
+//   fused multiply-add.

 enum class AluScalarOpcode : uint32_t {
  // Floating-Point Add
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@ -30,17 +30,6 @@ project("xenia-gpu-vulkan-trace-viewer")
  kind("WindowedApp")
  language("C++")
  links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
    "xenia-apu",
    "xenia-apu-nop",
    "xenia-base",
@ -57,6 +46,19 @@ project("xenia-gpu-vulkan-trace-viewer")
    "xenia-ui-vulkan",
    "xenia-vfs",
    "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
    "xxhash",
  })
  defines({
@ -98,17 +100,6 @@ project("xenia-gpu-vulkan-trace-dump")
  kind("ConsoleApp")
  language("C++")
  links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
    "xenia-apu",
    "xenia-apu-nop",
    "xenia-base",
@ -125,6 +116,19 @@ project("xenia-gpu-vulkan-trace-dump")
    "xenia-ui-vulkan",
    "xenia-vfs",
    "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
    "xxhash",
  })
  defines({
--- a/src/xenia/hid/premake5.lua
+++ b/src/xenia/hid/premake5.lua
@ -41,11 +41,11 @@ project("xenia-hid-demo")

  filter("platforms:Linux")
    links({
+      "SDL2",
+      "vulkan",
      "X11",
      "xcb",
      "X11-xcb",
-      "vulkan",
-      "SDL2",
    })

  filter("platforms:Windows")
--- a/src/xenia/kernel/kernel_state.cc
+++ b/src/xenia/kernel/kernel_state.cc
@ -359,7 +359,7 @@ void KernelState::SetExecutableModule(object_ref<UserModule> module) {
          }
          return 0;
        }));
-    dispatch_thread_->set_name("Kernel Dispatch Thread");
+    dispatch_thread_->set_name("Kernel Dispatch");
    dispatch_thread_->Create();
  }
 }
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@ -8,6 +8,7 @@
 */

 #include "xenia/base/logging.h"
+#include "xenia/base/math.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xam/xam_private.h"
@ -235,7 +236,8 @@ dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type,
    xe::store_and_swap(&dev->device_type, dummy_device_info_.device_type);
    xe::store_and_swap(&dev->total_bytes, dummy_device_info_.total_bytes);
    xe::store_and_swap(&dev->free_bytes, dummy_device_info_.free_bytes);
-    xe::copy_and_swap(dev->name, dummy_device_info_.name, 28);
+    xe::copy_and_swap(dev->name, dummy_device_info_.name,
+                      xe::countof(dev->name));
  }

  *handle_out = e->handle();
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@ -9,6 +9,7 @@

 #include "xenia/base/logging.h"
 #include "xenia/base/cvar.h"
+#include "xenia/base/string_util.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"
 #include "xenia/kernel/util/shim_utils.h"
@ -77,15 +78,15 @@ static SYSTEMTIME xeGetLocalSystemTime(uint64_t filetime) {

 void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t output_buffer,
                         dword_t output_count) {
-  std::memset(output_buffer, 0, output_count * 2);
+  std::memset(output_buffer, 0, output_count * sizeof(char16_t));

 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
  auto st = xeGetLocalSystemTime(filetime);
  // TODO: format this depending on users locale?
  auto str = fmt::format(u"{:02d}/{:02d}/{}", st.wMonth, st.wDay, st.wYear);
-  auto copy_length = std::min(size_t(output_count), str.size()) * 2;
-  xe::copy_and_swap(output_buffer.as<char16_t*>(), str.c_str(), copy_length);
+  xe::string_util::copy_and_swap_truncating(output_buffer.as<char16_t*>(), str,
+                                            output_count);
 #else
  assert_always();
 #endif
@ -94,15 +95,15 @@ DECLARE_XAM_EXPORT1(XamFormatDateString, kNone, kImplemented);

 void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t output_buffer,
                         dword_t output_count) {
-  std::memset(output_buffer, 0, output_count * 2);
+  std::memset(output_buffer, 0, output_count * sizeof(char16_t));

 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
  auto st = xeGetLocalSystemTime(filetime);
  // TODO: format this depending on users locale?
  auto str = fmt::format(u"{:02d}:{:02d}", st.wHour, st.wMinute);
-  auto copy_count = std::min(size_t(output_count), str.size());
-  xe::copy_and_swap(output_buffer.as<char16_t*>(), str.c_str(), copy_count);
+  xe::string_util::copy_and_swap_truncating(output_buffer.as<char16_t*>(), str,
+                                            output_count);
 #else
  assert_always();
 #endif
@ -124,9 +125,8 @@ dword_result_t keXamBuildResourceLocator(uint64_t module,
    path = fmt::format(u"section://{:X},{}#{}", (uint32_t)module, container,
                       resource);
  }
-  auto copy_count = std::min(size_t(buffer_count), path.size());
-  xe::copy_and_swap(buffer_ptr.as<char16_t*>(), path.c_str(), copy_count);
-  (buffer_ptr.as<char16_t*>())[copy_count] = 0;
+  xe::string_util::copy_and_swap_truncating(buffer_ptr.as<char16_t*>(), path,
+                                            buffer_count);
  return 0;
 }

--- a/src/xenia/kernel/xam/xam_net.cc
+++ b/src/xenia/kernel/xam/xam_net.cc
@ -984,8 +984,7 @@ dword_result_t NetDll___WSAFDIsSet(dword_t socket_handle,
 DECLARE_XAM_EXPORT1(NetDll___WSAFDIsSet, kNetworking, kImplemented);

 void RegisterNetExports(xe::cpu::ExportResolver* export_resolver,
-                        KernelState* kernel_state) {
-}
+                        KernelState* kernel_state) {}

 }  // namespace xam
 }  // namespace kernel
--- a/src/xenia/kernel/xam/xam_ui.cc
+++ b/src/xenia/kernel/xam/xam_ui.cc
@ -9,6 +9,7 @@

 #include "third_party/imgui/imgui.h"
 #include "xenia/base/logging.h"
+#include "xenia/base/string_util.h"
 #include "xenia/emulator.h"
 #include "xenia/kernel/kernel_flags.h"
 #include "xenia/kernel/kernel_state.h"
@ -188,8 +189,8 @@ class KeyboardInputDialog : public xe::ui::ImGuiDialog {
      *out_text_ = default_text;
    }
    text_buffer_.resize(max_length);
-    std::strncpy(text_buffer_.data(), default_text_.c_str(),
-                 std::min(text_buffer_.size() - 1, default_text_.size()));
+    xe::string_util::copy_truncating(text_buffer_.data(), default_text_,
+                                     text_buffer_.size());
  }

  void OnDraw(ImGuiIO& io) override {
--- a/src/xenia/kernel/xam/xam_user.cc
+++ b/src/xenia/kernel/xam/xam_user.cc
@ -10,6 +10,8 @@
 #include <cstring>

 #include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/string_util.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xam/xam_private.h"
@ -168,7 +170,8 @@ X_HRESULT_result_t XamUserGetSigninInfo(dword_t user_index, dword_t flags,
  const auto& user_profile = kernel_state()->user_profile();
  info->xuid = user_profile->xuid();
  info->signin_state = user_profile->signin_state();
-  std::strncpy(info->name, user_profile->name().data(), 15);
+  xe::string_util::copy_truncating(info->name, user_profile->name(),
+                                   xe::countof(info->name));
  return X_E_SUCCESS;
 }
 DECLARE_XAM_EXPORT1(XamUserGetSigninInfo, kUserProfiles, kImplemented);
@ -187,10 +190,8 @@ dword_result_t XamUserGetName(dword_t user_index, lpstring_t buffer,
  const auto& user_name = user_profile->name();

  // Real XAM will only copy a maximum of 15 characters out.
-  size_t copy_length = std::min(
-      {size_t(15), user_name.size(), static_cast<size_t>(buffer_len) - 1});
-  std::memcpy(buffer, user_name.data(), copy_length);
-  buffer[copy_length] = '\0';
+  xe::string_util::copy_truncating(buffer, user_name,
+                                   std::min(buffer_len.value(), uint32_t(15)));
  return X_ERROR_SUCCESS;
 }
 DECLARE_XAM_EXPORT1(XamUserGetName, kUserProfiles, kImplemented);
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@ -226,19 +226,21 @@ DECLARE_XBOXKRNL_EXPORT1(KeSetCurrentStackPointers, kThreading, kImplemented);

 dword_result_t KeSetAffinityThread(lpvoid_t thread_ptr, dword_t affinity,
                                   lpdword_t previous_affinity_ptr) {
-  uint32_t previous_affinity = 0;
-
+  // The Xbox 360, according to disassembly of KeSetAffinityThread, unlike
+  // Windows NT, stores the previous affinity via the pointer provided as an
+  // argument, not in the return value - the return value is used for the
+  // result.
+  if (!affinity) {
+    return X_STATUS_INVALID_PARAMETER;
+  }
  auto thread = XObject::GetNativeObject<XThread>(kernel_state(), thread_ptr);
  if (thread) {
-    previous_affinity = thread->affinity();
+    if (previous_affinity_ptr) {
+      *previous_affinity_ptr = uint32_t(1) << thread->active_cpu();
+    }
    thread->SetAffinity(affinity);
  }
-
-  if (previous_affinity_ptr) {
-    *previous_affinity_ptr = previous_affinity;
-  }
-
-  return (uint32_t)affinity;
+  return X_STATUS_SUCCESS;
 }
 DECLARE_XBOXKRNL_EXPORT1(KeSetAffinityThread, kThreading, kImplemented);

--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@ -157,11 +157,17 @@ void XThread::set_name(const std::string_view name) {
  }
 }

-uint8_t next_cpu = 0;
-uint8_t GetFakeCpuNumber(uint8_t proc_mask) {
+static uint8_t next_cpu = 0;
+static uint8_t GetFakeCpuNumber(uint8_t proc_mask) {
+  // NOTE: proc_mask is logical processors, not physical processors or cores.
  if (!proc_mask) {
    next_cpu = (next_cpu + 1) % 6;
    return next_cpu;  // is this reasonable?
+    // TODO(Triang3l): Does the following apply here?
+    // https://docs.microsoft.com/en-us/windows/win32/dxtecharts/coding-for-multiple-cores
+    // "On Xbox 360, you must explicitly assign software threads to a particular
+    //  hardware thread by using XSetThreadProcessor. Otherwise, all child
+    //  threads will stay on the same hardware thread as the parent."
  }
  assert_false(proc_mask & 0xC0);

@ -206,6 +212,7 @@ void XThread::InitializeGuestObject() {
  // 0xA88 = APC
  // 0x18 = timer
  xe::store_and_swap<uint32_t>(p + 0x09C, 0xFDFFD7FF);
+  // current_cpu is expected to be initialized externally via SetActiveCpu.
  xe::store_and_swap<uint32_t>(p + 0x0D0, stack_base_);
  xe::store_and_swap<uint64_t>(p + 0x130, Clock::QueryGuestSystemTime());
  xe::store_and_swap<uint32_t>(p + 0x144, guest_object() + 0x144);
@ -347,6 +354,12 @@ X_STATUS XThread::Create() {
  // Exports use this to get the kernel.
  thread_state_->context()->kernel_state = kernel_state_;

+  uint8_t cpu_index = GetFakeCpuNumber(
+      static_cast<uint8_t>(creation_params_.creation_flags >> 24));
+
+  // Initialize the KTHREAD object.
+  InitializeGuestObject();
+
  X_KPCR* pcr = memory()->TranslateVirtual<X_KPCR*>(pcr_address_);

  pcr->tls_ptr = tls_static_address_;
@ -356,14 +369,11 @@ X_STATUS XThread::Create() {
  pcr->stack_base_ptr = stack_base_;
  pcr->stack_end_ptr = stack_limit_;

-  uint8_t proc_mask =
-      static_cast<uint8_t>(creation_params_.creation_flags >> 24);
+  pcr->dpc_active = 0;  // DPC active bool?

-  pcr->current_cpu = GetFakeCpuNumber(proc_mask);  // Current CPU(?)
-  pcr->dpc_active = 0;                             // DPC active bool?
-
-  // Initialize the KTHREAD object.
-  InitializeGuestObject();
+  // Assign the thread to the logical processor, and also set up the current CPU
+  // in KPCR and KTHREAD.
+  SetActiveCpu(cpu_index);

  // Always retain when starting - the thread owns itself until exited.
  RetainHandle();
@ -416,10 +426,6 @@ X_STATUS XThread::Create() {
    return X_STATUS_NO_MEMORY;
  }

-  if (!cvars::ignore_thread_affinities) {
-    thread_->set_affinity_mask(proc_mask);
-  }
-
  // Set the thread name based on host ID (for easier debugging).
  if (thread_name_.empty()) {
    set_name(fmt::format("XThread{:04X}", thread_->system_id()));
@ -712,37 +718,36 @@ void XThread::SetPriority(int32_t increment) {
 }

 void XThread::SetAffinity(uint32_t affinity) {
-  // Affinity mask, as in SetThreadAffinityMask.
-  // Xbox thread IDs:
-  // 0 - core 0, thread 0 - user
-  // 1 - core 0, thread 1 - user
-  // 2 - core 1, thread 0 - sometimes xcontent
-  // 3 - core 1, thread 1 - user
-  // 4 - core 2, thread 0 - xaudio
-  // 5 - core 2, thread 1 - user
-  // TODO(benvanik): implement better thread distribution.
-  // NOTE: these are logical processors, not physical processors or cores.
+  SetActiveCpu(GetFakeCpuNumber(affinity));
+}
+
+uint8_t XThread::active_cpu() const {
+  const X_KPCR& pcr = *memory()->TranslateVirtual<const X_KPCR*>(pcr_address_);
+  return pcr.current_cpu;
+}
+
+void XThread::SetActiveCpu(uint8_t cpu_index) {
+  // May be called during thread creation - don't skip if current == new.
+
+  assert_true(cpu_index < 6);
+
+  X_KPCR& pcr = *memory()->TranslateVirtual<X_KPCR*>(pcr_address_);
+  pcr.current_cpu = cpu_index;
+
+  if (is_guest_thread()) {
+    X_KTHREAD& thread_object =
+        *memory()->TranslateVirtual<X_KTHREAD*>(guest_object());
+    thread_object.current_cpu = cpu_index;
+  }
+
  if (xe::threading::logical_processor_count() < 6) {
    XELOGW("Too few processors - scheduling will be wonky");
  }
-  SetActiveCpu(GetFakeCpuNumber(affinity));
-  affinity_ = affinity;
  if (!cvars::ignore_thread_affinities) {
-    thread_->set_affinity_mask(affinity);
+    thread_->set_affinity_mask(uint64_t(1) << cpu_index);
  }
 }

-uint32_t XThread::active_cpu() const {
-  uint8_t* pcr = memory()->TranslateVirtual(pcr_address_);
-  return xe::load_and_swap<uint8_t>(pcr + 0x10C);
-}
-
-void XThread::SetActiveCpu(uint32_t cpu_index) {
-  assert_true(cpu_index < 6);
-  uint8_t* pcr = memory()->TranslateVirtual(pcr_address_);
-  xe::store_and_swap<uint8_t>(pcr + 0x10C, cpu_index);
-}
-
 bool XThread::GetTLSValue(uint32_t slot, uint32_t* value_out) {
  if (slot * 4 > tls_total_size_) {
    return false;
--- a/src/xenia/kernel/xthread.h
+++ b/src/xenia/kernel/xthread.h
@ -88,7 +88,8 @@ struct X_KTHREAD {
  char unk_10[0xAC];             // 0x10
  uint8_t suspend_count;         // 0xBC
  uint8_t unk_BD;                // 0xBD
-  uint16_t unk_BE;               // 0xBE
+  uint8_t unk_BE;                // 0xBE
+  uint8_t current_cpu;           // 0xBF
  char unk_C0[0x70];             // 0xC0
  xe::be<uint64_t> create_time;  // 0x130
  xe::be<uint64_t> exit_time;    // 0x138
@ -171,10 +172,17 @@ class XThread : public XObject, public cpu::Thread {
  int32_t priority() const { return priority_; }
  int32_t QueryPriority();
  void SetPriority(int32_t increment);
-  uint32_t affinity() const { return affinity_; }
+
+  // Xbox thread IDs:
+  // 0 - core 0, thread 0 - user
+  // 1 - core 0, thread 1 - user
+  // 2 - core 1, thread 0 - sometimes xcontent
+  // 3 - core 1, thread 1 - user
+  // 4 - core 2, thread 0 - xaudio
+  // 5 - core 2, thread 1 - user
  void SetAffinity(uint32_t affinity);
-  uint32_t active_cpu() const;
-  void SetActiveCpu(uint32_t cpu_index);
+  uint8_t active_cpu() const;
+  void SetActiveCpu(uint8_t cpu_index);

  bool GetTLSValue(uint32_t slot, uint32_t* value_out);
  bool SetTLSValue(uint32_t slot, uint32_t value);
@ -226,7 +234,6 @@ class XThread : public XObject, public cpu::Thread {
  bool running_ = false;

  int32_t priority_ = 0;
-  uint32_t affinity_ = 0;

  xe::global_critical_region global_critical_region_;
  std::atomic<uint32_t> irql_ = {0};
--- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc
+++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc
@ -118,15 +118,15 @@ bool D3D12ImmediateDrawer::Initialize() {
    return false;
  }

-  // Create the pipeline states.
-  D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_state_desc = {};
-  pipeline_state_desc.pRootSignature = root_signature_;
-  pipeline_state_desc.VS.pShaderBytecode = immediate_vs;
-  pipeline_state_desc.VS.BytecodeLength = sizeof(immediate_vs);
-  pipeline_state_desc.PS.pShaderBytecode = immediate_ps;
-  pipeline_state_desc.PS.BytecodeLength = sizeof(immediate_ps);
+  // Create the pipelines.
+  D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_desc = {};
+  pipeline_desc.pRootSignature = root_signature_;
+  pipeline_desc.VS.pShaderBytecode = immediate_vs;
+  pipeline_desc.VS.BytecodeLength = sizeof(immediate_vs);
+  pipeline_desc.PS.pShaderBytecode = immediate_ps;
+  pipeline_desc.PS.BytecodeLength = sizeof(immediate_ps);
  D3D12_RENDER_TARGET_BLEND_DESC& pipeline_blend_desc =
-      pipeline_state_desc.BlendState.RenderTarget[0];
+      pipeline_desc.BlendState.RenderTarget[0];
  pipeline_blend_desc.BlendEnable = TRUE;
  pipeline_blend_desc.SrcBlend = D3D12_BLEND_SRC_ALPHA;
  pipeline_blend_desc.DestBlend = D3D12_BLEND_INV_SRC_ALPHA;
@ -138,11 +138,11 @@ bool D3D12ImmediateDrawer::Initialize() {
  pipeline_blend_desc.RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED |
                                              D3D12_COLOR_WRITE_ENABLE_GREEN |
                                              D3D12_COLOR_WRITE_ENABLE_BLUE;
-  pipeline_state_desc.SampleMask = UINT_MAX;
-  pipeline_state_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
-  pipeline_state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
-  pipeline_state_desc.RasterizerState.FrontCounterClockwise = FALSE;
-  pipeline_state_desc.RasterizerState.DepthClipEnable = TRUE;
+  pipeline_desc.SampleMask = UINT_MAX;
+  pipeline_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
+  pipeline_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
+  pipeline_desc.RasterizerState.FrontCounterClockwise = FALSE;
+  pipeline_desc.RasterizerState.DepthClipEnable = TRUE;
  D3D12_INPUT_ELEMENT_DESC pipeline_input_elements[3] = {};
  pipeline_input_elements[0].SemanticName = "POSITION";
  pipeline_input_elements[0].Format = DXGI_FORMAT_R32G32_FLOAT;
@ -154,26 +154,24 @@ bool D3D12ImmediateDrawer::Initialize() {
  pipeline_input_elements[2].Format = DXGI_FORMAT_R8G8B8A8_UNORM;
  pipeline_input_elements[2].AlignedByteOffset =
      offsetof(ImmediateVertex, color);
-  pipeline_state_desc.InputLayout.pInputElementDescs = pipeline_input_elements;
-  pipeline_state_desc.InputLayout.NumElements =
+  pipeline_desc.InputLayout.pInputElementDescs = pipeline_input_elements;
+  pipeline_desc.InputLayout.NumElements =
      UINT(xe::countof(pipeline_input_elements));
-  pipeline_state_desc.PrimitiveTopologyType =
-      D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
-  pipeline_state_desc.NumRenderTargets = 1;
-  pipeline_state_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat;
-  pipeline_state_desc.SampleDesc.Count = 1;
+  pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+  pipeline_desc.NumRenderTargets = 1;
+  pipeline_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat;
+  pipeline_desc.SampleDesc.Count = 1;
  if (FAILED(device->CreateGraphicsPipelineState(
-          &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_triangle_)))) {
+          &pipeline_desc, IID_PPV_ARGS(&pipeline_triangle_)))) {
    XELOGE(
        "Failed to create the Direct3D 12 immediate drawer triangle pipeline "
        "state");
    Shutdown();
    return false;
  }
-  pipeline_state_desc.PrimitiveTopologyType =
-      D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
+  pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
  if (FAILED(device->CreateGraphicsPipelineState(
-          &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_line_)))) {
+          &pipeline_desc, IID_PPV_ARGS(&pipeline_line_)))) {
    XELOGE(
        "Failed to create the Direct3D 12 immediate drawer line pipeline "
        "state");
@ -267,8 +265,8 @@ void D3D12ImmediateDrawer::Shutdown() {

  util::ReleaseAndNull(sampler_heap_);

-  util::ReleaseAndNull(pipeline_state_line_);
-  util::ReleaseAndNull(pipeline_state_triangle_);
+  util::ReleaseAndNull(pipeline_line_);
+  util::ReleaseAndNull(pipeline_triangle_);

  util::ReleaseAndNull(root_signature_);
 }
@ -611,17 +609,17 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) {
                                         uint32_t(sampler_index)));
  }

-  // Set the primitive type and the pipeline state for it.
+  // Set the primitive type and the pipeline for it.
  D3D_PRIMITIVE_TOPOLOGY primitive_topology;
-  ID3D12PipelineState* pipeline_state;
+  ID3D12PipelineState* pipeline;
  switch (draw.primitive_type) {
    case ImmediatePrimitiveType::kLines:
      primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST;
-      pipeline_state = pipeline_state_line_;
+      pipeline = pipeline_line_;
      break;
    case ImmediatePrimitiveType::kTriangles:
      primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
-      pipeline_state = pipeline_state_triangle_;
+      pipeline = pipeline_triangle_;
      break;
    default:
      assert_unhandled_case(draw.primitive_type);
@ -630,7 +628,7 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) {
  if (current_primitive_topology_ != primitive_topology) {
    current_primitive_topology_ = primitive_topology;
    current_command_list_->IASetPrimitiveTopology(primitive_topology);
-    current_command_list_->SetPipelineState(pipeline_state);
+    current_command_list_->SetPipelineState(pipeline);
  }

  // Draw.
--- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.h
+++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.h
@ -105,8 +105,8 @@ class D3D12ImmediateDrawer : public ImmediateDrawer {
    kCount
  };

-  ID3D12PipelineState* pipeline_state_triangle_ = nullptr;
-  ID3D12PipelineState* pipeline_state_line_ = nullptr;
+  ID3D12PipelineState* pipeline_triangle_ = nullptr;
+  ID3D12PipelineState* pipeline_line_ = nullptr;

  ID3D12DescriptorHeap* sampler_heap_ = nullptr;
  D3D12_CPU_DESCRIPTOR_HANDLE sampler_heap_cpu_start_;
--- a/src/xenia/ui/d3d12/d3d12_provider.h
+++ b/src/xenia/ui/d3d12/d3d12_provider.h
@ -46,22 +46,22 @@ class D3D12Provider : public GraphicsProvider {
  uint32_t GetRTVDescriptorSize() const { return descriptor_size_rtv_; }
  uint32_t GetDSVDescriptorSize() const { return descriptor_size_dsv_; }
  template <typename T>
-  inline T OffsetViewDescriptor(T start, uint32_t index) const {
+  T OffsetViewDescriptor(T start, uint32_t index) const {
    start.ptr += index * descriptor_size_view_;
    return start;
  }
  template <typename T>
-  inline T OffsetSamplerDescriptor(T start, uint32_t index) const {
+  T OffsetSamplerDescriptor(T start, uint32_t index) const {
    start.ptr += index * descriptor_size_sampler_;
    return start;
  }
  template <typename T>
-  inline T OffsetRTVDescriptor(T start, uint32_t index) const {
+  T OffsetRTVDescriptor(T start, uint32_t index) const {
    start.ptr += index * descriptor_size_rtv_;
    return start;
  }
  template <typename T>
-  inline T OffsetDSVDescriptor(T start, uint32_t index) const {
+  T OffsetDSVDescriptor(T start, uint32_t index) const {
    start.ptr += index * descriptor_size_dsv_;
    return start;
  }
@ -91,32 +91,30 @@ class D3D12Provider : public GraphicsProvider {
  }

  // Proxies for Direct3D 12 functions since they are loaded dynamically.
-  inline HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc,
-                                        D3D_ROOT_SIGNATURE_VERSION version,
-                                        ID3DBlob** blob_out,
-                                        ID3DBlob** error_blob_out) const {
+  HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc,
+                                 D3D_ROOT_SIGNATURE_VERSION version,
+                                 ID3DBlob** blob_out,
+                                 ID3DBlob** error_blob_out) const {
    return pfn_d3d12_serialize_root_signature_(desc, version, blob_out,
                                               error_blob_out);
  }
-  inline HRESULT Disassemble(const void* src_data, size_t src_data_size,
-                             UINT flags, const char* comments,
-                             ID3DBlob** disassembly_out) const {
+  HRESULT Disassemble(const void* src_data, size_t src_data_size, UINT flags,
+                      const char* comments, ID3DBlob** disassembly_out) const {
    if (!pfn_d3d_disassemble_) {
      return E_NOINTERFACE;
    }
    return pfn_d3d_disassemble_(src_data, src_data_size, flags, comments,
                                disassembly_out);
  }
-  inline HRESULT DxbcConverterCreateInstance(const CLSID& rclsid,
-                                             const IID& riid,
-                                             void** ppv) const {
+  HRESULT DxbcConverterCreateInstance(const CLSID& rclsid, const IID& riid,
+                                      void** ppv) const {
    if (!pfn_dxilconv_dxc_create_instance_) {
      return E_NOINTERFACE;
    }
    return pfn_dxilconv_dxc_create_instance_(rclsid, riid, ppv);
  }
-  inline HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid,
-                                   void** ppv) const {
+  HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid,
+                            void** ppv) const {
    if (!pfn_dxcompiler_dxc_create_instance_) {
      return E_NOINTERFACE;
    }
--- a/src/xenia/ui/d3d12/d3d12_util.cc
+++ b/src/xenia/ui/d3d12/d3d12_util.cc
@ -47,7 +47,7 @@ ID3D12RootSignature* CreateRootSignature(
  return root_signature;
 }

-ID3D12PipelineState* CreateComputePipelineState(
+ID3D12PipelineState* CreateComputePipeline(
    ID3D12Device* device, const void* shader, size_t shader_size,
    ID3D12RootSignature* root_signature) {
  D3D12_COMPUTE_PIPELINE_STATE_DESC desc;
--- a/src/xenia/ui/d3d12/d3d12_util.h
+++ b/src/xenia/ui/d3d12/d3d12_util.h
@ -27,7 +27,7 @@ extern const D3D12_HEAP_PROPERTIES kHeapPropertiesUpload;
 extern const D3D12_HEAP_PROPERTIES kHeapPropertiesReadback;

 template <typename T>
-inline bool ReleaseAndNull(T& object) {
+bool ReleaseAndNull(T& object) {
  if (object != nullptr) {
    object->Release();
    object = nullptr;
@ -39,9 +39,10 @@ inline bool ReleaseAndNull(T& object) {
 ID3D12RootSignature* CreateRootSignature(const D3D12Provider& provider,
                                         const D3D12_ROOT_SIGNATURE_DESC& desc);

-ID3D12PipelineState* CreateComputePipelineState(
-    ID3D12Device* device, const void* shader, size_t shader_size,
-    ID3D12RootSignature* root_signature);
+ID3D12PipelineState* CreateComputePipeline(ID3D12Device* device,
+                                           const void* shader,
+                                           size_t shader_size,
+                                           ID3D12RootSignature* root_signature);

 constexpr DXGI_FORMAT GetUintPow2DXGIFormat(uint32_t element_size_bytes_log2) {
  switch (element_size_bytes_log2) {
--- a/src/xenia/ui/graphics_upload_buffer_pool.cc
+++ b/src/xenia/ui/graphics_upload_buffer_pool.cc
@ -71,7 +71,7 @@ void GraphicsUploadBufferPool::FlushWrites() {
 GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request(
    uint64_t submission_index, size_t size, size_t alignment,
    size_t& offset_out) {
-  assert_not_zero(alignment);
+  alignment = std::max(alignment, size_t(1));
  assert_true(xe::is_pow2(alignment));
  size = xe::align(size, alignment);
  assert_true(size <= page_size_);
@ -126,7 +126,7 @@ GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request(
 GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::RequestPartial(
    uint64_t submission_index, size_t size, size_t alignment,
    size_t& offset_out, size_t& size_out) {
-  assert_not_zero(alignment);
+  alignment = std::max(alignment, size_t(1));
  assert_true(xe::is_pow2(alignment));
  size = xe::align(size, alignment);
  size = std::min(size, page_size_);
--- a/third_party/SDL2-static.lua
+++ b/third_party/SDL2-static.lua
@ -18,7 +18,7 @@ project("SDL2")
    "SDL2/include",
  })
  buildoptions({
-    "/wd4828",  -- illegal characters in file
+    "/wd4828",  -- illegal characters in file https://bugzilla.libsdl.org/show_bug.cgi?id=5333
  })
  files({
    -- 1:1 from SDL.vcxproj file
--- a/third_party/premake-cmake
+++ b/third_party/premake-cmake
@ -0,0 +1 @@
+Subproject commit 26fbbb9962aefcb1c24aff1e7952033ce1361190
--- a/third_party/spirv-tools.lua
+++ b/third_party/spirv-tools.lua
@ -73,4 +73,4 @@ project("spirv-tools")
    buildoptions({
      "/wd4800",  -- Forcing value to bool 'true' or 'false'
      "/wd4996",  -- Call to 'std::equal' with parameters that may be unsafe
-    })
+    })
--- a/tools/build/premake
+++ b/tools/build/premake
@ -1,4 +1,4 @@
-#!/usr/bin/env python3 
+#!/usr/bin/env python3

 # Copyright 2015 Ben Vanik. All Rights Reserved.

@ -107,13 +107,14 @@ def has_bin(bin):
  return None


-def shell_call(command, throw_on_error=True, stdout_path=None):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
  """Executes a shell command.

  Args:
    command: Command to execute, as a list of parameters.
    throw_on_error: Whether to throw an error or return the status code.
    stdout_path: File path to write stdout output to.
+    stderr_path: File path to write stderr output to.

  Returns:
    If throw_on_error is False the status code of the call will be returned.
@ -121,17 +122,22 @@ def shell_call(command, throw_on_error=True, stdout_path=None):
  stdout_file = None
  if stdout_path:
    stdout_file = open(stdout_path, 'w')
+  stderr_file = None
+  if stderr_path:
+    stderr_file = open(stderr_path, 'w')
  result = 0
  try:
    if throw_on_error:
      result = 1
-      subprocess.check_call(command, shell=False, stdout=stdout_file)
+      subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
      result = 0
    else:
-      result = subprocess.call(command, shell=False, stdout=stdout_file)
+      result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
  finally:
    if stdout_file:
      stdout_file.close()
+    if stderr_file:
+      stderr_file.close()
  return result


@ -196,42 +202,5 @@ def import_subprocess_environment(args):
        os.environ[var.upper()] = setting
        break

-def git_submodule_update():
-  """Runs a full recursive git submodule init and update.
-
-  Older versions of git do not support 'update --init --recursive'. We could
-  check and run it on versions that do support it and speed things up a bit.
-  """
-  if True:
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--init',
-        '--recursive',
-        ])
-  else:
-    shell_call([
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'foreach',
-        '--recursive',
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--recursive',
-        ])
-
-
 if __name__ == '__main__':
  main()
--- a/180
+++ b/180
@ -34,8 +34,11 @@ def main():

    # Check git exists.
    if not has_bin('git'):
-        print('ERROR: git must be installed and on PATH.')
-        sys.exit(1)
+        print('WARNING: Git should be installed and on PATH. Version info will be omitted from all binaries!')
+        print('')
+    elif not git_is_repository():
+        print('WARNING: The source tree is unversioned. Version info will be omitted from all binaries!')
+        print('')

    # Check python version.
    if not sys.version_info[:2] >= (3, 6):
@ -85,6 +88,16 @@ def main():
    sys.exit(return_code)


+def print_box(msg):
+    """Prints an important message inside a box
+    """
+    print(
+        '┌{0:─^{2}}╖\n'
+        '│{1: ^{2}}║\n'
+        '╘{0:═^{2}}╝\n'
+        .format('', msg, len(msg) + 2))
+
+
 def import_vs_environment():
    """Finds the installed Visual Studio version and imports
    interesting environment variables into os.environ.
@ -150,6 +163,7 @@ def import_subprocess_environment(args):
                os.environ[var.upper()] = setting
                break

+
 def has_bin(binary):
    """Checks whether the given binary is present.

@ -185,13 +199,14 @@ def get_bin(binary):
    return None


-def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
    """Executes a shell command.

    Args:
      command: Command to execute, as a list of parameters.
      throw_on_error: Whether to throw an error or return the status code.
      stdout_path: File path to write stdout output to.
+      stderr_path: File path to write stderr output to.

    Returns:
      If throw_on_error is False the status code of the call will be returned.
@ -199,21 +214,49 @@ def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
    stdout_file = None
    if stdout_path:
        stdout_file = open(stdout_path, 'w')
+    stderr_file = None
+    if stderr_path:
+        stderr_file = open(stderr_path, 'w')
    result = 0
    try:
        if throw_on_error:
            result = 1
-            subprocess.check_call(command, shell=shell, stdout=stdout_file)
+            subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
            result = 0
        else:
-            result = subprocess.call(command, shell=shell, stdout=stdout_file)
+            result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
    finally:
        if stdout_file:
            stdout_file.close()
+        if stderr_file:
+            stderr_file.close()
    return result


-def get_git_head_info():
+def generate_version_h():
+    """Generates a build/version.h file that contains current git info.
+    """
+    if git_is_repository():
+        (branch_name, commit, commit_short) = git_get_head_info()
+    else:
+        branch_name = 'tarball'
+        commit = ':(-dont-do-this'
+        commit_short = ':('
+
+    contents = '''// Autogenerated by `xb premake`.
+  #ifndef GENERATED_VERSION_H_
+  #define GENERATED_VERSION_H_
+  #define XE_BUILD_BRANCH "%s"
+  #define XE_BUILD_COMMIT "%s"
+  #define XE_BUILD_COMMIT_SHORT "%s"
+  #define XE_BUILD_DATE __DATE__
+  #endif  // GENERATED_VERSION_H_
+  ''' % (branch_name, commit, commit_short)
+    with open('build/version.h', 'w') as f:
+        f.write(contents)
+
+
+def git_get_head_info():
    """Queries the current branch and commit checksum from git.

    Returns:
@ -247,58 +290,28 @@ def get_git_head_info():
    return branch_name, commit, commit_short


-def generate_version_h():
-    """Generates a build/version.h file that contains current git info.
+def git_is_repository():
+    """Checks if git is available and this source tree is versioned.
    """
-    (branch_name, commit, commit_short) = get_git_head_info()
-    contents = '''// Autogenerated by `xb premake`.
-  #ifndef GENERATED_VERSION_H_
-  #define GENERATED_VERSION_H_
-  #define XE_BUILD_BRANCH "%s"
-  #define XE_BUILD_COMMIT "%s"
-  #define XE_BUILD_COMMIT_SHORT "%s"
-  #define XE_BUILD_DATE __DATE__
-  #endif  // GENERATED_VERSION_H_
-  ''' % (branch_name, commit, commit_short)
-    with open('build/version.h', 'w') as f:
-        f.write(contents)
+    if not has_bin('git'):
+        return False
+    return shell_call([
+        'git',
+        'rev-parse',
+        '--is-inside-work-tree',
+        ], throw_on_error=False, stdout_path=os.devnull, stderr_path=os.devnull) == 0


 def git_submodule_update():
    """Runs a full recursive git submodule init and update.
-
-    Older versions of git do not support 'update --init --recursive'. We could
-    check and run it on versions that do support it and speed things up a bit.
    """
-    if True:
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--init',
-            '--recursive',
-            ])
-    else:
-        shell_call([
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'foreach',
-            '--recursive',
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--recursive',
-            ])
+    shell_call([
+        'git',
+        'submodule',
+        'update',
+        '--init',
+        '--recursive',
+        ])


 def get_clang_format_binary():
@ -370,9 +383,9 @@ def run_platform_premake(cc='clang', devenv=None):
        if 'VSVERSION' in os.environ:
            vs_version = os.environ['VSVERSION']

-        return run_premake('windows', 'vs' + vs_version)
+        return run_premake('windows', devenv or ('vs' + vs_version))
    else:
-        return run_premake('linux', devenv == 'codelite' and devenv or 'gmake2', cc)
+        return run_premake('linux', devenv or 'gmake2', cc)


 def run_premake_export_commands():
@ -406,6 +419,43 @@ def get_build_bin_path(args):
    return os.path.join(self_path, 'build', 'bin', platform.capitalize(), args['config'].capitalize())


+def create_clion_workspace():
+    """Creates some basic workspace information inside the .idea directory for first start.
+    """
+    if os.path.exists('.idea'):
+        # No first start
+        return False
+    print('Generating CLion workspace files...')
+    # Might become easier in the future: https://youtrack.jetbrains.com/issue/CPP-7911
+
+    # Set the location of the CMakeLists.txt
+    os.mkdir('.idea')
+    with open(os.path.join('.idea', 'misc.xml'), 'w') as f:
+        f.write("""<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/build">
+    <contentRoot DIR="$PROJECT_DIR$" />
+  </component>
+</project>
+""")
+
+    # Set available configurations
+    # TODO Find a way to trigger a cmake reload
+    with open(os.path.join('.idea', 'workspace.xml'), 'w') as f:
+        f.write("""<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Checked" CONFIG_NAME="Checked" />
+      <configuration PROFILE_NAME="Debug" CONFIG_NAME="Debug" />
+      <configuration PROFILE_NAME="Release" CONFIG_NAME="Release" />
+    </configurations>
+  </component>
+</project>""")
+
+    return True
+
+
 def discover_commands(subparsers):
    """Looks for all commands and returns a dictionary of them.
    In the future commands could be discovered on disk.
@ -491,7 +541,10 @@ class SetupCommand(Command):

        # Setup submodules.
        print('- git submodule init / update...')
-        git_submodule_update()
+        if git_is_repository():
+            git_submodule_update()
+        else:
+            print('WARNING: Git not available or not a repository. Dependencies may be missing.')
        print('')

        print('- running premake...')
@ -1445,8 +1498,13 @@ class DevenvCommand(Command):

    def execute(self, args, pass_args, cwd):
        devenv = None
+        show_reload_prompt = False
        if sys.platform == 'win32':
            print('Launching Visual Studio...')
+        elif has_bin('clion') or has_bin('clion.sh'):
+            print('Launching CLion...')
+            show_reload_prompt = create_clion_workspace()
+            devenv = 'cmake'
        else:
            print('Launching CodeLite...')
            devenv = 'codelite'
@ -1457,11 +1515,23 @@ class DevenvCommand(Command):
        print('')

        print('- launching devenv...')
+        if show_reload_prompt:
+            print_box('Please run "File ⇒ ↺ Reload CMake Project" from inside the IDE!')
        if sys.platform == 'win32':
            shell_call([
                'devenv',
                'build\\xenia.sln',
            ])
+        elif has_bin('clion'):
+            shell_call([
+                'clion',
+                '.',
+            ])
+        elif has_bin('clion.sh'):
+            shell_call([
+                'clion.sh',
+                '.',
+            ])
        else:
            shell_call([
                'codelite',
				`@ -0,0 +1 @@`
				`Subproject commit 26fbbb9962aefcb1c24aff1e7952033ce1361190`