diff --git a/.gdbinit b/.gdbinit
new file mode 100644
index 000000000..09b4af30f
--- /dev/null
+++ b/.gdbinit
@@ -0,0 +1,10 @@
+# Ignore HighResolutionTimer custom event
+handle SIG34 nostop noprint
+# Ignore PosixTimer custom event
+handle SIG35 nostop noprint
+# Ignore PosixThread exit event
+handle SIG32 nostop noprint
+# Ignore PosixThread suspend event
+handle SIG36 nostop noprint
+# Ignore PosixThread user callback event
+handle SIG37 nostop noprint
diff --git a/.gitmodules b/.gitmodules
index 6c3ca7278..c8b4ef272 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -64,3 +64,6 @@
 [submodule "third_party/DirectXShaderCompiler"]
 	path = third_party/DirectXShaderCompiler
 	url = https://github.com/microsoft/DirectXShaderCompiler.git
+[submodule "third_party/premake-cmake"]
+	path = third_party/premake-cmake
+	url = https://github.com/Enhex/premake-cmake.git
diff --git a/.travis.yml b/.travis.yml
index 7536f47a3..188278034 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,9 +28,9 @@ addons:
 
 jobs:
   include:
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 LINT=true
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Debug
-    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Release
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 LINT=true
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Debug
+    - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Release
 
 git:
   # We handle submodules ourselves in xenia-build setup.
@@ -40,8 +40,10 @@ before_script:
   - export LIBVULKAN_VERSION=1.1.70
   - export CXX=$CXX_COMPILER
   - export CC=$C_COMPILER
+  - export AR=$AR_COMPILER
   # Dump useful info.
   - $CXX --version
+  - $AR_COMPILER --version
   - python3 --version
   - clang-format-9 --version
   - clang-format-9 -style=file -dump-config
diff --git a/docs/building.md b/docs/building.md
index 6aafc521e..0a70fb206 100644
--- a/docs/building.md
+++ b/docs/building.md
@@ -91,12 +91,14 @@ Linux support is extremely experimental and presently incomplete.
 The build script uses LLVM/Clang 9. GCC while it should work in theory, is not easily
 interchangeable right now.
 
-[CodeLite](https://codelite.org) is the supported IDE and `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website.
-Normal building via `xb build` uses Make.
+* Normal building via `xb build` uses Make.
+* [CodeLite](https://codelite.org) is supported. `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website.
+* Experimental CMake generation is available to facilitate use of other IDEs such as [CLion](https://www.jetbrains.com/clion/). If `clion` is available inside `$PATH`, `xb devenv` will start it. Otherwise `build/CMakeLists.txt` needs to be generated by invoking `xb premake --devenv=cmake` manually.
 
 Clang-9 or newer should be available from system repositories on all up to date distributions.
 You will also need some development libraries. To get them on an Ubuntu system:
-```
+
+```bash
 sudo apt-get install libgtk-3-dev libpthread-stubs0-dev liblz4-dev libx11-dev libvulkan-dev libsdl2-dev libiberty-dev libunwind-dev libc++-dev libc++abi-dev
 ```
 
diff --git a/premake5.lua b/premake5.lua
index bf7a1b286..ffa3b78db 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -1,5 +1,6 @@
 include("tools/build")
 require("third_party/premake-export-compile-commands/export-compile-commands")
+require("third_party/premake-cmake/cmake")
 
 location(build_root)
 targetdir(build_bin)
@@ -24,6 +25,9 @@ defines({
   "UNICODE",
 })
 
+cppdialect("C++17")
+symbols("On")
+
 -- TODO(DrChat): Find a way to disable this on other architectures.
 if ARCH ~= "ppc64" then
   filter("architecture:x86_64")
@@ -44,30 +48,29 @@ filter("kind:StaticLib")
 
 filter("configurations:Checked")
   runtime("Debug")
+  optimize("Off")
   defines({
     "DEBUG",
   })
-  runtime("Debug")
 filter({"configurations:Checked", "platforms:Windows"})
   buildoptions({
-    "/RTCsu",   -- Full Run-Time Checks.
+    "/RTCsu",           -- Full Run-Time Checks.
+  })
+filter({"configurations:Checked", "platforms:Linux"})
+  defines({
+    "_GLIBCXX_DEBUG",   -- libstdc++ debug mode
   })
 
 filter("configurations:Debug")
-  runtime("Debug")
+  runtime("Release")
+  optimize("Off")
   defines({
     "DEBUG",
     "_NO_DEBUG_HEAP=1",
   })
-  runtime("Release")
-filter({"configurations:Debug", "platforms:Windows"})
-  linkoptions({
-    "/NODEFAULTLIB:MSVCRTD",
-  })
-
 filter({"configurations:Debug", "platforms:Linux"})
-  buildoptions({
-    "-g",
+  defines({
+    "_GLIBCXX_DEBUG",   -- make dbg symbols work on some distros
   })
 
 filter("configurations:Release")
@@ -76,26 +79,18 @@ filter("configurations:Release")
     "NDEBUG",
     "_NO_DEBUG_HEAP=1",
   })
-  optimize("speed")
+  optimize("Speed")
   inlining("Auto")
   floatingpoint("Fast")
   flags({
     "LinkTimeOptimization",
   })
-  runtime("Release")
-filter({"configurations:Release", "platforms:Windows"})
-  linkoptions({
-    "/NODEFAULTLIB:MSVCRTD",
-  })
-
 filter("platforms:Linux")
   system("linux")
   toolset("clang")
-  cppdialect("C++17")
   buildoptions({
     -- "-mlzcnt",  -- (don't) Assume lzcnt is supported.
-    "`pkg-config --cflags gtk+-x11-3.0`",
-    "-fno-lto", -- Premake doesn't support LTO on clang
+    ({os.outputof("pkg-config --cflags gtk+-x11-3.0")})[1],
   })
   links({
     "stdc++fs",
@@ -105,14 +100,13 @@ filter("platforms:Linux")
     "rt",
   })
   linkoptions({
-    "`pkg-config --libs gtk+-3.0`",
+    ({os.outputof("pkg-config --libs gtk+-3.0")})[1],
   })
 
 filter({"platforms:Linux", "kind:*App"})
   linkgroups("On")
 
 filter({"platforms:Linux", "language:C++", "toolset:gcc"})
-  cppdialect("C++17")
   links({
   })
   disablewarnings({
@@ -147,13 +141,11 @@ filter({"platforms:Linux", "language:C++", "toolset:clang", "files:*.cc or *.cpp
 filter("platforms:Windows")
   system("windows")
   toolset("msc")
-  cppdialect("C++17")
   buildoptions({
-    "/MP",      -- Multiprocessor compilation.
     "/utf-8",   -- 'build correctly on systems with non-Latin codepages'.
     -- Mark warnings as severe
-    "/w14839", -- non-standard use of class 'type' as an argument to a variadic function
-    "/w14840", -- non-portable use of class 'type' as an argument to a variadic function
+    "/w14839",  -- non-standard use of class 'type' as an argument to a variadic function
+    "/w14840",  -- non-portable use of class 'type' as an argument to a variadic function
     -- Disable warnings
     "/wd4100",  -- Unreferenced parameters are ok.
     "/wd4201",  -- Nameless struct/unions are ok.
@@ -163,10 +155,10 @@ filter("platforms:Windows")
     "/wd4189",  -- 'local variable is initialized but not referenced'.
   })
   flags({
-    "NoMinimalRebuild", -- Required for /MP above.
+    "MultiProcessorCompile",  -- Multiprocessor compilation.
+    "NoMinimalRebuild",       -- Required for /MP above.
   })
 
-  symbols("On")
   defines({
     "_CRT_NONSTDC_NO_DEPRECATE",
     "_CRT_SECURE_NO_WARNINGS",
diff --git a/src/xenia/app/emulator_window.cc b/src/xenia/app/emulator_window.cc
index 8c66fa880..576b1e4b6 100644
--- a/src/xenia/app/emulator_window.cc
+++ b/src/xenia/app/emulator_window.cc
@@ -71,8 +71,8 @@ std::unique_ptr<EmulatorWindow> EmulatorWindow::Create(Emulator* emulator) {
   std::unique_ptr<EmulatorWindow> emulator_window(new EmulatorWindow(emulator));
 
   emulator_window->loop()->PostSynchronous([&emulator_window]() {
-    xe::threading::set_name("Win32 Loop");
-    xe::Profiler::ThreadEnter("Win32 Loop");
+    xe::threading::set_name("Windowing Loop");
+    xe::Profiler::ThreadEnter("Windowing Loop");
 
     if (!emulator_window->Initialize()) {
       xe::FatalError("Failed to initialize main window");
diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua
index 2110fd0ab..639f79d94 100644
--- a/src/xenia/app/premake5.lua
+++ b/src/xenia/app/premake5.lua
@@ -8,19 +8,6 @@ project("xenia-app")
   targetname("xenia_canary")
   language("C++")
   links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "dxbc",
-    "discord-rpc",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
     "xenia-app-discord",
     "xenia-apu",
     "xenia-apu-nop",
@@ -43,6 +30,21 @@ project("xenia-app")
     "xenia-ui-vulkan",
     "xenia-patcher",
     "xenia-vfs",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "dxbc",
+    "discord-rpc",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
     "xxhash",
   })
   defines({
diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc
index 16d6e66a8..e5cdb2561 100644
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@@ -302,6 +302,7 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) {
 
   // No available data.
   if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) {
+    data->output_buffer_valid = 0;
     return;
   }
 
diff --git a/src/xenia/apu/xma_decoder.cc b/src/xenia/apu/xma_decoder.cc
index dd7d30817..ee1c9aa45 100644
--- a/src/xenia/apu/xma_decoder.cc
+++ b/src/xenia/apu/xma_decoder.cc
@@ -144,7 +144,7 @@ X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
         WorkerThreadMain();
         return 0;
       }));
-  worker_thread_->set_name("XMA Decoder Worker");
+  worker_thread_->set_name("XMA Decoder");
   worker_thread_->set_can_debugger_suspend(true);
   worker_thread_->Create();
 
diff --git a/src/xenia/base/debugging_posix.cc b/src/xenia/base/debugging_posix.cc
index a9c08ed60..3b73ab12a 100644
--- a/src/xenia/base/debugging_posix.cc
+++ b/src/xenia/base/debugging_posix.cc
@@ -9,21 +9,51 @@
 
 #include "xenia/base/debugging.h"
 
-#include <signal.h>
+#include <csignal>
 #include <cstdarg>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>
 
 #include "xenia/base/string_buffer.h"
 
 namespace xe {
 namespace debugging {
 
-bool IsDebuggerAttached() { return false; }
-void Break() { raise(SIGTRAP); }
+bool IsDebuggerAttached() {
+  std::ifstream proc_status_stream("/proc/self/status");
+  if (!proc_status_stream.is_open()) {
+    return false;
+  }
+  std::string line;
+  while (std::getline(proc_status_stream, line)) {
+    std::istringstream line_stream(line);
+    std::string key;
+    line_stream >> key;
+    if (key == "TracerPid:") {
+      uint32_t tracer_pid;
+      line_stream >> tracer_pid;
+      return tracer_pid != 0;
+    }
+  }
+  return false;
+}
+
+void Break() {
+  static std::once_flag flag;
+  std::call_once(flag, []() {
+    // Install handler for sigtrap only once
+    std::signal(SIGTRAP, [](int) {
+      // Forward signal to default handler after being caught
+      std::signal(SIGTRAP, SIG_DFL);
+    });
+  });
+  std::raise(SIGTRAP);
+}
 
 namespace internal {
-void DebugPrint(const char* s) {
-  // TODO: proper implementation.
-}
+void DebugPrint(const char* s) { std::clog << s << std::endl; }
 }  // namespace internal
 
 }  // namespace debugging
diff --git a/src/xenia/base/logging.cc b/src/xenia/base/logging.cc
index aa688c87e..8584892d4 100644
--- a/src/xenia/base/logging.cc
+++ b/src/xenia/base/logging.cc
@@ -93,7 +93,7 @@ class Logger {
 
     write_thread_ =
         xe::threading::Thread::Create({}, [this]() { WriteThread(); });
-    write_thread_->set_name("xe::FileLogSink Writer");
+    write_thread_->set_name("Logging Writer");
   }
 
   ~Logger() {
diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h
index 33083a831..9b98175c5 100644
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@@ -76,14 +76,12 @@
 #endif  // XE_PLATFORM_MAC
 
 #if XE_COMPILER_MSVC
-#define XEPACKEDSTRUCT(name, value)                                  \
-  __pragma(pack(push, 1)) struct name##_s value __pragma(pack(pop)); \
-  typedef struct name##_s name;
+#define XEPACKEDSTRUCT(name, value) \
+  __pragma(pack(push, 1)) struct name value __pragma(pack(pop));
 #define XEPACKEDSTRUCTANONYMOUS(value) \
   __pragma(pack(push, 1)) struct value __pragma(pack(pop));
-#define XEPACKEDUNION(name, value)                                  \
-  __pragma(pack(push, 1)) union name##_s value __pragma(pack(pop)); \
-  typedef union name##_s name;
+#define XEPACKEDUNION(name, value) \
+  __pragma(pack(push, 1)) union name value __pragma(pack(pop));
 #else
 #define XEPACKEDSTRUCT(name, value) struct __attribute__((packed)) name value;
 #define XEPACKEDSTRUCTANONYMOUS(value) struct __attribute__((packed)) value;
diff --git a/src/xenia/base/string_util.h b/src/xenia/base/string_util.h
index f1499bb5f..adb2012af 100644
--- a/src/xenia/base/string_util.h
+++ b/src/xenia/base/string_util.h
@@ -10,11 +10,15 @@
 #ifndef XENIA_BASE_STRING_UTIL_H_
 #define XENIA_BASE_STRING_UTIL_H_
 
+#include <algorithm>
 #include <charconv>
+#include <cstddef>
+#include <cstring>
 #include <string>
 
 #include "third_party/fmt/include/fmt/format.h"
 #include "xenia/base/assert.h"
+#include "xenia/base/memory.h"
 #include "xenia/base/platform.h"
 #include "xenia/base/string.h"
 #include "xenia/base/vec128.h"
@@ -30,6 +34,40 @@
 namespace xe {
 namespace string_util {
 
+inline size_t copy_truncating(char* dest, const std::string_view source,
+                              size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  std::memcpy(dest, source.data(), chars_copied);
+  dest[chars_copied] = '\0';
+  return chars_copied;
+}
+
+inline size_t copy_truncating(char16_t* dest, const std::u16string_view source,
+                              size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  std::memcpy(dest, source.data(), chars_copied * sizeof(char16_t));
+  dest[chars_copied] = u'\0';
+  return chars_copied;
+}
+
+inline size_t copy_and_swap_truncating(char16_t* dest,
+                                       const std::u16string_view source,
+                                       size_t dest_buffer_count) {
+  if (!dest_buffer_count) {
+    return 0;
+  }
+  size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1));
+  xe::copy_and_swap(dest, source.data(), chars_copied);
+  dest[chars_copied] = u'\0';
+  return chars_copied;
+}
+
 inline std::string to_hex_string(uint32_t value) {
   return fmt::format("{:08X}", value);
 }
diff --git a/src/xenia/base/testing/threading_test.cc b/src/xenia/base/testing/threading_test.cc
new file mode 100644
index 000000000..8d5f74449
--- /dev/null
+++ b/src/xenia/base/testing/threading_test.cc
@@ -0,0 +1,967 @@
+/**
+******************************************************************************
+* Xenia : Xbox 360 Emulator Research Project                                 *
+******************************************************************************
+* Copyright 2018 Ben Vanik. All rights reserved.                             *
+* Released under the BSD license - see LICENSE in the root for more details. *
+******************************************************************************
+*/
+
+#include <array>
+
+#include "xenia/base/threading.h"
+
+#include "third_party/catch/include/catch.hpp"
+
+namespace xe {
+namespace base {
+namespace test {
+using namespace threading;
+using namespace std::chrono_literals;
+
+TEST_CASE("Fence") {
+  std::unique_ptr<threading::Fence> pFence;
+  std::unique_ptr<threading::HighResolutionTimer> pTimer;
+
+  // Signal without wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+
+  // Signal once and wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Signal twice and wait
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Signal and wait two times
+  pFence = std::make_unique<threading::Fence>();
+  pFence->Signal();
+  pFence->Wait();
+  pFence->Signal();
+  pFence->Wait();
+
+  // Test to synchronize multiple threads
+  std::atomic<int> started(0);
+  std::atomic<int> finished(0);
+  pFence = std::make_unique<threading::Fence>();
+  auto func = [&pFence, &started, &finished] {
+    started.fetch_add(1);
+    pFence->Wait();
+    finished.fetch_add(1);
+  };
+
+  auto threads = std::array<std::thread, 5>({
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+      std::thread(func),
+  });
+
+  Sleep(100ms);
+  REQUIRE(started.load() == threads.size());
+  REQUIRE(finished.load() == 0);
+
+  pFence->Signal();
+
+  for (auto& t : threads) t.join();
+  REQUIRE(finished.load() == threads.size());
+}  // namespace test
+
+TEST_CASE("Get number of logical processors") {
+  auto count = std::thread::hardware_concurrency();
+  REQUIRE(logical_processor_count() == count);
+  REQUIRE(logical_processor_count() == count);
+  REQUIRE(logical_processor_count() == count);
+}
+
+TEST_CASE("Enable process to set thread affinity") {
+  EnableAffinityConfiguration();
+}
+
+TEST_CASE("Yield Current Thread", "MaybeYield") {
+  // Run to see if there are any errors
+  MaybeYield();
+}
+
+TEST_CASE("Sync with Memory Barrier", "SyncMemory") {
+  // Run to see if there are any errors
+  SyncMemory();
+}
+
+TEST_CASE("Sleep Current Thread", "Sleep") {
+  auto wait_time = 50ms;
+  auto start = std::chrono::steady_clock::now();
+  Sleep(wait_time);
+  auto duration = std::chrono::steady_clock::now() - start;
+  REQUIRE(duration >= wait_time);
+}
+
+TEST_CASE("Sleep Current Thread in Alertable State", "Sleep") {
+  auto wait_time = 50ms;
+  auto start = std::chrono::steady_clock::now();
+  auto result = threading::AlertableSleep(wait_time);
+  auto duration = std::chrono::steady_clock::now() - start;
+  REQUIRE(duration >= wait_time);
+  REQUIRE(result == threading::SleepResult::kSuccess);
+
+  // TODO(bwrsandman): Test a Thread to return kAlerted.
+  // Need callback to call extended I/O function (ReadFileEx or WriteFileEx)
+}
+
+TEST_CASE("TlsHandle") {
+  // Test Allocate
+  auto handle = threading::AllocateTlsHandle();
+
+  // Test Free
+  REQUIRE(threading::FreeTlsHandle(handle));
+  REQUIRE(!threading::FreeTlsHandle(handle));
+  REQUIRE(!threading::FreeTlsHandle(threading::kInvalidTlsHandle));
+
+  // Test setting values
+  handle = threading::AllocateTlsHandle();
+  REQUIRE(threading::GetTlsValue(handle) == 0);
+  uint32_t value = 0xDEADBEEF;
+  threading::SetTlsValue(handle, reinterpret_cast<uintptr_t>(&value));
+  auto p_received_value = threading::GetTlsValue(handle);
+  REQUIRE(threading::GetTlsValue(handle) != 0);
+  auto received_value = *reinterpret_cast<uint32_t*>(p_received_value);
+  REQUIRE(received_value == value);
+
+  uintptr_t non_thread_local_value = 0;
+  auto thread = Thread::Create({}, [&non_thread_local_value, &handle] {
+    non_thread_local_value = threading::GetTlsValue(handle);
+  });
+
+  auto result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(non_thread_local_value == 0);
+
+  // Cleanup
+  REQUIRE(threading::FreeTlsHandle(handle));
+}
+
+TEST_CASE("HighResolutionTimer") {
+  // The wait time is 500ms with an interval of 50ms
+  // Smaller values are not as precise and fail the test
+  const auto wait_time = 500ms;
+
+  // Time the actual sleep duration
+  {
+    const auto interval = 50ms;
+    std::atomic<uint64_t> counter;
+    auto start = std::chrono::steady_clock::now();
+    auto cb = [&counter] { ++counter; };
+    auto pTimer = HighResolutionTimer::CreateRepeating(interval, cb);
+    Sleep(wait_time);
+    pTimer.reset();
+    auto duration = std::chrono::steady_clock::now() - start;
+
+    // Should have run as many times as wait_time / timer_interval plus or
+    // minus 1 due to imprecision of Sleep
+    REQUIRE(duration.count() >= wait_time.count());
+    auto ratio = static_cast<uint64_t>(duration / interval);
+    REQUIRE(counter >= ratio - 1);
+    REQUIRE(counter <= ratio + 1);
+  }
+
+  // Test concurrent timers
+  {
+    const auto interval1 = 100ms;
+    const auto interval2 = 200ms;
+    std::atomic<uint64_t> counter1;
+    std::atomic<uint64_t> counter2;
+    auto start = std::chrono::steady_clock::now();
+    auto cb1 = [&counter1] { ++counter1; };
+    auto cb2 = [&counter2] { ++counter2; };
+    auto pTimer1 = HighResolutionTimer::CreateRepeating(interval1, cb1);
+    auto pTimer2 = HighResolutionTimer::CreateRepeating(interval2, cb2);
+    Sleep(wait_time);
+    pTimer1.reset();
+    pTimer2.reset();
+    auto duration = std::chrono::steady_clock::now() - start;
+
+    // Should have run as many times as wait_time / timer_interval plus or
+    // minus 1 due to imprecision of Sleep
+    REQUIRE(duration.count() >= wait_time.count());
+    auto ratio1 = static_cast<uint64_t>(duration / interval1);
+    auto ratio2 = static_cast<uint64_t>(duration / interval2);
+    REQUIRE(counter1 >= ratio1 - 1);
+    REQUIRE(counter1 <= ratio1 + 1);
+    REQUIRE(counter2 >= ratio2 - 1);
+    REQUIRE(counter2 <= ratio2 + 1);
+  }
+
+  // TODO(bwrsandman): Check on which thread callbacks are executed when
+  // spawned from differing threads
+}
+
+TEST_CASE("Wait on Multiple Handles", "Wait") {
+  auto mutant = Mutant::Create(true);
+  auto semaphore = Semaphore::Create(10, 10);
+  auto event_ = Event::CreateManualResetEvent(false);
+  auto thread = Thread::Create({}, [&mutant, &semaphore, &event_] {
+    event_->Set();
+    Wait(mutant.get(), false, 25ms);
+    semaphore->Release(1, nullptr);
+    Wait(mutant.get(), false, 25ms);
+    mutant->Release();
+  });
+
+  std::vector<WaitHandle*> handles = {
+      mutant.get(),
+      semaphore.get(),
+      event_.get(),
+      thread.get(),
+  };
+
+  auto any_result = WaitAny(handles, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+
+  auto all_result = WaitAll(handles, false, 100ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Signal and Wait") {
+  WaitResult result;
+  auto mutant = Mutant::Create(true);
+  auto event_ = Event::CreateAutoResetEvent(false);
+  auto thread = Thread::Create({}, [&mutant, &event_] {
+    Wait(mutant.get(), false);
+    event_->Set();
+  });
+  result = Wait(event_.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  result = SignalAndWait(mutant.get(), event_.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Wait on Event", "Event") {
+  auto evt = Event::CreateAutoResetEvent(false);
+  WaitResult result;
+
+  // Call wait on unset Event
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Call wait on set Event
+  evt->Set();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call wait on now consumed Event
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+}
+
+TEST_CASE("Reset Event", "Event") {
+  auto evt = Event::CreateAutoResetEvent(false);
+  WaitResult result;
+
+  // Call wait on reset Event
+  evt->Set();
+  evt->Reset();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Test resetting the unset event
+  evt->Reset();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Test setting the reset event
+  evt->Set();
+  result = Wait(evt.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+}
+
+TEST_CASE("Wait on Multiple Events", "Event") {
+  auto events = std::array<std::unique_ptr<Event>, 4>{
+      Event::CreateAutoResetEvent(false),
+      Event::CreateAutoResetEvent(false),
+      Event::CreateAutoResetEvent(false),
+      Event::CreateManualResetEvent(false),
+  };
+
+  std::array<char, 8> order = {0};
+  std::atomic_uint index(0);
+  auto sign_in = [&order, &index](uint32_t id) {
+    auto i = index.fetch_add(1, std::memory_order::memory_order_relaxed);
+    order[i] = static_cast<char>('0' + id);
+  };
+
+  auto threads = std::array<std::thread, 4>{
+      std::thread([&events, &sign_in] {
+        auto res = WaitAll({events[1].get(), events[3].get()}, false, 100ms);
+        if (res == WaitResult::kSuccess) {
+          sign_in(1);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAny({events[0].get(), events[2].get()}, false, 100ms);
+        if (res.first == WaitResult::kSuccess) {
+          sign_in(2);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAll({events[0].get(), events[2].get(), events[3].get()},
+                           false, 100ms);
+        if (res == WaitResult::kSuccess) {
+          sign_in(3);
+        }
+      }),
+      std::thread([&events, &sign_in] {
+        auto res = WaitAny({events[1].get(), events[3].get()}, false, 100ms);
+        if (res.first == WaitResult::kSuccess) {
+          sign_in(4);
+        }
+      }),
+  };
+
+  Sleep(10ms);
+  events[3]->Set();  // Signals thread id=4 and stays on for 1 and 3
+  Sleep(10ms);
+  events[1]->Set();  // Signals thread id=1
+  Sleep(10ms);
+  events[0]->Set();  // Signals thread id=2
+  Sleep(10ms);
+  events[2]->Set();  // Partial signals thread id=3
+  events[0]->Set();  // Signals thread id=3
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  INFO(order.data());
+  REQUIRE(order[0] == '4');
+  // TODO(bwrsandman): Order is not always maintained on linux
+  // REQUIRE(order[1] == '1');
+  // REQUIRE(order[2] == '2');
+  // REQUIRE(order[3] == '3');
+}
+
+TEST_CASE("Wait on Semaphore", "Semaphore") {
+  WaitResult result;
+  std::unique_ptr<Semaphore> sem;
+  int previous_count = 0;
+
+  // Wait on semaphore with no room
+  sem = Semaphore::Create(0, 5);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Add room in semaphore
+  REQUIRE(sem->Release(2, &previous_count));
+  REQUIRE(previous_count == 0);
+  REQUIRE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == 2);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == 2);
+
+  // Set semaphore over maximum_count
+  sem = Semaphore::Create(5, 5);
+  previous_count = -1;
+  REQUIRE_FALSE(sem->Release(1, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+  sem = Semaphore::Create(0, 5);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(10, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Test invalid Release parameters
+  REQUIRE_FALSE(sem->Release(0, &previous_count));
+  REQUIRE(previous_count == -1);
+  REQUIRE_FALSE(sem->Release(-1, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Wait on fully available semaphore
+  sem = Semaphore::Create(5, 5);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  // Semaphore between threads
+  sem = Semaphore::Create(5, 5);
+  Sleep(10ms);
+  // Occupy the semaphore with 5 threads
+  auto func = [&sem] {
+    auto res = Wait(sem.get(), false, 100ms);
+    Sleep(500ms);
+    if (res == WaitResult::kSuccess) {
+      sem->Release(1, nullptr);
+    }
+  };
+  auto threads = std::array<std::thread, 5>{
+      std::thread(func), std::thread(func), std::thread(func),
+      std::thread(func), std::thread(func),
+  };
+  // Give threads time to acquire semaphore
+  Sleep(10ms);
+  // Attempt to acquire full semaphore with current (6th) thread
+  result = Wait(sem.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  // Give threads time to release semaphore
+  for (auto& t : threads) {
+    t.join();
+  }
+  result = Wait(sem.get(), false, 10ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  sem->Release(1, &previous_count);
+  REQUIRE(previous_count == 4);
+
+  // Test invalid construction parameters
+  // These are invalid according to documentation
+  // TODO(bwrsandman): Many of these invalid invocations succeed
+  sem = Semaphore::Create(-1, 5);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(10, 5);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(0, 0);
+  // REQUIRE(sem.get() == nullptr);
+  sem = Semaphore::Create(0, -1);
+  // REQUIRE(sem.get() == nullptr);
+}
+
+TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+  int previous_count;
+  std::unique_ptr<Semaphore> sem0, sem1;
+
+  // Test Wait all which should fail
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(5, 5);
+  all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE_FALSE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == -1);
+
+  // Test Wait all again which should succeed
+  sem0 = Semaphore::Create(1, 5);
+  sem1 = Semaphore::Create(5, 5);
+  all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 4);
+
+  // Test Wait Any which should fail
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(0, 5);
+  any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+
+  // Test Wait Any which should succeed
+  sem0 = Semaphore::Create(0, 5);
+  sem1 = Semaphore::Create(5, 5);
+  any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+  previous_count = -1;
+  REQUIRE(sem0->Release(1, &previous_count));
+  REQUIRE(previous_count == 0);
+  previous_count = -1;
+  REQUIRE(sem1->Release(1, &previous_count));
+  REQUIRE(previous_count == 4);
+}
+
+TEST_CASE("Wait on Mutant", "Mutant") {
+  WaitResult result;
+  std::unique_ptr<Mutant> mut;
+
+  // Release on initially owned mutant
+  mut = Mutant::Create(true);
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Release on initially not-owned mutant
+  mut = Mutant::Create(false);
+  REQUIRE_FALSE(mut->Release());
+
+  // Wait on initially owned mutant
+  mut = Mutant::Create(true);
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Wait on initially not owned mutant
+  mut = Mutant::Create(false);
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+  REQUIRE_FALSE(mut->Release());
+
+  // Multiple waits (or locks)
+  mut = Mutant::Create(false);
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(mut.get(), false, 1ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  for (int i = 0; i < 10; ++i) {
+    REQUIRE(mut->Release());
+  }
+  REQUIRE_FALSE(mut->Release());
+
+  // Test mutants on other threads
+  auto thread1 = std::thread([&mut] {
+    Sleep(5ms);
+    mut = Mutant::Create(true);
+    Sleep(100ms);
+    mut->Release();
+  });
+  Sleep(10ms);
+  REQUIRE_FALSE(mut->Release());
+  Sleep(10ms);
+  result = Wait(mut.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread1.join();
+  result = Wait(mut.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(mut->Release());
+}
+
+TEST_CASE("Wait on Multiple Mutants", "Mutant") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+  std::unique_ptr<Mutant> mut0, mut1;
+
+  // Test which should fail for WaitAll and WaitAny
+  auto thread0 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(true);
+    mut1 = Mutant::Create(true);
+    Sleep(50ms);
+    mut0->Release();
+    mut1->Release();
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  thread0.join();
+
+  // Test which should fail for WaitAll but not WaitAny
+  auto thread1 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(true);
+    mut1 = Mutant::Create(false);
+    Sleep(50ms);
+    mut0->Release();
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+  REQUIRE_FALSE(mut0->Release());
+  REQUIRE(mut1->Release());
+  thread1.join();
+
+  // Test which should pass for WaitAll and WaitAny
+  auto thread2 = std::thread([&mut0, &mut1] {
+    mut0 = Mutant::Create(false);
+    mut1 = Mutant::Create(false);
+    Sleep(50ms);
+  });
+  Sleep(10ms);
+  all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  REQUIRE(mut0->Release());
+  REQUIRE(mut1->Release());
+  any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+  REQUIRE(mut0->Release());
+  REQUIRE_FALSE(mut1->Release());
+  thread2.join();
+}
+
+TEST_CASE("Wait on Timer", "Timer") {
+  WaitResult result;
+  std::unique_ptr<Timer> timer;
+
+  // Test Manual Reset
+  timer = Timer::CreateManualResetTimer();
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(timer->SetOnce(1ms));  // Signals it
+  result = Wait(timer.get(), false, 2ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kSuccess);  // Did not reset
+
+  // Test Synchronization
+  timer = Timer::CreateSynchronizationTimer();
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(timer->SetOnce(1ms));  // Signals it
+  result = Wait(timer.get(), false, 2ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  result = Wait(timer.get(), false, 1ms);
+  REQUIRE(result == WaitResult::kTimeout);  // Did reset
+
+  // TODO(bwrsandman): This test unexpectedly fails under windows
+  // Test long due time
+  // timer = Timer::CreateSynchronizationTimer();
+  // REQUIRE(timer->SetOnce(10s));
+  // result = Wait(timer.get(), false, 10ms);  // Still signals under windows
+  // REQUIRE(result == WaitResult::kTimeout);
+
+  // Test Repeating
+  REQUIRE(timer->SetRepeating(1ms, 10ms));
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    INFO(i);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  MaybeYield();
+  Sleep(10ms);  // Skip a few events
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  // Cancel it
+  timer->Cancel();
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  MaybeYield();
+  Sleep(10ms);  // Skip a few events
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  // Cancel with SetOnce
+  REQUIRE(timer->SetRepeating(1ms, 10ms));
+  for (int i = 0; i < 10; ++i) {
+    result = Wait(timer.get(), false, 20ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }
+  REQUIRE(timer->SetOnce(1ms));
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kSuccess);  // Signal from Set Once
+  result = Wait(timer.get(), false, 20ms);
+  REQUIRE(result == WaitResult::kTimeout);  // No more signals from repeating
+}
+
+TEST_CASE("Wait on Multiple Timers", "Timer") {
+  WaitResult all_result;
+  std::pair<WaitResult, size_t> any_result;
+
+  auto timer0 = Timer::CreateSynchronizationTimer();
+  auto timer1 = Timer::CreateManualResetTimer();
+
+  // None signaled
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 1ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 1ms);
+  REQUIRE(any_result.first == WaitResult::kTimeout);
+  REQUIRE(any_result.second == 0);
+
+  // Some signaled
+  REQUIRE(timer1->SetOnce(1ms));
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(all_result == WaitResult::kTimeout);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+
+  // All signaled
+  REQUIRE(timer0->SetOnce(1ms));
+  all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(all_result == WaitResult::kSuccess);
+  REQUIRE(timer0->SetOnce(1ms));
+  Sleep(1ms);
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 0);
+
+  // Check that timer0 reset
+  any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms);
+  REQUIRE(any_result.first == WaitResult::kSuccess);
+  REQUIRE(any_result.second == 1);
+}
+
+TEST_CASE("Create and Trigger Timer Callbacks", "Timer") {
+  // TODO(bwrsandman): Check which thread performs callback and timing of
+  // callback
+  REQUIRE(true);
+}
+
+TEST_CASE("Set and Test Current Thread ID", "Thread") {
+  // System ID
+  auto system_id = current_thread_system_id();
+  REQUIRE(system_id > 0);
+
+  // Thread ID
+  auto thread_id = current_thread_id();
+  REQUIRE(thread_id == system_id);
+
+  // Set a new thread id
+  const uint32_t new_thread_id = 0xDEADBEEF;
+  set_current_thread_id(new_thread_id);
+  REQUIRE(current_thread_id() == new_thread_id);
+
+  // Set back original thread id of system
+  set_current_thread_id(std::numeric_limits<uint32_t>::max());
+  REQUIRE(current_thread_id() == system_id);
+
+  // TODO(bwrsandman): Test on Thread object
+}
+
+TEST_CASE("Set and Test Current Thread Name", "Thread") {
+  auto current_thread = Thread::GetCurrentThread();
+  REQUIRE(current_thread);
+  auto old_thread_name = current_thread->name();
+
+  std::string new_thread_name = "Threading Test";
+  REQUIRE_NOTHROW(set_name(new_thread_name));
+
+  // Restore the old catch.hpp thread name
+  REQUIRE_NOTHROW(set_name(old_thread_name));
+}
+
+TEST_CASE("Create and Run Thread", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  auto func = [] { Sleep(20ms); };
+
+  // Create most basic case of thread
+  thread = Thread::Create(params, func);
+  REQUIRE(thread->native_handle() != nullptr);
+  REQUIRE_NOTHROW(thread->affinity_mask());
+  REQUIRE(thread->name().empty());
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Add thread name
+  std::string new_name = "Test thread name";
+  thread = Thread::Create(params, func);
+  auto name = thread->name();
+  INFO(name.c_str());
+  REQUIRE(name.empty());
+  thread->set_name(new_name);
+  REQUIRE(thread->name() == new_name);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Use Terminate to end an infinitely looping thread
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Sleep(1ms);
+    }
+  });
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread->Terminate(-1);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call Exit from inside an infinitely looping thread
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Thread::Exit(-1);
+    }
+  });
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // Call timeout wait on self
+  result = Wait(Thread::GetCurrentThread(), false, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+
+  params.stack_size = 16 * 1024;
+  thread = Thread::Create(params, [] {
+    while (true) {
+      Thread::Exit(-1);
+    }
+  });
+  REQUIRE(thread != nullptr);
+  result = Wait(thread.get(), false, 50ms);
+  REQUIRE(result == WaitResult::kSuccess);
+
+  // TODO(bwrsandman): Test with different priorities
+  // TODO(bwrsandman): Test setting and getting thread affinity
+}
+
+TEST_CASE("Test Suspending Thread", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  auto func = [] { Sleep(20ms); };
+
+  // Create initially suspended
+  params.create_suspended = true;
+  thread = threading::Thread::Create(params, func);
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+  params.create_suspended = false;
+
+  // Create and then suspend
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+
+  // Test recursive suspend
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend();
+  thread->Suspend();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kTimeout);
+  thread->Resume();
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+
+  // Test suspend count
+  uint32_t suspend_count = 0;
+  thread = threading::Thread::Create(params, func);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 0);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 2);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 3);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 2);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  thread->Suspend(&suspend_count);
+  REQUIRE(suspend_count == 0);
+  thread->Resume(&suspend_count);
+  REQUIRE(suspend_count == 1);
+  result = threading::Wait(thread.get(), false, 50ms);
+  REQUIRE(result == threading::WaitResult::kSuccess);
+}
+
+TEST_CASE("Test Thread QueueUserCallback", "Thread") {
+  std::unique_ptr<Thread> thread;
+  WaitResult result;
+  Thread::CreationParameters params = {};
+  std::atomic_int order;
+  int is_modified;
+  int has_finished;
+  auto callback = [&is_modified, &order] {
+    is_modified = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  };
+
+  // Without alertable
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&has_finished, &order] {
+    // Not using Alertable so callback is not registered
+    Sleep(90ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(is_modified == -1);
+  thread->QueueUserCallback(callback);
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == -1);
+  REQUIRE(has_finished == 0);
+
+  // With alertable
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&has_finished, &order] {
+    // Using Alertable so callback is registered
+    AlertableSleep(90ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 50ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  REQUIRE(is_modified == -1);
+  thread->QueueUserCallback(callback);
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == 0);
+  REQUIRE(has_finished == 1);
+
+  // Test Exit command with QueueUserCallback
+  order = 0;
+  is_modified = -1;
+  has_finished = -1;
+  thread = Thread::Create(params, [&is_modified, &has_finished, &order] {
+    is_modified = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+    // Using Alertable so callback is registered
+    AlertableSleep(200ms);
+    has_finished = std::atomic_fetch_add_explicit(
+        &order, 1, std::memory_order::memory_order_relaxed);
+  });
+  result = Wait(thread.get(), true, 100ms);
+  REQUIRE(result == WaitResult::kTimeout);
+  thread->QueueUserCallback([] { Thread::Exit(0); });
+  result = Wait(thread.get(), true, 500ms);
+  REQUIRE(result == WaitResult::kSuccess);
+  REQUIRE(is_modified == 0);
+  REQUIRE(has_finished == -1);
+
+  // TODO(bwrsandman): Test alertable wait returning kUserCallback by using IO
+  // callbacks.
+}
+
+}  // namespace test
+}  // namespace base
+}  // namespace xe
diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h
index fef37dd06..776a158e0 100644
--- a/src/xenia/base/threading.h
+++ b/src/xenia/base/threading.h
@@ -24,29 +24,56 @@
 #include <utility>
 #include <vector>
 
+#include "xenia/base/assert.h"
+
 namespace xe {
 namespace threading {
 
+// This is more like an Event with self-reset when returning from Wait()
 class Fence {
  public:
-  Fence() : signaled_(false) {}
+  Fence() : signal_state_(0) {}
+
   void Signal() {
     std::unique_lock<std::mutex> lock(mutex_);
-    signaled_.store(true);
+    signal_state_ |= SIGMASK_;
     cond_.notify_all();
   }
+
+  // Wait for the Fence to be signaled. Clears the signal on return.
   void Wait() {
     std::unique_lock<std::mutex> lock(mutex_);
-    while (!signaled_.load()) {
+    assert_true((signal_state_ & ~SIGMASK_) < (SIGMASK_ - 1) &&
+                "Too many threads?");
+
+    // keep local copy to minimize loads
+    auto signal_state = ++signal_state_;
+    for (; !(signal_state & SIGMASK_); signal_state = signal_state_) {
       cond_.wait(lock);
     }
-    signaled_.store(false);
+
+    // We can't just clear the signal as other threads may not have read it yet
+    assert_true((signal_state & ~SIGMASK_) > 0);  // wait_count > 0
+    if (signal_state == (1 | SIGMASK_)) {         // wait_count == 1
+      // Last one out turn off the lights
+      signal_state_ = 0;
+    } else {
+      // Oops, another thread is still waiting, set the new count and keep the
+      // signal.
+      signal_state_ = --signal_state;
+    }
   }
 
  private:
+  using state_t_ = uint_fast32_t;
+  static constexpr state_t_ SIGMASK_ = state_t_(1)
+                                       << (sizeof(state_t_) * 8 - 1);
+
   std::mutex mutex_;
   std::condition_variable cond_;
-  std::atomic<bool> signaled_;
+  // Use the highest bit (sign bit) as the signal flag and the rest to count
+  // waiting threads.
+  volatile state_t_ signal_state_;
 };
 
 // Returns the total number of logical processors in the host system.
@@ -308,12 +335,12 @@ class Timer : public WaitHandle {
                             std::chrono::milliseconds period,
                             std::function<void()> opt_callback = nullptr) = 0;
   template <typename Rep, typename Period>
-  void SetRepeating(std::chrono::nanoseconds due_time,
+  bool SetRepeating(std::chrono::nanoseconds due_time,
                     std::chrono::duration<Rep, Period> period,
                     std::function<void()> opt_callback = nullptr) {
-    SetRepeating(due_time,
-                 std::chrono::duration_cast<std::chrono::milliseconds>(period),
-                 std::move(opt_callback));
+    return SetRepeating(
+        due_time, std::chrono::duration_cast<std::chrono::milliseconds>(period),
+        std::move(opt_callback));
   }
 
   // Stops the timer before it can be set to the signaled state and cancels
@@ -391,7 +418,7 @@ class Thread : public WaitHandle {
 
   // Decrements a thread's suspend count. When the suspend count is decremented
   // to zero, the execution of the thread is resumed.
-  virtual bool Resume(uint32_t* out_new_suspend_count = nullptr) = 0;
+  virtual bool Resume(uint32_t* out_previous_suspend_count = nullptr) = 0;
 
   // Suspends the specified thread.
   virtual bool Suspend(uint32_t* out_previous_suspend_count = nullptr) = 0;
diff --git a/src/xenia/base/threading_posix.cc b/src/xenia/base/threading_posix.cc
index 28597e608..9e39b17a5 100644
--- a/src/xenia/base/threading_posix.cc
+++ b/src/xenia/base/threading_posix.cc
@@ -13,16 +13,64 @@
 #include "xenia/base/logging.h"
 
 #include <pthread.h>
+#include <signal.h>
 #include <sys/eventfd.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#include <time.h>
 #include <unistd.h>
+#include <ctime>
+#include <memory>
 
 namespace xe {
 namespace threading {
 
+template <typename _Rep, typename _Period>
+inline timespec DurationToTimeSpec(
+    std::chrono::duration<_Rep, _Period> duration) {
+  auto nanoseconds =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(duration);
+  auto div = ldiv(nanoseconds.count(), 1000000000L);
+  return timespec{div.quot, div.rem};
+}
+
+// Thread interruption is done using user-defined signals
+// This implementation uses the SIGRTMAX - SIGRTMIN to signal to a thread
+// gdb tip, for SIG = SIGRTMIN + SignalType : handle SIG nostop
+// lldb tip, for SIG = SIGRTMIN + SignalType : process handle SIG -s false
+enum class SignalType {
+  kHighResolutionTimer,
+  kTimer,
+  kThreadSuspend,
+  kThreadUserCallback,
+  k_Count
+};
+
+int GetSystemSignal(SignalType num) {
+  auto result = SIGRTMIN + static_cast<int>(num);
+  assert_true(result < SIGRTMAX);
+  return result;
+}
+
+SignalType GetSystemSignalType(int num) {
+  return static_cast<SignalType>(num - SIGRTMIN);
+}
+
+thread_local std::array<bool, static_cast<size_t>(SignalType::k_Count)>
+    signal_handler_installed = {};
+
+static void signal_handler(int signal, siginfo_t* info, void* context);
+
+void install_signal_handler(SignalType type) {
+  if (signal_handler_installed[static_cast<size_t>(type)]) return;
+  struct sigaction action {};
+  action.sa_flags = SA_SIGINFO;
+  action.sa_sigaction = signal_handler;
+  sigemptyset(&action.sa_mask);
+  if (sigaction(GetSystemSignal(type), &action, nullptr) == -1)
+    signal_handler_installed[static_cast<size_t>(type)] = true;
+}
+
 // TODO(dougvj)
 void EnableAffinityConfiguration() {}
 
@@ -47,55 +95,81 @@ void MaybeYield() {
 void SyncMemory() { __sync_synchronize(); }
 
 void Sleep(std::chrono::microseconds duration) {
-  timespec rqtp = {time_t(duration.count() / 1000000),
-                   time_t(duration.count() % 1000)};
-  nanosleep(&rqtp, nullptr);
-  // TODO(benvanik): spin while rmtp >0?
+  timespec rqtp = DurationToTimeSpec(duration);
+  timespec rmtp = {};
+  auto p_rqtp = &rqtp;
+  auto p_rmtp = &rmtp;
+  int ret = 0;
+  do {
+    ret = nanosleep(p_rqtp, p_rmtp);
+    // Swap requested for remaining in case of signal interruption
+    // in which case, we start sleeping again for the remainder
+    std::swap(p_rqtp, p_rmtp);
+  } while (ret == -1 && errno == EINTR);
 }
 
-// TODO(dougvj) Not sure how to implement the equivalent of this on POSIX.
+// TODO(bwrsandman) Implement by allowing alert interrupts from IO operations
+thread_local bool alertable_state_ = false;
 SleepResult AlertableSleep(std::chrono::microseconds duration) {
-  sleep(duration.count() / 1000);
+  alertable_state_ = true;
+  Sleep(duration);
+  alertable_state_ = false;
   return SleepResult::kSuccess;
 }
 
-// TODO(dougvj) We can probably wrap this with pthread_key_t but the type of
-// TlsHandle probably needs to be refactored
 TlsHandle AllocateTlsHandle() {
-  assert_always();
-  return 0;
+  auto key = static_cast<pthread_key_t>(-1);
+  auto res = pthread_key_create(&key, nullptr);
+  assert_zero(res);
+  assert_true(key != static_cast<pthread_key_t>(-1));
+  return static_cast<TlsHandle>(key);
 }
 
-bool FreeTlsHandle(TlsHandle handle) { return true; }
+bool FreeTlsHandle(TlsHandle handle) {
+  return pthread_key_delete(static_cast<pthread_key_t>(handle)) == 0;
+}
 
 uintptr_t GetTlsValue(TlsHandle handle) {
-  assert_always();
-  return 0;
+  return reinterpret_cast<uintptr_t>(
+      pthread_getspecific(static_cast<pthread_key_t>(handle)));
 }
 
 bool SetTlsValue(TlsHandle handle, uintptr_t value) {
-  assert_always();
-  return false;
+  return pthread_setspecific(static_cast<pthread_key_t>(handle),
+                             reinterpret_cast<void*>(value)) == 0;
 }
 
-// TODO(dougvj)
 class PosixHighResolutionTimer : public HighResolutionTimer {
  public:
-  PosixHighResolutionTimer(std::function<void()> callback)
-      : callback_(callback) {}
-  ~PosixHighResolutionTimer() override {}
+  explicit PosixHighResolutionTimer(std::function<void()> callback)
+      : callback_(std::move(callback)), timer_(nullptr) {}
+  ~PosixHighResolutionTimer() override {
+    if (timer_) timer_delete(timer_);
+  }
 
   bool Initialize(std::chrono::milliseconds period) {
-    assert_always();
-    return false;
+    // Create timer
+    sigevent sev{};
+    sev.sigev_notify = SIGEV_SIGNAL;
+    sev.sigev_signo = GetSystemSignal(SignalType::kHighResolutionTimer);
+    sev.sigev_value.sival_ptr = (void*)&callback_;
+    if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
+
+    // Start timer
+    itimerspec its{};
+    its.it_value = DurationToTimeSpec(period);
+    its.it_interval = its.it_value;
+    return timer_settime(timer_, 0, &its, nullptr) != -1;
   }
 
  private:
   std::function<void()> callback_;
+  timer_t timer_;
 };
 
 std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
     std::chrono::milliseconds period, std::function<void()> callback) {
+  install_signal_handler(SignalType::kHighResolutionTimer);
   auto timer = std::make_unique<PosixHighResolutionTimer>(std::move(callback));
   if (!timer->Initialize(period)) {
     return nullptr;
@@ -103,209 +177,669 @@ std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
   return std::unique_ptr<HighResolutionTimer>(timer.release());
 }
 
-// TODO(dougvj) There really is no native POSIX handle for a single wait/signal
-// construct pthreads is at a lower level with more handles for such a mechanism
-// This simple wrapper class could function as our handle, but probably needs
-// some more functionality
-class PosixCondition {
+class PosixConditionBase {
  public:
-  PosixCondition() : signal_(false) {
-    pthread_mutex_init(&mutex_, NULL);
-    pthread_cond_init(&cond_, NULL);
+  virtual bool Signal() = 0;
+
+  WaitResult Wait(std::chrono::milliseconds timeout) {
+    bool executed;
+    auto predicate = [this] { return this->signaled(); };
+    auto lock = std::unique_lock<std::mutex>(mutex_);
+    if (predicate()) {
+      executed = true;
+    } else {
+      if (timeout == std::chrono::milliseconds::max()) {
+        cond_.wait(lock, predicate);
+        executed = true;  // Did not time out;
+      } else {
+        executed = cond_.wait_for(lock, timeout, predicate);
+      }
+    }
+    if (executed) {
+      post_execution();
+      return WaitResult::kSuccess;
+    } else {
+      return WaitResult::kTimeout;
+    }
   }
 
-  ~PosixCondition() {
-    pthread_mutex_destroy(&mutex_);
-    pthread_cond_destroy(&cond_);
+  static std::pair<WaitResult, size_t> WaitMultiple(
+      std::vector<PosixConditionBase*>&& handles, bool wait_all,
+      std::chrono::milliseconds timeout) {
+    using iter_t = std::vector<PosixConditionBase*>::const_iterator;
+    bool executed;
+    auto predicate = [](auto h) { return h->signaled(); };
+
+    // Construct a condition for all or any depending on wait_all
+    auto operation = wait_all ? std::all_of<iter_t, decltype(predicate)>
+                              : std::any_of<iter_t, decltype(predicate)>;
+    auto aggregate = [&handles, operation, predicate] {
+      return operation(handles.cbegin(), handles.cend(), predicate);
+    };
+
+    // TODO(bwrsandman, Triang3l) This is controversial, see issue #1677
+    // This will probably cause a deadlock on the next thread doing any waiting
+    // if the thread is suspended between locking and waiting
+    std::unique_lock<std::mutex> lock(PosixConditionBase::mutex_);
+
+    // Check if the aggregate lambda (all or any) is already satisfied
+    if (aggregate()) {
+      executed = true;
+    } else {
+      // If the aggregate is not yet satisfied and the timeout is infinite,
+      // wait without timeout.
+      if (timeout == std::chrono::milliseconds::max()) {
+        PosixConditionBase::cond_.wait(lock, aggregate);
+        executed = true;
+      } else {
+        // Wait with timeout.
+        executed = PosixConditionBase::cond_.wait_for(lock, timeout, aggregate);
+      }
+    }
+    if (executed) {
+      auto first_signaled = std::numeric_limits<size_t>::max();
+      for (auto i = 0u; i < handles.size(); ++i) {
+        if (handles[i]->signaled()) {
+          if (first_signaled > i) {
+            first_signaled = i;
+          }
+          handles[i]->post_execution();
+          if (!wait_all) break;
+        }
+      }
+      return std::make_pair(WaitResult::kSuccess, first_signaled);
+    } else {
+      return std::make_pair<WaitResult, size_t>(WaitResult::kTimeout, 0);
+    }
   }
 
-  void Signal() {
-    pthread_mutex_lock(&mutex_);
+  virtual void* native_handle() const { return cond_.native_handle(); }
+
+ protected:
+  inline virtual bool signaled() const = 0;
+  inline virtual void post_execution() = 0;
+  static std::condition_variable cond_;
+  static std::mutex mutex_;
+};
+
+std::condition_variable PosixConditionBase::cond_;
+std::mutex PosixConditionBase::mutex_;
+
+// There really is no native POSIX handle for a single wait/signal construct
+// pthreads is at a lower level with more handles for such a mechanism.
+// This simple wrapper class functions as our handle and uses conditional
+// variables for waits and signals.
+template <typename T>
+class PosixCondition {};
+
+template <>
+class PosixCondition<Event> : public PosixConditionBase {
+ public:
+  PosixCondition(bool manual_reset, bool initial_state)
+      : signal_(initial_state), manual_reset_(manual_reset) {}
+  virtual ~PosixCondition() = default;
+
+  bool Signal() override {
+    auto lock = std::unique_lock<std::mutex>(mutex_);
     signal_ = true;
-    pthread_cond_broadcast(&cond_);
-    pthread_mutex_unlock(&mutex_);
+    if (manual_reset_) {
+      cond_.notify_all();
+    } else {
+      // FIXME(bwrsandman): Potential cause for deadlock
+      // See issue #1678 for possible fix and discussion
+      cond_.notify_one();
+    }
+    return true;
   }
 
   void Reset() {
-    pthread_mutex_lock(&mutex_);
+    auto lock = std::unique_lock<std::mutex>(mutex_);
     signal_ = false;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  bool Wait(unsigned int timeout_ms) {
-    // Assume 0 means no timeout, not instant timeout
-    if (timeout_ms == 0) {
-      Wait();
-    }
-    struct timespec time_to_wait;
-    struct timeval now;
-    gettimeofday(&now, NULL);
-
-    // Add the number of seconds we want to wait to the current time
-    time_to_wait.tv_sec = now.tv_sec + (timeout_ms / 1000);
-    // Add the number of nanoseconds we want to wait to the current nanosecond
-    // stride
-    long nsec = (now.tv_usec + (timeout_ms % 1000)) * 1000;
-    // If we overflowed the nanosecond count then we add a second
-    time_to_wait.tv_sec += nsec / 1000000000UL;
-    // We only add nanoseconds within the 1 second stride
-    time_to_wait.tv_nsec = nsec % 1000000000UL;
-    pthread_mutex_lock(&mutex_);
-    while (!signal_) {
-      int status = pthread_cond_timedwait(&cond_, &mutex_, &time_to_wait);
-      if (status == ETIMEDOUT) return false;  // We timed out
-    }
-    pthread_mutex_unlock(&mutex_);
-    return true;  // We didn't time out
-  }
-
-  bool Wait() {
-    pthread_mutex_lock(&mutex_);
-    while (!signal_) {
-      pthread_cond_wait(&cond_, &mutex_);
-    }
-    pthread_mutex_unlock(&mutex_);
-    return true;  // Did not time out;
   }
 
  private:
+  inline bool signaled() const override { return signal_; }
+  inline void post_execution() override {
+    if (!manual_reset_) {
+      signal_ = false;
+    }
+  }
   bool signal_;
-  pthread_cond_t cond_;
-  pthread_mutex_t mutex_;
+  const bool manual_reset_;
 };
 
-// Native posix thread handle
-template <typename T>
-class PosixThreadHandle : public T {
+template <>
+class PosixCondition<Semaphore> : public PosixConditionBase {
  public:
-  explicit PosixThreadHandle(pthread_t handle) : handle_(handle) {}
-  ~PosixThreadHandle() override {}
+  PosixCondition(uint32_t initial_count, uint32_t maximum_count)
+      : count_(initial_count), maximum_count_(maximum_count) {}
 
- protected:
-  void* native_handle() const override {
-    return reinterpret_cast<void*>(handle_);
+  bool Signal() override { return Release(1, nullptr); }
+
+  bool Release(uint32_t release_count, int* out_previous_count) {
+    if (maximum_count_ - count_ >= release_count) {
+      auto lock = std::unique_lock<std::mutex>(mutex_);
+      if (out_previous_count) *out_previous_count = count_;
+      count_ += release_count;
+      cond_.notify_all();
+      return true;
+    }
+    return false;
   }
 
-  pthread_t handle_;
+ private:
+  inline bool signaled() const override { return count_ > 0; }
+  inline void post_execution() override {
+    count_--;
+    cond_.notify_all();
+  }
+  uint32_t count_;
+  const uint32_t maximum_count_;
 };
 
-// This is wraps a condition object as our handle because posix has no single
-// native handle for higher level concurrency constructs such as semaphores
-template <typename T>
-class PosixConditionHandle : public T {
+template <>
+class PosixCondition<Mutant> : public PosixConditionBase {
  public:
-  ~PosixConditionHandle() override {}
-
- protected:
-  void* native_handle() const override {
-    return reinterpret_cast<void*>(const_cast<PosixCondition*>(&handle_));
+  explicit PosixCondition(bool initial_owner) : count_(0) {
+    if (initial_owner) {
+      count_ = 1;
+      owner_ = std::this_thread::get_id();
+    }
   }
 
-  PosixCondition handle_;
+  bool Signal() override { return Release(); }
+
+  bool Release() {
+    if (owner_ == std::this_thread::get_id() && count_ > 0) {
+      auto lock = std::unique_lock<std::mutex>(mutex_);
+      --count_;
+      // Free to be acquired by another thread
+      if (count_ == 0) {
+        cond_.notify_one();
+      }
+      return true;
+    }
+    return false;
+  }
+
+  void* native_handle() const override { return mutex_.native_handle(); }
+
+ private:
+  inline bool signaled() const override {
+    return count_ == 0 || owner_ == std::this_thread::get_id();
+  }
+  inline void post_execution() override {
+    count_++;
+    owner_ = std::this_thread::get_id();
+  }
+  uint32_t count_;
+  std::thread::id owner_;
 };
 
-template <typename T>
-class PosixFdHandle : public T {
+template <>
+class PosixCondition<Timer> : public PosixConditionBase {
  public:
-  explicit PosixFdHandle(intptr_t handle) : handle_(handle) {}
-  ~PosixFdHandle() override {
-    close(handle_);
-    handle_ = 0;
+  explicit PosixCondition(bool manual_reset)
+      : callback_(),
+        timer_(nullptr),
+        signal_(false),
+        manual_reset_(manual_reset) {}
+
+  virtual ~PosixCondition() { Cancel(); }
+
+  bool Signal() override {
+    CompletionRoutine();
+    return true;
   }
 
- protected:
-  void* native_handle() const override {
-    return reinterpret_cast<void*>(handle_);
-  }
+  // TODO(bwrsandman): due_times of under 1ms deadlock under travis
+  bool Set(std::chrono::nanoseconds due_time, std::chrono::milliseconds period,
+           std::function<void()> opt_callback = nullptr) {
+    std::lock_guard<std::mutex> lock(mutex_);
 
-  intptr_t handle_;
-};
+    callback_ = std::move(opt_callback);
+    signal_ = false;
 
-// TODO(dougvj)
-WaitResult Wait(WaitHandle* wait_handle, bool is_alertable,
-                std::chrono::milliseconds timeout) {
-  intptr_t handle = reinterpret_cast<intptr_t>(wait_handle->native_handle());
-
-  fd_set set;
-  struct timeval time_val;
-  int ret;
-
-  FD_ZERO(&set);
-  FD_SET(handle, &set);
-
-  time_val.tv_sec = timeout.count() / 1000;
-  time_val.tv_usec = timeout.count() * 1000;
-  ret = select(handle + 1, &set, NULL, NULL, &time_val);
-  if (ret == -1) {
-    return WaitResult::kFailed;
-  } else if (ret == 0) {
-    return WaitResult::kTimeout;
-  } else {
-    uint64_t buf = 0;
-    ret = read(handle, &buf, sizeof(buf));
-    if (ret < 8) {
-      return WaitResult::kTimeout;
+    // Create timer
+    if (timer_ == nullptr) {
+      sigevent sev{};
+      sev.sigev_notify = SIGEV_SIGNAL;
+      sev.sigev_signo = GetSystemSignal(SignalType::kTimer);
+      sev.sigev_value.sival_ptr = this;
+      if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
     }
 
-    return WaitResult::kSuccess;
+    // Start timer
+    itimerspec its{};
+    its.it_value = DurationToTimeSpec(due_time);
+    its.it_interval = DurationToTimeSpec(period);
+    return timer_settime(timer_, 0, &its, nullptr) == 0;
   }
+
+  void CompletionRoutine() {
+    // As the callback may reset the timer, store local.
+    std::function<void()> callback;
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      // Store callback
+      if (callback_) callback = callback_;
+      signal_ = true;
+      if (manual_reset_) {
+        cond_.notify_all();
+      } else {
+        cond_.notify_one();
+      }
+    }
+    // Call callback
+    if (callback) callback();
+  }
+
+  bool Cancel() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bool result = true;
+    if (timer_) {
+      result = timer_delete(timer_) == 0;
+      timer_ = nullptr;
+    }
+    return result;
+  }
+
+  void* native_handle() const override {
+    return reinterpret_cast<void*>(timer_);
+  }
+
+ private:
+  inline bool signaled() const override { return signal_; }
+  inline void post_execution() override {
+    if (!manual_reset_) {
+      signal_ = false;
+    }
+  }
+  std::function<void()> callback_;
+  timer_t timer_;
+  volatile bool signal_;
+  const bool manual_reset_;
+};
+
+struct ThreadStartData {
+  std::function<void()> start_routine;
+  bool create_suspended;
+  Thread* thread_obj;
+};
+
+template <>
+class PosixCondition<Thread> : public PosixConditionBase {
+  enum class State {
+    kUninitialized,
+    kRunning,
+    kSuspended,
+    kFinished,
+  };
+
+ public:
+  PosixCondition()
+      : thread_(0),
+        signaled_(false),
+        exit_code_(0),
+        state_(State::kUninitialized),
+        suspend_count_(0) {}
+  bool Initialize(Thread::CreationParameters params,
+                  ThreadStartData* start_data) {
+    start_data->create_suspended = params.create_suspended;
+    pthread_attr_t attr;
+    if (pthread_attr_init(&attr) != 0) return false;
+    if (pthread_attr_setstacksize(&attr, params.stack_size) != 0) {
+      pthread_attr_destroy(&attr);
+      return false;
+    }
+    if (params.initial_priority != 0) {
+      sched_param sched{};
+      sched.sched_priority = params.initial_priority + 1;
+      if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO) != 0) {
+        pthread_attr_destroy(&attr);
+        return false;
+      }
+      if (pthread_attr_setschedparam(&attr, &sched) != 0) {
+        pthread_attr_destroy(&attr);
+        return false;
+      }
+    }
+    if (pthread_create(&thread_, &attr, ThreadStartRoutine, start_data) != 0) {
+      return false;
+    }
+    pthread_attr_destroy(&attr);
+    return true;
+  }
+
+  /// Constructor for existing thread. This should only happen once called by
+  /// Thread::GetCurrentThread() on the main thread
+  explicit PosixCondition(pthread_t thread)
+      : thread_(thread),
+        signaled_(false),
+        exit_code_(0),
+        state_(State::kRunning) {}
+
+  virtual ~PosixCondition() {
+    if (thread_ && !signaled_) {
+      if (pthread_cancel(thread_) != 0) {
+        assert_always();
+      }
+      if (pthread_join(thread_, nullptr) != 0) {
+        assert_always();
+      }
+    }
+  }
+
+  bool Signal() override { return true; }
+
+  std::string name() const {
+    WaitStarted();
+    auto result = std::array<char, 17>{'\0'};
+    std::unique_lock<std::mutex> lock(state_mutex_);
+    if (state_ != State::kUninitialized && state_ != State::kFinished) {
+      if (pthread_getname_np(thread_, result.data(), result.size() - 1) != 0)
+        assert_always();
+    }
+    return std::string(result.data());
+  }
+
+  void set_name(const std::string& name) {
+    WaitStarted();
+    std::unique_lock<std::mutex> lock(state_mutex_);
+    if (state_ != State::kUninitialized && state_ != State::kFinished) {
+      threading::set_name(static_cast<std::thread::native_handle_type>(thread_),
+                          name);
+    }
+  }
+
+  uint32_t system_id() const { return static_cast<uint32_t>(thread_); }
+
+  uint64_t affinity_mask() {
+    WaitStarted();
+    cpu_set_t cpu_set;
+    if (pthread_getaffinity_np(thread_, sizeof(cpu_set_t), &cpu_set) != 0)
+      assert_always();
+    uint64_t result = 0;
+    auto cpu_count = std::min(CPU_SETSIZE, 64);
+    for (auto i = 0u; i < cpu_count; i++) {
+      auto set = CPU_ISSET(i, &cpu_set);
+      result |= set << i;
+    }
+    return result;
+  }
+
+  void set_affinity_mask(uint64_t mask) {
+    WaitStarted();
+    cpu_set_t cpu_set;
+    CPU_ZERO(&cpu_set);
+    for (auto i = 0u; i < 64; i++) {
+      if (mask & (1 << i)) {
+        CPU_SET(i, &cpu_set);
+      }
+    }
+    if (pthread_setaffinity_np(thread_, sizeof(cpu_set_t), &cpu_set) != 0) {
+      assert_always();
+    }
+  }
+
+  int priority() {
+    WaitStarted();
+    int policy;
+    sched_param param{};
+    int ret = pthread_getschedparam(thread_, &policy, &param);
+    if (ret != 0) {
+      return -1;
+    }
+
+    return param.sched_priority;
+  }
+
+  void set_priority(int new_priority) {
+    WaitStarted();
+    sched_param param{};
+    param.sched_priority = new_priority;
+    if (pthread_setschedparam(thread_, SCHED_FIFO, &param) != 0)
+      assert_always();
+  }
+
+  void QueueUserCallback(std::function<void()> callback) {
+    WaitStarted();
+    std::unique_lock<std::mutex> lock(callback_mutex_);
+    user_callback_ = std::move(callback);
+    sigval value{};
+    value.sival_ptr = this;
+    pthread_sigqueue(thread_, GetSystemSignal(SignalType::kThreadUserCallback),
+                     value);
+  }
+
+  void CallUserCallback() {
+    std::unique_lock<std::mutex> lock(callback_mutex_);
+    user_callback_();
+  }
+
+  bool Resume(uint32_t* out_previous_suspend_count = nullptr) {
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = 0;
+    }
+    WaitStarted();
+    std::unique_lock<std::mutex> lock(state_mutex_);
+    if (state_ != State::kSuspended) return false;
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = suspend_count_;
+    }
+    --suspend_count_;
+    state_signal_.notify_all();
+    return true;
+  }
+
+  bool Suspend(uint32_t* out_previous_suspend_count = nullptr) {
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = 0;
+    }
+    WaitStarted();
+    {
+      if (out_previous_suspend_count) {
+        *out_previous_suspend_count = suspend_count_;
+      }
+      state_ = State::kSuspended;
+      ++suspend_count_;
+    }
+    int result =
+        pthread_kill(thread_, GetSystemSignal(SignalType::kThreadSuspend));
+    return result == 0;
+  }
+
+  void Terminate(int exit_code) {
+    {
+      std::unique_lock<std::mutex> lock(state_mutex_);
+      state_ = State::kFinished;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    // Sometimes the thread can call terminate twice before stopping
+    if (thread_ == 0) return;
+    auto thread = thread_;
+
+    exit_code_ = exit_code;
+    signaled_ = true;
+    cond_.notify_all();
+
+    if (pthread_cancel(thread) != 0) assert_always();
+  }
+
+  void WaitStarted() const {
+    std::unique_lock<std::mutex> lock(state_mutex_);
+    state_signal_.wait(lock,
+                       [this] { return state_ != State::kUninitialized; });
+  }
+
+  /// Set state to suspended and wait until it reset by another thread
+  void WaitSuspended() {
+    std::unique_lock<std::mutex> lock(state_mutex_);
+    state_signal_.wait(lock, [this] { return suspend_count_ == 0; });
+    state_ = State::kRunning;
+  }
+
+  void* native_handle() const override {
+    return reinterpret_cast<void*>(thread_);
+  }
+
+ private:
+  static void* ThreadStartRoutine(void* parameter);
+  inline bool signaled() const override { return signaled_; }
+  inline void post_execution() override {
+    if (thread_) {
+      pthread_join(thread_, nullptr);
+      thread_ = 0;
+    }
+  }
+  pthread_t thread_;
+  bool signaled_;
+  int exit_code_;
+  volatile State state_;
+  volatile uint32_t suspend_count_;
+  mutable std::mutex state_mutex_;
+  mutable std::mutex callback_mutex_;
+  mutable std::condition_variable state_signal_;
+  std::function<void()> user_callback_;
+};
+
+class PosixWaitHandle {
+ public:
+  virtual PosixConditionBase& condition() = 0;
+};
+
+// This wraps a condition object as our handle because posix has no single
+// native handle for higher level concurrency constructs such as semaphores
+template <typename T>
+class PosixConditionHandle : public T, public PosixWaitHandle {
+ public:
+  PosixConditionHandle() = default;
+  explicit PosixConditionHandle(bool);
+  explicit PosixConditionHandle(pthread_t thread);
+  PosixConditionHandle(bool manual_reset, bool initial_state);
+  PosixConditionHandle(uint32_t initial_count, uint32_t maximum_count);
+  ~PosixConditionHandle() override = default;
+
+  PosixConditionBase& condition() override { return handle_; }
+  void* native_handle() const override { return handle_.native_handle(); }
+
+ protected:
+  PosixCondition<T> handle_;
+  friend PosixCondition<T>;
+};
+
+template <>
+PosixConditionHandle<Semaphore>::PosixConditionHandle(uint32_t initial_count,
+                                                      uint32_t maximum_count)
+    : handle_(initial_count, maximum_count) {}
+
+template <>
+PosixConditionHandle<Mutant>::PosixConditionHandle(bool initial_owner)
+    : handle_(initial_owner) {}
+
+template <>
+PosixConditionHandle<Timer>::PosixConditionHandle(bool manual_reset)
+    : handle_(manual_reset) {}
+
+template <>
+PosixConditionHandle<Event>::PosixConditionHandle(bool manual_reset,
+                                                  bool initial_state)
+    : handle_(manual_reset, initial_state) {}
+
+template <>
+PosixConditionHandle<Thread>::PosixConditionHandle(pthread_t thread)
+    : handle_(thread) {}
+
+WaitResult Wait(WaitHandle* wait_handle, bool is_alertable,
+                std::chrono::milliseconds timeout) {
+  auto posix_wait_handle = dynamic_cast<PosixWaitHandle*>(wait_handle);
+  if (posix_wait_handle == nullptr) {
+    return WaitResult::kFailed;
+  }
+  if (is_alertable) alertable_state_ = true;
+  auto result = posix_wait_handle->condition().Wait(timeout);
+  if (is_alertable) alertable_state_ = false;
+  return result;
 }
 
-// TODO(dougvj)
 WaitResult SignalAndWait(WaitHandle* wait_handle_to_signal,
                          WaitHandle* wait_handle_to_wait_on, bool is_alertable,
                          std::chrono::milliseconds timeout) {
-  assert_always();
-  return WaitResult::kFailed;
+  auto result = WaitResult::kFailed;
+  auto posix_wait_handle_to_signal =
+      dynamic_cast<PosixWaitHandle*>(wait_handle_to_signal);
+  auto posix_wait_handle_to_wait_on =
+      dynamic_cast<PosixWaitHandle*>(wait_handle_to_wait_on);
+  if (posix_wait_handle_to_signal == nullptr ||
+      posix_wait_handle_to_wait_on == nullptr) {
+    return WaitResult::kFailed;
+  }
+  if (is_alertable) alertable_state_ = true;
+  if (posix_wait_handle_to_signal->condition().Signal()) {
+    result = posix_wait_handle_to_wait_on->condition().Wait(timeout);
+  }
+  if (is_alertable) alertable_state_ = false;
+  return result;
 }
 
-// TODO(dougvj)
 std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
                                            size_t wait_handle_count,
                                            bool wait_all, bool is_alertable,
                                            std::chrono::milliseconds timeout) {
-  assert_always();
-  return std::pair<WaitResult, size_t>(WaitResult::kFailed, 0);
+  std::vector<PosixConditionBase*> conditions;
+  conditions.reserve(wait_handle_count);
+  for (size_t i = 0u; i < wait_handle_count; ++i) {
+    auto handle = dynamic_cast<PosixWaitHandle*>(wait_handles[i]);
+    if (handle == nullptr) {
+      return std::make_pair(WaitResult::kFailed, 0);
+    }
+    conditions.push_back(&handle->condition());
+  }
+  if (is_alertable) alertable_state_ = true;
+  auto result = PosixConditionBase::WaitMultiple(std::move(conditions),
+                                                 wait_all, timeout);
+  if (is_alertable) alertable_state_ = false;
+  return result;
 }
 
-// TODO(dougvj)
-class PosixEvent : public PosixFdHandle<Event> {
+class PosixEvent : public PosixConditionHandle<Event> {
  public:
-  PosixEvent(intptr_t fd) : PosixFdHandle(fd) {}
+  PosixEvent(bool manual_reset, bool initial_state)
+      : PosixConditionHandle(manual_reset, initial_state) {}
   ~PosixEvent() override = default;
-  void Set() override {
-    uint64_t buf = 1;
-    write(handle_, &buf, sizeof(buf));
+  void Set() override { handle_.Signal(); }
+  void Reset() override { handle_.Reset(); }
+  void Pulse() override {
+    using namespace std::chrono_literals;
+    handle_.Signal();
+    MaybeYield();
+    Sleep(10us);
+    handle_.Reset();
   }
-  void Reset() override { assert_always(); }
-  void Pulse() override { assert_always(); }
-
- private:
-  PosixCondition condition_;
 };
 
 std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
-  // Linux's eventfd doesn't appear to support manual reset natively.
-  return nullptr;
+  return std::make_unique<PosixEvent>(true, initial_state);
 }
 
 std::unique_ptr<Event> Event::CreateAutoResetEvent(bool initial_state) {
-  int fd = eventfd(initial_state ? 1 : 0, EFD_CLOEXEC);
-  if (fd == -1) {
-    return nullptr;
-  }
-
-  return std::make_unique<PosixEvent>(PosixEvent(fd));
+  return std::make_unique<PosixEvent>(false, initial_state);
 }
 
-// TODO(dougvj)
 class PosixSemaphore : public PosixConditionHandle<Semaphore> {
  public:
-  PosixSemaphore(int initial_count, int maximum_count) { assert_always(); }
+  PosixSemaphore(int initial_count, int maximum_count)
+      : PosixConditionHandle(static_cast<uint32_t>(initial_count),
+                             static_cast<uint32_t>(maximum_count)) {}
   ~PosixSemaphore() override = default;
   bool Release(int release_count, int* out_previous_count) override {
-    assert_always();
-    return false;
+    if (release_count < 1) {
+      return false;
+    }
+    return handle_.Release(static_cast<uint32_t>(release_count),
+                           out_previous_count);
   }
 };
 
@@ -314,149 +848,210 @@ std::unique_ptr<Semaphore> Semaphore::Create(int initial_count,
   return std::make_unique<PosixSemaphore>(initial_count, maximum_count);
 }
 
-// TODO(dougvj)
 class PosixMutant : public PosixConditionHandle<Mutant> {
  public:
-  PosixMutant(bool initial_owner) { assert_always(); }
-  ~PosixMutant() = default;
-  bool Release() override {
-    assert_always();
-    return false;
-  }
+  explicit PosixMutant(bool initial_owner)
+      : PosixConditionHandle(initial_owner) {}
+  ~PosixMutant() override = default;
+  bool Release() override { return handle_.Release(); }
 };
 
 std::unique_ptr<Mutant> Mutant::Create(bool initial_owner) {
   return std::make_unique<PosixMutant>(initial_owner);
 }
 
-// TODO(dougvj)
 class PosixTimer : public PosixConditionHandle<Timer> {
  public:
-  PosixTimer(bool manual_reset) { assert_always(); }
-  ~PosixTimer() = default;
+  explicit PosixTimer(bool manual_reset) : PosixConditionHandle(manual_reset) {}
+  ~PosixTimer() override = default;
   bool SetOnce(std::chrono::nanoseconds due_time,
                std::function<void()> opt_callback) override {
-    assert_always();
-    return false;
+    return handle_.Set(due_time, std::chrono::milliseconds::zero(),
+                       std::move(opt_callback));
   }
   bool SetRepeating(std::chrono::nanoseconds due_time,
                     std::chrono::milliseconds period,
                     std::function<void()> opt_callback) override {
-    assert_always();
-    return false;
-  }
-  bool Cancel() override {
-    assert_always();
-    return false;
+    return handle_.Set(due_time, period, std::move(opt_callback));
   }
+  bool Cancel() override { return handle_.Cancel(); }
 };
 
 std::unique_ptr<Timer> Timer::CreateManualResetTimer() {
+  install_signal_handler(SignalType::kTimer);
   return std::make_unique<PosixTimer>(true);
 }
 
 std::unique_ptr<Timer> Timer::CreateSynchronizationTimer() {
+  install_signal_handler(SignalType::kTimer);
   return std::make_unique<PosixTimer>(false);
 }
 
-class PosixThread : public PosixThreadHandle<Thread> {
+class PosixThread : public PosixConditionHandle<Thread> {
  public:
-  explicit PosixThread(pthread_t handle) : PosixThreadHandle(handle) {}
-  ~PosixThread() = default;
+  PosixThread() = default;
+  explicit PosixThread(pthread_t thread) : PosixConditionHandle(thread) {}
+  ~PosixThread() override = default;
+
+  bool Initialize(CreationParameters params,
+                  std::function<void()> start_routine) {
+    auto start_data =
+        new ThreadStartData({std::move(start_routine), false, this});
+    return handle_.Initialize(params, start_data);
+  }
 
   void set_name(std::string name) override {
-    pthread_setname_np(handle_, name.c_str());
-  }
-
-  uint32_t system_id() const override { return 0; }
-
-  // TODO(DrChat)
-  uint64_t affinity_mask() override { return 0; }
-  void set_affinity_mask(uint64_t mask) override { assert_always(); }
-
-  int priority() override {
-    int policy;
-    struct sched_param param;
-    int ret = pthread_getschedparam(handle_, &policy, &param);
-    if (ret != 0) {
-      return -1;
+    handle_.WaitStarted();
+    Thread::set_name(name);
+    if (name.length() > 15) {
+      name = name.substr(0, 15);
     }
-
-    return param.sched_priority;
+    handle_.set_name(name);
   }
 
+  uint32_t system_id() const override { return handle_.system_id(); }
+
+  uint64_t affinity_mask() override { return handle_.affinity_mask(); }
+  void set_affinity_mask(uint64_t mask) override {
+    handle_.set_affinity_mask(mask);
+  }
+
+  int priority() override { return handle_.priority(); }
   void set_priority(int new_priority) override {
-    struct sched_param param;
-    param.sched_priority = new_priority;
-    int ret = pthread_setschedparam(handle_, SCHED_FIFO, &param);
+    handle_.set_priority(new_priority);
   }
 
-  // TODO(DrChat)
   void QueueUserCallback(std::function<void()> callback) override {
-    assert_always();
+    handle_.QueueUserCallback(std::move(callback));
   }
 
-  bool Resume(uint32_t* out_new_suspend_count = nullptr) override {
-    assert_always();
-    return false;
+  bool Resume(uint32_t* out_previous_suspend_count) override {
+    return handle_.Resume(out_previous_suspend_count);
   }
 
-  bool Suspend(uint32_t* out_previous_suspend_count = nullptr) override {
-    assert_always();
-    return false;
+  bool Suspend(uint32_t* out_previous_suspend_count) override {
+    return handle_.Suspend(out_previous_suspend_count);
   }
 
-  void Terminate(int exit_code) override {}
+  void Terminate(int exit_code) override { handle_.Terminate(exit_code); }
+
+  void WaitSuspended() { handle_.WaitSuspended(); }
 };
 
-thread_local std::unique_ptr<PosixThread> current_thread_ = nullptr;
+thread_local PosixThread* current_thread_ = nullptr;
 
-struct ThreadStartData {
-  std::function<void()> start_routine;
-};
-void* ThreadStartRoutine(void* parameter) {
-  current_thread_ =
-      std::unique_ptr<PosixThread>(new PosixThread(::pthread_self()));
+void* PosixCondition<Thread>::ThreadStartRoutine(void* parameter) {
+  if (pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr) != 0) {
+    assert_always();
+  }
+  threading::set_name("");
 
-  auto start_data = reinterpret_cast<ThreadStartData*>(parameter);
-  start_data->start_routine();
+  auto start_data = static_cast<ThreadStartData*>(parameter);
+  assert_not_null(start_data);
+  assert_not_null(start_data->thread_obj);
+
+  auto thread = dynamic_cast<PosixThread*>(start_data->thread_obj);
+  auto start_routine = std::move(start_data->start_routine);
+  auto create_suspended = start_data->create_suspended;
   delete start_data;
-  return 0;
+
+  current_thread_ = thread;
+  {
+    std::unique_lock<std::mutex> lock(thread->handle_.state_mutex_);
+    thread->handle_.state_ =
+        create_suspended ? State::kSuspended : State::kRunning;
+    thread->handle_.state_signal_.notify_all();
+  }
+
+  if (create_suspended) {
+    std::unique_lock<std::mutex> lock(thread->handle_.state_mutex_);
+    thread->handle_.suspend_count_ = 1;
+    thread->handle_.state_signal_.wait(
+        lock, [thread] { return thread->handle_.suspend_count_ == 0; });
+  }
+
+  start_routine();
+
+  {
+    std::unique_lock<std::mutex> lock(thread->handle_.state_mutex_);
+    thread->handle_.state_ = State::kFinished;
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  thread->handle_.exit_code_ = 0;
+  thread->handle_.signaled_ = true;
+  cond_.notify_all();
+
+  current_thread_ = nullptr;
+  return nullptr;
 }
 
 std::unique_ptr<Thread> Thread::Create(CreationParameters params,
                                        std::function<void()> start_routine) {
-  auto start_data = new ThreadStartData({std::move(start_routine)});
-
-  assert_false(params.create_suspended);
-  pthread_t handle;
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  int ret = pthread_create(&handle, &attr, ThreadStartRoutine, start_data);
-  if (ret != 0) {
-    // TODO(benvanik): pass back?
-    auto last_error = errno;
-    XELOGE("Unable to pthread_create: {}", last_error);
-    delete start_data;
-    return nullptr;
-  }
-
-  return std::unique_ptr<PosixThread>(new PosixThread(handle));
+  install_signal_handler(SignalType::kThreadSuspend);
+  install_signal_handler(SignalType::kThreadUserCallback);
+  auto thread = std::make_unique<PosixThread>();
+  if (!thread->Initialize(params, std::move(start_routine))) return nullptr;
+  assert_not_null(thread);
+  return thread;
 }
 
 Thread* Thread::GetCurrentThread() {
   if (current_thread_) {
-    return current_thread_.get();
+    return current_thread_;
   }
 
+  // Should take this route only for threads not created by Thread::Create.
+  // The only thread not created by Thread::Create should be the main thread.
   pthread_t handle = pthread_self();
 
-  current_thread_ = std::make_unique<PosixThread>(handle);
-  return current_thread_.get();
+  current_thread_ = new PosixThread(handle);
+  atexit([] { delete current_thread_; });
+
+  return current_thread_;
 }
 
 void Thread::Exit(int exit_code) {
-  pthread_exit(reinterpret_cast<void*>(exit_code));
+  if (current_thread_) {
+    current_thread_->Terminate(exit_code);
+    // Sometimes the current thread keeps running after being cancelled.
+    // Prevent other calls from this thread from using current_thread_.
+    current_thread_ = nullptr;
+  } else {
+    // Should only happen with the main thread
+    pthread_exit(reinterpret_cast<void*>(exit_code));
+  }
+}
+
+static void signal_handler(int signal, siginfo_t* info, void* /*context*/) {
+  switch (GetSystemSignalType(signal)) {
+    case SignalType::kHighResolutionTimer: {
+      assert_not_null(info->si_value.sival_ptr);
+      auto callback =
+          *static_cast<std::function<void()>*>(info->si_value.sival_ptr);
+      callback();
+    } break;
+    case SignalType::kTimer: {
+      assert_not_null(info->si_value.sival_ptr);
+      auto pTimer =
+          static_cast<PosixCondition<Timer>*>(info->si_value.sival_ptr);
+      pTimer->CompletionRoutine();
+    } break;
+    case SignalType::kThreadSuspend: {
+      assert_not_null(current_thread_);
+      current_thread_->WaitSuspended();
+    } break;
+    case SignalType::kThreadUserCallback: {
+      assert_not_null(info->si_value.sival_ptr);
+      auto p_thread =
+          static_cast<PosixCondition<Thread>*>(info->si_value.sival_ptr);
+      if (alertable_state_) {
+        p_thread->CallUserCallback();
+      }
+    } break;
+    default:
+      assert_always();
+  }
 }
 
 }  // namespace threading
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index 605c2ccbf..6b4e31a99 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -388,16 +388,16 @@ class Win32Thread : public Win32Handle<Thread> {
     QueueUserAPC(DispatchApc, handle_, reinterpret_cast<ULONG_PTR>(apc_data));
   }
 
-  bool Resume(uint32_t* out_new_suspend_count = nullptr) override {
-    if (out_new_suspend_count) {
-      *out_new_suspend_count = 0;
+  bool Resume(uint32_t* out_previous_suspend_count = nullptr) override {
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = 0;
     }
     DWORD result = ResumeThread(handle_);
     if (result == UINT_MAX) {
       return false;
     }
-    if (out_new_suspend_count) {
-      *out_new_suspend_count = result;
+    if (out_previous_suspend_count) {
+      *out_previous_suspend_count = result;
     }
     return true;
   }
diff --git a/src/xenia/cpu/export_resolver.cc b/src/xenia/cpu/export_resolver.cc
index ecc5d8246..b05df5d83 100644
--- a/src/xenia/cpu/export_resolver.cc
+++ b/src/xenia/cpu/export_resolver.cc
@@ -30,7 +30,7 @@ ExportResolver::Table::Table(const std::string_view module_name,
   }
   std::sort(
       exports_by_name_.begin(), exports_by_name_.end(),
-      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; });
+      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; });
 }
 
 ExportResolver::ExportResolver() = default;
@@ -51,7 +51,7 @@ void ExportResolver::RegisterTable(
   }
   std::sort(
       all_exports_by_name_.begin(), all_exports_by_name_.end(),
-      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; });
+      [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; });
 }
 
 Export* ExportResolver::GetExportByOrdinal(const std::string_view module_name,
diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc
index 4f5e7f96b..69a94c7f4 100644
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@@ -73,7 +73,7 @@ bool CommandProcessor::Initialize(
         WorkerThreadMain();
         return 0;
       }));
-  worker_thread_->set_name("GraphicsSystem Command Processor");
+  worker_thread_->set_name("GPU Commands");
   worker_thread_->Create();
 
   return true;
@@ -731,12 +731,20 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) {
     } break;
     case PM4_CONTEXT_UPDATE: {
       assert_true(count == 1);
-      uint64_t value = reader->ReadAndSwap<uint32_t>();
+      uint32_t value = reader->ReadAndSwap<uint32_t>();
       XELOGGPU("GPU context update = {:08X}", value);
       assert_true(value == 0);
       result = true;
       break;
     }
+    case PM4_WAIT_FOR_IDLE: {
+      // This opcode is used by "Duke Nukem Forever" while going/being ingame
+      assert_true(count == 1);
+      uint32_t value = reader->ReadAndSwap<uint32_t>();
+      XELOGGPU("GPU wait for idle = {:08X}", value);
+      result = true;
+      break;
+    }
 
     default:
       XELOGGPU("Unimplemented GPU OPCODE: 0x{:02X}\t\tCOUNT: {}\n", opcode,
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index f0be8c50e..8db6f1626 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -21,6 +21,7 @@
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
+#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/d3d12/d3d12_util.h"
@@ -387,7 +388,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
         sampler_count_vertex);
     return nullptr;
   }
-  root_signatures_bindful_.insert({index, root_signature});
+  root_signatures_bindful_.emplace(index, root_signature);
   return root_signature;
 }
 
@@ -745,12 +746,11 @@ void D3D12CommandProcessor::SetSamplePositions(
   current_sample_positions_ = sample_positions;
 }
 
-void D3D12CommandProcessor::SetComputePipelineState(
-    ID3D12PipelineState* pipeline_state) {
-  if (current_external_pipeline_state_ != pipeline_state) {
-    deferred_command_list_.D3DSetPipelineState(pipeline_state);
-    current_external_pipeline_state_ = pipeline_state;
-    current_cached_pipeline_state_ = nullptr;
+void D3D12CommandProcessor::SetComputePipeline(ID3D12PipelineState* pipeline) {
+  if (current_external_pipeline_ != pipeline) {
+    deferred_command_list_.D3DSetPipelineState(pipeline);
+    current_external_pipeline_ = pipeline;
+    current_cached_pipeline_ = nullptr;
   }
 }
 
@@ -773,8 +773,16 @@ std::string D3D12CommandProcessor::GetWindowTitleText() const {
     }
     // Currently scaling is only supported with ROV.
     if (texture_cache_ != nullptr && texture_cache_->IsResolutionScale2X()) {
-      return "Direct3D 12 - 2x";
+      return "Direct3D 12 - ROV 2x";
     }
+    // Rasterizer-ordered views are a feature very rarely used as of 2020 and
+    // that faces adoption complications (outside of Direct3D - on Vulkan - at
+    // least), but crucial to Xenia - raise awareness of its usage.
+    // https://github.com/KhronosGroup/Vulkan-Ecosystem/issues/27#issuecomment-455712319
+    // "In Xenia's title bar "D3D12 ROV" can be seen, which was a surprise, as I
+    //  wasn't aware that Xenia D3D12 backend was using Raster Order Views
+    //  feature" - oscarbg in that issue.
+    return "Direct3D 12 - ROV";
   }
   return "Direct3D 12";
 }
@@ -1196,7 +1204,7 @@ bool D3D12CommandProcessor::SetupContext() {
       *this, *register_file_, bindless_resources_used_, edram_rov_used_,
       texture_cache_->IsResolutionScale2X() ? 2 : 1);
   if (!pipeline_cache_->Initialize()) {
-    XELOGE("Failed to initialize the graphics pipeline state cache");
+    XELOGE("Failed to initialize the graphics pipeline cache");
     return false;
   }
 
@@ -1526,8 +1534,7 @@ void D3D12CommandProcessor::ShutdownContext() {
   // Shut down binding - bindless descriptors may be owned by subsystems like
   // the texture cache.
 
-  // Root signatured are used by pipeline states, thus freed after the pipeline
-  // states.
+  // Root signatures are used by pipelines, thus freed after the pipelines.
   ui::d3d12::util::ReleaseAndNull(root_signature_bindless_ds_);
   ui::d3d12::util::ReleaseAndNull(root_signature_bindless_vs_);
   for (auto it : root_signatures_bindful_) {
@@ -1878,7 +1885,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
            xenos::VertexShaderExportMode::kMultipass ||
        (primitive_two_faced && pa_su_sc_mode_cntl.cull_front &&
         pa_su_sc_mode_cntl.cull_back))) {
-    // All faces are culled - can't be expressed in the pipeline state.
+    // All faces are culled - can't be expressed in the pipeline.
     return true;
   }
 
@@ -1954,7 +1961,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     line_loop_closing_index = 0;
   }
 
-  // Update the textures - this may bind pipeline state objects.
+  // Update the textures - this may bind pipelines.
   uint32_t used_texture_mask =
       vertex_shader->GetUsedTextureMask() |
       (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0);
@@ -1972,21 +1979,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     early_z = true;
   }
 
-  // Create the pipeline state object if needed and bind it.
-  void* pipeline_state_handle;
+  // Create the pipeline if needed and bind it.
+  void* pipeline_handle;
   ID3D12RootSignature* root_signature;
   if (!pipeline_cache_->ConfigurePipeline(
           vertex_shader, pixel_shader, primitive_type_converted,
           indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16,
-          early_z, pipeline_render_targets, &pipeline_state_handle,
+          early_z, pipeline_render_targets, &pipeline_handle,
           &root_signature)) {
     return false;
   }
-  if (current_cached_pipeline_state_ != pipeline_state_handle) {
+  if (current_cached_pipeline_ != pipeline_handle) {
     deferred_command_list_.SetPipelineStateHandle(
-        reinterpret_cast<void*>(pipeline_state_handle));
-    current_cached_pipeline_state_ = pipeline_state_handle;
-    current_external_pipeline_state_ = nullptr;
+        reinterpret_cast<void*>(pipeline_handle));
+    current_cached_pipeline_ = pipeline_handle;
+    current_external_pipeline_ = nullptr;
   }
 
   // Update viewport, scissor, blend factor and stencil reference.
@@ -2005,14 +2012,15 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   }
   // Must not call anything that can change the descriptor heap from now on!
 
-  // Ensure vertex and index buffers are resident and draw.
+  // Ensure vertex buffers are resident.
   // TODO(Triang3l): Cache residency for ranges in a way similar to how texture
-  // validity will be tracked.
+  // validity is tracked.
   uint64_t vertex_buffers_resident[2] = {};
-  for (const auto& vertex_binding : vertex_shader->vertex_bindings()) {
+  for (const Shader::VertexBinding& vertex_binding :
+       vertex_shader->vertex_bindings()) {
     uint32_t vfetch_index = vertex_binding.fetch_constant;
     if (vertex_buffers_resident[vfetch_index >> 6] &
-        (1ull << (vfetch_index & 63))) {
+        (uint64_t(1) << (vfetch_index & 63))) {
       continue;
     }
     const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
@@ -2045,7 +2053,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           vfetch_constant.address << 2, vfetch_constant.size << 2);
       return false;
     }
-    vertex_buffers_resident[vfetch_index >> 6] |= 1ull << (vfetch_index & 63);
+    vertex_buffers_resident[vfetch_index >> 6] |= uint64_t(1)
+                                                  << (vfetch_index & 63);
   }
 
   // Gather memexport ranges and ensure the heaps for them are resident, and
@@ -2517,8 +2526,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
     submission_open_ = true;
 
     // Start a new deferred command list - will submit it to the real one in the
-    // end of the submission (when async pipeline state object creation requests
-    // are fulfilled).
+    // end of the submission (when async pipeline creation requests are
+    // fulfilled).
     deferred_command_list_.Reset();
 
     // Reset cached state of the command list.
@@ -2527,8 +2536,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
     ff_blend_factor_update_needed_ = true;
     ff_stencil_ref_update_needed_ = true;
     current_sample_positions_ = xenos::MsaaSamples::k1X;
-    current_cached_pipeline_state_ = nullptr;
-    current_external_pipeline_state_ = nullptr;
+    current_cached_pipeline_ = nullptr;
+    current_external_pipeline_ = nullptr;
     current_graphics_root_signature_ = nullptr;
     current_graphics_root_up_to_date_ = 0;
     if (bindless_resources_used_) {
@@ -2724,7 +2733,7 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) {
 }
 
 bool D3D12CommandProcessor::CanEndSubmissionImmediately() const {
-  return !submission_open_ || !pipeline_cache_->IsCreatingPipelineStates();
+  return !submission_open_ || !pipeline_cache_->IsCreatingPipelines();
 }
 
 void D3D12CommandProcessor::ClearCommandAllocatorCache() {
@@ -2745,12 +2754,12 @@ void D3D12CommandProcessor::ClearCommandAllocatorCache() {
 }
 
 void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) {
-  auto& regs = *register_file_;
-
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
 
+  const RegisterFile& regs = *register_file_;
+
   // Window parameters.
   // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h
   // See r200UpdateWindow:
@@ -2838,34 +2847,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) {
   }
 
   // Scissor.
-  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
-  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
-  D3D12_RECT scissor;
-  scissor.left = pa_sc_window_scissor_tl.tl_x;
-  scissor.top = pa_sc_window_scissor_tl.tl_y;
-  scissor.right = pa_sc_window_scissor_br.br_x;
-  scissor.bottom = pa_sc_window_scissor_br.br_y;
-  if (!pa_sc_window_scissor_tl.window_offset_disable) {
-    scissor.left =
-        std::max(scissor.left + pa_sc_window_offset.window_x_offset, LONG(0));
-    scissor.top =
-        std::max(scissor.top + pa_sc_window_offset.window_y_offset, LONG(0));
-    scissor.right =
-        std::max(scissor.right + pa_sc_window_offset.window_x_offset, LONG(0));
-    scissor.bottom =
-        std::max(scissor.bottom + pa_sc_window_offset.window_y_offset, LONG(0));
-  }
-  scissor.left *= pixel_size_x;
-  scissor.top *= pixel_size_y;
-  scissor.right *= pixel_size_x;
-  scissor.bottom *= pixel_size_y;
-  ff_scissor_update_needed_ |= ff_scissor_.left != scissor.left;
-  ff_scissor_update_needed_ |= ff_scissor_.top != scissor.top;
-  ff_scissor_update_needed_ |= ff_scissor_.right != scissor.right;
-  ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor.bottom;
+  draw_util::Scissor scissor;
+  draw_util::GetScissor(regs, scissor);
+  D3D12_RECT scissor_rect;
+  scissor_rect.left = LONG(scissor.left * pixel_size_x);
+  scissor_rect.top = LONG(scissor.top * pixel_size_y);
+  scissor_rect.right = LONG((scissor.left + scissor.width) * pixel_size_x);
+  scissor_rect.bottom = LONG((scissor.top + scissor.height) * pixel_size_y);
+  ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left;
+  ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top;
+  ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right;
+  ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom;
   if (ff_scissor_update_needed_) {
-    ff_scissor_ = scissor;
-    deferred_command_list_.RSSetScissorRect(scissor);
+    ff_scissor_ = scissor_rect;
+    deferred_command_list_.RSSetScissorRect(scissor_rect);
     ff_scissor_update_needed_ = false;
   }
 
@@ -2915,12 +2910,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
     uint32_t line_loop_closing_index, xenos::Endian index_endian,
     uint32_t used_texture_mask, bool early_z, uint32_t color_mask,
     const RenderTargetCache::PipelineRenderTarget render_targets[4]) {
-  auto& regs = *register_file_;
-
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
 
+  const RegisterFile& regs = *register_file_;
   auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
   auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
   auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
@@ -3103,14 +3097,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
   dirty |= system_constants_.line_loop_closing_index != line_loop_closing_index;
   system_constants_.line_loop_closing_index = line_loop_closing_index;
 
-  // Vertex index offset.
-  dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
-  system_constants_.vertex_base_index = vgt_indx_offset;
-
   // Index or tessellation edge factor buffer endianness.
   dirty |= system_constants_.vertex_index_endian != index_endian;
   system_constants_.vertex_index_endian = index_endian;
 
+  // Vertex index offset.
+  dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
+  system_constants_.vertex_base_index = vgt_indx_offset;
+
   // User clip planes (UCP_ENA_#), when not CLIP_DISABLE.
   if (!pa_cl_clip_cntl.clip_disable) {
     for (uint32_t i = 0; i < 6; ++i) {
@@ -3574,7 +3568,7 @@ bool D3D12CommandProcessor::UpdateBindings(
           float_constant_map_vertex.float_bitmap[i];
       // If no float constants at all, we can reuse any buffer for them, so not
       // invalidating.
-      if (float_constant_map_vertex.float_count != 0) {
+      if (float_constant_count_vertex) {
         cbuffer_binding_float_vertex_.up_to_date = false;
       }
     }
@@ -3589,7 +3583,7 @@ bool D3D12CommandProcessor::UpdateBindings(
           float_constant_map_pixel.float_bitmap[i]) {
         current_float_constant_map_pixel_[i] =
             float_constant_map_pixel.float_bitmap[i];
-        if (float_constant_map_pixel.float_count != 0) {
+        if (float_constant_count_pixel) {
           cbuffer_binding_float_pixel_.up_to_date = false;
         }
       }
@@ -3889,8 +3883,8 @@ bool D3D12CommandProcessor::UpdateBindings(
                   sampler_parameters,
                   provider.OffsetSamplerDescriptor(
                       sampler_bindless_heap_cpu_start_, sampler_index));
-              texture_cache_bindless_sampler_map_.insert(
-                  {sampler_parameters.value, sampler_index});
+              texture_cache_bindless_sampler_map_.emplace(
+                  sampler_parameters.value, sampler_index);
             }
             current_sampler_bindless_indices_vertex_[j] = sampler_index;
           }
@@ -3921,8 +3915,8 @@ bool D3D12CommandProcessor::UpdateBindings(
                   sampler_parameters,
                   provider.OffsetSamplerDescriptor(
                       sampler_bindless_heap_cpu_start_, sampler_index));
-              texture_cache_bindless_sampler_map_.insert(
-                  {sampler_parameters.value, sampler_index});
+              texture_cache_bindless_sampler_map_.emplace(
+                  sampler_parameters.value, sampler_index);
             }
             current_sampler_bindless_indices_pixel_[j] = sampler_index;
           }
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 5caa6bb78..ef2aa2cc3 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -190,19 +190,17 @@ class D3D12CommandProcessor : public CommandProcessor {
   // render targets or copying to depth render targets.
   void SetSamplePositions(xenos::MsaaSamples sample_positions);
 
-  // Returns a pipeline state object with deferred creation by its handle. May
-  // return nullptr if failed to create the pipeline state object.
-  inline ID3D12PipelineState* GetD3D12PipelineStateByHandle(
-      void* handle) const {
-    return pipeline_cache_->GetD3D12PipelineStateByHandle(handle);
+  // Returns a pipeline with deferred creation by its handle. May return nullptr
+  // if failed to create the pipeline.
+  ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
+    return pipeline_cache_->GetD3D12PipelineByHandle(handle);
   }
 
-  // Sets the current pipeline state to a compute one. This is for cache
-  // invalidation primarily. A submission must be open.
-  void SetComputePipelineState(ID3D12PipelineState* pipeline_state);
+  // Sets the current pipeline to a compute one. This is for cache invalidation
+  // primarily. A submission must be open.
+  void SetComputePipeline(ID3D12PipelineState* pipeline);
 
-  // For the pipeline state cache to call when binding layout UIDs may be
-  // reused.
+  // For the pipeline cache to call when binding layout UIDs may be reused.
   void NotifyShaderBindingsLayoutUIDsInvalidated();
 
   // Returns the text to display in the GPU backend name in the window title.
@@ -327,8 +325,8 @@ class D3D12CommandProcessor : public CommandProcessor {
   bool EndSubmission(bool is_swap);
   // Checks if ending a submission right now would not cause potentially more
   // delay than it would reduce by making the GPU start working earlier - such
-  // as when there are unfinished graphics pipeline state creation requests that
-  // would need to be fulfilled before actually submitting the command list.
+  // as when there are unfinished graphics pipeline creation requests that would
+  // need to be fulfilled before actually submitting the command list.
   bool CanEndSubmissionImmediately() const;
   bool AwaitAllQueueOperationsCompletion() {
     CheckSubmissionFence(submission_current_);
@@ -512,7 +510,7 @@ class D3D12CommandProcessor : public CommandProcessor {
     return cvars::internal_tile_height;
   }
 
-  inline std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
+  std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
     if (texture_cache_->IsResolutionScale2X()) {
       return std::make_pair(kSwapTextureWidth() * 2, kSwapTextureHeight() * 2);
     }
@@ -557,13 +555,12 @@ class D3D12CommandProcessor : public CommandProcessor {
   // Current SSAA sample positions (to be updated by the render target cache).
   xenos::MsaaSamples current_sample_positions_;
 
-  // Currently bound pipeline state, either a graphics pipeline state object
-  // from the pipeline state cache (with potentially deferred creation -
-  // current_external_pipeline_state_ is nullptr in this case) or a non-Xenos
-  // graphics or compute pipeline state object (current_cached_pipeline_state_
-  // is nullptr in this case).
-  void* current_cached_pipeline_state_;
-  ID3D12PipelineState* current_external_pipeline_state_;
+  // Currently bound pipeline, either a graphics pipeline from the pipeline
+  // cache (with potentially deferred creation - current_external_pipeline_ is
+  // nullptr in this case) or a non-Xenos graphics or compute pipeline
+  // (current_cached_pipeline_ is nullptr in this case).
+  void* current_cached_pipeline_;
+  ID3D12PipelineState* current_external_pipeline_;
 
   // Currently bound graphics root signature.
   ID3D12RootSignature* current_graphics_root_signature_;
diff --git a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
index e50bbbaac..d32f223ce 100644
--- a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
+++ b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
@@ -157,7 +157,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
   stretch_pipeline_desc.SampleDesc.Count = 1;
   if (FAILED(device->CreateGraphicsPipelineState(
           &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_pipeline_)))) {
-    XELOGE("Failed to create the front buffer stretch pipeline state");
+    XELOGE("Failed to create the front buffer stretch pipeline");
     stretch_gamma_root_signature_->Release();
     stretch_gamma_root_signature_ = nullptr;
     stretch_root_signature_->Release();
@@ -170,8 +170,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
   if (FAILED(device->CreateGraphicsPipelineState(
           &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_gamma_pipeline_)))) {
     XELOGE(
-        "Failed to create the gamma-correcting front buffer stretch "
-        "pipeline state");
+        "Failed to create the gamma-correcting front buffer stretch pipeline");
     stretch_pipeline_->Release();
     stretch_pipeline_ = nullptr;
     stretch_gamma_root_signature_->Release();
diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h
index 7eb4ac6e0..c24d6a00a 100644
--- a/src/xenia/gpu/d3d12/d3d12_shader.h
+++ b/src/xenia/gpu/d3d12/d3d12_shader.h
@@ -85,7 +85,7 @@ class D3D12Shader : public Shader {
     return sampler_bindings_.data();
   }
 
-  // For owning subsystems like the pipeline state cache, accessors for unique
+  // For owning subsystems like the pipeline cache, accessors for unique
   // identifiers (used instead of hashes to make sure collisions can't happen)
   // of binding layouts used by the shader, for invalidation if a shader with an
   // incompatible layout was bound.
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
index 6620cecaa..dc918bb11 100644
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@@ -48,7 +48,7 @@ class D3D12SharedMemory : public SharedMemory {
   // UseForReading or UseForWriting.
 
   // Makes the buffer usable for vertices, indices and texture untiling.
-  inline void UseForReading() {
+  void UseForReading() {
     // Vertex fetch is also allowed in pixel shaders.
     CommitUAVWritesAndTransitionBuffer(
         D3D12_RESOURCE_STATE_INDEX_BUFFER |
@@ -56,18 +56,18 @@ class D3D12SharedMemory : public SharedMemory {
         D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
   }
   // Makes the buffer usable for texture tiling after a resolve.
-  inline void UseForWriting() {
+  void UseForWriting() {
     CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
   }
   // Makes the buffer usable as a source for copy commands.
-  inline void UseAsCopySource() {
+  void UseAsCopySource() {
     CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
   }
   // Must be called when doing draws/dispatches modifying data within the shared
   // memory buffer as a UAV, to make sure that when UseForWriting is called the
   // next time, a UAV barrier will be done, and subsequent overlapping UAV
   // writes and reads are ordered.
-  inline void MarkUAVWritesCommitNeeded() {
+  void MarkUAVWritesCommitNeeded() {
     if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
       buffer_uav_writes_commit_needed_ = true;
     }
diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc
index 2b013e8ad..eb8d8922e 100644
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@@ -209,9 +209,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
         }
       } break;
       case Command::kSetPipelineStateHandle: {
-        current_pipeline_state =
-            command_processor_.GetD3D12PipelineStateByHandle(
-                *reinterpret_cast<void* const*>(stream));
+        current_pipeline_state = command_processor_.GetD3D12PipelineByHandle(
+            *reinterpret_cast<void* const*>(stream));
         if (current_pipeline_state) {
           command_list->SetPipelineState(current_pipeline_state);
         }
diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h
index 9393798c3..e8060371c 100644
--- a/src/xenia/gpu/d3d12/deferred_command_list.h
+++ b/src/xenia/gpu/d3d12/deferred_command_list.h
@@ -33,7 +33,7 @@ class DeferredCommandList {
   void Execute(ID3D12GraphicsCommandList* command_list,
                ID3D12GraphicsCommandList1* command_list_1);
 
-  inline void D3DClearUnorderedAccessViewUint(
+  void D3DClearUnorderedAccessViewUint(
       D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap,
       D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle, ID3D12Resource* resource,
       const UINT values[4], UINT num_rects, const D3D12_RECT* rects) {
@@ -51,9 +51,9 @@ class DeferredCommandList {
     }
   }
 
-  inline void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset,
-                                  ID3D12Resource* src_buffer, UINT64 src_offset,
-                                  UINT64 num_bytes) {
+  void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset,
+                           ID3D12Resource* src_buffer, UINT64 src_offset,
+                           UINT64 num_bytes) {
     auto& args = *reinterpret_cast<D3DCopyBufferRegionArguments*>(WriteCommand(
         Command::kD3DCopyBufferRegion, sizeof(D3DCopyBufferRegionArguments)));
     args.dst_buffer = dst_buffer;
@@ -63,26 +63,26 @@ class DeferredCommandList {
     args.num_bytes = num_bytes;
   }
 
-  inline void D3DCopyResource(ID3D12Resource* dst_resource,
-                              ID3D12Resource* src_resource) {
+  void D3DCopyResource(ID3D12Resource* dst_resource,
+                       ID3D12Resource* src_resource) {
     auto& args = *reinterpret_cast<D3DCopyResourceArguments*>(WriteCommand(
         Command::kD3DCopyResource, sizeof(D3DCopyResourceArguments)));
     args.dst_resource = dst_resource;
     args.src_resource = src_resource;
   }
 
-  inline void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst,
-                          const D3D12_TEXTURE_COPY_LOCATION& src) {
+  void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst,
+                   const D3D12_TEXTURE_COPY_LOCATION& src) {
     auto& args = *reinterpret_cast<CopyTextureArguments*>(
         WriteCommand(Command::kCopyTexture, sizeof(CopyTextureArguments)));
     std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
     std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
   }
 
-  inline void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst,
-                                UINT dst_x, UINT dst_y, UINT dst_z,
-                                const D3D12_TEXTURE_COPY_LOCATION& src,
-                                const D3D12_BOX& src_box) {
+  void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, UINT dst_x,
+                         UINT dst_y, UINT dst_z,
+                         const D3D12_TEXTURE_COPY_LOCATION& src,
+                         const D3D12_BOX& src_box) {
     auto& args = *reinterpret_cast<CopyTextureRegionArguments*>(WriteCommand(
         Command::kCopyTextureRegion, sizeof(CopyTextureRegionArguments)));
     std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
@@ -93,8 +93,8 @@ class DeferredCommandList {
     args.src_box = src_box;
   }
 
-  inline void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
-                          UINT thread_group_count_z) {
+  void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
+                   UINT thread_group_count_z) {
     auto& args = *reinterpret_cast<D3DDispatchArguments*>(
         WriteCommand(Command::kD3DDispatch, sizeof(D3DDispatchArguments)));
     args.thread_group_count_x = thread_group_count_x;
@@ -102,11 +102,10 @@ class DeferredCommandList {
     args.thread_group_count_z = thread_group_count_z;
   }
 
-  inline void D3DDrawIndexedInstanced(UINT index_count_per_instance,
-                                      UINT instance_count,
-                                      UINT start_index_location,
-                                      INT base_vertex_location,
-                                      UINT start_instance_location) {
+  void D3DDrawIndexedInstanced(UINT index_count_per_instance,
+                               UINT instance_count, UINT start_index_location,
+                               INT base_vertex_location,
+                               UINT start_instance_location) {
     auto& args = *reinterpret_cast<D3DDrawIndexedInstancedArguments*>(
         WriteCommand(Command::kD3DDrawIndexedInstanced,
                      sizeof(D3DDrawIndexedInstancedArguments)));
@@ -117,9 +116,9 @@ class DeferredCommandList {
     args.start_instance_location = start_instance_location;
   }
 
-  inline void D3DDrawInstanced(UINT vertex_count_per_instance,
-                               UINT instance_count, UINT start_vertex_location,
-                               UINT start_instance_location) {
+  void D3DDrawInstanced(UINT vertex_count_per_instance, UINT instance_count,
+                        UINT start_vertex_location,
+                        UINT start_instance_location) {
     auto& args = *reinterpret_cast<D3DDrawInstancedArguments*>(WriteCommand(
         Command::kD3DDrawInstanced, sizeof(D3DDrawInstancedArguments)));
     args.vertex_count_per_instance = vertex_count_per_instance;
@@ -128,7 +127,7 @@ class DeferredCommandList {
     args.start_instance_location = start_instance_location;
   }
 
-  inline void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) {
+  void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) {
     auto& args = *reinterpret_cast<D3D12_INDEX_BUFFER_VIEW*>(WriteCommand(
         Command::kD3DIASetIndexBuffer, sizeof(D3D12_INDEX_BUFFER_VIEW)));
     if (view != nullptr) {
@@ -142,14 +141,13 @@ class DeferredCommandList {
     }
   }
 
-  inline void D3DIASetPrimitiveTopology(
-      D3D12_PRIMITIVE_TOPOLOGY primitive_topology) {
+  void D3DIASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY primitive_topology) {
     auto& arg = *reinterpret_cast<D3D12_PRIMITIVE_TOPOLOGY*>(WriteCommand(
         Command::kD3DIASetPrimitiveTopology, sizeof(D3D12_PRIMITIVE_TOPOLOGY)));
     arg = primitive_topology;
   }
 
-  inline void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
+  void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
     auto args = reinterpret_cast<FLOAT*>(
         WriteCommand(Command::kD3DOMSetBlendFactor, 4 * sizeof(FLOAT)));
     args[0] = blend_factor[0];
@@ -158,7 +156,7 @@ class DeferredCommandList {
     args[3] = blend_factor[3];
   }
 
-  inline void D3DOMSetRenderTargets(
+  void D3DOMSetRenderTargets(
       UINT num_render_target_descriptors,
       const D3D12_CPU_DESCRIPTOR_HANDLE* render_target_descriptors,
       BOOL rts_single_handle_to_descriptor_range,
@@ -185,14 +183,14 @@ class DeferredCommandList {
     }
   }
 
-  inline void D3DOMSetStencilRef(UINT stencil_ref) {
+  void D3DOMSetStencilRef(UINT stencil_ref) {
     auto& arg = *reinterpret_cast<UINT*>(
         WriteCommand(Command::kD3DOMSetStencilRef, sizeof(UINT)));
     arg = stencil_ref;
   }
 
-  inline void D3DResourceBarrier(UINT num_barriers,
-                                 const D3D12_RESOURCE_BARRIER* barriers) {
+  void D3DResourceBarrier(UINT num_barriers,
+                          const D3D12_RESOURCE_BARRIER* barriers) {
     if (num_barriers == 0) {
       return;
     }
@@ -207,21 +205,22 @@ class DeferredCommandList {
                 num_barriers * sizeof(D3D12_RESOURCE_BARRIER));
   }
 
-  inline void RSSetScissorRect(const D3D12_RECT& rect) {
+  void RSSetScissorRect(const D3D12_RECT& rect) {
     auto& arg = *reinterpret_cast<D3D12_RECT*>(
         WriteCommand(Command::kRSSetScissorRect, sizeof(D3D12_RECT)));
     arg = rect;
   }
 
-  inline void RSSetViewport(const D3D12_VIEWPORT& viewport) {
+  void RSSetViewport(const D3D12_VIEWPORT& viewport) {
     auto& arg = *reinterpret_cast<D3D12_VIEWPORT*>(
         WriteCommand(Command::kRSSetViewport, sizeof(D3D12_VIEWPORT)));
     arg = viewport;
   }
 
-  inline void D3DSetComputeRoot32BitConstants(
-      UINT root_parameter_index, UINT num_32bit_values_to_set,
-      const void* src_data, UINT dest_offset_in_32bit_values) {
+  void D3DSetComputeRoot32BitConstants(UINT root_parameter_index,
+                                       UINT num_32bit_values_to_set,
+                                       const void* src_data,
+                                       UINT dest_offset_in_32bit_values) {
     if (num_32bit_values_to_set == 0) {
       return;
     }
@@ -235,9 +234,10 @@ class DeferredCommandList {
     std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t));
   }
 
-  inline void D3DSetGraphicsRoot32BitConstants(
-      UINT root_parameter_index, UINT num_32bit_values_to_set,
-      const void* src_data, UINT dest_offset_in_32bit_values) {
+  void D3DSetGraphicsRoot32BitConstants(UINT root_parameter_index,
+                                        UINT num_32bit_values_to_set,
+                                        const void* src_data,
+                                        UINT dest_offset_in_32bit_values) {
     if (num_32bit_values_to_set == 0) {
       return;
     }
@@ -251,7 +251,7 @@ class DeferredCommandList {
     std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t));
   }
 
-  inline void D3DSetComputeRootConstantBufferView(
+  void D3DSetComputeRootConstantBufferView(
       UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) {
     auto& args = *reinterpret_cast<SetRootConstantBufferViewArguments*>(
         WriteCommand(Command::kD3DSetComputeRootConstantBufferView,
@@ -260,7 +260,7 @@ class DeferredCommandList {
     args.buffer_location = buffer_location;
   }
 
-  inline void D3DSetGraphicsRootConstantBufferView(
+  void D3DSetGraphicsRootConstantBufferView(
       UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) {
     auto& args = *reinterpret_cast<SetRootConstantBufferViewArguments*>(
         WriteCommand(Command::kD3DSetGraphicsRootConstantBufferView,
@@ -269,7 +269,7 @@ class DeferredCommandList {
     args.buffer_location = buffer_location;
   }
 
-  inline void D3DSetComputeRootDescriptorTable(
+  void D3DSetComputeRootDescriptorTable(
       UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) {
     auto& args = *reinterpret_cast<SetRootDescriptorTableArguments*>(
         WriteCommand(Command::kD3DSetComputeRootDescriptorTable,
@@ -278,7 +278,7 @@ class DeferredCommandList {
     args.base_descriptor.ptr = base_descriptor.ptr;
   }
 
-  inline void D3DSetGraphicsRootDescriptorTable(
+  void D3DSetGraphicsRootDescriptorTable(
       UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) {
     auto& args = *reinterpret_cast<SetRootDescriptorTableArguments*>(
         WriteCommand(Command::kD3DSetGraphicsRootDescriptorTable,
@@ -287,42 +287,40 @@ class DeferredCommandList {
     args.base_descriptor.ptr = base_descriptor.ptr;
   }
 
-  inline void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) {
+  void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) {
     auto& arg = *reinterpret_cast<ID3D12RootSignature**>(WriteCommand(
         Command::kD3DSetComputeRootSignature, sizeof(ID3D12RootSignature*)));
     arg = root_signature;
   }
 
-  inline void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) {
+  void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) {
     auto& arg = *reinterpret_cast<ID3D12RootSignature**>(WriteCommand(
         Command::kD3DSetGraphicsRootSignature, sizeof(ID3D12RootSignature*)));
     arg = root_signature;
   }
 
-  inline void SetDescriptorHeaps(
-      ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap,
-      ID3D12DescriptorHeap* sampler_descriptor_heap) {
+  void SetDescriptorHeaps(ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap,
+                          ID3D12DescriptorHeap* sampler_descriptor_heap) {
     auto& args = *reinterpret_cast<SetDescriptorHeapsArguments*>(WriteCommand(
         Command::kSetDescriptorHeaps, sizeof(SetDescriptorHeapsArguments)));
     args.cbv_srv_uav_descriptor_heap = cbv_srv_uav_descriptor_heap;
     args.sampler_descriptor_heap = sampler_descriptor_heap;
   }
 
-  inline void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) {
+  void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) {
     auto& arg = *reinterpret_cast<ID3D12PipelineState**>(WriteCommand(
         Command::kD3DSetPipelineState, sizeof(ID3D12PipelineState*)));
     arg = pipeline_state;
   }
 
-  inline void SetPipelineStateHandle(void* pipeline_state_handle) {
+  void SetPipelineStateHandle(void* pipeline_state_handle) {
     auto& arg = *reinterpret_cast<void**>(
         WriteCommand(Command::kSetPipelineStateHandle, sizeof(void*)));
     arg = pipeline_state_handle;
   }
 
-  inline void D3DSetSamplePositions(
-      UINT num_samples_per_pixel, UINT num_pixels,
-      const D3D12_SAMPLE_POSITION* sample_positions) {
+  void D3DSetSamplePositions(UINT num_samples_per_pixel, UINT num_pixels,
+                             const D3D12_SAMPLE_POSITION* sample_positions) {
     auto& args = *reinterpret_cast<D3DSetSamplePositionsArguments*>(
         WriteCommand(Command::kD3DSetSamplePositions,
                      sizeof(D3DSetSamplePositionsArguments)));
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc
index 3a9f609d3..b2db2654e 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@@ -43,10 +43,10 @@ DEFINE_bool(
     "D3D12");
 DEFINE_int32(
     d3d12_pipeline_creation_threads, -1,
-    "Number of threads used for graphics pipeline state object creation. -1 to "
-    "calculate automatically (75% of logical CPU cores), a positive number to "
-    "specify the number of threads explicitly (up to the number of logical CPU "
-    "cores), 0 to disable multithreaded pipeline state object creation.",
+    "Number of threads used for graphics pipeline creation. -1 to calculate "
+    "automatically (75% of logical CPU cores), a positive number to specify "
+    "the number of threads explicitly (up to the number of logical CPU cores), "
+    "0 to disable multithreaded pipeline creation.",
     "D3D12");
 DEFINE_bool(d3d12_tessellation_wireframe, false,
             "Display tessellated surfaces as wireframe for debugging.",
@@ -125,8 +125,8 @@ bool PipelineCache::Initialize() {
     logical_processor_count = 6;
   }
   // Initialize creation thread synchronization data even if not using creation
-  // threads because they may be used anyway to create pipeline state objects
-  // from the storage.
+  // threads because they may be used anyway to create pipelines from the
+  // storage.
   creation_threads_busy_ = 0;
   creation_completion_event_ =
       xe::threading::Event::CreateManualResetEvent(true);
@@ -145,7 +145,7 @@ bool PipelineCache::Initialize() {
     for (size_t i = 0; i < creation_thread_count; ++i) {
       std::unique_ptr<xe::threading::Thread> creation_thread =
           xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); });
-      creation_thread->set_name("D3D12 Pipeline States");
+      creation_thread->set_name("D3D12 Pipelines");
       creation_threads_.push_back(std::move(creation_thread));
     }
   }
@@ -184,13 +184,12 @@ void PipelineCache::ClearCache(bool shutting_down) {
   }
   ShutdownShaderStorage();
 
-  // Remove references to the current pipeline state object.
-  current_pipeline_state_ = nullptr;
+  // Remove references to the current pipeline.
+  current_pipeline_ = nullptr;
 
   if (!creation_threads_.empty()) {
-    // Empty the pipeline state object creation queue and make sure there are no
-    // threads currently creating pipeline state objects because pipeline states
-    // are going to be deleted.
+    // Empty the pipeline creation queue and make sure there are no threads
+    // currently creating pipelines because pipelines are going to be deleted.
     bool await_creation_completion_event = false;
     {
       std::lock_guard<std::mutex> lock(creation_request_lock_);
@@ -207,13 +206,13 @@ void PipelineCache::ClearCache(bool shutting_down) {
     }
   }
 
-  // Destroy all pipeline state objects.
-  for (auto it : pipeline_states_) {
+  // Destroy all pipelines.
+  for (auto it : pipelines_) {
     it.second->state->Release();
     delete it.second;
   }
-  pipeline_states_.clear();
-  COUNT_profile_set("gpu/pipeline_cache/pipeline_states", 0);
+  pipelines_.clear();
+  COUNT_profile_set("gpu/pipeline_cache/pipelines", 0);
 
   // Destroy all shaders.
   command_processor_.NotifyShaderBindingsLayoutUIDsInvalidated();
@@ -223,10 +222,10 @@ void PipelineCache::ClearCache(bool shutting_down) {
   }
   texture_binding_layout_map_.clear();
   texture_binding_layouts_.clear();
-  for (auto it : shader_map_) {
+  for (auto it : shaders_) {
     delete it.second;
   }
-  shader_map_.clear();
+  shaders_.clear();
 
   if (reinitialize_shader_storage) {
     InitializeShaderStorage(shader_storage_root, shader_storage_title_id,
@@ -374,8 +373,7 @@ void PipelineCache::InitializeShaderStorage(
       }
       size_t ucode_byte_count =
           shader_header.ucode_dword_count * sizeof(uint32_t);
-      if (shader_map_.find(shader_header.ucode_data_hash) !=
-          shader_map_.end()) {
+      if (shaders_.find(shader_header.ucode_data_hash) != shaders_.end()) {
         // Already added - usually shaders aren't added without the intention of
         // translating them imminently, so don't do additional checks to
         // actually ensure that translation happens right now (they would cause
@@ -402,7 +400,7 @@ void PipelineCache::InitializeShaderStorage(
       D3D12Shader* shader =
           new D3D12Shader(shader_header.type, ucode_data_hash,
                           ucode_dwords.data(), shader_header.ucode_dword_count);
-      shader_map_.insert({ucode_data_hash, shader});
+      shaders_.emplace(ucode_data_hash, shader);
       // Create new threads if the currently existing threads can't keep up with
       // file reading, but not more than the number of logical processors minus
       // one.
@@ -439,7 +437,7 @@ void PipelineCache::InitializeShaderStorage(
       }
       shader_translation_threads.clear();
       for (D3D12Shader* shader : shaders_failed_to_translate) {
-        shader_map_.erase(shader->ucode_data_hash());
+        shaders_.erase(shader->ucode_data_hash());
         delete shader;
       }
     }
@@ -460,72 +458,66 @@ void PipelineCache::InitializeShaderStorage(
   }
 
   // 'DXRO' or 'DXRT'.
-  const uint32_t pipeline_state_storage_magic_api =
+  const uint32_t pipeline_storage_magic_api =
       edram_rov_used_ ? 0x4F525844 : 0x54525844;
 
-  // Initialize the pipeline state storage stream.
-  uint64_t pipeline_state_storage_initialization_start_ =
+  // Initialize the pipeline storage stream.
+  uint64_t pipeline_storage_initialization_start_ =
       xe::Clock::QueryHostTickCount();
-  auto pipeline_state_storage_file_path =
+  auto pipeline_storage_file_path =
       shader_storage_shareable_root /
       fmt::format("{:08X}.{}.d3d12.xpso", title_id,
                   edram_rov_used_ ? "rov" : "rtv");
-  pipeline_state_storage_file_ =
-      xe::filesystem::OpenFile(pipeline_state_storage_file_path, "a+b");
-  if (!pipeline_state_storage_file_) {
+  pipeline_storage_file_ =
+      xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b");
+  if (!pipeline_storage_file_) {
     XELOGE(
-        "Failed to open the Direct3D 12 pipeline state description storage "
-        "file for writing, persistent shader storage will be disabled: {}",
-        xe::path_to_utf8(pipeline_state_storage_file_path));
+        "Failed to open the Direct3D 12 pipeline description storage file for "
+        "writing, persistent shader storage will be disabled: {}",
+        xe::path_to_utf8(pipeline_storage_file_path));
     fclose(shader_storage_file_);
     shader_storage_file_ = nullptr;
     return;
   }
-  pipeline_state_storage_file_flush_needed_ = false;
+  pipeline_storage_file_flush_needed_ = false;
   // 'XEPS'.
-  const uint32_t pipeline_state_storage_magic = 0x53504558;
+  const uint32_t pipeline_storage_magic = 0x53504558;
   struct {
     uint32_t magic;
     uint32_t magic_api;
     uint32_t version_swapped;
-  } pipeline_state_storage_file_header;
-  if (fread(&pipeline_state_storage_file_header,
-            sizeof(pipeline_state_storage_file_header), 1,
-            pipeline_state_storage_file_) &&
-      pipeline_state_storage_file_header.magic ==
-          pipeline_state_storage_magic &&
-      pipeline_state_storage_file_header.magic_api ==
-          pipeline_state_storage_magic_api &&
-      xe::byte_swap(pipeline_state_storage_file_header.version_swapped) ==
+  } pipeline_storage_file_header;
+  if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
+            1, pipeline_storage_file_) &&
+      pipeline_storage_file_header.magic == pipeline_storage_magic &&
+      pipeline_storage_file_header.magic_api == pipeline_storage_magic_api &&
+      xe::byte_swap(pipeline_storage_file_header.version_swapped) ==
           PipelineDescription::kVersion) {
-    uint64_t pipeline_state_storage_valid_bytes =
-        sizeof(pipeline_state_storage_file_header);
-    // Enqueue pipeline state descriptions written by previous Xenia executions
-    // until the end of the file or until a corrupted one is detected.
-    xe::filesystem::Seek(pipeline_state_storage_file_, 0, SEEK_END);
-    int64_t pipeline_state_storage_told_end =
-        xe::filesystem::Tell(pipeline_state_storage_file_);
-    size_t pipeline_state_storage_told_count =
-        size_t(pipeline_state_storage_told_end >=
-                       int64_t(pipeline_state_storage_valid_bytes)
-                   ? (uint64_t(pipeline_state_storage_told_end) -
-                      pipeline_state_storage_valid_bytes) /
-                         sizeof(PipelineStoredDescription)
-                   : 0);
-    if (pipeline_state_storage_told_count &&
-        xe::filesystem::Seek(pipeline_state_storage_file_,
-                             int64_t(pipeline_state_storage_valid_bytes),
-                             SEEK_SET)) {
+    uint64_t pipeline_storage_valid_bytes =
+        sizeof(pipeline_storage_file_header);
+    // Enqueue pipeline descriptions written by previous Xenia executions until
+    // the end of the file or until a corrupted one is detected.
+    xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END);
+    int64_t pipeline_storage_told_end =
+        xe::filesystem::Tell(pipeline_storage_file_);
+    size_t pipeline_storage_told_count = size_t(
+        pipeline_storage_told_end >= int64_t(pipeline_storage_valid_bytes)
+            ? (uint64_t(pipeline_storage_told_end) -
+               pipeline_storage_valid_bytes) /
+                  sizeof(PipelineStoredDescription)
+            : 0);
+    if (pipeline_storage_told_count &&
+        xe::filesystem::Seek(pipeline_storage_file_,
+                             int64_t(pipeline_storage_valid_bytes), SEEK_SET)) {
       std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
-      pipeline_stored_descriptions.resize(pipeline_state_storage_told_count);
-      pipeline_stored_descriptions.resize(fread(
-          pipeline_stored_descriptions.data(),
-          sizeof(PipelineStoredDescription), pipeline_state_storage_told_count,
-          pipeline_state_storage_file_));
+      pipeline_stored_descriptions.resize(pipeline_storage_told_count);
+      pipeline_stored_descriptions.resize(
+          fread(pipeline_stored_descriptions.data(),
+                sizeof(PipelineStoredDescription), pipeline_storage_told_count,
+                pipeline_storage_file_));
       if (!pipeline_stored_descriptions.empty()) {
         // Launch additional creation threads to use all cores to create
-        // pipeline state objects faster. Will also be using the main thread, so
-        // minus 1.
+        // pipelines faster. Will also be using the main thread, so minus 1.
         size_t creation_thread_original_count = creation_threads_.size();
         size_t creation_thread_needed_count =
             std::max(std::min(pipeline_stored_descriptions.size(),
@@ -539,10 +531,10 @@ void PipelineCache::InitializeShaderStorage(
                   {}, [this, creation_thread_index]() {
                     CreationThread(creation_thread_index);
                   });
-          creation_thread->set_name("D3D12 Pipeline States Additional");
+          creation_thread->set_name("D3D12 Pipelines");
           creation_threads_.push_back(std::move(creation_thread));
         }
-        size_t pipeline_states_created = 0;
+        size_t pipelines_created = 0;
         for (const PipelineStoredDescription& pipeline_stored_description :
              pipeline_stored_descriptions) {
           const PipelineDescription& pipeline_description =
@@ -554,30 +546,28 @@ void PipelineCache::InitializeShaderStorage(
                     0) != pipeline_stored_description.description_hash) {
             break;
           }
-          pipeline_state_storage_valid_bytes +=
-              sizeof(PipelineStoredDescription);
-          // Skip already known pipeline states - those have already been
-          // enqueued.
-          auto found_range = pipeline_states_.equal_range(
+          pipeline_storage_valid_bytes += sizeof(PipelineStoredDescription);
+          // Skip already known pipelines - those have already been enqueued.
+          auto found_range = pipelines_.equal_range(
               pipeline_stored_description.description_hash);
-          bool pipeline_state_found = false;
+          bool pipeline_found = false;
           for (auto it = found_range.first; it != found_range.second; ++it) {
-            PipelineState* found_pipeline_state = it->second;
-            if (!std::memcmp(&found_pipeline_state->description.description,
+            Pipeline* found_pipeline = it->second;
+            if (!std::memcmp(&found_pipeline->description.description,
                              &pipeline_description,
                              sizeof(pipeline_description))) {
-              pipeline_state_found = true;
+              pipeline_found = true;
               break;
             }
           }
-          if (pipeline_state_found) {
+          if (pipeline_found) {
             continue;
           }
 
           PipelineRuntimeDescription pipeline_runtime_description;
           auto vertex_shader_it =
-              shader_map_.find(pipeline_description.vertex_shader_hash);
-          if (vertex_shader_it == shader_map_.end()) {
+              shaders_.find(pipeline_description.vertex_shader_hash);
+          if (vertex_shader_it == shaders_.end()) {
             continue;
           }
           pipeline_runtime_description.vertex_shader = vertex_shader_it->second;
@@ -586,8 +576,8 @@ void PipelineCache::InitializeShaderStorage(
           }
           if (pipeline_description.pixel_shader_hash) {
             auto pixel_shader_it =
-                shader_map_.find(pipeline_description.pixel_shader_hash);
-            if (pixel_shader_it == shader_map_.end()) {
+                shaders_.find(pipeline_description.pixel_shader_hash);
+            if (pixel_shader_it == shaders_.end()) {
               continue;
             }
             pipeline_runtime_description.pixel_shader = pixel_shader_it->second;
@@ -607,36 +597,33 @@ void PipelineCache::InitializeShaderStorage(
           std::memcpy(&pipeline_runtime_description.description,
                       &pipeline_description, sizeof(pipeline_description));
 
-          PipelineState* new_pipeline_state = new PipelineState;
-          new_pipeline_state->state = nullptr;
-          std::memcpy(&new_pipeline_state->description,
-                      &pipeline_runtime_description,
+          Pipeline* new_pipeline = new Pipeline;
+          new_pipeline->state = nullptr;
+          std::memcpy(&new_pipeline->description, &pipeline_runtime_description,
                       sizeof(pipeline_runtime_description));
-          pipeline_states_.insert(
-              std::make_pair(pipeline_stored_description.description_hash,
-                             new_pipeline_state));
-          COUNT_profile_set("gpu/pipeline_cache/pipeline_states",
-                            pipeline_states_.size());
+          pipelines_.emplace(pipeline_stored_description.description_hash,
+                             new_pipeline);
+          COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
           if (!creation_threads_.empty()) {
             // Submit the pipeline for creation to any available thread.
             {
               std::lock_guard<std::mutex> lock(creation_request_lock_);
-              creation_queue_.push_back(new_pipeline_state);
+              creation_queue_.push_back(new_pipeline);
             }
             creation_request_cond_.notify_one();
           } else {
-            new_pipeline_state->state =
-                CreateD3D12PipelineState(pipeline_runtime_description);
+            new_pipeline->state =
+                CreateD3D12Pipeline(pipeline_runtime_description);
           }
-          ++pipeline_states_created;
+          ++pipelines_created;
         }
-        CreateQueuedPipelineStatesOnProcessorThread();
+        CreateQueuedPipelinesOnProcessorThread();
         if (creation_threads_.size() > creation_thread_original_count) {
           {
             std::lock_guard<std::mutex> lock(creation_request_lock_);
             creation_threads_shutdown_from_ = creation_thread_original_count;
             // Assuming the queue is empty because of
-            // CreateQueuedPipelineStatesOnProcessorThread.
+            // CreateQueuedPipelinesOnProcessorThread.
           }
           creation_request_cond_.notify_all();
           while (creation_threads_.size() > creation_thread_original_count) {
@@ -664,26 +651,23 @@ void PipelineCache::InitializeShaderStorage(
           }
         }
         XELOGGPU(
-            "Created {} graphics pipeline state objects from the storage in {} "
-            "milliseconds",
-            pipeline_states_created,
+            "Created {} graphics pipelines from the storage in {} milliseconds",
+            pipelines_created,
             (xe::Clock::QueryHostTickCount() -
-             pipeline_state_storage_initialization_start_) *
+             pipeline_storage_initialization_start_) *
                 1000 / xe::Clock::QueryHostTickFrequency());
       }
     }
-    xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_,
-                                      pipeline_state_storage_valid_bytes);
+    xe::filesystem::TruncateStdioFile(pipeline_storage_file_,
+                                      pipeline_storage_valid_bytes);
   } else {
-    xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, 0);
-    pipeline_state_storage_file_header.magic = pipeline_state_storage_magic;
-    pipeline_state_storage_file_header.magic_api =
-        pipeline_state_storage_magic_api;
-    pipeline_state_storage_file_header.version_swapped =
+    xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0);
+    pipeline_storage_file_header.magic = pipeline_storage_magic;
+    pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
+    pipeline_storage_file_header.version_swapped =
         xe::byte_swap(PipelineDescription::kVersion);
-    fwrite(&pipeline_state_storage_file_header,
-           sizeof(pipeline_state_storage_file_header), 1,
-           pipeline_state_storage_file_);
+    fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
+           1, pipeline_storage_file_);
   }
 
   shader_storage_root_ = storage_root;
@@ -691,7 +675,7 @@ void PipelineCache::InitializeShaderStorage(
 
   // Start the storage writing thread.
   storage_write_flush_shaders_ = false;
-  storage_write_flush_pipeline_states_ = false;
+  storage_write_flush_pipelines_ = false;
   storage_write_thread_shutdown_ = false;
   storage_write_thread_ =
       xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); });
@@ -708,12 +692,12 @@ void PipelineCache::ShutdownShaderStorage() {
     storage_write_thread_.reset();
   }
   storage_write_shader_queue_.clear();
-  storage_write_pipeline_state_queue_.clear();
+  storage_write_pipeline_queue_.clear();
 
-  if (pipeline_state_storage_file_) {
-    fclose(pipeline_state_storage_file_);
-    pipeline_state_storage_file_ = nullptr;
-    pipeline_state_storage_file_flush_needed_ = false;
+  if (pipeline_storage_file_) {
+    fclose(pipeline_storage_file_);
+    pipeline_storage_file_ = nullptr;
+    pipeline_storage_file_flush_needed_ = false;
   }
 
   if (shader_storage_file_) {
@@ -728,30 +712,29 @@ void PipelineCache::ShutdownShaderStorage() {
 
 void PipelineCache::EndSubmission() {
   if (shader_storage_file_flush_needed_ ||
-      pipeline_state_storage_file_flush_needed_) {
+      pipeline_storage_file_flush_needed_) {
     {
       std::lock_guard<std::mutex> lock(storage_write_request_lock_);
       if (shader_storage_file_flush_needed_) {
         storage_write_flush_shaders_ = true;
       }
-      if (pipeline_state_storage_file_flush_needed_) {
-        storage_write_flush_pipeline_states_ = true;
+      if (pipeline_storage_file_flush_needed_) {
+        storage_write_flush_pipelines_ = true;
       }
     }
     storage_write_request_cond_.notify_one();
     shader_storage_file_flush_needed_ = false;
-    pipeline_state_storage_file_flush_needed_ = false;
+    pipeline_storage_file_flush_needed_ = false;
   }
   if (!creation_threads_.empty()) {
-    CreateQueuedPipelineStatesOnProcessorThread();
-    // Await creation of all queued pipeline state objects.
+    CreateQueuedPipelinesOnProcessorThread();
+    // Await creation of all queued pipelines.
     bool await_creation_completion_event;
     {
       std::lock_guard<std::mutex> lock(creation_request_lock_);
       // Assuming the creation queue is already empty (because the processor
-      // thread also worked on creating the leftover pipeline state objects), so
-      // only check if there are threads with pipeline state objects currently
-      // being created.
+      // thread also worked on creating the leftover pipelines), so only check
+      // if there are threads with pipelines currently being created.
       await_creation_completion_event = creation_threads_busy_ != 0;
       if (await_creation_completion_event) {
         creation_completion_event_->Reset();
@@ -765,7 +748,7 @@ void PipelineCache::EndSubmission() {
   }
 }
 
-bool PipelineCache::IsCreatingPipelineStates() {
+bool PipelineCache::IsCreatingPipelines() {
   if (creation_threads_.empty()) {
     return false;
   }
@@ -779,8 +762,8 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
                                        uint32_t dword_count) {
   // Hash the input memory and lookup the shader.
   uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0);
-  auto it = shader_map_.find(data_hash);
-  if (it != shader_map_.end()) {
+  auto it = shaders_.find(data_hash);
+  if (it != shaders_.end()) {
     // Shader has been previously loaded.
     return it->second;
   }
@@ -790,7 +773,7 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
   // again.
   D3D12Shader* shader =
       new D3D12Shader(shader_type, data_hash, host_address, dword_count);
-  shader_map_.insert({data_hash, shader});
+  shaders_.emplace(data_hash, shader);
 
   return shader;
 }
@@ -798,11 +781,11 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
 Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid()
     const {
   // If the values this functions returns are changed, INVALIDATE THE SHADER
-  // STORAGE (increase kVersion for BOTH shaders and pipeline states)! The
-  // exception is when the function originally returned "unsupported", but
-  // started to return a valid value (in this case the shader wouldn't be cached
-  // in the first place). Otherwise games will not be able to locate shaders for
-  // draws for which the host vertex shader type has changed!
+  // STORAGE (increase kVersion for BOTH shaders and pipelines)! The exception
+  // is when the function originally returned "unsupported", but started to
+  // return a valid value (in this case the shader wouldn't be cached in the
+  // first place). Otherwise games will not be able to locate shaders for draws
+  // for which the host vertex shader type has changed!
   const auto& regs = register_file_;
   auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
   if (!xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
@@ -929,13 +912,12 @@ bool PipelineCache::ConfigurePipeline(
     xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
     bool early_z,
     const RenderTargetCache::PipelineRenderTarget render_targets[5],
-    void** pipeline_state_handle_out,
-    ID3D12RootSignature** root_signature_out) {
+    void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
 #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
 
-  assert_not_null(pipeline_state_handle_out);
+  assert_not_null(pipeline_handle_out);
   assert_not_null(root_signature_out);
 
   PipelineRuntimeDescription runtime_description;
@@ -946,24 +928,24 @@ bool PipelineCache::ConfigurePipeline(
   }
   PipelineDescription& description = runtime_description.description;
 
-  if (current_pipeline_state_ != nullptr &&
-      !std::memcmp(&current_pipeline_state_->description.description,
-                   &description, sizeof(description))) {
-    *pipeline_state_handle_out = current_pipeline_state_;
+  if (current_pipeline_ != nullptr &&
+      !std::memcmp(&current_pipeline_->description.description, &description,
+                   sizeof(description))) {
+    *pipeline_handle_out = current_pipeline_;
     *root_signature_out = runtime_description.root_signature;
     return true;
   }
 
-  // Find an existing pipeline state object in the cache.
+  // Find an existing pipeline in the cache.
   uint64_t hash = XXH64(&description, sizeof(description), 0);
-  auto found_range = pipeline_states_.equal_range(hash);
+  auto found_range = pipelines_.equal_range(hash);
   for (auto it = found_range.first; it != found_range.second; ++it) {
-    PipelineState* found_pipeline_state = it->second;
-    if (!std::memcmp(&found_pipeline_state->description.description,
-                     &description, sizeof(description))) {
-      current_pipeline_state_ = found_pipeline_state;
-      *pipeline_state_handle_out = found_pipeline_state;
-      *root_signature_out = found_pipeline_state->description.root_signature;
+    Pipeline* found_pipeline = it->second;
+    if (!std::memcmp(&found_pipeline->description.description, &description,
+                     sizeof(description))) {
+      current_pipeline_ = found_pipeline;
+      *pipeline_handle_out = found_pipeline;
+      *root_signature_out = found_pipeline->description.root_signature;
       return true;
     }
   }
@@ -974,33 +956,32 @@ bool PipelineCache::ConfigurePipeline(
     return false;
   }
 
-  PipelineState* new_pipeline_state = new PipelineState;
-  new_pipeline_state->state = nullptr;
-  std::memcpy(&new_pipeline_state->description, &runtime_description,
+  Pipeline* new_pipeline = new Pipeline;
+  new_pipeline->state = nullptr;
+  std::memcpy(&new_pipeline->description, &runtime_description,
               sizeof(runtime_description));
-  pipeline_states_.insert(std::make_pair(hash, new_pipeline_state));
-  COUNT_profile_set("gpu/pipeline_cache/pipeline_states",
-                    pipeline_states_.size());
+  pipelines_.emplace(hash, new_pipeline);
+  COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
 
   if (!creation_threads_.empty()) {
-    // Submit the pipeline state object for creation to any available thread.
+    // Submit the pipeline for creation to any available thread.
     {
       std::lock_guard<std::mutex> lock(creation_request_lock_);
-      creation_queue_.push_back(new_pipeline_state);
+      creation_queue_.push_back(new_pipeline);
     }
     creation_request_cond_.notify_one();
   } else {
-    new_pipeline_state->state = CreateD3D12PipelineState(runtime_description);
+    new_pipeline->state = CreateD3D12Pipeline(runtime_description);
   }
 
-  if (pipeline_state_storage_file_) {
+  if (pipeline_storage_file_) {
     assert_not_null(storage_write_thread_);
-    pipeline_state_storage_file_flush_needed_ = true;
+    pipeline_storage_file_flush_needed_ = true;
     {
       std::lock_guard<std::mutex> lock(storage_write_request_lock_);
-      storage_write_pipeline_state_queue_.emplace_back();
+      storage_write_pipeline_queue_.emplace_back();
       PipelineStoredDescription& stored_description =
-          storage_write_pipeline_state_queue_.back();
+          storage_write_pipeline_queue_.back();
       stored_description.description_hash = hash;
       std::memcpy(&stored_description.description, &description,
                   sizeof(description));
@@ -1008,8 +989,8 @@ bool PipelineCache::ConfigurePipeline(
     storage_write_request_cond_.notify_all();
   }
 
-  current_pipeline_state_ = new_pipeline_state;
-  *pipeline_state_handle_out = new_pipeline_state;
+  current_pipeline_ = new_pipeline;
+  *pipeline_handle_out = new_pipeline;
   *root_signature_out = runtime_description.root_signature;
   return true;
 }
@@ -1136,8 +1117,8 @@ bool PipelineCache::TranslateShader(
         std::memcpy(
             texture_binding_layouts_.data() + new_uid.vector_span_offset,
             texture_bindings, texture_binding_layout_bytes);
-        texture_binding_layout_map_.insert(
-            {texture_binding_layout_hash, new_uid});
+        texture_binding_layout_map_.emplace(texture_binding_layout_hash,
+                                            new_uid);
       }
     }
     if (bindless_sampler_count) {
@@ -1179,8 +1160,8 @@ bool PipelineCache::TranslateShader(
           vector_bindless_sampler_layout[i] =
               sampler_bindings[i].bindless_descriptor_index;
         }
-        bindless_sampler_layout_map_.insert(
-            {bindless_sampler_layout_hash, new_uid});
+        bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash,
+                                             new_uid);
       }
     }
   }
@@ -1508,8 +1489,7 @@ bool PipelineCache::GetCurrentStateDescription(
         /* 16 */ PipelineBlendFactor::kSrcAlphaSat,
     };
     // Like kBlendFactorMap, but with color modes changed to alpha. Some
-    // pipeline state objects aren't created in Prey because a color mode is
-    // used for alpha.
+    // pipelines aren't created in Prey because a color mode is used for alpha.
     static const PipelineBlendFactor kBlendFactorAlphaMap[32] = {
         /*  0 */ PipelineBlendFactor::kZero,
         /*  1 */ PipelineBlendFactor::kOne,
@@ -1569,18 +1549,16 @@ bool PipelineCache::GetCurrentStateDescription(
   return true;
 }
 
-ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState(
+ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
     const PipelineRuntimeDescription& runtime_description) {
   const PipelineDescription& description = runtime_description.description;
 
   if (runtime_description.pixel_shader != nullptr) {
-    XELOGGPU(
-        "Creating graphics pipeline state with VS {:016X}"
-        ", PS {:016X}",
-        runtime_description.vertex_shader->ucode_data_hash(),
-        runtime_description.pixel_shader->ucode_data_hash());
+    XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}",
+             runtime_description.vertex_shader->ucode_data_hash(),
+             runtime_description.pixel_shader->ucode_data_hash());
   } else {
-    XELOGGPU("Creating graphics pipeline state with VS {:016X}",
+    XELOGGPU("Creating graphics pipeline with VS {:016X}",
              runtime_description.vertex_shader->ucode_data_hash());
   }
 
@@ -1893,20 +1871,18 @@ ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState(
     }
   }
 
-  // Create the pipeline state object.
+  // Create the D3D12 pipeline state object.
   auto device =
       command_processor_.GetD3D12Context().GetD3D12Provider().GetDevice();
   ID3D12PipelineState* state;
   if (FAILED(device->CreateGraphicsPipelineState(&state_desc,
                                                  IID_PPV_ARGS(&state)))) {
     if (runtime_description.pixel_shader != nullptr) {
-      XELOGE(
-          "Failed to create graphics pipeline state with VS {:016X}"
-          ", PS {:016X}",
-          runtime_description.vertex_shader->ucode_data_hash(),
-          runtime_description.pixel_shader->ucode_data_hash());
+      XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
+             runtime_description.vertex_shader->ucode_data_hash(),
+             runtime_description.pixel_shader->ucode_data_hash());
     } else {
-      XELOGE("Failed to create graphics pipeline state with VS {:016X}",
+      XELOGE("Failed to create graphics pipeline with VS {:016X}",
              runtime_description.vertex_shader->ucode_data_hash());
     }
     return nullptr;
@@ -1933,7 +1909,7 @@ void PipelineCache::StorageWriteThread() {
   ucode_guest_endian.reserve(0xFFFF);
 
   bool flush_shaders = false;
-  bool flush_pipeline_states = false;
+  bool flush_pipelines = false;
 
   while (true) {
     if (flush_shaders) {
@@ -1941,15 +1917,15 @@ void PipelineCache::StorageWriteThread() {
       assert_not_null(shader_storage_file_);
       fflush(shader_storage_file_);
     }
-    if (flush_pipeline_states) {
-      flush_pipeline_states = false;
-      assert_not_null(pipeline_state_storage_file_);
-      fflush(pipeline_state_storage_file_);
+    if (flush_pipelines) {
+      flush_pipelines = false;
+      assert_not_null(pipeline_storage_file_);
+      fflush(pipeline_storage_file_);
     }
 
     std::pair<const Shader*, reg::SQ_PROGRAM_CNTL> shader_pair = {};
     PipelineStoredDescription pipeline_description;
-    bool write_pipeline_state = false;
+    bool write_pipeline = false;
     {
       std::unique_lock<std::mutex> lock(storage_write_request_lock_);
       if (storage_write_thread_shutdown_) {
@@ -1962,17 +1938,17 @@ void PipelineCache::StorageWriteThread() {
         storage_write_flush_shaders_ = false;
         flush_shaders = true;
       }
-      if (!storage_write_pipeline_state_queue_.empty()) {
+      if (!storage_write_pipeline_queue_.empty()) {
         std::memcpy(&pipeline_description,
-                    &storage_write_pipeline_state_queue_.front(),
+                    &storage_write_pipeline_queue_.front(),
                     sizeof(pipeline_description));
-        storage_write_pipeline_state_queue_.pop_front();
-        write_pipeline_state = true;
-      } else if (storage_write_flush_pipeline_states_) {
-        storage_write_flush_pipeline_states_ = false;
-        flush_pipeline_states = true;
+        storage_write_pipeline_queue_.pop_front();
+        write_pipeline = true;
+      } else if (storage_write_flush_pipelines_) {
+        storage_write_flush_pipelines_ = false;
+        flush_pipelines = true;
       }
-      if (!shader_pair.first && !write_pipeline_state) {
+      if (!shader_pair.first && !write_pipeline) {
         storage_write_request_cond_.wait(lock);
         continue;
       }
@@ -1999,27 +1975,26 @@ void PipelineCache::StorageWriteThread() {
       }
     }
 
-    if (write_pipeline_state) {
-      assert_not_null(pipeline_state_storage_file_);
+    if (write_pipeline) {
+      assert_not_null(pipeline_storage_file_);
       fwrite(&pipeline_description, sizeof(pipeline_description), 1,
-             pipeline_state_storage_file_);
+             pipeline_storage_file_);
     }
   }
 }
 
 void PipelineCache::CreationThread(size_t thread_index) {
   while (true) {
-    PipelineState* pipeline_state_to_create = nullptr;
+    Pipeline* pipeline_to_create = nullptr;
 
     // Check if need to shut down or set the completion event and dequeue the
-    // pipeline state if there is any.
+    // pipeline if there is any.
     {
       std::unique_lock<std::mutex> lock(creation_request_lock_);
       if (thread_index >= creation_threads_shutdown_from_ ||
           creation_queue_.empty()) {
         if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
-          // Last pipeline state object in the queue created - signal the event
-          // if requested.
+          // Last pipeline in the queue created - signal the event if requested.
           creation_completion_set_event_ = false;
           creation_completion_event_->Set();
         }
@@ -2029,23 +2004,22 @@ void PipelineCache::CreationThread(size_t thread_index) {
         creation_request_cond_.wait(lock);
         continue;
       }
-      // Take the pipeline state from the queue and increment the busy thread
-      // count until the pipeline state object is created - other threads must
-      // be able to dequeue requests, but can't set the completion event until
-      // the pipeline state objects are fully created (rather than just started
-      // creating).
-      pipeline_state_to_create = creation_queue_.front();
+      // Take the pipeline from the queue and increment the busy thread count
+      // until the pipeline is created - other threads must be able to dequeue
+      // requests, but can't set the completion event until the pipelines are
+      // fully created (rather than just started creating).
+      pipeline_to_create = creation_queue_.front();
       creation_queue_.pop_front();
       ++creation_threads_busy_;
     }
 
     // Create the D3D12 pipeline state object.
-    pipeline_state_to_create->state =
-        CreateD3D12PipelineState(pipeline_state_to_create->description);
+    pipeline_to_create->state =
+        CreateD3D12Pipeline(pipeline_to_create->description);
 
-    // Pipeline state object created - the thread is not busy anymore, safe to
-    // set the completion event if needed (at the next iteration, or in some
-    // other thread).
+    // Pipeline created - the thread is not busy anymore, safe to set the
+    // completion event if needed (at the next iteration, or in some other
+    // thread).
     {
       std::lock_guard<std::mutex> lock(creation_request_lock_);
       --creation_threads_busy_;
@@ -2053,20 +2027,20 @@ void PipelineCache::CreationThread(size_t thread_index) {
   }
 }
 
-void PipelineCache::CreateQueuedPipelineStatesOnProcessorThread() {
+void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
   assert_false(creation_threads_.empty());
   while (true) {
-    PipelineState* pipeline_state_to_create;
+    Pipeline* pipeline_to_create;
     {
       std::lock_guard<std::mutex> lock(creation_request_lock_);
       if (creation_queue_.empty()) {
         break;
       }
-      pipeline_state_to_create = creation_queue_.front();
+      pipeline_to_create = creation_queue_.front();
       creation_queue_.pop_front();
     }
-    pipeline_state_to_create->state =
-        CreateD3D12PipelineState(pipeline_state_to_create->description);
+    pipeline_to_create->state =
+        CreateD3D12Pipeline(pipeline_to_create->description);
   }
 }
 
diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h
index cdc6ed5f3..8159416d0 100644
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@@ -29,6 +29,7 @@
 #include "xenia/gpu/dxbc_shader_translator.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/xenos.h"
+#include "xenia/ui/d3d12/d3d12_api.h"
 
 namespace xe {
 namespace gpu {
@@ -54,7 +55,7 @@ class PipelineCache {
   void ShutdownShaderStorage();
 
   void EndSubmission();
-  bool IsCreatingPipelineStates();
+  bool IsCreatingPipelines();
 
   D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address,
                           const uint32_t* host_address, uint32_t dword_count);
@@ -73,14 +74,12 @@ class PipelineCache {
       xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
       bool early_z,
       const RenderTargetCache::PipelineRenderTarget render_targets[5],
-      void** pipeline_state_handle_out,
-      ID3D12RootSignature** root_signature_out);
+      void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);
 
-  // Returns a pipeline state object with deferred creation by its handle. May
-  // return nullptr if failed to create the pipeline state object.
-  inline ID3D12PipelineState* GetD3D12PipelineStateByHandle(
-      void* handle) const {
-    return reinterpret_cast<const PipelineState*>(handle)->state;
+  // Returns a pipeline with deferred creation by its handle. May return nullptr
+  // if failed to create the pipeline.
+  ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
+    return reinterpret_cast<const Pipeline*>(handle)->state;
   }
 
  private:
@@ -237,7 +236,7 @@ class PipelineCache {
       const RenderTargetCache::PipelineRenderTarget render_targets[5],
       PipelineRuntimeDescription& runtime_description_out);
 
-  ID3D12PipelineState* CreateD3D12PipelineState(
+  ID3D12PipelineState* CreateD3D12Pipeline(
       const PipelineRuntimeDescription& runtime_description);
 
   D3D12CommandProcessor& command_processor_;
@@ -255,9 +254,9 @@ class PipelineCache {
   IDxcUtils* dxc_utils_ = nullptr;
   IDxcCompiler* dxc_compiler_ = nullptr;
 
-  // All loaded shaders mapped by their guest hash key.
+  // Ucode hash -> shader.
   std::unordered_map<uint64_t, D3D12Shader*, xe::hash::IdentityHasher<uint64_t>>
-      shader_map_;
+      shaders_;
 
   struct LayoutUID {
     size_t uid;
@@ -285,21 +284,20 @@ class PipelineCache {
   // Xenos pixel shader provided.
   std::vector<uint8_t> depth_only_pixel_shader_;
 
-  struct PipelineState {
+  struct Pipeline {
     // nullptr if creation has failed.
     ID3D12PipelineState* state;
     PipelineRuntimeDescription description;
   };
-  // All previously generated pipeline state objects identified by hash and the
-  // description.
-  std::unordered_multimap<uint64_t, PipelineState*,
+  // All previously generated pipelines identified by hash and the description.
+  std::unordered_multimap<uint64_t, Pipeline*,
                           xe::hash::IdentityHasher<uint64_t>>
-      pipeline_states_;
+      pipelines_;
 
-  // Previously used pipeline state object. This matches our current state
-  // settings and allows us to quickly(ish) reuse the pipeline state if no
-  // registers have changed.
-  PipelineState* current_pipeline_state_ = nullptr;
+  // Previously used pipeline. This matches our current state settings and
+  // allows us to quickly(ish) reuse the pipeline if no registers have been
+  // changed.
+  Pipeline* current_pipeline_ = nullptr;
 
   // Currently open shader storage path.
   std::filesystem::path shader_storage_root_;
@@ -309,10 +307,9 @@ class PipelineCache {
   FILE* shader_storage_file_ = nullptr;
   bool shader_storage_file_flush_needed_ = false;
 
-  // Pipeline state storage output stream, for preload in the next emulator
-  // runs.
-  FILE* pipeline_state_storage_file_ = nullptr;
-  bool pipeline_state_storage_file_flush_needed_ = false;
+  // Pipeline storage output stream, for preload in the next emulator runs.
+  FILE* pipeline_storage_file_ = nullptr;
+  bool pipeline_storage_file_flush_needed_ = false;
 
   // Thread for asynchronous writing to the storage streams.
   void StorageWriteThread();
@@ -322,28 +319,27 @@ class PipelineCache {
   // thread is notified about its change via storage_write_request_cond_.
   std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>>
       storage_write_shader_queue_;
-  std::deque<PipelineStoredDescription> storage_write_pipeline_state_queue_;
+  std::deque<PipelineStoredDescription> storage_write_pipeline_queue_;
   bool storage_write_flush_shaders_ = false;
-  bool storage_write_flush_pipeline_states_ = false;
+  bool storage_write_flush_pipelines_ = false;
   bool storage_write_thread_shutdown_ = false;
   std::unique_ptr<xe::threading::Thread> storage_write_thread_;
 
-  // Pipeline state object creation threads.
+  // Pipeline creation threads.
   void CreationThread(size_t thread_index);
-  void CreateQueuedPipelineStatesOnProcessorThread();
+  void CreateQueuedPipelinesOnProcessorThread();
   std::mutex creation_request_lock_;
   std::condition_variable creation_request_cond_;
   // Protected with creation_request_lock_, notify_one creation_request_cond_
   // when set.
-  std::deque<PipelineState*> creation_queue_;
-  // Number of threads that are currently creating a pipeline state object -
-  // incremented when a pipeline state object is dequeued (the completion event
-  // can't be triggered before this is zero). Protected with
-  // creation_request_lock_.
+  std::deque<Pipeline*> creation_queue_;
+  // Number of threads that are currently creating a pipeline - incremented when
+  // a pipeline is dequeued (the completion event can't be triggered before this
+  // is zero). Protected with creation_request_lock_.
   size_t creation_threads_busy_ = 0;
-  // Manual-reset event set when the last queued pipeline state object is
-  // created and there are no more pipeline state objects to create. This is
-  // triggered by the thread creating the last pipeline state object.
+  // Manual-reset event set when the last queued pipeline is created and there
+  // are no more pipelines to create. This is triggered by the thread creating
+  // the last pipeline.
   std::unique_ptr<xe::threading::Event> creation_completion_event_;
   // Whether setting the event on completion is queued. Protected with
   // creation_request_lock_, notify_one creation_request_cond_ when set.
diff --git a/src/xenia/gpu/d3d12/premake5.lua b/src/xenia/gpu/d3d12/premake5.lua
index b4b9f3ecb..fa82fdb6c 100644
--- a/src/xenia/gpu/d3d12/premake5.lua
+++ b/src/xenia/gpu/d3d12/premake5.lua
@@ -25,15 +25,6 @@ project("xenia-gpu-d3d12-trace-viewer")
   kind("WindowedApp")
   language("C++")
   links({
-    "aes_128",
-    "capstone",
-    "dxbc",
-    "fmt",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
     "xenia-apu",
     "xenia-apu-nop",
     "xenia-base",
@@ -49,6 +40,17 @@ project("xenia-gpu-d3d12-trace-viewer")
     "xenia-ui-d3d12",
     "xenia-vfs",
     "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "dxbc",
+    "fmt",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
     "xxhash",
   })
   files({
@@ -71,15 +73,6 @@ project("xenia-gpu-d3d12-trace-dump")
   kind("ConsoleApp")
   language("C++")
   links({
-    "aes_128",
-    "capstone",
-    "dxbc",
-    "fmt",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
     "xenia-apu",
     "xenia-apu-nop",
     "xenia-base",
@@ -95,6 +88,17 @@ project("xenia-gpu-d3d12-trace-dump")
     "xenia-ui-d3d12",
     "xenia-vfs",
     "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "dxbc",
+    "fmt",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
     "xxhash",
   })
   files({
@@ -109,4 +113,4 @@ project("xenia-gpu-d3d12-trace-dump")
       "2>&1",
       "1>scratch/stdout-trace-dump.txt",
     })
-  end
\ No newline at end of file
+  end
diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc
index d4f989123..90ba11ac5 100644
--- a/src/xenia/gpu/d3d12/primitive_converter.cc
+++ b/src/xenia/gpu/d3d12/primitive_converter.cc
@@ -454,8 +454,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
   // again and again and exit.
   if (!conversion_needed || converted_index_count == 0) {
     converted_indices.gpu_address = 0;
-    converted_indices_cache_.insert(
-        std::make_pair(converted_indices.key.value, converted_indices));
+    converted_indices_cache_.emplace(converted_indices.key.value,
+                                     converted_indices);
     memory_regions_used_ |= memory_regions_used_bits;
     return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty
                                       : ConversionResult::kConversionNotNeeded;
@@ -670,8 +670,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
 
   // Cache and return the indices.
   converted_indices.gpu_address = gpu_address;
-  converted_indices_cache_.insert(
-      std::make_pair(converted_indices.key.value, converted_indices));
+  converted_indices_cache_.emplace(converted_indices.key.value,
+                                   converted_indices);
   memory_regions_used_ |= memory_regions_used_bits;
   gpu_address_out = gpu_address;
   index_count_out = converted_index_count;
diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index b2c964a55..66ef2ba9f 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -277,20 +277,19 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
       return false;
     }
 
-    // Create the EDRAM load/store pipeline state objects.
+    // Create the EDRAM load/store pipelines.
     for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) {
       const EdramLoadStoreModeInfo& mode_info = edram_load_store_mode_info_[i];
-      edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState(
+      edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
           device, mode_info.load_shader, mode_info.load_shader_size,
           edram_load_store_root_signature_);
-      edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState(
+      edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
           device, mode_info.store_shader, mode_info.store_shader_size,
           edram_load_store_root_signature_);
       if (edram_load_pipelines_[i] == nullptr ||
           edram_store_pipelines_[i] == nullptr) {
-        XELOGE(
-            "Failed to create the EDRAM load/store pipeline states for mode {}",
-            i);
+        XELOGE("Failed to create the EDRAM load/store pipelines for mode {}",
+               i);
         Shutdown();
         return false;
       }
@@ -299,7 +298,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
     }
   }
 
-  // Create the resolve root signatures and pipeline state objects.
+  // Create the resolve root signatures and pipelines.
   D3D12_ROOT_PARAMETER resolve_root_parameters[3];
 
   // Copying root signature.
@@ -369,7 +368,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
     return false;
   }
 
-  // Copying pipeline state objects.
+  // Copying pipelines.
   uint32_t resolution_scale = resolution_scale_2x_ ? 2 : 1;
   for (size_t i = 0; i < size_t(draw_util::ResolveCopyShaderIndex::kCount);
        ++i) {
@@ -381,63 +380,61 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
       continue;
     }
     const auto& resolve_copy_shader = resolve_copy_shaders_[i];
-    ID3D12PipelineState* resolve_copy_pipeline_state =
-        ui::d3d12::util::CreateComputePipelineState(
+    ID3D12PipelineState* resolve_copy_pipeline =
+        ui::d3d12::util::CreateComputePipeline(
             device, resolve_copy_shader.first, resolve_copy_shader.second,
             resolve_copy_root_signature_);
-    if (resolve_copy_pipeline_state == nullptr) {
-      XELOGE("Failed to create {} resolve copy pipeline state",
+    if (resolve_copy_pipeline == nullptr) {
+      XELOGE("Failed to create {} resolve copy pipeline",
              resolve_copy_shader_info.debug_name);
     }
-    resolve_copy_pipeline_state->SetName(reinterpret_cast<LPCWSTR>(
+    resolve_copy_pipeline->SetName(reinterpret_cast<LPCWSTR>(
         xe::to_utf16(resolve_copy_shader_info.debug_name).c_str()));
-    resolve_copy_pipeline_states_[i] = resolve_copy_pipeline_state;
+    resolve_copy_pipelines_[i] = resolve_copy_pipeline;
   }
 
-  // Clearing pipeline state objects.
-  resolve_clear_32bpp_pipeline_state_ =
-      ui::d3d12::util::CreateComputePipelineState(
-          device,
-          resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs
-                               : resolve_clear_32bpp_cs,
-          resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs)
-                               : sizeof(resolve_clear_32bpp_cs),
-          resolve_clear_root_signature_);
-  if (resolve_clear_32bpp_pipeline_state_ == nullptr) {
-    XELOGE("Failed to create the 32bpp resolve clear pipeline state");
+  // Clearing pipelines.
+  resolve_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device,
+      resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs
+                           : resolve_clear_32bpp_cs,
+      resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs)
+                           : sizeof(resolve_clear_32bpp_cs),
+      resolve_clear_root_signature_);
+  if (resolve_clear_32bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 32bpp resolve clear pipeline");
     Shutdown();
     return false;
   }
-  resolve_clear_32bpp_pipeline_state_->SetName(L"Resolve Clear 32bpp");
-  resolve_clear_64bpp_pipeline_state_ =
-      ui::d3d12::util::CreateComputePipelineState(
-          device,
-          resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs
-                               : resolve_clear_64bpp_cs,
-          resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs)
-                               : sizeof(resolve_clear_64bpp_cs),
-          resolve_clear_root_signature_);
-  if (resolve_clear_64bpp_pipeline_state_ == nullptr) {
-    XELOGE("Failed to create the 64bpp resolve clear pipeline state");
+  resolve_clear_32bpp_pipeline_->SetName(L"Resolve Clear 32bpp");
+  resolve_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device,
+      resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs
+                           : resolve_clear_64bpp_cs,
+      resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs)
+                           : sizeof(resolve_clear_64bpp_cs),
+      resolve_clear_root_signature_);
+  if (resolve_clear_64bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 64bpp resolve clear pipeline");
     Shutdown();
     return false;
   }
-  resolve_clear_64bpp_pipeline_state_->SetName(L"Resolve Clear 64bpp");
+  resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp");
   if (!edram_rov_used_) {
     assert_false(resolution_scale_2x_);
-    resolve_clear_depth_24_32_pipeline_state_ =
-        ui::d3d12::util::CreateComputePipelineState(
+    resolve_clear_depth_24_32_pipeline_ =
+        ui::d3d12::util::CreateComputePipeline(
             device, resolve_clear_depth_24_32_cs,
             sizeof(resolve_clear_depth_24_32_cs),
             resolve_clear_root_signature_);
-    if (resolve_clear_depth_24_32_pipeline_state_ == nullptr) {
+    if (resolve_clear_depth_24_32_pipeline_ == nullptr) {
       XELOGE(
           "Failed to create the 24-bit and 32-bit depth resolve clear pipeline "
           "state");
       Shutdown();
       return false;
     }
-    resolve_clear_64bpp_pipeline_state_->SetName(
+    resolve_clear_64bpp_pipeline_->SetName(
         L"Resolve Clear 24-bit & 32-bit Depth");
   }
 
@@ -451,12 +448,12 @@ void RenderTargetCache::Shutdown() {
 
   edram_snapshot_restore_pool_.reset();
   ui::d3d12::util::ReleaseAndNull(edram_snapshot_download_buffer_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_state_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_state_);
-  ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_state_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_);
+  ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_);
   ui::d3d12::util::ReleaseAndNull(resolve_clear_root_signature_);
-  for (size_t i = 0; i < xe::countof(resolve_copy_pipeline_states_); ++i) {
-    ui::d3d12::util::ReleaseAndNull(resolve_copy_pipeline_states_[i]);
+  for (size_t i = 0; i < xe::countof(resolve_copy_pipelines_); ++i) {
+    ui::d3d12::util::ReleaseAndNull(resolve_copy_pipelines_[i]);
   }
   ui::d3d12::util::ReleaseAndNull(resolve_copy_root_signature_);
   for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) {
@@ -1209,8 +1206,8 @@ bool RenderTargetCache::Resolve(const Memory& memory,
                 0, sizeof(copy_shader_constants) / sizeof(uint32_t),
                 &copy_shader_constants, 0);
           }
-          command_processor_.SetComputePipelineState(
-              resolve_copy_pipeline_states_[size_t(copy_shader)]);
+          command_processor_.SetComputePipeline(
+              resolve_copy_pipelines_[size_t(copy_shader)]);
           command_processor_.SubmitBarriers();
           command_list.D3DDispatch(copy_group_count_x, copy_group_count_y, 1);
 
@@ -1279,9 +1276,9 @@ bool RenderTargetCache::Resolve(const Memory& memory,
         command_list.D3DSetComputeRoot32BitConstants(
             0, sizeof(depth_clear_constants) / sizeof(uint32_t),
             &depth_clear_constants, 0);
-        command_processor_.SetComputePipelineState(
-            clear_float32_depth ? resolve_clear_depth_24_32_pipeline_state_
-                                : resolve_clear_32bpp_pipeline_state_);
+        command_processor_.SetComputePipeline(
+            clear_float32_depth ? resolve_clear_depth_24_32_pipeline_
+                                : resolve_clear_32bpp_pipeline_);
         command_processor_.SubmitBarriers();
         command_list.D3DDispatch(clear_group_count.first,
                                  clear_group_count.second, 1);
@@ -1301,10 +1298,10 @@ bool RenderTargetCache::Resolve(const Memory& memory,
               0, sizeof(color_clear_constants) / sizeof(uint32_t),
               &color_clear_constants, 0);
         }
-        command_processor_.SetComputePipelineState(
+        command_processor_.SetComputePipeline(
             resolve_info.color_edram_info.format_is_64bpp
-                ? resolve_clear_64bpp_pipeline_state_
-                : resolve_clear_32bpp_pipeline_state_);
+                ? resolve_clear_64bpp_pipeline_
+                : resolve_clear_32bpp_pipeline_);
         command_processor_.SubmitBarriers();
         command_list.D3DDispatch(clear_group_count.first,
                                  clear_group_count.second, 1);
@@ -1816,7 +1813,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
                                 render_target->footprints, nullptr, nullptr,
                                 &copy_buffer_size);
   render_target->copy_buffer_size = uint32_t(copy_buffer_size);
-  render_targets_.insert(std::make_pair(key.value, render_target));
+  render_targets_.emplace(key.value, render_target);
   COUNT_profile_set("gpu/render_target_cache/render_targets",
                     render_targets_.size());
 #if 0
@@ -2015,8 +2012,7 @@ void RenderTargetCache::StoreRenderTargetsToEdram() {
         0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
     EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                                render_target->key.format);
-    command_processor_.SetComputePipelineState(
-        edram_store_pipelines_[size_t(mode)]);
+    command_processor_.SetComputePipeline(edram_store_pipelines_[size_t(mode)]);
     // 1 group per 80x16 samples.
     command_list.D3DDispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);
 
@@ -2140,8 +2136,7 @@ void RenderTargetCache::LoadRenderTargetsFromEdram(
         0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
     EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                                render_target->key.format);
-    command_processor_.SetComputePipelineState(
-        edram_load_pipelines_[size_t(mode)]);
+    command_processor_.SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
     // 1 group per 80x16 samples.
     command_list.D3DDispatch(render_target->key.width_ss_div_80, edram_rows, 1);
 
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index 0def0d25c..6d20e8d52 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -237,14 +237,13 @@ class D3D12CommandProcessor;
 // get each of the 4 host pixels for each sample.
 class RenderTargetCache {
  public:
-  // Direct3D 12 debug layer does some kaschenit-style trolling by giving errors
-  // that contradict each other when you use null RTV descriptors - if you set
-  // a valid format in RTVFormats in the pipeline state, it says that null
-  // descriptors can only be used if the format in the pipeline state is
-  // DXGI_FORMAT_UNKNOWN, however, if DXGI_FORMAT_UNKNOWN is set, it complains
-  // that the format in the pipeline doesn't match the RTV format. So we have to
-  // make render target bindings consecutive and remap the output indices in
-  // pixel shaders.
+  // Direct3D 12 debug layer is giving errors that contradict each other when
+  // you use null RTV descriptors - if you set a valid format in RTVFormats in
+  // the pipeline state, it says that null descriptors can only be used if the
+  // format in the pipeline state is DXGI_FORMAT_UNKNOWN, however, if
+  // DXGI_FORMAT_UNKNOWN is set, it complains that the format in the pipeline
+  // state doesn't match the RTV format. So we have to make render target
+  // bindings consecutive and remap the output indices in pixel shaders.
   struct PipelineRenderTarget {
     uint32_t guest_render_target;
     DXGI_FORMAT format;
@@ -304,8 +303,7 @@ class RenderTargetCache {
   // performance difference, but with EDRAM loads/stores less conversion should
   // be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and
   // it's probably more accurate.
-  static inline DXGI_FORMAT GetDepthDXGIFormat(
-      xenos::DepthRenderTargetFormat format) {
+  static DXGI_FORMAT GetDepthDXGIFormat(xenos::DepthRenderTargetFormat format) {
     return format == xenos::DepthRenderTargetFormat::kD24FS8
                ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT
                : DXGI_FORMAT_D24_UNORM_S8_UINT;
@@ -537,7 +535,7 @@ class RenderTargetCache {
     // 16: - EDRAM pitch in tiles.
     uint32_t base_samples_2x_depth_pitch;
   };
-  // EDRAM pipeline states for the RTV/DSV path.
+  // EDRAM pipelines for the RTV/DSV path.
   static const EdramLoadStoreModeInfo
       edram_load_store_mode_info_[size_t(EdramLoadStoreMode::kCount)];
   ID3D12PipelineState*
@@ -546,20 +544,20 @@ class RenderTargetCache {
   ID3D12PipelineState*
       edram_store_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};
 
-  // Resolve root signatures and pipeline state objects.
+  // Resolve root signatures and pipelines.
   ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
   static const std::pair<const uint8_t*, size_t>
       resolve_copy_shaders_[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
-  ID3D12PipelineState* resolve_copy_pipeline_states_[size_t(
+  ID3D12PipelineState* resolve_copy_pipelines_[size_t(
       draw_util::ResolveCopyShaderIndex::kCount)] = {};
   ID3D12RootSignature* resolve_clear_root_signature_ = nullptr;
   // Clearing 32bpp color, depth with ROV, or unorm depth without ROV.
-  ID3D12PipelineState* resolve_clear_32bpp_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_32bpp_pipeline_ = nullptr;
   // Clearing 64bpp color.
-  ID3D12PipelineState* resolve_clear_64bpp_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_64bpp_pipeline_ = nullptr;
   // Clearing float depth without ROV, both the float24 and the host float32
   // versions.
-  ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_state_ = nullptr;
+  ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_ = nullptr;
 
   // FIXME(Triang3l): Investigate what's wrong with placed RTV/DSV aliasing on
   // Nvidia Maxwell 1st generation and older.
diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc
index c8b1e6297..44d76c9ed 100644
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@@ -918,27 +918,24 @@ bool TextureCache::Initialize(bool edram_rov_used) {
     return false;
   }
 
-  // Create the loading pipeline state objects.
+  // Create the loading pipelines.
   for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) {
     const LoadModeInfo& mode_info = load_mode_info_[i];
-    load_pipeline_states_[i] = ui::d3d12::util::CreateComputePipelineState(
+    load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
         device, mode_info.shader, mode_info.shader_size, load_root_signature_);
-    if (load_pipeline_states_[i] == nullptr) {
-      XELOGE(
-          "Failed to create the texture loading pipeline state object for mode "
-          "{}",
-          i);
+    if (load_pipelines_[i] == nullptr) {
+      XELOGE("Failed to create the texture loading pipeline for mode {}", i);
       Shutdown();
       return false;
     }
     if (IsResolutionScale2X() && mode_info.shader_2x != nullptr) {
-      load_pipeline_states_2x_[i] = ui::d3d12::util::CreateComputePipelineState(
+      load_pipelines_2x_[i] = ui::d3d12::util::CreateComputePipeline(
           device, mode_info.shader_2x, mode_info.shader_2x_size,
           load_root_signature_);
-      if (load_pipeline_states_2x_[i] == nullptr) {
+      if (load_pipelines_2x_[i] == nullptr) {
         XELOGE(
-            "Failed to create the 2x-scaled texture loading pipeline state "
-            "for mode {}",
+            "Failed to create the 2x-scaled texture loading pipeline for mode "
+            "{}",
             i);
         Shutdown();
         return false;
@@ -1024,8 +1021,8 @@ void TextureCache::Shutdown() {
   ui::d3d12::util::ReleaseAndNull(null_srv_descriptor_heap_);
 
   for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) {
-    ui::d3d12::util::ReleaseAndNull(load_pipeline_states_2x_[i]);
-    ui::d3d12::util::ReleaseAndNull(load_pipeline_states_[i]);
+    ui::d3d12::util::ReleaseAndNull(load_pipelines_2x_[i]);
+    ui::d3d12::util::ReleaseAndNull(load_pipelines_[i]);
   }
   ui::d3d12::util::ReleaseAndNull(load_root_signature_);
 
@@ -1892,7 +1889,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
   if (IsResolutionScale2X() && key.tiled) {
     LoadMode load_mode = GetLoadMode(key);
     if (load_mode != LoadMode::kUnknown &&
-        load_pipeline_states_2x_[uint32_t(load_mode)] != nullptr) {
+        load_pipelines_2x_[uint32_t(load_mode)] != nullptr) {
       uint32_t base_size = 0, mip_size = 0;
       texture_util::GetTextureTotalSize(
           key.dimension, key.width, key.height, key.depth, key.format,
@@ -2047,7 +2044,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
   }
   texture->base_watch_handle = nullptr;
   texture->mip_watch_handle = nullptr;
-  textures_.insert(std::make_pair(map_key, texture));
+  textures_.emplace(map_key, texture);
   COUNT_profile_set("gpu/texture_cache/textures", textures_.size());
   textures_total_size_ += texture->resource_size;
   COUNT_profile_set("gpu/texture_cache/total_size_mb",
@@ -2079,10 +2076,10 @@ bool TextureCache::LoadTextureData(Texture* texture) {
     return false;
   }
   bool scaled_resolve = texture->key.scaled_resolve ? true : false;
-  ID3D12PipelineState* pipeline_state =
-      scaled_resolve ? load_pipeline_states_2x_[uint32_t(load_mode)]
-                     : load_pipeline_states_[uint32_t(load_mode)];
-  if (pipeline_state == nullptr) {
+  ID3D12PipelineState* pipeline = scaled_resolve
+                                      ? load_pipelines_2x_[uint32_t(load_mode)]
+                                      : load_pipelines_[uint32_t(load_mode)];
+  if (pipeline == nullptr) {
     return false;
   }
   const LoadModeInfo& load_mode_info = load_mode_info_[uint32_t(load_mode)];
@@ -2296,7 +2293,7 @@ bool TextureCache::LoadTextureData(Texture* texture) {
                                                 load_mode_info.srv_bpe_log2);
     }
   }
-  command_processor_.SetComputePipelineState(pipeline_state);
+  command_processor_.SetComputePipeline(pipeline);
   command_list.D3DSetComputeRootSignature(load_root_signature_);
   command_list.D3DSetComputeRootDescriptorTable(2, descriptor_dest.second);
 
@@ -2597,7 +2594,7 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture,
   }
   device->CreateShaderResourceView(
       texture.resource, &desc, GetTextureDescriptorCPUHandle(descriptor_index));
-  texture.srv_descriptors.insert({descriptor_key, descriptor_index});
+  texture.srv_descriptors.emplace(descriptor_key, descriptor_index);
   return descriptor_index;
 }
 
diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h
index 1345d8faf..85131f25d 100644
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@@ -106,18 +106,18 @@ class TextureCache {
     bool operator!=(const TextureKey& key) const {
       return GetMapKey() != key.GetMapKey() || bucket_key != key.bucket_key;
     }
-    inline uint64_t GetMapKey() const {
+    uint64_t GetMapKey() const {
       return uint64_t(map_key[0]) | (uint64_t(map_key[1]) << 32);
     }
-    inline void SetMapKey(uint64_t key) {
+    void SetMapKey(uint64_t key) {
       map_key[0] = uint32_t(key);
       map_key[1] = uint32_t(key >> 32);
     }
-    inline bool IsInvalid() const {
+    bool IsInvalid() const {
       // Zero base and zero width is enough for a binding to be invalid.
       return map_key[0] == 0;
     }
-    inline void MakeInvalid() {
+    void MakeInvalid() {
       // Reset all for a stable hash.
       SetMapKey(0);
       bucket_key = 0;
@@ -222,9 +222,7 @@ class TextureCache {
 
   void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled);
 
-  inline bool IsResolutionScale2X() const {
-    return scaled_resolve_buffer_ != nullptr;
-  }
+  bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr; }
   ID3D12Resource* GetScaledResolveBuffer() const {
     return scaled_resolve_buffer_;
   }
@@ -233,7 +231,7 @@ class TextureCache {
                                          uint32_t length_unscaled);
   void UseScaledResolveBufferForReading();
   void UseScaledResolveBufferForWriting();
-  inline void MarkScaledResolveBufferUAVWritesCommitNeeded() {
+  void MarkScaledResolveBufferUAVWritesCommitNeeded() {
     if (scaled_resolve_buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
       scaled_resolve_buffer_uav_writes_commit_needed_ = true;
     }
@@ -432,7 +430,7 @@ class TextureCache {
   // Whether the signed version of the texture has a different representation on
   // the host than its unsigned version (for example, if it's a fixed-point
   // texture emulated with a larger host pixel format).
-  static inline bool IsSignedVersionSeparate(xenos::TextureFormat format) {
+  static bool IsSignedVersionSeparate(xenos::TextureFormat format) {
     const HostFormat& host_format = host_formats_[uint32_t(format)];
     return host_format.load_mode_snorm != LoadMode::kUnknown &&
            host_format.load_mode_snorm != host_format.load_mode;
@@ -441,26 +439,24 @@ class TextureCache {
   // of block-compressed textures with 4x4-aligned dimensions on PC).
   static bool IsDecompressionNeeded(xenos::TextureFormat format, uint32_t width,
                                     uint32_t height);
-  static inline DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format,
-                                                  uint32_t width,
-                                                  uint32_t height) {
+  static DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format,
+                                           uint32_t width, uint32_t height) {
     const HostFormat& host_format = host_formats_[uint32_t(format)];
     return IsDecompressionNeeded(format, width, height)
                ? host_format.dxgi_format_uncompressed
                : host_format.dxgi_format_resource;
   }
-  static inline DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) {
+  static DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) {
     return GetDXGIResourceFormat(key.format, key.width, key.height);
   }
-  static inline DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format,
-                                               uint32_t width,
-                                               uint32_t height) {
+  static DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format,
+                                        uint32_t width, uint32_t height) {
     const HostFormat& host_format = host_formats_[uint32_t(format)];
     return IsDecompressionNeeded(format, width, height)
                ? host_format.dxgi_format_uncompressed
                : host_format.dxgi_format_unorm;
   }
-  static inline DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
+  static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
     return GetDXGIUnormFormat(key.format, key.width, key.height);
   }
 
@@ -550,9 +546,9 @@ class TextureCache {
 
   static const LoadModeInfo load_mode_info_[];
   ID3D12RootSignature* load_root_signature_ = nullptr;
-  ID3D12PipelineState* load_pipeline_states_[size_t(LoadMode::kCount)] = {};
-  // Load pipeline state objects for 2x-scaled resolved targets.
-  ID3D12PipelineState* load_pipeline_states_2x_[size_t(LoadMode::kCount)] = {};
+  ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {};
+  // Load pipelines for 2x-scaled resolved targets.
+  ID3D12PipelineState* load_pipelines_2x_[size_t(LoadMode::kCount)] = {};
 
   std::unordered_multimap<uint64_t, Texture*> textures_;
   uint64_t textures_total_size_ = 0;
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 6aaa1b856..202d34965 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -111,6 +111,34 @@ int32_t FloatToD3D11Fixed16p8(float f32) {
   return result.s;
 }
 
+void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
+  // FIXME(Triang3l): Screen scissor isn't applied here, but it seems to be
+  // unused on Xbox 360 Direct3D 9.
+  auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
+  auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
+  uint32_t tl_x = pa_sc_window_scissor_tl.tl_x;
+  uint32_t tl_y = pa_sc_window_scissor_tl.tl_y;
+  uint32_t br_x = pa_sc_window_scissor_br.br_x;
+  uint32_t br_y = pa_sc_window_scissor_br.br_y;
+  if (!pa_sc_window_scissor_tl.window_offset_disable) {
+    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+    tl_x = uint32_t(std::max(
+        int32_t(tl_x) + pa_sc_window_offset.window_x_offset, int32_t(0)));
+    tl_y = uint32_t(std::max(
+        int32_t(tl_y) + pa_sc_window_offset.window_y_offset, int32_t(0)));
+    br_x = uint32_t(std::max(
+        int32_t(br_x) + pa_sc_window_offset.window_x_offset, int32_t(0)));
+    br_y = uint32_t(std::max(
+        int32_t(br_y) + pa_sc_window_offset.window_y_offset, int32_t(0)));
+  }
+  br_x = std::max(br_x, tl_x);
+  br_y = std::max(br_y, tl_y);
+  scissor_out.left = tl_x;
+  scissor_out.top = tl_y;
+  scissor_out.width = br_x - tl_x;
+  scissor_out.height = br_y - tl_y;
+}
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index edb880ab0..7ef3186a0 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -33,6 +33,14 @@ namespace draw_util {
 // for use with the top-left rasterization rule later.
 int32_t FloatToD3D11Fixed16p8(float f32);
 
+struct Scissor {
+  uint32_t left;
+  uint32_t top;
+  uint32_t width;
+  uint32_t height;
+};
+void GetScissor(const RegisterFile& regs, Scissor& scissor_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 xenos::CopySampleSelect SanitizeCopySampleSelect(
diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc
index 74faf6e13..b2d24f89b 100644
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       break;
     case AluVectorOpcode::kMul:
     case AluVectorOpcode::kMad: {
-      bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad;
-      if (is_mad) {
-        DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]);
-      } else {
-        DxbcOpMul(per_component_dest, operands[0], operands[1]);
-      }
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t absolute_different =
+      // Not using DXBC mad to prevent fused multiply-add (mul followed by add
+      // may be optimized into non-fused mad by the driver in the identical
+      // operands case also).
+      DxbcOpMul(per_component_dest, operands[0], operands[1]);
+      uint32_t multiplicands_different =
           used_result_components &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+          ~instr.vector_operands[0].GetIdenticalComponents(
               instr.vector_operands[1]);
-      if (absolute_different) {
+      if (multiplicands_different) {
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
-        DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different),
                   operands[0].Abs(), operands[1].Abs());
         // min isn't required to flush denormals, eq is.
-        DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different),
+        DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different),
                  DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f));
-        DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different),
-                   DxbcSrc::R(is_zero_temp),
-                   is_mad ? operands[2] : DxbcSrc::LF(0.0f),
+        // Not replacing true `0 + term` with movc of the term because +0 + -0
+        // should result in +0, not -0.
+        DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different),
+                   DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f),
                    DxbcSrc::R(system_temp_result_));
         // Release is_zero_temp.
         PopSystemTemp();
       }
+      if (instr.vector_opcode == AluVectorOpcode::kMad) {
+        DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_),
+                  operands[2]);
+      }
     } break;
 
     case AluVectorOpcode::kMax:
@@ -179,69 +181,40 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
         component_count = 4;
       }
       result_swizzle = DxbcSrc::kXXXX;
-      uint32_t absolute_different =
-          uint32_t((1 << component_count) - 1) &
-          ~instr.vector_operands[0].GetAbsoluteIdenticalComponents(
-              instr.vector_operands[1]);
-      if (absolute_different) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
-        // Add component products only if non-zero. For dp4, 16 scalar
-        // operations in the worst case (as opposed to always 20 for
-        // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands
-        // for dp4).
-        DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001),
-                  operands[0].SelectFromSwizzled(0),
-                  operands[1].SelectFromSwizzled(0));
-        if (absolute_different & 0b0001) {
-          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010),
-                    operands[0].SelectFromSwizzled(0).Abs(),
-                    operands[1].SelectFromSwizzled(0).Abs());
-          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010),
-                   DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
+      uint32_t different = uint32_t((1 << component_count) - 1) &
+                           ~instr.vector_operands[0].GetIdenticalComponents(
+                               instr.vector_operands[1]);
+      for (uint32_t i = 0; i < component_count; ++i) {
+        DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
+                  operands[0].SelectFromSwizzled(i),
+                  operands[1].SelectFromSwizzled(i));
+        if ((different & (1 << i)) != 0) {
+          // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing
+          // true `0 + term` with movc of the term because +0 + -0 should result
+          // in +0, not -0).
+          DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
+                    operands[0].SelectFromSwizzled(i).Abs(),
+                    operands[1].SelectFromSwizzled(i).Abs());
+          DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+                   DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
                    DxbcSrc::LF(0.0f));
-          DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
-                     DxbcSrc::LF(0.0f),
-                     DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-        }
-        for (uint32_t i = 1; i < component_count; ++i) {
-          bool component_different = (absolute_different & (1 << i)) != 0;
-          DxbcOpMAd(DxbcDest::R(system_temp_result_,
-                                component_different ? 0b0010 : 0b0001),
-                    operands[0].SelectFromSwizzled(i),
-                    operands[1].SelectFromSwizzled(i),
-                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
-          if (component_different) {
-            DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
-                      operands[0].SelectFromSwizzled(i).Abs(),
-                      operands[1].SelectFromSwizzled(i).Abs());
-            DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
+          DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
                      DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                     DxbcSrc::LF(0.0f));
-            DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
-                       DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
-          }
+                     DxbcSrc::LF(0.0f),
+                     DxbcSrc::R(system_temp_result_,
+                                i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX));
         }
-      } else {
-        if (component_count == 2) {
-          DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else if (component_count == 3) {
-          DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
-        } else {
-          assert_true(component_count == 4);
-          DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
-                    operands[1]);
+        if (i) {
+          // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar
+          // as of 2020 anyway, and not using mad for the same reason (mul
+          // followed by add may be optimized into non-fused mad by the driver
+          // in the identical operands case also).
+          DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
+                    DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
         }
       }
       if (component_count == 2) {
-        // Add the third operand. Since floating-point addition isn't
-        // associative, even though adding this in multiply-add for the first
-        // component would be faster, it's safer to add here, in the end.
         DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
                   DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
                   operands[2].SelectFromSwizzled(0));
@@ -592,14 +565,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
         DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f));
       }
       if (used_result_components & 0b0010) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
         DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010),
                   operands[0].SelectFromSwizzled(1),
                   operands[1].SelectFromSwizzled(1));
-        if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
+        if (!(instr.vector_operands[0].GetIdenticalComponents(
                   instr.vector_operands[1]) &
               0b0010)) {
+          // Shader Model 3: +-0 or denormal * anything = +0.
           DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
                     operands[0].SelectFromSwizzled(1).Abs(),
                     operands[1].SelectFromSwizzled(1).Abs());
@@ -700,8 +672,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
       DxbcOpMul(ps_dest, operand_0_a, operand_0_b);
       if (instr.scalar_operands[0].components[0] !=
           instr.scalar_operands[0].components[1]) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
         DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                   operand_0_b.Abs());
@@ -714,58 +685,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
         PopSystemTemp();
       }
       break;
-    case AluScalarOpcode::kMulsPrev: {
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      uint32_t is_zero_temp = PushSystemTemp();
-      DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
-                ps_src.Abs());
-      // min isn't required to flush denormals, eq is.
-      DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001),
-               DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
-      DxbcOpMul(ps_dest, operand_0_a, ps_src);
-      DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX),
-                 DxbcSrc::LF(0.0f), ps_src);
-      // Release is_zero_temp.
-      PopSystemTemp();
-    } break;
+    case AluScalarOpcode::kMulsPrev:
     case AluScalarOpcode::kMulsPrev2: {
       uint32_t test_temp = PushSystemTemp();
-      // Check if need to select the src0.a * ps case.
-      // ps != -FLT_MAX.
-      DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
-      // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX
-      // is already loaded to an SGPR, this is also false if it's NaN.
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // isfinite(src0.b).
-      DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
-               DxbcSrc::LF(-FLT_MAX));
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
-      // for NaN).
-      DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b);
-      DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
-                DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
-      DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
-      // Shader Model 3: 0 or denormal * anything = 0.
-      // ps is already known to be not NaN or Infinity, so multiplying it by 0
-      // will result in 0. However, src0.a can be anything, so the result should
-      // be zero if ps is zero.
-      // FIXME(Triang3l): Signed zero needs research and handling.
-      DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f));
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        // Check if need to select the src0.a * ps case.
+        // ps != -FLT_MAX.
+        DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
+        // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since
+        // -FLT_MAX is already loaded to an SGPR, this is also false if it's
+        // NaN.
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // isfinite(src0.b).
+        DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(),
+                 DxbcSrc::LF(-FLT_MAX));
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
+        // for NaN).
+        DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f),
+                 operand_0_b);
+        DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
+                  DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
+                  DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
+        DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
+      }
+      // Shader Model 3: +-0 or denormal * anything = +0.
+      DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(),
+                ps_src.Abs());
+      // min isn't required to flush denormals, eq is.
+      DxbcOpEq(DxbcDest::R(test_temp, 0b0001),
+               DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
       DxbcOpMul(ps_dest, operand_0_a, ps_src);
       DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
                  DxbcSrc::LF(0.0f), ps_src);
-      DxbcOpElse();
-      DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
-      DxbcOpEndIf();
+      if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
+        DxbcOpElse();
+        DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
+        DxbcOpEndIf();
+      }
       // Release test_temp.
       PopSystemTemp();
     } break;
@@ -1023,11 +986,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
     case AluScalarOpcode::kMulsc0:
     case AluScalarOpcode::kMulsc1:
       DxbcOpMul(ps_dest, operand_0_a, operand_1);
-      if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
+      if (!(instr.scalar_operands[0].GetIdenticalComponents(
                 instr.scalar_operands[1]) &
             0b0001)) {
-        // Shader Model 3: 0 or denormal * anything = 0.
-        // FIXME(Triang3l): Signed zero needs research and handling.
+        // Shader Model 3: +-0 or denormal * anything = +0.
         uint32_t is_zero_temp = PushSystemTemp();
         DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
                   operand_1.Abs());
diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
index 0a86f7ff6..76eed4d10 100644
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
@@ -99,8 +99,8 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
     DxbcOpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
               DxbcSrc::LU(~uint32_t(3)));
   }
-  // Add the word offset from the instruction, plus the offset of the first
-  // needed word within the element.
+  // Add the word offset from the instruction (signed), plus the offset of the
+  // first needed word within the element.
   uint32_t first_word_index;
   xe::bit_scan_forward(needed_words, &first_word_index);
   int32_t first_word_buffer_offset =
@@ -1730,10 +1730,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
         }
         uint32_t texture_binding_index_unsigned =
             FindOrAddTextureBinding(tfetch_index, srv_dimension, false);
-        const TextureBinding& texture_binding_unsigned =
-            texture_bindings_[texture_binding_index_unsigned];
         uint32_t texture_binding_index_signed =
             FindOrAddTextureBinding(tfetch_index, srv_dimension, true);
+        const TextureBinding& texture_binding_unsigned =
+            texture_bindings_[texture_binding_index_unsigned];
         const TextureBinding& texture_binding_signed =
             texture_bindings_[texture_binding_index_signed];
         DxbcSrc srv_unsigned(DxbcSrc::LF(0.0f)), srv_signed(DxbcSrc::LF(0.0f));
diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc
index e54792a27..04bc8024b 100644
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@@ -135,7 +135,7 @@ X_STATUS GraphicsSystem::Setup(cpu::Processor* processor,
       }));
   // As we run vblank interrupts the debugger must be able to suspend us.
   vsync_worker_thread_->set_can_debugger_suspend(true);
-  vsync_worker_thread_->set_name("GraphicsSystem Vsync");
+  vsync_worker_thread_->set_name("GPU VSync");
   vsync_worker_thread_->Create();
 
   if (cvars::trace_gpu_stream) {
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index 2c25e682d..23998c307 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -65,17 +65,17 @@ enum class InstructionStorageTarget {
 // disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both
 // skipped components and zeros, which cannot be encoded, and therefore it will
 // not).
-constexpr uint32_t GetInstructionStorageTargetUsedComponents(
+constexpr uint32_t GetInstructionStorageTargetUsedComponentCount(
     InstructionStorageTarget target) {
   switch (target) {
     case InstructionStorageTarget::kNone:
-      return 0b0000;
+      return 0;
     case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
-      return 0b0111;
+      return 3;
     case InstructionStorageTarget::kDepth:
-      return 0b0001;
+      return 1;
     default:
-      return 0b1111;
+      return 4;
   }
 }
 
@@ -136,8 +136,9 @@ struct InstructionResult {
   // Returns the write mask containing only components actually present in the
   // target.
   uint32_t GetUsedWriteMask() const {
-    return original_write_mask &
-           GetInstructionStorageTargetUsedComponents(storage_target);
+    uint32_t target_component_count =
+        GetInstructionStorageTargetUsedComponentCount(storage_target);
+    return original_write_mask & ((1 << target_component_count) - 1);
   }
   // True if the components are in their 'standard' swizzle arrangement (xyzw).
   bool IsStandardSwizzle() const {
@@ -161,6 +162,28 @@ struct InstructionResult {
     }
     return used_components;
   }
+  // Returns which components of the used write mask are constant, and what
+  // values they have.
+  uint32_t GetUsedConstantComponents(uint32_t& constant_values_out) const {
+    uint32_t constant_components = 0;
+    uint32_t constant_values = 0;
+    uint32_t used_write_mask = GetUsedWriteMask();
+    for (uint32_t i = 0; i < 4; ++i) {
+      if (!(used_write_mask & (1 << i))) {
+        continue;
+      }
+      SwizzleSource component = components[i];
+      if (component >= SwizzleSource::kX && component <= SwizzleSource::kW) {
+        continue;
+      }
+      constant_components |= 1 << i;
+      if (component == SwizzleSource::k1) {
+        constant_values |= 1 << i;
+      }
+    }
+    constant_values_out = constant_values;
+    return constant_components;
+  }
 };
 
 enum class InstructionStorageSource {
@@ -212,14 +235,18 @@ struct InstructionOperand {
     return false;
   }
 
-  // Returns which components of two operands are identical, but may have
-  // different signs (for simplicity of usage with GetComponent, treating the
-  // rightmost component as replicated).
-  uint32_t GetAbsoluteIdenticalComponents(
-      const InstructionOperand& other) const {
+  // Returns which components of two operands will always be bitwise equal
+  // (disregarding component_count for simplicity of usage with GetComponent,
+  // treating the rightmost component as replicated). This, strictly with all
+  // conditions, must be used when emulating Shader Model 3 +-0 * x = +0
+  // multiplication behavior with IEEE-compliant multiplication (because
+  // -0 * |-0|, or -0 * +0, is -0, while the result must be +0).
+  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
     if (storage_source != other.storage_source ||
         storage_index != other.storage_index ||
-        storage_addressing_mode != other.storage_addressing_mode) {
+        storage_addressing_mode != other.storage_addressing_mode ||
+        is_negated != other.is_negated ||
+        is_absolute_value != other.is_absolute_value) {
       return 0;
     }
     uint32_t identical_components = 0;
@@ -229,16 +256,6 @@ struct InstructionOperand {
     }
     return identical_components;
   }
-  // Returns which components of two operands will always be bitwise equal, but
-  // may have different signs (disregarding component_count for simplicity of
-  // usage with GetComponent, treating the rightmost component as replicated).
-  uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
-    if (is_negated != other.is_negated ||
-        is_absolute_value != other.is_absolute_value) {
-      return 0;
-    }
-    return GetAbsoluteIdenticalComponents(other);
-  }
 };
 
 struct ParsedExecInstruction {
diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h
index 496836a38..98719b670 100644
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -25,6 +25,9 @@ namespace gpu {
 // system page size granularity.
 class SharedMemory {
  public:
+  static constexpr uint32_t kBufferSizeLog2 = 29;
+  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
+
   virtual ~SharedMemory();
   // Call in the implementation-specific ClearCache.
   virtual void ClearCache();
@@ -98,9 +101,6 @@ class SharedMemory {
   // destructor.
   void ShutdownCommon();
 
-  static constexpr uint32_t kBufferSizeLog2 = 29;
-  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
-
   // Sparse allocations are 4 MB, so not too many of them are allocated, but
   // also not to waste too much memory for padding (with 16 MB there's too
   // much).
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index c0c035167..21ccbaff9 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12);
 //   Both are valid only within the current ALU clause. They are not modified
 //   when the instruction that would write them fails its predication check.
 // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
-//   multiplication (0 or denormal * anything = 0) wherever it's present (mul,
-//   mad, dp, etc.) and for NaN in min/max. It's very important to respect this
-//   rule for multiplication, as games often rely on it in vector normalization
-//   (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in
-//   games - causes white screen in Halo 3, white specular on characters in GTA
-//   IV.
-// TODO(Triang3l): Investigate signed zero handling in multiplication.
+//   multiplication (+-0 or denormal * anything = +0) wherever it's present
+//   (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect
+//   this rule for multiplication, as games often rely on it in vector
+//   normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of
+//   things in games - causes white screen in Halo 3, white specular on
+//   characters in GTA IV. The result is always positive zero in this case, no
+//   matter what the signs of the other operands are, according to R5xx
+//   Acceleration section 8.7.5 "Legacy multiply behavior" and testing on
+//   Adreno 200. This means that the following need to be taken into account
+//   (according to 8.7.2 "ALU Non-Transcendental Floating Point"):
+//   - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3
+//     handling, it should result in +0.
+//   - +0 + -0 is +0, so multiply-add should not be replaced with conditional
+//     move of the third operand in case of zero multiplicands, because the term
+//     may be -0, while the result should be +0 in this case.
+//   http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
+//   Multiply-add also appears to be not fused (the SM3 behavior instruction on
+//   GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
+//   should not use instructions that may be interpreted by the host GPU as
+//   fused multiply-add.
 
 enum class AluScalarOpcode : uint32_t {
   // Floating-Point Add
diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua
index 14259b183..58512ec17 100644
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@@ -30,17 +30,6 @@ project("xenia-gpu-vulkan-trace-viewer")
   kind("WindowedApp")
   language("C++")
   links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
     "xenia-apu",
     "xenia-apu-nop",
     "xenia-base",
@@ -57,6 +46,19 @@ project("xenia-gpu-vulkan-trace-viewer")
     "xenia-ui-vulkan",
     "xenia-vfs",
     "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
     "xxhash",
   })
   defines({
@@ -98,17 +100,6 @@ project("xenia-gpu-vulkan-trace-dump")
   kind("ConsoleApp")
   language("C++")
   links({
-    "aes_128",
-    "capstone",
-    "fmt",
-    "glslang-spirv",
-    "imgui",
-    "libavcodec",
-    "libavutil",
-    "mspack",
-    "snappy",
-    "spirv-tools",
-    "volk",
     "xenia-apu",
     "xenia-apu-nop",
     "xenia-base",
@@ -125,6 +116,19 @@ project("xenia-gpu-vulkan-trace-dump")
     "xenia-ui-vulkan",
     "xenia-vfs",
     "xenia-patcher",
+  })
+  links({
+    "aes_128",
+    "capstone",
+    "fmt",
+    "glslang-spirv",
+    "imgui",
+    "libavcodec",
+    "libavutil",
+    "mspack",
+    "snappy",
+    "spirv-tools",
+    "volk",
     "xxhash",
   })
   defines({
diff --git a/src/xenia/hid/premake5.lua b/src/xenia/hid/premake5.lua
index 152887e2b..348e12371 100644
--- a/src/xenia/hid/premake5.lua
+++ b/src/xenia/hid/premake5.lua
@@ -41,11 +41,11 @@ project("xenia-hid-demo")
 
   filter("platforms:Linux")
     links({
+      "SDL2",
+      "vulkan",
       "X11",
       "xcb",
       "X11-xcb",
-      "vulkan",
-      "SDL2",
     })
 
   filter("platforms:Windows")
diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc
index 01fed1e7f..dd0d7ec5f 100644
--- a/src/xenia/kernel/kernel_state.cc
+++ b/src/xenia/kernel/kernel_state.cc
@@ -359,7 +359,7 @@ void KernelState::SetExecutableModule(object_ref<UserModule> module) {
           }
           return 0;
         }));
-    dispatch_thread_->set_name("Kernel Dispatch Thread");
+    dispatch_thread_->set_name("Kernel Dispatch");
     dispatch_thread_->Create();
   }
 }
diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc
index f12612b10..cc42bfcb6 100644
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@@ -8,6 +8,7 @@
  */
 
 #include "xenia/base/logging.h"
+#include "xenia/base/math.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xam/xam_private.h"
@@ -235,7 +236,8 @@ dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type,
     xe::store_and_swap(&dev->device_type, dummy_device_info_.device_type);
     xe::store_and_swap(&dev->total_bytes, dummy_device_info_.total_bytes);
     xe::store_and_swap(&dev->free_bytes, dummy_device_info_.free_bytes);
-    xe::copy_and_swap(dev->name, dummy_device_info_.name, 28);
+    xe::copy_and_swap(dev->name, dummy_device_info_.name,
+                      xe::countof(dev->name));
   }
 
   *handle_out = e->handle();
diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 758dde9c2..f0c28c14a 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -9,6 +9,7 @@
 
 #include "xenia/base/logging.h"
 #include "xenia/base/cvar.h"
+#include "xenia/base/string_util.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/user_module.h"
 #include "xenia/kernel/util/shim_utils.h"
@@ -77,15 +78,15 @@ static SYSTEMTIME xeGetLocalSystemTime(uint64_t filetime) {
 
 void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t output_buffer,
                          dword_t output_count) {
-  std::memset(output_buffer, 0, output_count * 2);
+  std::memset(output_buffer, 0, output_count * sizeof(char16_t));
 
 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
   auto st = xeGetLocalSystemTime(filetime);
   // TODO: format this depending on users locale?
   auto str = fmt::format(u"{:02d}/{:02d}/{}", st.wMonth, st.wDay, st.wYear);
-  auto copy_length = std::min(size_t(output_count), str.size()) * 2;
-  xe::copy_and_swap(output_buffer.as<char16_t*>(), str.c_str(), copy_length);
+  xe::string_util::copy_and_swap_truncating(output_buffer.as<char16_t*>(), str,
+                                            output_count);
 #else
   assert_always();
 #endif
@@ -94,15 +95,15 @@ DECLARE_XAM_EXPORT1(XamFormatDateString, kNone, kImplemented);
 
 void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t output_buffer,
                          dword_t output_count) {
-  std::memset(output_buffer, 0, output_count * 2);
+  std::memset(output_buffer, 0, output_count * sizeof(char16_t));
 
 // TODO: implement this for other platforms
 #if XE_PLATFORM_WIN32
   auto st = xeGetLocalSystemTime(filetime);
   // TODO: format this depending on users locale?
   auto str = fmt::format(u"{:02d}:{:02d}", st.wHour, st.wMinute);
-  auto copy_count = std::min(size_t(output_count), str.size());
-  xe::copy_and_swap(output_buffer.as<char16_t*>(), str.c_str(), copy_count);
+  xe::string_util::copy_and_swap_truncating(output_buffer.as<char16_t*>(), str,
+                                            output_count);
 #else
   assert_always();
 #endif
@@ -124,9 +125,8 @@ dword_result_t keXamBuildResourceLocator(uint64_t module,
     path = fmt::format(u"section://{:X},{}#{}", (uint32_t)module, container,
                        resource);
   }
-  auto copy_count = std::min(size_t(buffer_count), path.size());
-  xe::copy_and_swap(buffer_ptr.as<char16_t*>(), path.c_str(), copy_count);
-  (buffer_ptr.as<char16_t*>())[copy_count] = 0;
+  xe::string_util::copy_and_swap_truncating(buffer_ptr.as<char16_t*>(), path,
+                                            buffer_count);
   return 0;
 }
 
diff --git a/src/xenia/kernel/xam/xam_net.cc b/src/xenia/kernel/xam/xam_net.cc
index ddeccd9e9..ed7e4a023 100644
--- a/src/xenia/kernel/xam/xam_net.cc
+++ b/src/xenia/kernel/xam/xam_net.cc
@@ -984,8 +984,7 @@ dword_result_t NetDll___WSAFDIsSet(dword_t socket_handle,
 DECLARE_XAM_EXPORT1(NetDll___WSAFDIsSet, kNetworking, kImplemented);
 
 void RegisterNetExports(xe::cpu::ExportResolver* export_resolver,
-                        KernelState* kernel_state) {
-}
+                        KernelState* kernel_state) {}
 
 }  // namespace xam
 }  // namespace kernel
diff --git a/src/xenia/kernel/xam/xam_ui.cc b/src/xenia/kernel/xam/xam_ui.cc
index 6d2fc7ea4..1723a0949 100644
--- a/src/xenia/kernel/xam/xam_ui.cc
+++ b/src/xenia/kernel/xam/xam_ui.cc
@@ -9,6 +9,7 @@
 
 #include "third_party/imgui/imgui.h"
 #include "xenia/base/logging.h"
+#include "xenia/base/string_util.h"
 #include "xenia/emulator.h"
 #include "xenia/kernel/kernel_flags.h"
 #include "xenia/kernel/kernel_state.h"
@@ -188,8 +189,8 @@ class KeyboardInputDialog : public xe::ui::ImGuiDialog {
       *out_text_ = default_text;
     }
     text_buffer_.resize(max_length);
-    std::strncpy(text_buffer_.data(), default_text_.c_str(),
-                 std::min(text_buffer_.size() - 1, default_text_.size()));
+    xe::string_util::copy_truncating(text_buffer_.data(), default_text_,
+                                     text_buffer_.size());
   }
 
   void OnDraw(ImGuiIO& io) override {
diff --git a/src/xenia/kernel/xam/xam_user.cc b/src/xenia/kernel/xam/xam_user.cc
index 3e58639a9..9c746548c 100644
--- a/src/xenia/kernel/xam/xam_user.cc
+++ b/src/xenia/kernel/xam/xam_user.cc
@@ -10,6 +10,8 @@
 #include <cstring>
 
 #include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/string_util.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
 #include "xenia/kernel/xam/xam_private.h"
@@ -168,7 +170,8 @@ X_HRESULT_result_t XamUserGetSigninInfo(dword_t user_index, dword_t flags,
   const auto& user_profile = kernel_state()->user_profile();
   info->xuid = user_profile->xuid();
   info->signin_state = user_profile->signin_state();
-  std::strncpy(info->name, user_profile->name().data(), 15);
+  xe::string_util::copy_truncating(info->name, user_profile->name(),
+                                   xe::countof(info->name));
   return X_E_SUCCESS;
 }
 DECLARE_XAM_EXPORT1(XamUserGetSigninInfo, kUserProfiles, kImplemented);
@@ -187,10 +190,8 @@ dword_result_t XamUserGetName(dword_t user_index, lpstring_t buffer,
   const auto& user_name = user_profile->name();
 
   // Real XAM will only copy a maximum of 15 characters out.
-  size_t copy_length = std::min(
-      {size_t(15), user_name.size(), static_cast<size_t>(buffer_len) - 1});
-  std::memcpy(buffer, user_name.data(), copy_length);
-  buffer[copy_length] = '\0';
+  xe::string_util::copy_truncating(buffer, user_name,
+                                   std::min(buffer_len.value(), uint32_t(15)));
   return X_ERROR_SUCCESS;
 }
 DECLARE_XAM_EXPORT1(XamUserGetName, kUserProfiles, kImplemented);
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
index 6ec46617e..f4ab5cec4 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc
@@ -226,19 +226,21 @@ DECLARE_XBOXKRNL_EXPORT1(KeSetCurrentStackPointers, kThreading, kImplemented);
 
 dword_result_t KeSetAffinityThread(lpvoid_t thread_ptr, dword_t affinity,
                                    lpdword_t previous_affinity_ptr) {
-  uint32_t previous_affinity = 0;
-
+  // The Xbox 360, according to disassembly of KeSetAffinityThread, unlike
+  // Windows NT, stores the previous affinity via the pointer provided as an
+  // argument, not in the return value - the return value is used for the
+  // result.
+  if (!affinity) {
+    return X_STATUS_INVALID_PARAMETER;
+  }
   auto thread = XObject::GetNativeObject<XThread>(kernel_state(), thread_ptr);
   if (thread) {
-    previous_affinity = thread->affinity();
+    if (previous_affinity_ptr) {
+      *previous_affinity_ptr = uint32_t(1) << thread->active_cpu();
+    }
     thread->SetAffinity(affinity);
   }
-
-  if (previous_affinity_ptr) {
-    *previous_affinity_ptr = previous_affinity;
-  }
-
-  return (uint32_t)affinity;
+  return X_STATUS_SUCCESS;
 }
 DECLARE_XBOXKRNL_EXPORT1(KeSetAffinityThread, kThreading, kImplemented);
 
diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc
index 118d0b0a2..b14462a75 100644
--- a/src/xenia/kernel/xthread.cc
+++ b/src/xenia/kernel/xthread.cc
@@ -157,11 +157,17 @@ void XThread::set_name(const std::string_view name) {
   }
 }
 
-uint8_t next_cpu = 0;
-uint8_t GetFakeCpuNumber(uint8_t proc_mask) {
+static uint8_t next_cpu = 0;
+static uint8_t GetFakeCpuNumber(uint8_t proc_mask) {
+  // NOTE: proc_mask is logical processors, not physical processors or cores.
   if (!proc_mask) {
     next_cpu = (next_cpu + 1) % 6;
     return next_cpu;  // is this reasonable?
+    // TODO(Triang3l): Does the following apply here?
+    // https://docs.microsoft.com/en-us/windows/win32/dxtecharts/coding-for-multiple-cores
+    // "On Xbox 360, you must explicitly assign software threads to a particular
+    //  hardware thread by using XSetThreadProcessor. Otherwise, all child
+    //  threads will stay on the same hardware thread as the parent."
   }
   assert_false(proc_mask & 0xC0);
 
@@ -206,6 +212,7 @@ void XThread::InitializeGuestObject() {
   // 0xA88 = APC
   // 0x18 = timer
   xe::store_and_swap<uint32_t>(p + 0x09C, 0xFDFFD7FF);
+  // current_cpu is expected to be initialized externally via SetActiveCpu.
   xe::store_and_swap<uint32_t>(p + 0x0D0, stack_base_);
   xe::store_and_swap<uint64_t>(p + 0x130, Clock::QueryGuestSystemTime());
   xe::store_and_swap<uint32_t>(p + 0x144, guest_object() + 0x144);
@@ -347,6 +354,12 @@ X_STATUS XThread::Create() {
   // Exports use this to get the kernel.
   thread_state_->context()->kernel_state = kernel_state_;
 
+  uint8_t cpu_index = GetFakeCpuNumber(
+      static_cast<uint8_t>(creation_params_.creation_flags >> 24));
+
+  // Initialize the KTHREAD object.
+  InitializeGuestObject();
+
   X_KPCR* pcr = memory()->TranslateVirtual<X_KPCR*>(pcr_address_);
 
   pcr->tls_ptr = tls_static_address_;
@@ -356,14 +369,11 @@ X_STATUS XThread::Create() {
   pcr->stack_base_ptr = stack_base_;
   pcr->stack_end_ptr = stack_limit_;
 
-  uint8_t proc_mask =
-      static_cast<uint8_t>(creation_params_.creation_flags >> 24);
+  pcr->dpc_active = 0;  // DPC active bool?
 
-  pcr->current_cpu = GetFakeCpuNumber(proc_mask);  // Current CPU(?)
-  pcr->dpc_active = 0;                             // DPC active bool?
-
-  // Initialize the KTHREAD object.
-  InitializeGuestObject();
+  // Assign the thread to the logical processor, and also set up the current CPU
+  // in KPCR and KTHREAD.
+  SetActiveCpu(cpu_index);
 
   // Always retain when starting - the thread owns itself until exited.
   RetainHandle();
@@ -416,10 +426,6 @@ X_STATUS XThread::Create() {
     return X_STATUS_NO_MEMORY;
   }
 
-  if (!cvars::ignore_thread_affinities) {
-    thread_->set_affinity_mask(proc_mask);
-  }
-
   // Set the thread name based on host ID (for easier debugging).
   if (thread_name_.empty()) {
     set_name(fmt::format("XThread{:04X}", thread_->system_id()));
@@ -712,37 +718,36 @@ void XThread::SetPriority(int32_t increment) {
 }
 
 void XThread::SetAffinity(uint32_t affinity) {
-  // Affinity mask, as in SetThreadAffinityMask.
-  // Xbox thread IDs:
-  // 0 - core 0, thread 0 - user
-  // 1 - core 0, thread 1 - user
-  // 2 - core 1, thread 0 - sometimes xcontent
-  // 3 - core 1, thread 1 - user
-  // 4 - core 2, thread 0 - xaudio
-  // 5 - core 2, thread 1 - user
-  // TODO(benvanik): implement better thread distribution.
-  // NOTE: these are logical processors, not physical processors or cores.
+  SetActiveCpu(GetFakeCpuNumber(affinity));
+}
+
+uint8_t XThread::active_cpu() const {
+  const X_KPCR& pcr = *memory()->TranslateVirtual<const X_KPCR*>(pcr_address_);
+  return pcr.current_cpu;
+}
+
+void XThread::SetActiveCpu(uint8_t cpu_index) {
+  // May be called during thread creation - don't skip if current == new.
+
+  assert_true(cpu_index < 6);
+
+  X_KPCR& pcr = *memory()->TranslateVirtual<X_KPCR*>(pcr_address_);
+  pcr.current_cpu = cpu_index;
+
+  if (is_guest_thread()) {
+    X_KTHREAD& thread_object =
+        *memory()->TranslateVirtual<X_KTHREAD*>(guest_object());
+    thread_object.current_cpu = cpu_index;
+  }
+
   if (xe::threading::logical_processor_count() < 6) {
     XELOGW("Too few processors - scheduling will be wonky");
   }
-  SetActiveCpu(GetFakeCpuNumber(affinity));
-  affinity_ = affinity;
   if (!cvars::ignore_thread_affinities) {
-    thread_->set_affinity_mask(affinity);
+    thread_->set_affinity_mask(uint64_t(1) << cpu_index);
   }
 }
 
-uint32_t XThread::active_cpu() const {
-  uint8_t* pcr = memory()->TranslateVirtual(pcr_address_);
-  return xe::load_and_swap<uint8_t>(pcr + 0x10C);
-}
-
-void XThread::SetActiveCpu(uint32_t cpu_index) {
-  assert_true(cpu_index < 6);
-  uint8_t* pcr = memory()->TranslateVirtual(pcr_address_);
-  xe::store_and_swap<uint8_t>(pcr + 0x10C, cpu_index);
-}
-
 bool XThread::GetTLSValue(uint32_t slot, uint32_t* value_out) {
   if (slot * 4 > tls_total_size_) {
     return false;
diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h
index 84abfd027..3c8e4ecaa 100644
--- a/src/xenia/kernel/xthread.h
+++ b/src/xenia/kernel/xthread.h
@@ -88,7 +88,8 @@ struct X_KTHREAD {
   char unk_10[0xAC];             // 0x10
   uint8_t suspend_count;         // 0xBC
   uint8_t unk_BD;                // 0xBD
-  uint16_t unk_BE;               // 0xBE
+  uint8_t unk_BE;                // 0xBE
+  uint8_t current_cpu;           // 0xBF
   char unk_C0[0x70];             // 0xC0
   xe::be<uint64_t> create_time;  // 0x130
   xe::be<uint64_t> exit_time;    // 0x138
@@ -171,10 +172,17 @@ class XThread : public XObject, public cpu::Thread {
   int32_t priority() const { return priority_; }
   int32_t QueryPriority();
   void SetPriority(int32_t increment);
-  uint32_t affinity() const { return affinity_; }
+
+  // Xbox thread IDs:
+  // 0 - core 0, thread 0 - user
+  // 1 - core 0, thread 1 - user
+  // 2 - core 1, thread 0 - sometimes xcontent
+  // 3 - core 1, thread 1 - user
+  // 4 - core 2, thread 0 - xaudio
+  // 5 - core 2, thread 1 - user
   void SetAffinity(uint32_t affinity);
-  uint32_t active_cpu() const;
-  void SetActiveCpu(uint32_t cpu_index);
+  uint8_t active_cpu() const;
+  void SetActiveCpu(uint8_t cpu_index);
 
   bool GetTLSValue(uint32_t slot, uint32_t* value_out);
   bool SetTLSValue(uint32_t slot, uint32_t value);
@@ -226,7 +234,6 @@ class XThread : public XObject, public cpu::Thread {
   bool running_ = false;
 
   int32_t priority_ = 0;
-  uint32_t affinity_ = 0;
 
   xe::global_critical_region global_critical_region_;
   std::atomic<uint32_t> irql_ = {0};
diff --git a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc
index b9e23dc93..5c0a104e5 100644
--- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc
+++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc
@@ -118,15 +118,15 @@ bool D3D12ImmediateDrawer::Initialize() {
     return false;
   }
 
-  // Create the pipeline states.
-  D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_state_desc = {};
-  pipeline_state_desc.pRootSignature = root_signature_;
-  pipeline_state_desc.VS.pShaderBytecode = immediate_vs;
-  pipeline_state_desc.VS.BytecodeLength = sizeof(immediate_vs);
-  pipeline_state_desc.PS.pShaderBytecode = immediate_ps;
-  pipeline_state_desc.PS.BytecodeLength = sizeof(immediate_ps);
+  // Create the pipelines.
+  D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_desc = {};
+  pipeline_desc.pRootSignature = root_signature_;
+  pipeline_desc.VS.pShaderBytecode = immediate_vs;
+  pipeline_desc.VS.BytecodeLength = sizeof(immediate_vs);
+  pipeline_desc.PS.pShaderBytecode = immediate_ps;
+  pipeline_desc.PS.BytecodeLength = sizeof(immediate_ps);
   D3D12_RENDER_TARGET_BLEND_DESC& pipeline_blend_desc =
-      pipeline_state_desc.BlendState.RenderTarget[0];
+      pipeline_desc.BlendState.RenderTarget[0];
   pipeline_blend_desc.BlendEnable = TRUE;
   pipeline_blend_desc.SrcBlend = D3D12_BLEND_SRC_ALPHA;
   pipeline_blend_desc.DestBlend = D3D12_BLEND_INV_SRC_ALPHA;
@@ -138,11 +138,11 @@ bool D3D12ImmediateDrawer::Initialize() {
   pipeline_blend_desc.RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED |
                                               D3D12_COLOR_WRITE_ENABLE_GREEN |
                                               D3D12_COLOR_WRITE_ENABLE_BLUE;
-  pipeline_state_desc.SampleMask = UINT_MAX;
-  pipeline_state_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
-  pipeline_state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
-  pipeline_state_desc.RasterizerState.FrontCounterClockwise = FALSE;
-  pipeline_state_desc.RasterizerState.DepthClipEnable = TRUE;
+  pipeline_desc.SampleMask = UINT_MAX;
+  pipeline_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
+  pipeline_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
+  pipeline_desc.RasterizerState.FrontCounterClockwise = FALSE;
+  pipeline_desc.RasterizerState.DepthClipEnable = TRUE;
   D3D12_INPUT_ELEMENT_DESC pipeline_input_elements[3] = {};
   pipeline_input_elements[0].SemanticName = "POSITION";
   pipeline_input_elements[0].Format = DXGI_FORMAT_R32G32_FLOAT;
@@ -154,26 +154,24 @@ bool D3D12ImmediateDrawer::Initialize() {
   pipeline_input_elements[2].Format = DXGI_FORMAT_R8G8B8A8_UNORM;
   pipeline_input_elements[2].AlignedByteOffset =
       offsetof(ImmediateVertex, color);
-  pipeline_state_desc.InputLayout.pInputElementDescs = pipeline_input_elements;
-  pipeline_state_desc.InputLayout.NumElements =
+  pipeline_desc.InputLayout.pInputElementDescs = pipeline_input_elements;
+  pipeline_desc.InputLayout.NumElements =
       UINT(xe::countof(pipeline_input_elements));
-  pipeline_state_desc.PrimitiveTopologyType =
-      D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
-  pipeline_state_desc.NumRenderTargets = 1;
-  pipeline_state_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat;
-  pipeline_state_desc.SampleDesc.Count = 1;
+  pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+  pipeline_desc.NumRenderTargets = 1;
+  pipeline_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat;
+  pipeline_desc.SampleDesc.Count = 1;
   if (FAILED(device->CreateGraphicsPipelineState(
-          &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_triangle_)))) {
+          &pipeline_desc, IID_PPV_ARGS(&pipeline_triangle_)))) {
     XELOGE(
         "Failed to create the Direct3D 12 immediate drawer triangle pipeline "
         "state");
     Shutdown();
     return false;
   }
-  pipeline_state_desc.PrimitiveTopologyType =
-      D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
+  pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
   if (FAILED(device->CreateGraphicsPipelineState(
-          &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_line_)))) {
+          &pipeline_desc, IID_PPV_ARGS(&pipeline_line_)))) {
     XELOGE(
         "Failed to create the Direct3D 12 immediate drawer line pipeline "
         "state");
@@ -267,8 +265,8 @@ void D3D12ImmediateDrawer::Shutdown() {
 
   util::ReleaseAndNull(sampler_heap_);
 
-  util::ReleaseAndNull(pipeline_state_line_);
-  util::ReleaseAndNull(pipeline_state_triangle_);
+  util::ReleaseAndNull(pipeline_line_);
+  util::ReleaseAndNull(pipeline_triangle_);
 
   util::ReleaseAndNull(root_signature_);
 }
@@ -611,17 +609,17 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) {
                                          uint32_t(sampler_index)));
   }
 
-  // Set the primitive type and the pipeline state for it.
+  // Set the primitive type and the pipeline for it.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology;
-  ID3D12PipelineState* pipeline_state;
+  ID3D12PipelineState* pipeline;
   switch (draw.primitive_type) {
     case ImmediatePrimitiveType::kLines:
       primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST;
-      pipeline_state = pipeline_state_line_;
+      pipeline = pipeline_line_;
       break;
     case ImmediatePrimitiveType::kTriangles:
       primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
-      pipeline_state = pipeline_state_triangle_;
+      pipeline = pipeline_triangle_;
       break;
     default:
       assert_unhandled_case(draw.primitive_type);
@@ -630,7 +628,7 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) {
   if (current_primitive_topology_ != primitive_topology) {
     current_primitive_topology_ = primitive_topology;
     current_command_list_->IASetPrimitiveTopology(primitive_topology);
-    current_command_list_->SetPipelineState(pipeline_state);
+    current_command_list_->SetPipelineState(pipeline);
   }
 
   // Draw.
diff --git a/src/xenia/ui/d3d12/d3d12_immediate_drawer.h b/src/xenia/ui/d3d12/d3d12_immediate_drawer.h
index 4300af76e..fbc362f59 100644
--- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.h
+++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.h
@@ -105,8 +105,8 @@ class D3D12ImmediateDrawer : public ImmediateDrawer {
     kCount
   };
 
-  ID3D12PipelineState* pipeline_state_triangle_ = nullptr;
-  ID3D12PipelineState* pipeline_state_line_ = nullptr;
+  ID3D12PipelineState* pipeline_triangle_ = nullptr;
+  ID3D12PipelineState* pipeline_line_ = nullptr;
 
   ID3D12DescriptorHeap* sampler_heap_ = nullptr;
   D3D12_CPU_DESCRIPTOR_HANDLE sampler_heap_cpu_start_;
diff --git a/src/xenia/ui/d3d12/d3d12_provider.h b/src/xenia/ui/d3d12/d3d12_provider.h
index 0e70def17..255d42a3d 100644
--- a/src/xenia/ui/d3d12/d3d12_provider.h
+++ b/src/xenia/ui/d3d12/d3d12_provider.h
@@ -46,22 +46,22 @@ class D3D12Provider : public GraphicsProvider {
   uint32_t GetRTVDescriptorSize() const { return descriptor_size_rtv_; }
   uint32_t GetDSVDescriptorSize() const { return descriptor_size_dsv_; }
   template <typename T>
-  inline T OffsetViewDescriptor(T start, uint32_t index) const {
+  T OffsetViewDescriptor(T start, uint32_t index) const {
     start.ptr += index * descriptor_size_view_;
     return start;
   }
   template <typename T>
-  inline T OffsetSamplerDescriptor(T start, uint32_t index) const {
+  T OffsetSamplerDescriptor(T start, uint32_t index) const {
     start.ptr += index * descriptor_size_sampler_;
     return start;
   }
   template <typename T>
-  inline T OffsetRTVDescriptor(T start, uint32_t index) const {
+  T OffsetRTVDescriptor(T start, uint32_t index) const {
     start.ptr += index * descriptor_size_rtv_;
     return start;
   }
   template <typename T>
-  inline T OffsetDSVDescriptor(T start, uint32_t index) const {
+  T OffsetDSVDescriptor(T start, uint32_t index) const {
     start.ptr += index * descriptor_size_dsv_;
     return start;
   }
@@ -91,32 +91,30 @@ class D3D12Provider : public GraphicsProvider {
   }
 
   // Proxies for Direct3D 12 functions since they are loaded dynamically.
-  inline HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc,
-                                        D3D_ROOT_SIGNATURE_VERSION version,
-                                        ID3DBlob** blob_out,
-                                        ID3DBlob** error_blob_out) const {
+  HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc,
+                                 D3D_ROOT_SIGNATURE_VERSION version,
+                                 ID3DBlob** blob_out,
+                                 ID3DBlob** error_blob_out) const {
     return pfn_d3d12_serialize_root_signature_(desc, version, blob_out,
                                                error_blob_out);
   }
-  inline HRESULT Disassemble(const void* src_data, size_t src_data_size,
-                             UINT flags, const char* comments,
-                             ID3DBlob** disassembly_out) const {
+  HRESULT Disassemble(const void* src_data, size_t src_data_size, UINT flags,
+                      const char* comments, ID3DBlob** disassembly_out) const {
     if (!pfn_d3d_disassemble_) {
       return E_NOINTERFACE;
     }
     return pfn_d3d_disassemble_(src_data, src_data_size, flags, comments,
                                 disassembly_out);
   }
-  inline HRESULT DxbcConverterCreateInstance(const CLSID& rclsid,
-                                             const IID& riid,
-                                             void** ppv) const {
+  HRESULT DxbcConverterCreateInstance(const CLSID& rclsid, const IID& riid,
+                                      void** ppv) const {
     if (!pfn_dxilconv_dxc_create_instance_) {
       return E_NOINTERFACE;
     }
     return pfn_dxilconv_dxc_create_instance_(rclsid, riid, ppv);
   }
-  inline HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid,
-                                   void** ppv) const {
+  HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid,
+                            void** ppv) const {
     if (!pfn_dxcompiler_dxc_create_instance_) {
       return E_NOINTERFACE;
     }
diff --git a/src/xenia/ui/d3d12/d3d12_util.cc b/src/xenia/ui/d3d12/d3d12_util.cc
index 710d3b6db..caea2b296 100644
--- a/src/xenia/ui/d3d12/d3d12_util.cc
+++ b/src/xenia/ui/d3d12/d3d12_util.cc
@@ -47,7 +47,7 @@ ID3D12RootSignature* CreateRootSignature(
   return root_signature;
 }
 
-ID3D12PipelineState* CreateComputePipelineState(
+ID3D12PipelineState* CreateComputePipeline(
     ID3D12Device* device, const void* shader, size_t shader_size,
     ID3D12RootSignature* root_signature) {
   D3D12_COMPUTE_PIPELINE_STATE_DESC desc;
diff --git a/src/xenia/ui/d3d12/d3d12_util.h b/src/xenia/ui/d3d12/d3d12_util.h
index 5bce23568..6798f4f1c 100644
--- a/src/xenia/ui/d3d12/d3d12_util.h
+++ b/src/xenia/ui/d3d12/d3d12_util.h
@@ -27,7 +27,7 @@ extern const D3D12_HEAP_PROPERTIES kHeapPropertiesUpload;
 extern const D3D12_HEAP_PROPERTIES kHeapPropertiesReadback;
 
 template <typename T>
-inline bool ReleaseAndNull(T& object) {
+bool ReleaseAndNull(T& object) {
   if (object != nullptr) {
     object->Release();
     object = nullptr;
@@ -39,9 +39,10 @@ inline bool ReleaseAndNull(T& object) {
 ID3D12RootSignature* CreateRootSignature(const D3D12Provider& provider,
                                          const D3D12_ROOT_SIGNATURE_DESC& desc);
 
-ID3D12PipelineState* CreateComputePipelineState(
-    ID3D12Device* device, const void* shader, size_t shader_size,
-    ID3D12RootSignature* root_signature);
+ID3D12PipelineState* CreateComputePipeline(ID3D12Device* device,
+                                           const void* shader,
+                                           size_t shader_size,
+                                           ID3D12RootSignature* root_signature);
 
 constexpr DXGI_FORMAT GetUintPow2DXGIFormat(uint32_t element_size_bytes_log2) {
   switch (element_size_bytes_log2) {
diff --git a/src/xenia/ui/graphics_upload_buffer_pool.cc b/src/xenia/ui/graphics_upload_buffer_pool.cc
index 2a780b0c9..5eb04fba3 100644
--- a/src/xenia/ui/graphics_upload_buffer_pool.cc
+++ b/src/xenia/ui/graphics_upload_buffer_pool.cc
@@ -71,7 +71,7 @@ void GraphicsUploadBufferPool::FlushWrites() {
 GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request(
     uint64_t submission_index, size_t size, size_t alignment,
     size_t& offset_out) {
-  assert_not_zero(alignment);
+  alignment = std::max(alignment, size_t(1));
   assert_true(xe::is_pow2(alignment));
   size = xe::align(size, alignment);
   assert_true(size <= page_size_);
@@ -126,7 +126,7 @@ GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request(
 GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::RequestPartial(
     uint64_t submission_index, size_t size, size_t alignment,
     size_t& offset_out, size_t& size_out) {
-  assert_not_zero(alignment);
+  alignment = std::max(alignment, size_t(1));
   assert_true(xe::is_pow2(alignment));
   size = xe::align(size, alignment);
   size = std::min(size, page_size_);
diff --git a/third_party/SDL2-static.lua b/third_party/SDL2-static.lua
index a9206e300..447ceb325 100644
--- a/third_party/SDL2-static.lua
+++ b/third_party/SDL2-static.lua
@@ -18,7 +18,7 @@ project("SDL2")
     "SDL2/include",
   })
   buildoptions({
-    "/wd4828",  -- illegal characters in file
+    "/wd4828",  -- illegal characters in file https://bugzilla.libsdl.org/show_bug.cgi?id=5333
   })
   files({
     -- 1:1 from SDL.vcxproj file
diff --git a/third_party/premake-cmake b/third_party/premake-cmake
new file mode 160000
index 000000000..26fbbb996
--- /dev/null
+++ b/third_party/premake-cmake
@@ -0,0 +1 @@
+Subproject commit 26fbbb9962aefcb1c24aff1e7952033ce1361190
diff --git a/third_party/spirv-tools.lua b/third_party/spirv-tools.lua
index bf900a6e9..0e6335b98 100644
--- a/third_party/spirv-tools.lua
+++ b/third_party/spirv-tools.lua
@@ -73,4 +73,4 @@ project("spirv-tools")
     buildoptions({
       "/wd4800",  -- Forcing value to bool 'true' or 'false'
       "/wd4996",  -- Call to 'std::equal' with parameters that may be unsafe
-    })
\ No newline at end of file
+    })
diff --git a/tools/build/premake b/tools/build/premake
index 14e3d5ebc..9113958a5 100644
--- a/tools/build/premake
+++ b/tools/build/premake
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3 
+#!/usr/bin/env python3
 
 # Copyright 2015 Ben Vanik. All Rights Reserved.
 
@@ -107,13 +107,14 @@ def has_bin(bin):
   return None
 
 
-def shell_call(command, throw_on_error=True, stdout_path=None):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
   """Executes a shell command.
 
   Args:
     command: Command to execute, as a list of parameters.
     throw_on_error: Whether to throw an error or return the status code.
     stdout_path: File path to write stdout output to.
+    stderr_path: File path to write stderr output to.
 
   Returns:
     If throw_on_error is False the status code of the call will be returned.
@@ -121,17 +122,22 @@ def shell_call(command, throw_on_error=True, stdout_path=None):
   stdout_file = None
   if stdout_path:
     stdout_file = open(stdout_path, 'w')
+  stderr_file = None
+  if stderr_path:
+    stderr_file = open(stderr_path, 'w')
   result = 0
   try:
     if throw_on_error:
       result = 1
-      subprocess.check_call(command, shell=False, stdout=stdout_file)
+      subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
       result = 0
     else:
-      result = subprocess.call(command, shell=False, stdout=stdout_file)
+      result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
   finally:
     if stdout_file:
       stdout_file.close()
+    if stderr_file:
+      stderr_file.close()
   return result
 
 
@@ -196,42 +202,5 @@ def import_subprocess_environment(args):
         os.environ[var.upper()] = setting
         break
 
-def git_submodule_update():
-  """Runs a full recursive git submodule init and update.
-
-  Older versions of git do not support 'update --init --recursive'. We could
-  check and run it on versions that do support it and speed things up a bit.
-  """
-  if True:
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--init',
-        '--recursive',
-        ])
-  else:
-    shell_call([
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'foreach',
-        '--recursive',
-        'git',
-        'submodule',
-        'init',
-        ])
-    shell_call([
-        'git',
-        'submodule',
-        'update',
-        '--recursive',
-        ])
-
-
 if __name__ == '__main__':
   main()
diff --git a/xenia-build b/xenia-build
index 19bcd0307..ec3c00345 100755
--- a/xenia-build
+++ b/xenia-build
@@ -34,8 +34,11 @@ def main():
 
     # Check git exists.
     if not has_bin('git'):
-        print('ERROR: git must be installed and on PATH.')
-        sys.exit(1)
+        print('WARNING: Git should be installed and on PATH. Version info will be omitted from all binaries!')
+        print('')
+    elif not git_is_repository():
+        print('WARNING: The source tree is unversioned. Version info will be omitted from all binaries!')
+        print('')
 
     # Check python version.
     if not sys.version_info[:2] >= (3, 6):
@@ -85,6 +88,16 @@ def main():
     sys.exit(return_code)
 
 
+def print_box(msg):
+    """Prints an important message inside a box
+    """
+    print(
+        '┌{0:─^{2}}╖\n'
+        '│{1: ^{2}}║\n'
+        '╘{0:═^{2}}╝\n'
+        .format('', msg, len(msg) + 2))
+
+
 def import_vs_environment():
     """Finds the installed Visual Studio version and imports
     interesting environment variables into os.environ.
@@ -150,6 +163,7 @@ def import_subprocess_environment(args):
                 os.environ[var.upper()] = setting
                 break
 
+
 def has_bin(binary):
     """Checks whether the given binary is present.
 
@@ -185,13 +199,14 @@ def get_bin(binary):
     return None
 
 
-def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
+def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False):
     """Executes a shell command.
 
     Args:
       command: Command to execute, as a list of parameters.
       throw_on_error: Whether to throw an error or return the status code.
       stdout_path: File path to write stdout output to.
+      stderr_path: File path to write stderr output to.
 
     Returns:
       If throw_on_error is False the status code of the call will be returned.
@@ -199,21 +214,49 @@ def shell_call(command, throw_on_error=True, stdout_path=None, shell=False):
     stdout_file = None
     if stdout_path:
         stdout_file = open(stdout_path, 'w')
+    stderr_file = None
+    if stderr_path:
+        stderr_file = open(stderr_path, 'w')
     result = 0
     try:
         if throw_on_error:
             result = 1
-            subprocess.check_call(command, shell=shell, stdout=stdout_file)
+            subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
             result = 0
         else:
-            result = subprocess.call(command, shell=shell, stdout=stdout_file)
+            result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file)
     finally:
         if stdout_file:
             stdout_file.close()
+        if stderr_file:
+            stderr_file.close()
     return result
 
 
-def get_git_head_info():
+def generate_version_h():
+    """Generates a build/version.h file that contains current git info.
+    """
+    if git_is_repository():
+        (branch_name, commit, commit_short) = git_get_head_info()
+    else:
+        branch_name = 'tarball'
+        commit = ':(-dont-do-this'
+        commit_short = ':('
+
+    contents = '''// Autogenerated by `xb premake`.
+  #ifndef GENERATED_VERSION_H_
+  #define GENERATED_VERSION_H_
+  #define XE_BUILD_BRANCH "%s"
+  #define XE_BUILD_COMMIT "%s"
+  #define XE_BUILD_COMMIT_SHORT "%s"
+  #define XE_BUILD_DATE __DATE__
+  #endif  // GENERATED_VERSION_H_
+  ''' % (branch_name, commit, commit_short)
+    with open('build/version.h', 'w') as f:
+        f.write(contents)
+
+
+def git_get_head_info():
     """Queries the current branch and commit checksum from git.
 
     Returns:
@@ -247,58 +290,28 @@ def get_git_head_info():
     return branch_name, commit, commit_short
 
 
-def generate_version_h():
-    """Generates a build/version.h file that contains current git info.
+def git_is_repository():
+    """Checks if git is available and this source tree is versioned.
     """
-    (branch_name, commit, commit_short) = get_git_head_info()
-    contents = '''// Autogenerated by `xb premake`.
-  #ifndef GENERATED_VERSION_H_
-  #define GENERATED_VERSION_H_
-  #define XE_BUILD_BRANCH "%s"
-  #define XE_BUILD_COMMIT "%s"
-  #define XE_BUILD_COMMIT_SHORT "%s"
-  #define XE_BUILD_DATE __DATE__
-  #endif  // GENERATED_VERSION_H_
-  ''' % (branch_name, commit, commit_short)
-    with open('build/version.h', 'w') as f:
-        f.write(contents)
+    if not has_bin('git'):
+        return False
+    return shell_call([
+        'git',
+        'rev-parse',
+        '--is-inside-work-tree',
+        ], throw_on_error=False, stdout_path=os.devnull, stderr_path=os.devnull) == 0
 
 
 def git_submodule_update():
     """Runs a full recursive git submodule init and update.
-
-    Older versions of git do not support 'update --init --recursive'. We could
-    check and run it on versions that do support it and speed things up a bit.
     """
-    if True:
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--init',
-            '--recursive',
-            ])
-    else:
-        shell_call([
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'foreach',
-            '--recursive',
-            'git',
-            'submodule',
-            'init',
-            ])
-        shell_call([
-            'git',
-            'submodule',
-            'update',
-            '--recursive',
-            ])
+    shell_call([
+        'git',
+        'submodule',
+        'update',
+        '--init',
+        '--recursive',
+        ])
 
 
 def get_clang_format_binary():
@@ -370,9 +383,9 @@ def run_platform_premake(cc='clang', devenv=None):
         if 'VSVERSION' in os.environ:
             vs_version = os.environ['VSVERSION']
 
-        return run_premake('windows', 'vs' + vs_version)
+        return run_premake('windows', devenv or ('vs' + vs_version))
     else:
-        return run_premake('linux', devenv == 'codelite' and devenv or 'gmake2', cc)
+        return run_premake('linux', devenv or 'gmake2', cc)
 
 
 def run_premake_export_commands():
@@ -406,6 +419,43 @@ def get_build_bin_path(args):
     return os.path.join(self_path, 'build', 'bin', platform.capitalize(), args['config'].capitalize())
 
 
+def create_clion_workspace():
+    """Creates some basic workspace information inside the .idea directory for first start.
+    """
+    if os.path.exists('.idea'):
+        # No first start
+        return False
+    print('Generating CLion workspace files...')
+    # Might become easier in the future: https://youtrack.jetbrains.com/issue/CPP-7911
+
+    # Set the location of the CMakeLists.txt
+    os.mkdir('.idea')
+    with open(os.path.join('.idea', 'misc.xml'), 'w') as f:
+        f.write("""<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/build">
+    <contentRoot DIR="$PROJECT_DIR$" />
+  </component>
+</project>
+""")
+
+    # Set available configurations
+    # TODO Find a way to trigger a cmake reload
+    with open(os.path.join('.idea', 'workspace.xml'), 'w') as f:
+        f.write("""<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Checked" CONFIG_NAME="Checked" />
+      <configuration PROFILE_NAME="Debug" CONFIG_NAME="Debug" />
+      <configuration PROFILE_NAME="Release" CONFIG_NAME="Release" />
+    </configurations>
+  </component>
+</project>""")
+
+    return True
+
+
 def discover_commands(subparsers):
     """Looks for all commands and returns a dictionary of them.
     In the future commands could be discovered on disk.
@@ -491,7 +541,10 @@ class SetupCommand(Command):
 
         # Setup submodules.
         print('- git submodule init / update...')
-        git_submodule_update()
+        if git_is_repository():
+            git_submodule_update()
+        else:
+            print('WARNING: Git not available or not a repository. Dependencies may be missing.')
         print('')
 
         print('- running premake...')
@@ -1445,8 +1498,13 @@ class DevenvCommand(Command):
 
     def execute(self, args, pass_args, cwd):
         devenv = None
+        show_reload_prompt = False
         if sys.platform == 'win32':
             print('Launching Visual Studio...')
+        elif has_bin('clion') or has_bin('clion.sh'):
+            print('Launching CLion...')
+            show_reload_prompt = create_clion_workspace()
+            devenv = 'cmake'
         else:
             print('Launching CodeLite...')
             devenv = 'codelite'
@@ -1457,11 +1515,23 @@ class DevenvCommand(Command):
         print('')
 
         print('- launching devenv...')
+        if show_reload_prompt:
+            print_box('Please run "File ⇒ ↺ Reload CMake Project" from inside the IDE!')
         if sys.platform == 'win32':
             shell_call([
                 'devenv',
                 'build\\xenia.sln',
             ])
+        elif has_bin('clion'):
+            shell_call([
+                'clion',
+                '.',
+            ])
+        elif has_bin('clion.sh'):
+            shell_call([
+                'clion.sh',
+                '.',
+            ])
         else:
             shell_call([
                 'codelite',