diff --git a/src/common/intrin.h b/src/common/intrin.h
index 364a0cc38..b04236719 100644
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@@ -14,9 +14,9 @@
 #define CPU_ARCH_SIMD 1
 #define CPU_ARCH_SSE 1
 #include <emmintrin.h>
-#include <tmmintrin.h>
-#include <smmintrin.h>
 #include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
 
 #if defined(__AVX2__)
 #define CPU_ARCH_AVX 1
@@ -96,3 +96,40 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
   for (u32 i = 0; i < remaining_count; i++)
     *(dest++) = value;
 }
+
+ALWAYS_INLINE static void MultiPause()
+{
+#if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+  _mm_pause();
+#elif defined(CPU_ARCH_ARM64) && defined(_MSC_VER)
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+  __isb(_ARM64_BARRIER_SY);
+#elif defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_ARM32)
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+  __asm__ __volatile__("isb");
+#elif defined(CPU_ARCH_RISCV64)
+  // Probably wrong... pause is optional :/
+  asm volatile("fence" ::: "memory");
+#else
+#pragma warning("Missing implementation")
+#endif
+}
diff --git a/src/common/log_channels.h b/src/common/log_channels.h
index aaa8b7842..c05367249 100644
--- a/src/common/log_channels.h
+++ b/src/common/log_channels.h
@@ -33,6 +33,7 @@
   X(GPUShaderCache)                                                                                                    \
   X(GPUTexture)                                                                                                        \
   X(GPUTextureCache)                                                                                                   \
+  X(GPUThread)                                                                                                         \
   X(GPU_HW)                                                                                                            \
   X(GPU_SW)                                                                                                            \
   X(GPU_SW_Rasterizer)                                                                                                 \
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index f6cc9dcc8..98dcadf4e 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -61,10 +61,10 @@ add_library(core
   gpu_shadergen.h
   gpu_sw.cpp
   gpu_sw.h
-  gpu_sw_backend.cpp
-  gpu_sw_backend.h
   gpu_sw_rasterizer.cpp
   gpu_sw_rasterizer.h
+  gpu_thread.cpp
+  gpu_thread.h
   gpu_types.h
   guncon.cpp
   guncon.h
diff --git a/src/core/achievements.cpp b/src/core/achievements.cpp
index 16bfd0ed7..c39453cc9 100644
--- a/src/core/achievements.cpp
+++ b/src/core/achievements.cpp
@@ -9,6 +9,7 @@
 #include "bus.h"
 #include "cpu_core.h"
 #include "fullscreen_ui.h"
+#include "gpu_thread.h"
 #include "host.h"
 #include "system.h"
 
@@ -1144,7 +1145,7 @@ void Achievements::ClientLoadGameCallback(int result, const char* error_message,
 
   // ensure fullscreen UI is ready for notifications
   if (display_summary)
-    FullscreenUI::Initialize();
+    GPUThread::RunOnThread(&FullscreenUI::Initialize);
 
   char url_buf[URL_BUFFER_SIZE];
   if (int err = rc_client_game_get_image_url(info, url_buf, std::size(url_buf)); err == RC_OK)
@@ -1199,7 +1200,7 @@ void Achievements::ClearGameHash()
 
 void Achievements::DisplayAchievementSummary()
 {
-  if (g_settings.achievements_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_notifications)
   {
     std::string title;
     if (IsHardcoreModeActive())
@@ -1224,8 +1225,13 @@ void Achievements::DisplayAchievementSummary()
       summary = TRANSLATE_STR("Achievements", "This game has no achievements.");
     }
 
-    ImGuiFullscreen::AddNotification("achievement_summary", ACHIEVEMENT_SUMMARY_NOTIFICATION_TIME, std::move(title),
-                                     std::move(summary), s_game_icon);
+    GPUThread::RunOnThread([title = std::move(title), summary = std::move(summary)]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification("achievement_summary", ACHIEVEMENT_SUMMARY_NOTIFICATION_TIME, std::move(title),
+                                       std::move(summary), s_game_icon);
+    });
   }
 
   // Technically not going through the resource API, but since we're passing this to something else, we can't.
@@ -1235,11 +1241,16 @@ void Achievements::DisplayAchievementSummary()
 
 void Achievements::DisplayHardcoreDeferredMessage()
 {
-  if (g_settings.achievements_hardcore_mode && !s_hardcore_mode && System::IsValid() && FullscreenUI::Initialize())
+  if (g_settings.achievements_hardcore_mode && !s_hardcore_mode && System::IsValid())
   {
-    ImGuiFullscreen::ShowToast(std::string(),
-                               TRANSLATE_STR("Achievements", "Hardcore mode will be enabled on system reset."),
-                               Host::OSD_WARNING_DURATION);
+    GPUThread::RunOnThread([]() {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::ShowToast(std::string(),
+                                 TRANSLATE_STR("Achievements", "Hardcore mode will be enabled on system reset."),
+                                 Host::OSD_WARNING_DURATION);
+    });
   }
 }
 
@@ -1261,7 +1272,7 @@ void Achievements::HandleUnlockEvent(const rc_client_event_t* event)
   INFO_LOG("Achievement {} ({}) for game {} unlocked", cheevo->title, cheevo->id, s_game_id);
   UpdateGameSummary();
 
-  if (g_settings.achievements_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_notifications)
   {
     std::string title;
     if (cheevo->category == RC_CLIENT_ACHIEVEMENT_CATEGORY_UNOFFICIAL)
@@ -1271,9 +1282,15 @@ void Achievements::HandleUnlockEvent(const rc_client_event_t* event)
 
     std::string badge_path = GetAchievementBadgePath(cheevo, cheevo->state);
 
-    ImGuiFullscreen::AddNotification(fmt::format("achievement_unlock_{}", cheevo->id),
-                                     static_cast<float>(g_settings.achievements_notification_duration),
-                                     std::move(title), cheevo->description, std::move(badge_path));
+    GPUThread::RunOnThread([id = cheevo->id, duration = g_settings.achievements_notification_duration,
+                            title = std::move(title), description = std::string(cheevo->description),
+                            badge_path = std::move(badge_path)]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification(fmt::format("achievement_unlock_{}", id), static_cast<float>(duration),
+                                       std::move(title), std::move(description), std::move(badge_path));
+    });
   }
 
   if (g_settings.achievements_sound_effects)
@@ -1285,7 +1302,7 @@ void Achievements::HandleGameCompleteEvent(const rc_client_event_t* event)
   INFO_LOG("Game {} complete", s_game_id);
   UpdateGameSummary();
 
-  if (g_settings.achievements_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_notifications)
   {
     std::string title = fmt::format(TRANSLATE_FS("Achievements", "Mastered {}"), s_game_title);
     std::string message = fmt::format(
@@ -1294,8 +1311,13 @@ void Achievements::HandleGameCompleteEvent(const rc_client_event_t* event)
                            s_game_summary.num_unlocked_achievements),
       TRANSLATE_PLURAL_STR("Achievements", "%n points", "Achievement points", s_game_summary.points_unlocked));
 
-    ImGuiFullscreen::AddNotification("achievement_mastery", GAME_COMPLETE_NOTIFICATION_TIME, std::move(title),
-                                     std::move(message), s_game_icon);
+    GPUThread::RunOnThread([title = std::move(title), message = std::move(message), icon = s_game_icon]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification("achievement_mastery", GAME_COMPLETE_NOTIFICATION_TIME, std::move(title),
+                                       std::move(message), std::move(icon));
+    });
   }
 }
 
@@ -1303,14 +1325,19 @@ void Achievements::HandleLeaderboardStartedEvent(const rc_client_event_t* event)
 {
   DEV_LOG("Leaderboard {} ({}) started", event->leaderboard->id, event->leaderboard->title);
 
-  if (g_settings.achievements_leaderboard_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_leaderboard_notifications)
   {
     std::string title = event->leaderboard->title;
     std::string message = TRANSLATE_STR("Achievements", "Leaderboard attempt started.");
 
-    ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", event->leaderboard->id),
-                                     LEADERBOARD_STARTED_NOTIFICATION_TIME, std::move(title), std::move(message),
-                                     s_game_icon);
+    GPUThread::RunOnThread([id = event->leaderboard->id, title = std::move(title), message = std::move(message),
+                            icon = s_game_icon]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", id), LEADERBOARD_STARTED_NOTIFICATION_TIME,
+                                       std::move(title), std::move(message), std::move(icon));
+    });
   }
 }
 
@@ -1318,14 +1345,19 @@ void Achievements::HandleLeaderboardFailedEvent(const rc_client_event_t* event)
 {
   DEV_LOG("Leaderboard {} ({}) failed", event->leaderboard->id, event->leaderboard->title);
 
-  if (g_settings.achievements_leaderboard_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_leaderboard_notifications)
   {
     std::string title = event->leaderboard->title;
     std::string message = TRANSLATE_STR("Achievements", "Leaderboard attempt failed.");
 
-    ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", event->leaderboard->id),
-                                     LEADERBOARD_FAILED_NOTIFICATION_TIME, std::move(title), std::move(message),
-                                     s_game_icon);
+    GPUThread::RunOnThread([id = event->leaderboard->id, title = std::move(title), message = std::move(message),
+                            icon = s_game_icon]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", id), LEADERBOARD_FAILED_NOTIFICATION_TIME,
+                                       std::move(title), std::move(message), std::move(icon));
+    });
   }
 }
 
@@ -1333,7 +1365,7 @@ void Achievements::HandleLeaderboardSubmittedEvent(const rc_client_event_t* even
 {
   DEV_LOG("Leaderboard {} ({}) submitted", event->leaderboard->id, event->leaderboard->title);
 
-  if (g_settings.achievements_leaderboard_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_leaderboard_notifications)
   {
     static const char* value_strings[NUM_RC_CLIENT_LEADERBOARD_FORMATS] = {
       TRANSLATE_NOOP("Achievements", "Your Time: {}{}"),
@@ -1349,9 +1381,14 @@ void Achievements::HandleLeaderboardSubmittedEvent(const rc_client_event_t* even
       event->leaderboard->tracker_value ? event->leaderboard->tracker_value : "Unknown",
       g_settings.achievements_spectator_mode ? std::string_view() : TRANSLATE_SV("Achievements", " (Submitting)"));
 
-    ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", event->leaderboard->id),
-                                     static_cast<float>(g_settings.achievements_leaderboard_duration), std::move(title),
-                                     std::move(message), s_game_icon);
+    GPUThread::RunOnThread([id = event->leaderboard->id, title = std::move(title), message = std::move(message),
+                            icon = s_game_icon]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+      ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", id),
+                                       static_cast<float>(g_settings.achievements_leaderboard_duration),
+                                       std::move(title), std::move(message), std::move(icon));
+    });
   }
 
   if (g_settings.achievements_sound_effects)
@@ -1363,7 +1400,7 @@ void Achievements::HandleLeaderboardScoreboardEvent(const rc_client_event_t* eve
   DEV_LOG("Leaderboard {} scoreboard rank {} of {}", event->leaderboard_scoreboard->leaderboard_id,
           event->leaderboard_scoreboard->new_rank, event->leaderboard_scoreboard->num_entries);
 
-  if (g_settings.achievements_leaderboard_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_leaderboard_notifications)
   {
     static const char* value_strings[NUM_RC_CLIENT_LEADERBOARD_FORMATS] = {
       TRANSLATE_NOOP("Achievements", "Your Time: {} (Best: {})"),
@@ -1380,9 +1417,15 @@ void Achievements::HandleLeaderboardScoreboardEvent(const rc_client_event_t* eve
                   event->leaderboard_scoreboard->submitted_score, event->leaderboard_scoreboard->best_score),
       event->leaderboard_scoreboard->new_rank, event->leaderboard_scoreboard->num_entries);
 
-    ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", event->leaderboard->id),
-                                     static_cast<float>(g_settings.achievements_leaderboard_duration), std::move(title),
-                                     std::move(message), s_game_icon);
+    GPUThread::RunOnThread([id = event->leaderboard->id, title = std::move(title), message = std::move(message),
+                            icon = s_game_icon]() mutable {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::AddNotification(fmt::format("leaderboard_{}", id),
+                                       static_cast<float>(g_settings.achievements_leaderboard_duration),
+                                       std::move(title), std::move(message), std::move(icon));
+    });
   }
 }
 
@@ -1512,26 +1555,30 @@ void Achievements::HandleServerDisconnectedEvent(const rc_client_event_t* event)
 {
   WARNING_LOG("Server disconnected.");
 
-  if (FullscreenUI::Initialize())
-  {
+  GPUThread::RunOnThread([]() {
+    if (!FullscreenUI::Initialize())
+      return;
+
     ImGuiFullscreen::ShowToast(
       TRANSLATE_STR("Achievements", "Achievements Disconnected"),
       TRANSLATE_STR("Achievements",
                     "An unlock request could not be completed. We will keep retrying to submit this request."),
       Host::OSD_ERROR_DURATION);
-  }
+  });
 }
 
 void Achievements::HandleServerReconnectedEvent(const rc_client_event_t* event)
 {
   WARNING_LOG("Server reconnected.");
 
-  if (FullscreenUI::Initialize())
-  {
+  GPUThread::RunOnThread([]() {
+    if (!FullscreenUI::Initialize())
+      return;
+
     ImGuiFullscreen::ShowToast(TRANSLATE_STR("Achievements", "Achievements Reconnected"),
                                TRANSLATE_STR("Achievements", "All pending unlock requests have completed."),
                                Host::OSD_INFO_DURATION);
-  }
+  });
 }
 
 void Achievements::ResetClient()
@@ -1609,12 +1656,17 @@ void Achievements::SetHardcoreMode(bool enabled, bool force_display_message)
   // new mode
   s_hardcore_mode = enabled;
 
-  if (System::IsValid() && (HasActiveGame() || force_display_message) && FullscreenUI::Initialize())
+  if (System::IsValid() && (HasActiveGame() || force_display_message))
   {
-    ImGuiFullscreen::ShowToast(std::string(),
-                               enabled ? TRANSLATE_STR("Achievements", "Hardcore mode is now enabled.") :
-                                         TRANSLATE_STR("Achievements", "Hardcore mode is now disabled."),
-                               Host::OSD_INFO_DURATION);
+    GPUThread::RunOnThread([enabled]() {
+      if (!FullscreenUI::Initialize())
+        return;
+
+      ImGuiFullscreen::ShowToast(std::string(),
+                                 enabled ? TRANSLATE_STR("Achievements", "Hardcore mode is now enabled.") :
+                                           TRANSLATE_STR("Achievements", "Hardcore mode is now disabled."),
+                                 Host::OSD_INFO_DURATION);
+    });
   }
 
   rc_client_set_hardcore_enabled(s_client, enabled);
@@ -1925,7 +1977,7 @@ void Achievements::ShowLoginNotification()
   if (!user)
     return;
 
-  if (g_settings.achievements_notifications && FullscreenUI::Initialize())
+  if (g_settings.achievements_notifications)
   {
     std::string badge_path = GetLoggedInUserBadgePath();
     std::string title = user->display_name;
@@ -1934,8 +1986,14 @@ void Achievements::ShowLoginNotification()
     std::string summary = fmt::format(TRANSLATE_FS("Achievements", "Score: {} ({} softcore)\nUnread messages: {}"),
                                       user->score, user->score_softcore, user->num_unread_messages);
 
-    ImGuiFullscreen::AddNotification("achievements_login", LOGIN_NOTIFICATION_TIME, std::move(title),
-                                     std::move(summary), std::move(badge_path));
+    GPUThread::RunOnThread(
+      [title = std::move(title), summary = std::move(summary), badge_path = std::move(badge_path)]() mutable {
+        if (!FullscreenUI::Initialize())
+          return;
+
+        ImGuiFullscreen::AddNotification("achievements_login", LOGIN_NOTIFICATION_TIME, std::move(title),
+                                         std::move(summary), std::move(badge_path));
+      });
   }
 }
 
@@ -2035,14 +2093,6 @@ void Achievements::ConfirmHardcoreModeDisableAsync(const char* trigger, std::fun
   }
 #endif
 
-  if (!FullscreenUI::Initialize())
-  {
-    Host::AddOSDMessage(fmt::format(TRANSLATE_FS("Achievements", "Cannot {} while hardcode mode is active."), trigger),
-                        Host::OSD_WARNING_DURATION);
-    callback(false);
-    return;
-  }
-
   auto real_callback = [callback = std::move(callback)](bool res) mutable {
     // don't run the callback in the middle of rendering the UI
     Host::RunOnCPUThread([callback = std::move(callback), res]() {
@@ -2052,13 +2102,25 @@ void Achievements::ConfirmHardcoreModeDisableAsync(const char* trigger, std::fun
     });
   };
 
-  ImGuiFullscreen::OpenConfirmMessageDialog(
-    TRANSLATE_STR("Achievements", "Confirm Hardcore Mode"),
-    fmt::format(TRANSLATE_FS("Achievements", "{0} cannot be performed while hardcore mode is active. Do you "
-                                             "want to disable hardcore mode? {0} will be cancelled if you select No."),
-                trigger),
-    std::move(real_callback), fmt::format(ICON_FA_CHECK " {}", TRANSLATE_SV("Achievements", "Yes")),
-    fmt::format(ICON_FA_TIMES " {}", TRANSLATE_SV("Achievements", "No")));
+  GPUThread::RunOnThread([trigger = std::string(trigger), real_callback = std::move(real_callback)]() mutable {
+    if (!FullscreenUI::Initialize())
+    {
+      Host::AddOSDMessage(
+        fmt::format(TRANSLATE_FS("Achievements", "Cannot {} while hardcode mode is active."), trigger),
+        Host::OSD_WARNING_DURATION);
+      real_callback(false);
+      return;
+    }
+
+    ImGuiFullscreen::OpenConfirmMessageDialog(
+      TRANSLATE_STR("Achievements", "Confirm Hardcore Mode"),
+      fmt::format(TRANSLATE_FS("Achievements",
+                               "{0} cannot be performed while hardcore mode is active. Do you "
+                               "want to disable hardcore mode? {0} will be cancelled if you select No."),
+                  trigger),
+      std::move(real_callback), fmt::format(ICON_FA_CHECK " {}", TRANSLATE_SV("Achievements", "Yes")),
+      fmt::format(ICON_FA_TIMES " {}", TRANSLATE_SV("Achievements", "No")));
+  });
 #else
   Host::AddOSDMessage(fmt::format(TRANSLATE_FS("Achievements", "Cannot {} while hardcode mode is active."), trigger),
                       Host::OSD_WARNING_DURATION);
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index f3798eb86..0beaf2e2a 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -52,8 +52,8 @@
     <ClCompile Include="gpu_hw_texture_cache.cpp" />
     <ClCompile Include="gpu_shadergen.cpp" />
     <ClCompile Include="gpu_sw.cpp" />
-    <ClCompile Include="gpu_sw_backend.cpp" />
     <ClCompile Include="gpu_sw_rasterizer.cpp" />
+    <ClCompile Include="gpu_thread.cpp" />
     <ClCompile Include="gte.cpp" />
     <ClCompile Include="dma.cpp" />
     <ClCompile Include="gpu.cpp" />
@@ -133,8 +133,8 @@
     <ClInclude Include="gpu_hw_texture_cache.h" />
     <ClInclude Include="gpu_shadergen.h" />
     <ClInclude Include="gpu_sw.h" />
-    <ClInclude Include="gpu_sw_backend.h" />
     <ClInclude Include="gpu_sw_rasterizer.h" />
+    <ClInclude Include="gpu_thread.h" />
     <ClInclude Include="gpu_types.h" />
     <ClInclude Include="gte.h" />
     <ClInclude Include="cpu_types.h" />
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index 5a8d278a2..5e5e105f5 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -45,7 +45,6 @@
     <ClCompile Include="analog_joystick.cpp" />
     <ClCompile Include="cpu_recompiler_code_generator_aarch32.cpp" />
     <ClCompile Include="gpu_backend.cpp" />
-    <ClCompile Include="gpu_sw_backend.cpp" />
     <ClCompile Include="multitap.cpp" />
     <ClCompile Include="host.cpp" />
     <ClCompile Include="game_database.cpp" />
@@ -71,6 +70,7 @@
     <ClCompile Include="gpu_dump.cpp" />
     <ClCompile Include="cdrom_subq_replacement.cpp" />
     <ClCompile Include="performance_counters.cpp" />
+    <ClCompile Include="gpu_thread.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="types.h" />
@@ -119,7 +119,6 @@
     <ClInclude Include="analog_joystick.h" />
     <ClInclude Include="gpu_types.h" />
     <ClInclude Include="gpu_backend.h" />
-    <ClInclude Include="gpu_sw_backend.h" />
     <ClInclude Include="multitap.h" />
     <ClInclude Include="host.h" />
     <ClInclude Include="achievements.h" />
@@ -149,6 +148,7 @@
     <ClInclude Include="cdrom_subq_replacement.h" />
     <ClInclude Include="performance_counters.h" />
     <ClInclude Include="system_private.h" />
+    <ClInclude Include="gpu_thread.h" />
   </ItemGroup>
   <ItemGroup>
     <None Include="gpu_sw_rasterizer.inl" />
diff --git a/src/core/fullscreen_ui.cpp b/src/core/fullscreen_ui.cpp
index 244682c2b..c90d941ad 100644
--- a/src/core/fullscreen_ui.cpp
+++ b/src/core/fullscreen_ui.cpp
@@ -8,6 +8,7 @@
 #include "controller.h"
 #include "game_list.h"
 #include "gpu.h"
+#include "gpu_thread.h"
 #include "host.h"
 #include "settings.h"
 #include "system.h"
@@ -219,6 +220,7 @@ struct PostProcessingStageInfo
 //////////////////////////////////////////////////////////////////////////
 // Main
 //////////////////////////////////////////////////////////////////////////
+static void UpdateRunIdleState();
 static void PauseForMenuOpen(bool set_pause_menu_open);
 static bool AreAnyDialogsOpen();
 static void ClosePauseMenu();
@@ -600,12 +602,13 @@ bool FullscreenUI::Initialize()
   s_about_window_open = false;
   s_hotkey_list_cache = InputManager::GetHotkeyList();
 
+  if (s_initialized)
+    Host::RunOnCPUThread([]() { Host::OnFullscreenUIStartedOrStopped(true); });
+
   if (!System::IsValid())
     SwitchToLanding();
 
-  if (!System::IsRunning())
-    Host::OnIdleStateChanged();
-
+  UpdateRunIdleState();
   ForceKeyNavEnabled();
   return true;
 }
@@ -629,6 +632,7 @@ bool FullscreenUI::AreAnyDialogsOpen()
 
 void FullscreenUI::CheckForConfigChanges(const Settings& old_settings)
 {
+  // NOTE: Called on CPU thread.
   if (!IsInitialized())
     return;
 
@@ -636,54 +640,102 @@ void FullscreenUI::CheckForConfigChanges(const Settings& old_settings)
   // That means we're going to be reading achievement state.
   if (old_settings.achievements_enabled && !g_settings.achievements_enabled)
   {
-    if (s_current_main_window == MainWindowType::Achievements || s_current_main_window == MainWindowType::Leaderboards)
-      ReturnToPreviousWindow();
+    if (!IsInitialized())
+      return;
+
+    GPUThread::RunOnThread([]() {
+      if (s_current_main_window == MainWindowType::Achievements ||
+          s_current_main_window == MainWindowType::Leaderboards)
+      {
+        ReturnToPreviousWindow();
+      }
+    });
   }
 }
 
+void FullscreenUI::UpdateRunIdleState()
+{
+  const bool new_run_idle = HasActiveWindow();
+  if (GPUThread::GetRunIdleOnThread() == new_run_idle)
+    return;
+
+  GPUThread::SetRunIdleOnThread(new_run_idle);
+  Host::RunOnCPUThread([new_run_idle]() { Host::OnFullscreenUIActiveChanged(new_run_idle); });
+}
+
 void FullscreenUI::OnSystemStarted()
 {
+  // NOTE: Called on CPU thread.
   if (!IsInitialized())
     return;
 
-  s_current_main_window = MainWindowType::None;
-  QueueResetFocus(FocusResetType::ViewChanged);
-}
+  GPUThread::RunOnThread([]() {
+    if (!IsInitialized())
+      return;
 
-void FullscreenUI::OnSystemPaused()
-{
-  // noop
+    s_current_main_window = MainWindowType::None;
+    QueueResetFocus(FocusResetType::ViewChanged);
+    UpdateRunIdleState();
+  });
 }
 
 void FullscreenUI::OnSystemResumed()
 {
-  // get rid of pause menu if we unpaused another way
-  if (s_current_main_window == MainWindowType::PauseMenu)
-    ClosePauseMenu();
+  // NOTE: Called on CPU thread.
+  if (!IsInitialized())
+    return;
+
+  GPUThread::RunOnThread([]() {
+    if (!IsInitialized())
+      return;
+
+    // get rid of pause menu if we unpaused another way
+    if (s_current_main_window == MainWindowType::PauseMenu)
+      ClosePauseMenu();
+
+    UpdateRunIdleState();
+  });
 }
 
 void FullscreenUI::OnSystemDestroyed()
 {
+  // NOTE: Called on CPU thread.
   if (!IsInitialized())
     return;
 
-  s_pause_menu_was_open = false;
-  s_was_paused_on_quick_menu_open = false;
-  s_current_pause_submenu = PauseSubMenu::None;
-  SwitchToLanding();
+  GPUThread::RunOnThread([]() {
+    if (!IsInitialized())
+      return;
+
+    s_pause_menu_was_open = false;
+    s_was_paused_on_quick_menu_open = false;
+    s_current_pause_submenu = PauseSubMenu::None;
+    SwitchToLanding();
+    UpdateRunIdleState();
+  });
 }
 
 void FullscreenUI::OnRunningGameChanged()
 {
+  // NOTE: Called on CPU thread.
   if (!IsInitialized())
     return;
 
   const std::string& path = System::GetDiscPath();
   const std::string& serial = System::GetGameSerial();
+
+  std::string subtitle;
   if (!serial.empty())
-    s_current_game_subtitle = fmt::format("{0} - {1}", serial, Path::GetFileName(path));
+    subtitle = fmt::format("{0} - {1}", serial, Path::GetFileName(path));
   else
-    s_current_game_subtitle = {};
+    subtitle = {};
+
+  GPUThread::RunOnThread([subtitle = std::move(subtitle)]() mutable {
+    if (!IsInitialized())
+      return;
+
+    s_current_game_subtitle = std::move(subtitle);
+  });
 }
 
 void FullscreenUI::PauseForMenuOpen(bool set_pause_menu_open)
@@ -700,15 +752,18 @@ void FullscreenUI::OpenPauseMenu()
   if (!System::IsValid())
     return;
 
-  if (!Initialize() || s_current_main_window != MainWindowType::None)
-    return;
+  GPUThread::RunOnThread([]() {
+    if (!Initialize() || s_current_main_window != MainWindowType::None)
+      return;
 
-  PauseForMenuOpen(true);
-  s_current_main_window = MainWindowType::PauseMenu;
-  s_current_pause_submenu = PauseSubMenu::None;
-  QueueResetFocus(FocusResetType::ViewChanged);
-  ForceKeyNavEnabled();
-  FixStateIfPaused();
+    PauseForMenuOpen(true);
+    s_current_main_window = MainWindowType::PauseMenu;
+    s_current_pause_submenu = PauseSubMenu::None;
+    QueueResetFocus(FocusResetType::ViewChanged);
+    ForceKeyNavEnabled();
+    UpdateRunIdleState();
+    FixStateIfPaused();
+  });
 }
 
 void FullscreenUI::OpenCheatsMenu()
@@ -722,6 +777,7 @@ void FullscreenUI::OpenCheatsMenu()
   s_settings_page = SettingsPage::Cheats;
   PauseForMenuOpen(true);
   ForceKeyNavEnabled();
+  UpdateRunIdleState();
   FixStateIfPaused();
 }
 
@@ -732,31 +788,27 @@ void FullscreenUI::FixStateIfPaused()
 
   // When we're paused, we won't have trickled the key up event for escape yet. Do it now.
   ImGui::UpdateInputEvents(false);
-
-  Host::OnIdleStateChanged();
-  Host::RunOnCPUThread([]() {
-    if (System::IsValid())
-    {
-      // Why twice? To clear the "wants keyboard input" flag.
-      System::InvalidateDisplay();
-      System::InvalidateDisplay();
-    }
-  });
 }
 
 void FullscreenUI::ClosePauseMenu()
 {
-  if (!IsInitialized() || !System::IsValid())
+  if (!System::IsValid())
     return;
 
-  if (System::GetState() == System::State::Paused && !s_was_paused_on_quick_menu_open)
-    Host::RunOnCPUThread([]() { System::PauseSystem(false); });
+  GPUThread::RunOnThread([]() {
+    if (!IsInitialized())
+      return;
 
-  s_current_main_window = MainWindowType::None;
-  s_current_pause_submenu = PauseSubMenu::None;
-  s_pause_menu_was_open = false;
-  QueueResetFocus(FocusResetType::ViewChanged);
-  FixStateIfPaused();
+    if (System::GetState() == System::State::Paused && !s_was_paused_on_quick_menu_open)
+      Host::RunOnCPUThread([]() { System::PauseSystem(false); });
+
+    s_current_main_window = MainWindowType::None;
+    s_current_pause_submenu = PauseSubMenu::None;
+    s_pause_menu_was_open = false;
+    QueueResetFocus(FocusResetType::ViewChanged);
+    UpdateRunIdleState();
+    FixStateIfPaused();
+  });
 }
 
 void FullscreenUI::OpenPauseSubMenu(PauseSubMenu submenu)
@@ -787,8 +839,12 @@ void FullscreenUI::Shutdown()
   s_current_game_subtitle = {};
   DestroyResources();
   ImGuiFullscreen::Shutdown();
+  if (s_initialized)
+    Host::RunOnCPUThread([]() { Host::OnFullscreenUIStartedOrStopped(false); });
+
   s_initialized = false;
   s_tried_to_initialize = false;
+  UpdateRunIdleState();
 }
 
 void FullscreenUI::Render()
@@ -920,6 +976,7 @@ void FullscreenUI::ReturnToMainWindow()
 {
   ClosePauseMenu();
   s_current_main_window = System::IsValid() ? MainWindowType::None : MainWindowType::Landing;
+  UpdateRunIdleState();
   FixStateIfPaused();
 }
 
@@ -958,6 +1015,11 @@ void FullscreenUI::DoStartPath(std::string path, std::string state, std::optiona
   if (System::IsValid())
     return;
 
+  // Switch to nothing, we'll get called back via OnSystemDestroyed() if startup fails.
+  s_current_main_window = MainWindowType::None;
+  QueueResetFocus(FocusResetType::ViewChanged);
+  UpdateRunIdleState();
+
   SystemBootParameters params;
   params.filename = std::move(path);
   params.save_state = std::move(state);
@@ -1152,108 +1214,116 @@ void FullscreenUI::DoChangeDiscFromFile()
 
 void FullscreenUI::DoChangeDisc()
 {
-  ImGuiFullscreen::ChoiceDialogOptions options;
+  Host::RunOnCPUThread([]() {
+    ImGuiFullscreen::ChoiceDialogOptions options;
 
-  if (System::HasMediaSubImages())
-  {
-    const u32 current_index = System::GetMediaSubImageIndex();
-    const u32 count = System::GetMediaSubImageCount();
-    options.reserve(count + 1);
-    options.emplace_back(FSUI_STR("From File..."), false);
-
-    for (u32 i = 0; i < count; i++)
-      options.emplace_back(System::GetMediaSubImageTitle(i), i == current_index);
-
-    auto callback = [](s32 index, const std::string& title, bool checked) {
-      if (index == 0)
-      {
-        CloseChoiceDialog();
-        DoChangeDiscFromFile();
-        return;
-      }
-      else if (index > 0)
-      {
-        System::SwitchMediaSubImage(static_cast<u32>(index - 1));
-      }
-
-      CloseChoiceDialog();
-      ReturnToPreviousWindow();
-    };
-
-    OpenChoiceDialog(FSUI_ICONSTR(ICON_FA_COMPACT_DISC, "Select Disc Image"), true, std::move(options),
-                     std::move(callback));
-
-    return;
-  }
-
-  if (const GameDatabase::Entry* entry = System::GetGameDatabaseEntry(); entry && !entry->disc_set_serials.empty())
-  {
-    const auto lock = GameList::GetLock();
-    auto matches = GameList::GetMatchingEntriesForSerial(entry->disc_set_serials);
-    if (matches.size() > 1)
+    if (System::HasMediaSubImages())
     {
-      options.reserve(matches.size() + 1);
+      const u32 current_index = System::GetMediaSubImageIndex();
+      const u32 count = System::GetMediaSubImageCount();
+      options.reserve(count + 1);
       options.emplace_back(FSUI_STR("From File..."), false);
 
-      std::vector<std::string> paths;
-      paths.reserve(matches.size());
+      for (u32 i = 0; i < count; i++)
+        options.emplace_back(System::GetMediaSubImageTitle(i), i == current_index);
 
-      const std::string& current_path = System::GetDiscPath();
-      for (auto& [title, glentry] : matches)
-      {
-        options.emplace_back(std::move(title), current_path == glentry->path);
-        paths.push_back(glentry->path);
-      }
+      GPUThread::RunOnThread([options = std::move(options)]() mutable {
+        auto callback = [](s32 index, const std::string& title, bool checked) {
+          if (index == 0)
+          {
+            CloseChoiceDialog();
+            DoChangeDiscFromFile();
+            return;
+          }
+          else if (index > 0)
+          {
+            System::SwitchMediaSubImage(static_cast<u32>(index - 1));
+          }
 
-      auto callback = [paths = std::move(paths)](s32 index, const std::string& title, bool checked) {
-        if (index == 0)
-        {
           CloseChoiceDialog();
-          DoChangeDiscFromFile();
-          return;
-        }
-        else if (index > 0)
-        {
-          System::InsertMedia(paths[index - 1].c_str());
-        }
+          ReturnToPreviousWindow();
+        };
 
-        CloseChoiceDialog();
-        ReturnToMainWindow();
-      };
-
-      OpenChoiceDialog(FSUI_ICONSTR(ICON_FA_COMPACT_DISC, "Select Disc Image"), true, std::move(options),
-                       std::move(callback));
+        OpenChoiceDialog(FSUI_ICONSTR(ICON_FA_COMPACT_DISC, "Select Disc Image"), true, std::move(options),
+                         std::move(callback));
+      });
 
       return;
     }
-  }
 
-  DoChangeDiscFromFile();
+    if (const GameDatabase::Entry* entry = System::GetGameDatabaseEntry(); entry && !entry->disc_set_serials.empty())
+    {
+      const auto lock = GameList::GetLock();
+      auto matches = GameList::GetMatchingEntriesForSerial(entry->disc_set_serials);
+      if (matches.size() > 1)
+      {
+        options.reserve(matches.size() + 1);
+        options.emplace_back(FSUI_STR("From File..."), false);
+
+        std::vector<std::string> paths;
+        paths.reserve(matches.size());
+
+        const std::string& current_path = System::GetDiscPath();
+        for (auto& [title, glentry] : matches)
+        {
+          options.emplace_back(std::move(title), current_path == glentry->path);
+          paths.push_back(glentry->path);
+        }
+
+        GPUThread::RunOnThread([options = std::move(options), paths = std::move(paths)]() mutable {
+          auto callback = [paths = std::move(paths)](s32 index, const std::string& title, bool checked) {
+            if (index == 0)
+            {
+              CloseChoiceDialog();
+              DoChangeDiscFromFile();
+              return;
+            }
+            else if (index > 0)
+            {
+              System::InsertMedia(paths[index - 1].c_str());
+            }
+
+            CloseChoiceDialog();
+            ReturnToMainWindow();
+          };
+
+          OpenChoiceDialog(FSUI_ICONSTR(ICON_FA_COMPACT_DISC, "Select Disc Image"), true, std::move(options),
+                           std::move(callback));
+        });
+
+        return;
+      }
+    }
+
+    GPUThread::RunOnThread([]() { DoChangeDiscFromFile(); });
+  });
 }
 
 void FullscreenUI::DoToggleAnalogMode()
 {
   // hacky way to toggle analog mode
-  for (u32 i = 0; i < NUM_CONTROLLER_AND_CARD_PORTS; i++)
-  {
-    Controller* ctrl = System::GetController(i);
-    if (!ctrl)
-      continue;
-
-    const Controller::ControllerInfo* cinfo = Controller::GetControllerInfo(ctrl->GetType());
-    if (!cinfo)
-      continue;
-
-    for (const Controller::ControllerBindingInfo& bi : cinfo->bindings)
+  Host::RunOnCPUThread([]() {
+    for (u32 i = 0; i < NUM_CONTROLLER_AND_CARD_PORTS; i++)
     {
-      if (std::strcmp(bi.name, "Analog") == 0)
+      Controller* ctrl = System::GetController(i);
+      if (!ctrl)
+        continue;
+
+      const Controller::ControllerInfo* cinfo = Controller::GetControllerInfo(ctrl->GetType());
+      if (!cinfo)
+        continue;
+
+      for (const Controller::ControllerBindingInfo& bi : cinfo->bindings)
       {
-        ctrl->SetBindState(bi.bind_index, 1.0f);
-        ctrl->SetBindState(bi.bind_index, 0.0f);
-        break;
+        if (std::strcmp(bi.name, "Analog") == 0)
+        {
+          ctrl->SetBindState(bi.bind_index, 1.0f);
+          ctrl->SetBindState(bi.bind_index, 0.0f);
+          break;
+        }
       }
     }
-  }
+  });
 }
 
 void FullscreenUI::DoRequestExit()
@@ -3760,12 +3830,9 @@ void FullscreenUI::DrawControllerSettingsPage()
                   &Settings::GetMultitapModeName, &Settings::GetMultitapModeDisplayName, MultitapMode::Count);
 
   // load mtap settings
-  MultitapMode mtap_mode = g_settings.multitap_mode;
-  if (IsEditingGameSettings(bsi))
-  {
-    mtap_mode = Settings::ParseMultitapModeName(bsi->GetTinyStringValue("ControllerPorts", "MultitapMode", "").c_str())
-                  .value_or(g_settings.multitap_mode);
-  }
+  const MultitapMode mtap_mode =
+    Settings::ParseMultitapModeName(bsi->GetTinyStringValue("ControllerPorts", "MultitapMode", "").c_str())
+      .value_or(Settings::DEFAULT_MULTITAP_MODE);
   const std::array<bool, 2> mtap_enabled = {
     {(mtap_mode == MultitapMode::Port1Only || mtap_mode == MultitapMode::BothPorts),
      (mtap_mode == MultitapMode::Port2Only || mtap_mode == MultitapMode::BothPorts)}};
@@ -7388,31 +7455,36 @@ void FullscreenUI::DrawAboutWindow()
 
 void FullscreenUI::OpenAchievementsWindow()
 {
+  if (!System::IsValid())
+    return;
+
   if (!Achievements::IsActive())
   {
     Host::AddKeyedOSDMessage("achievements_disabled", FSUI_STR("Achievements are not enabled."),
                              Host::OSD_INFO_DURATION);
     return;
   }
-
-  if (!System::IsValid() || !Initialize())
-    return;
-
-  if (!Achievements::HasAchievements() || !Achievements::PrepareAchievementsWindow())
+  else if (!Achievements::HasAchievements())
   {
     ShowToast(std::string(), FSUI_STR("This game has no achievements."));
     return;
   }
 
-  if (s_current_main_window != MainWindowType::PauseMenu)
-  {
-    PauseForMenuOpen(false);
-    ForceKeyNavEnabled();
-  }
+  GPUThread::RunOnThread([]() {
+    if (!Initialize() || !Achievements::PrepareAchievementsWindow())
+      return;
 
-  s_current_main_window = MainWindowType::Achievements;
-  QueueResetFocus(FocusResetType::ViewChanged);
-  FixStateIfPaused();
+    if (s_current_main_window != MainWindowType::PauseMenu)
+    {
+      PauseForMenuOpen(false);
+      ForceKeyNavEnabled();
+    }
+
+    s_current_main_window = MainWindowType::Achievements;
+    QueueResetFocus(FocusResetType::ViewChanged);
+    UpdateRunIdleState();
+    FixStateIfPaused();
+  });
 }
 
 bool FullscreenUI::IsAchievementsWindowOpen()
@@ -7422,31 +7494,36 @@ bool FullscreenUI::IsAchievementsWindowOpen()
 
 void FullscreenUI::OpenLeaderboardsWindow()
 {
+  if (!System::IsValid())
+    return;
+
   if (!Achievements::IsActive())
   {
     Host::AddKeyedOSDMessage("achievements_disabled", FSUI_STR("Leaderboards are not enabled."),
                              Host::OSD_INFO_DURATION);
     return;
   }
-
-  if (!System::IsValid() || !Initialize())
-    return;
-
-  if (!Achievements::HasLeaderboards() || !Achievements::PrepareLeaderboardsWindow())
+  else if (!Achievements::HasLeaderboards())
   {
     ShowToast(std::string(), FSUI_STR("This game has no leaderboards."));
     return;
   }
 
-  if (s_current_main_window != MainWindowType::PauseMenu)
-  {
-    PauseForMenuOpen(false);
-    ForceKeyNavEnabled();
-  }
+  GPUThread::RunOnThread([]() {
+    if (!Initialize() || !Achievements::PrepareLeaderboardsWindow())
+      return;
 
-  s_current_main_window = MainWindowType::Leaderboards;
-  QueueResetFocus(FocusResetType::ViewChanged);
-  FixStateIfPaused();
+    if (s_current_main_window != MainWindowType::PauseMenu)
+    {
+      PauseForMenuOpen(false);
+      ForceKeyNavEnabled();
+    }
+
+    s_current_main_window = MainWindowType::Leaderboards;
+    QueueResetFocus(FocusResetType::ViewChanged);
+    UpdateRunIdleState();
+    FixStateIfPaused();
+  });
 }
 
 bool FullscreenUI::IsLeaderboardsWindowOpen()
diff --git a/src/core/fullscreen_ui.h b/src/core/fullscreen_ui.h
index 9e8405442..f770955b3 100644
--- a/src/core/fullscreen_ui.h
+++ b/src/core/fullscreen_ui.h
@@ -21,7 +21,6 @@ bool IsInitialized();
 bool HasActiveWindow();
 void CheckForConfigChanges(const Settings& old_settings);
 void OnSystemStarted();
-void OnSystemPaused();
 void OnSystemResumed();
 void OnSystemDestroyed();
 void OnRunningGameChanged();
@@ -50,6 +49,12 @@ namespace Host {
 
 #ifndef __ANDROID__
 
+/// Called whenever fullscreen UI starts/stops.
+void OnFullscreenUIStartedOrStopped(bool started);
+
+/// Called when the pause state changes, or fullscreen UI opens.
+void OnFullscreenUIActiveChanged(bool is_active);
+
 /// Requests shut down and exit of the hosting application. This may not actually exit,
 /// if the user cancels the shutdown confirmation.
 void RequestExitApplication(bool allow_confirm);
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index 9ec22af4a..56f080c3b 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -3,9 +3,12 @@
 
 #include "gpu.h"
 #include "dma.h"
+#include "gpu_backend.h"
 #include "gpu_dump.h"
+#include "gpu_hw_texture_cache.h"
 #include "gpu_shadergen.h"
 #include "gpu_sw_rasterizer.h"
+#include "gpu_thread.h"
 #include "host.h"
 #include "interrupt_controller.h"
 #include "performance_counters.h"
@@ -72,18 +75,7 @@ static u64 s_active_gpu_cycles = 0;
 static u32 s_active_gpu_cycles_frames = 0;
 #endif
 
-static constexpr GPUTexture::Format DISPLAY_INTERNAL_POSTFX_FORMAT = GPUTexture::Format::RGBA8;
-
-static bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string filename, FileSystem::ManagedCFilePtr fp,
-                                          u8 quality, bool clear_alpha, bool flip_y, std::vector<u32> texture_data,
-                                          u32 texture_data_stride, GPUTexture::Format texture_format,
-                                          std::string osd_key);
-
-GPU::GPU()
-{
-  GPU_SW_Rasterizer::SelectImplementation();
-  ResetStatistics();
-}
+GPU::GPU() = default;
 
 GPU::~GPU()
 {
@@ -92,11 +84,9 @@ GPU::~GPU()
   s_frame_done_event.Deactivate();
 
   StopRecordingGPUDump();
-  DestroyDeinterlaceTextures();
-  g_gpu_device->RecycleTexture(std::move(m_chroma_smoothing_texture));
 }
 
-bool GPU::Initialize(Error* error)
+void GPU::Initialize()
 {
   if (!System::IsReplayingGPUDump())
     s_crtc_tick_event.Activate();
@@ -108,21 +98,14 @@ bool GPU::Initialize(Error* error)
   m_console_is_pal = System::IsPALRegion();
   UpdateCRTCConfig();
 
-  if (!CompileDisplayPipelines(true, true, g_settings.display_24bit_chroma_smoothing, error))
-    return false;
-
 #ifdef PSX_GPU_STATS
   s_active_gpu_cycles = 0;
   s_active_gpu_cycles_frames = 0;
 #endif
-
-  return true;
 }
 
 void GPU::UpdateSettings(const Settings& old_settings)
 {
-  FlushRender();
-
   m_force_progressive_scan = (g_settings.display_deinterlacing_mode == DisplayDeinterlacingMode::Progressive);
   m_fifo_size = g_settings.gpu_fifo_size;
   m_max_run_ahead = g_settings.gpu_max_run_ahead;
@@ -138,23 +121,6 @@ void GPU::UpdateSettings(const Settings& old_settings)
     // Crop mode calls this, so recalculate the display area
     UpdateCRTCDisplayParameters();
   }
-
-  if (g_settings.display_scaling != old_settings.display_scaling ||
-      g_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode ||
-      g_settings.display_24bit_chroma_smoothing != old_settings.display_24bit_chroma_smoothing)
-  {
-    // Toss buffers on mode change.
-    if (g_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode)
-      DestroyDeinterlaceTextures();
-
-    if (!CompileDisplayPipelines(
-          g_settings.display_scaling != old_settings.display_scaling,
-          g_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode,
-          g_settings.display_24bit_chroma_smoothing != old_settings.display_24bit_chroma_smoothing, nullptr))
-    {
-      Panic("Failed to compile display pipeline on settings change.");
-    }
-  }
 }
 
 void GPU::CPUClockChanged()
@@ -162,20 +128,6 @@ void GPU::CPUClockChanged()
   UpdateCRTCConfig();
 }
 
-u32 GPU::GetResolutionScale() const
-{
-  return 1u;
-}
-
-void GPU::UpdateResolutionScale()
-{
-}
-
-std::tuple<u32, u32> GPU::GetFullDisplayResolution() const
-{
-  return std::tie(m_crtc_state.display_width, m_crtc_state.display_height);
-}
-
 void GPU::Reset(bool clear_vram)
 {
   m_GPUSTAT.bits = 0x14802000;
@@ -190,12 +142,6 @@ void GPU::Reset(bool clear_vram)
   m_crtc_state.interlaced_field = 0;
   m_crtc_state.interlaced_display_field = 0;
 
-  if (clear_vram)
-  {
-    std::memset(g_vram, 0, sizeof(g_vram));
-    std::memset(g_gpu_clut, 0, sizeof(g_gpu_clut));
-  }
-
   // Cancel VRAM writes.
   m_blitter_state = BlitterState::Idle;
 
@@ -204,12 +150,14 @@ void GPU::Reset(bool clear_vram)
   s_command_tick_event.Deactivate();
 
   SoftReset();
-  UpdateDisplay();
+
+  // Can skip the VRAM clear if it's not a hardware reset.
+  if (clear_vram)
+    GPUBackend::PushCommand(GPUBackend::NewClearVRAMCommand());
 }
 
 void GPU::SoftReset()
 {
-  FlushRender();
   if (m_blitter_state == BlitterState::WritingVRAM)
     FinishVRAMWrite();
 
@@ -255,14 +203,17 @@ void GPU::SoftReset()
   UpdateGPUIdle();
 }
 
-bool GPU::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display)
+bool GPU::DoState(StateWrapper& sw, bool update_display)
 {
-  FlushRender();
-
   if (sw.IsReading())
   {
     // perform a reset to discard all pending draws/fb state
-    Reset(host_texture == nullptr);
+    Reset(false);
+  }
+  else
+  {
+    // Need to ensure our copy of VRAM is good.
+    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
   }
 
   sw.Do(&m_GPUSTAT.bits);
@@ -333,16 +284,20 @@ bool GPU::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_displ
   sw.Do(&m_command_total_words);
   sw.Do(&m_GPUREAD_latch);
 
+  u16 load_clut_data[GPU_CLUT_SIZE];
   if (sw.GetVersion() < 64) [[unlikely]]
   {
     // Clear CLUT cache and let it populate later.
     InvalidateCLUT();
+    std::memset(load_clut_data, 0, sizeof(load_clut_data));
   }
   else
   {
     sw.Do(&m_current_clut_reg_bits);
     sw.Do(&m_current_clut_is_8bit);
-    sw.DoArray(g_gpu_clut, std::size(g_gpu_clut));
+
+    // I hate this extra copy... because I'm a moron and put it in the middle of the state data.
+    sw.DoArray(sw.IsReading() ? load_clut_data : g_gpu_clut, std::size(g_gpu_clut));
   }
 
   sw.Do(&m_vram_transfer.x);
@@ -362,36 +317,52 @@ bool GPU::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_displ
 
   if (sw.IsReading())
   {
-    m_draw_mode.texture_page_changed = true;
+    if (!sw.DoMarker("GPU-VRAM"))
+      return false;
+
+    // Need to calculate the TC data size. But skip over VRAM first, we'll grab it later.
+    const size_t vram_start_pos = sw.GetPosition();
+    sw.SkipBytes(VRAM_SIZE);
+    u32 tc_data_size;
+    if (!GPUTextureCache::GetStateSize(sw, &tc_data_size)) [[unlikely]]
+      return false;
+
+    // Now we can actually allocate FIFO storage, and push it to the GPU thread.
+    GPUBackendLoadStateCommand* cmd = static_cast<GPUBackendLoadStateCommand*>(
+      GPUThread::AllocateCommand(GPUBackendCommandType::LoadState, sizeof(GPUBackendLoadStateCommand) + tc_data_size));
+    cmd->drawing_area = m_drawing_area;
+    std::memcpy(cmd->clut_data, load_clut_data, sizeof(cmd->clut_data));
+    std::memcpy(cmd->vram_data, sw.GetData() + vram_start_pos, VRAM_SIZE);
+    cmd->texture_cache_state_version = sw.GetVersion();
+    cmd->texture_cache_state_size = tc_data_size;
+    if (tc_data_size > 0)
+      std::memcpy(cmd->texture_cache_state, sw.GetData() + vram_start_pos + VRAM_SIZE, tc_data_size);
+    GPUThread::PushCommand(cmd);
+
     m_drawing_area_changed = true;
     SetClampedDrawingArea();
     UpdateDMARequest();
-  }
+    UpdateCRTCConfig();
+    if (update_display)
+      UpdateDisplay(false);
 
-  if (!host_texture)
+    UpdateCommandTickEvent();
+  }
+  else // if not memory state
   {
     if (!sw.DoMarker("GPU-VRAM"))
       return false;
 
-    sw.DoBytes(g_vram, VRAM_WIDTH * VRAM_HEIGHT * sizeof(u16));
-  }
+    // write vram
+    sw.DoBytes(g_vram, VRAM_SIZE);
 
-  if (sw.IsReading())
-  {
-    UpdateCRTCConfig();
-    if (update_display)
-      UpdateDisplay();
-
-    UpdateCommandTickEvent();
+    // write TC data, we have to be super careful here, since we're reading GPU thread state...
+    GPUTextureCache::SaveState(sw);
   }
 
   return !sw.HasError();
 }
 
-void GPU::RestoreDeviceContext()
-{
-}
-
 void GPU::UpdateDMARequest()
 {
   switch (m_blitter_state)
@@ -874,8 +845,11 @@ void GPU::UpdateCRTCDisplayParameters()
       << height_shift;
   }
 
-  if (cs.display_vram_width != old_vram_width || cs.display_vram_height != old_vram_height)
-    UpdateResolutionScale();
+  if ((cs.display_vram_width != old_vram_width || cs.display_vram_height != old_vram_height) &&
+      g_settings.gpu_resolution_scale == 0)
+  {
+    GPUBackend::PushCommand(GPUBackend::NewUpdateResolutionScaleCommand());
+  }
 }
 
 TickCount GPU::GetPendingCRTCTicks() const
@@ -1061,9 +1035,8 @@ void GPU::CRTCTickEvent(TickCount ticks)
 
         // flush any pending draws and "scan out" the image
         // TODO: move present in here I guess
-        FlushRender();
-        UpdateDisplay();
         System::IncrementFrameNumber();
+        UpdateDisplay(true);
         frame_done = true;
 
         // switch fields early. this is needed so we draw to the correct one.
@@ -1175,16 +1148,21 @@ void GPU::UpdateCommandTickEvent()
 void GPU::ConvertScreenCoordinatesToDisplayCoordinates(float window_x, float window_y, float* display_x,
                                                        float* display_y) const
 {
-  if (!g_gpu_device->HasMainSwapChain()) [[unlikely]]
+  const WindowInfo& wi = GPUThread::GetRenderWindowInfo();
+  if (wi.IsSurfaceless())
   {
-    *display_x = 0.0f;
-    *display_y = 0.0f;
+    *display_x = *display_y = -1.0f;
     return;
   }
 
   GSVector4i display_rc, draw_rc;
-  CalculateDrawRect(g_gpu_device->GetMainSwapChain()->GetWidth(), g_gpu_device->GetMainSwapChain()->GetHeight(), true,
-                    true, &display_rc, &draw_rc);
+  CalculateDrawRect(wi.surface_width, wi.surface_height, m_crtc_state.display_width, m_crtc_state.display_height,
+                    m_crtc_state.display_origin_left, m_crtc_state.display_origin_top, m_crtc_state.display_vram_width,
+                    m_crtc_state.display_vram_height, g_settings.display_rotation, ComputeDisplayAspectRatio(),
+                    g_settings.display_stretch_vertically,
+                    (g_settings.display_scaling == DisplayScalingMode::NearestInteger ||
+                     g_settings.display_scaling == DisplayScalingMode::BilinearInteger),
+                    &display_rc, &draw_rc);
 
   // convert coordinates to active display region, then to full display region
   const float scaled_display_x =
@@ -1199,7 +1177,7 @@ void GPU::ConvertScreenCoordinatesToDisplayCoordinates(float window_x, float win
   // TODO: apply rotation matrix
 
   DEV_LOG("win {:.0f},{:.0f} -> local {:.0f},{:.0f}, disp {:.2f},{:.2f} (size {},{} frac {},{})", window_x, window_y,
-          window_x - draw_rc.left, window_y - draw_rc.top, *display_x, *display_y, m_crtc_state.display_width,
+          window_x - display_rc.left, window_y - display_rc.top, *display_x, *display_y, m_crtc_state.display_width,
           m_crtc_state.display_height, *display_x / static_cast<float>(m_crtc_state.display_width),
           *display_y / static_cast<float>(m_crtc_state.display_height));
 }
@@ -1378,7 +1356,7 @@ void GPU::WriteGP1(u32 value)
         SynchronizeCRTC();
         m_crtc_state.regs.display_address_start = new_value;
         UpdateCRTCDisplayParameters();
-        OnBufferSwapped();
+        GPUBackend::PushCommand(GPUBackend::NewBufferSwappedCommand());
       }
     }
     break;
@@ -1533,9 +1511,14 @@ void GPU::UpdateCLUTIfNeeded(GPUTextureMode texmode, GPUTexturePaletteReg clut)
   {
     DEBUG_LOG("Reloading CLUT from {},{}, {}", clut.GetXBase(), clut.GetYBase(), needs_8bit ? "8-bit" : "4-bit");
     AddCommandTicks(needs_8bit ? 256 : 16);
-    UpdateCLUT(clut, needs_8bit);
     m_current_clut_reg_bits = clut.bits;
     m_current_clut_is_8bit = needs_8bit;
+
+    GPUBackendUpdateCLUTCommand* cmd = GPUBackend::NewUpdateCLUTCommand();
+    FillBackendCommandParameters(cmd);
+    cmd->reg.bits = clut.bits;
+    cmd->clut_is_8bit = needs_8bit;
+    GPUBackend::PushCommand(cmd);
   }
 }
 
@@ -1550,27 +1533,21 @@ bool GPU::IsCLUTValid() const
   return (m_current_clut_reg_bits != std::numeric_limits<decltype(m_current_clut_reg_bits)>::max());
 }
 
-void GPU::ClearDisplay()
-{
-  ClearDisplayTexture();
-
-  // Just recycle the textures, it'll get re-fetched.
-  DestroyDeinterlaceTextures();
-}
-
 void GPU::SetClampedDrawingArea()
 {
-  if (m_drawing_area.left > m_drawing_area.right || m_drawing_area.top > m_drawing_area.bottom) [[unlikely]]
-  {
-    m_clamped_drawing_area = GSVector4i::zero();
-    return;
-  }
+  m_clamped_drawing_area = GetClampedDrawingArea(m_drawing_area);
+}
 
-  const u32 right = std::min(m_drawing_area.right + 1, static_cast<u32>(VRAM_WIDTH));
-  const u32 left = std::min(m_drawing_area.left, std::min(m_drawing_area.right, VRAM_WIDTH - 1));
-  const u32 bottom = std::min(m_drawing_area.bottom + 1, static_cast<u32>(VRAM_HEIGHT));
-  const u32 top = std::min(m_drawing_area.top, std::min(m_drawing_area.bottom, VRAM_HEIGHT - 1));
-  m_clamped_drawing_area = GSVector4i(left, top, right, bottom);
+GSVector4i GPU::GetClampedDrawingArea(const GPUDrawingArea& drawing_area)
+{
+  if (drawing_area.left > drawing_area.right || drawing_area.top > drawing_area.bottom) [[unlikely]]
+    return GSVector4i::zero();
+
+  const u32 right = std::min(drawing_area.right + 1, static_cast<u32>(VRAM_WIDTH));
+  const u32 left = std::min(drawing_area.left, std::min(drawing_area.right, VRAM_WIDTH - 1));
+  const u32 bottom = std::min(drawing_area.bottom + 1, static_cast<u32>(VRAM_HEIGHT));
+  const u32 top = std::min(drawing_area.top, std::min(drawing_area.bottom, VRAM_HEIGHT - 1));
+  return GSVector4i(left, top, right, bottom);
 }
 
 void GPU::SetDrawMode(u16 value)
@@ -1579,16 +1556,8 @@ void GPU::SetDrawMode(u16 value)
   if (!m_set_texture_disable_mask)
     new_mode_reg.texture_disable = false;
 
-  if (new_mode_reg.bits == m_draw_mode.mode_reg.bits)
-    return;
-
-  m_draw_mode.texture_page_changed |= ((new_mode_reg.bits & GPUDrawModeReg::TEXTURE_MODE_AND_PAGE_MASK) !=
-                                       (m_draw_mode.mode_reg.bits & GPUDrawModeReg::TEXTURE_MODE_AND_PAGE_MASK));
   m_draw_mode.mode_reg.bits = new_mode_reg.bits;
 
-  if (m_GPUSTAT.draw_to_displayed_field != new_mode_reg.draw_to_displayed_field)
-    FlushRender();
-
   // Bits 0..10 are returned in the GPU status register.
   m_GPUSTAT.bits = (m_GPUSTAT.bits & ~(GPUDrawModeReg::GPUSTAT_MASK)) |
                    (ZeroExtend32(new_mode_reg.bits) & GPUDrawModeReg::GPUSTAT_MASK);
@@ -1598,11 +1567,7 @@ void GPU::SetDrawMode(u16 value)
 void GPU::SetTexturePalette(u16 value)
 {
   value &= DrawMode::PALETTE_MASK;
-  if (m_draw_mode.palette_reg.bits == value)
-    return;
-
   m_draw_mode.palette_reg.bits = value;
-  m_draw_mode.texture_page_changed = true;
 }
 
 void GPU::SetTextureWindow(u32 value)
@@ -1624,708 +1589,21 @@ void GPU::SetTextureWindow(u32 value)
   m_draw_mode.texture_window_value = value;
 }
 
-void GPU::ReadCLUT(u16* dest, GPUTexturePaletteReg reg, bool clut_is_8bit)
+void GPU::CalculateDrawRect(u32 window_width, u32 window_height, u32 crtc_display_width, u32 crtc_display_height,
+                            s32 display_origin_left, s32 display_origin_top, u32 display_vram_width,
+                            u32 display_vram_height, DisplayRotation rotation, float aspect_ratio,
+                            bool stretch_vertically, bool integer_scale, GSVector4i* display_rect,
+                            GSVector4i* draw_rect)
 {
-  const u16* src_row = &g_vram[reg.GetYBase() * VRAM_WIDTH];
-  const u32 start_x = reg.GetXBase();
-  if (!clut_is_8bit)
-  {
-    // Wraparound can't happen in 4-bit mode.
-    std::memcpy(dest, &src_row[start_x], sizeof(u16) * 16);
-  }
-  else
-  {
-    if ((start_x + 256) > VRAM_WIDTH) [[unlikely]]
-    {
-      const u32 end = VRAM_WIDTH - start_x;
-      const u32 start = 256 - end;
-      std::memcpy(dest, &src_row[start_x], sizeof(u16) * end);
-      std::memcpy(dest + end, src_row, sizeof(u16) * start);
-    }
-    else
-    {
-      std::memcpy(dest, &src_row[start_x], sizeof(u16) * 256);
-    }
-  }
-}
-
-bool GPU::CompileDisplayPipelines(bool display, bool deinterlace, bool chroma_smoothing, Error* error)
-{
-  GPUShaderGen shadergen(g_gpu_device->GetRenderAPI(), g_gpu_device->GetFeatures().dual_source_blend,
-                         g_gpu_device->GetFeatures().framebuffer_fetch);
-
-  GPUPipeline::GraphicsConfig plconfig;
-  plconfig.input_layout.vertex_stride = 0;
-  plconfig.primitive = GPUPipeline::Primitive::Triangles;
-  plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState();
-  plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
-  plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
-  plconfig.geometry_shader = nullptr;
-  plconfig.depth_format = GPUTexture::Format::Unknown;
-  plconfig.samples = 1;
-  plconfig.per_sample_shading = false;
-  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
-
-  if (display)
-  {
-    plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
-    plconfig.SetTargetFormats(g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetFormat() :
-                                                                 GPUTexture::Format::RGBA8);
-
-    std::string vs = shadergen.GenerateDisplayVertexShader();
-    std::string fs;
-    switch (g_settings.display_scaling)
-    {
-      case DisplayScalingMode::BilinearSharp:
-        fs = shadergen.GenerateDisplaySharpBilinearFragmentShader();
-        break;
-
-      case DisplayScalingMode::BilinearSmooth:
-      case DisplayScalingMode::BilinearInteger:
-        fs = shadergen.GenerateDisplayFragmentShader(true, false);
-        break;
-
-      case DisplayScalingMode::Nearest:
-      case DisplayScalingMode::NearestInteger:
-      default:
-        fs = shadergen.GenerateDisplayFragmentShader(false, true);
-        break;
-    }
-
-    std::unique_ptr<GPUShader> vso =
-      g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs, error);
-    std::unique_ptr<GPUShader> fso =
-      g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(), fs, error);
-    if (!vso || !fso)
-      return false;
-    GL_OBJECT_NAME(vso, "Display Vertex Shader");
-    GL_OBJECT_NAME_FMT(fso, "Display Fragment Shader [{}]",
-                       Settings::GetDisplayScalingName(g_settings.display_scaling));
-    plconfig.vertex_shader = vso.get();
-    plconfig.fragment_shader = fso.get();
-    if (!(m_display_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-      return false;
-    GL_OBJECT_NAME_FMT(m_display_pipeline, "Display Pipeline [{}]",
-                       Settings::GetDisplayScalingName(g_settings.display_scaling));
-  }
-
-  if (deinterlace)
-  {
-    plconfig.SetTargetFormats(GPUTexture::Format::RGBA8);
-
-    std::unique_ptr<GPUShader> vso = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(),
-                                                                shadergen.GenerateScreenQuadVertexShader(), error);
-    if (!vso)
-      return false;
-    GL_OBJECT_NAME(vso, "Deinterlace Vertex Shader");
-
-    std::unique_ptr<GPUShader> fso;
-    if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
-                                           shadergen.GenerateInterleavedFieldExtractFragmentShader(), error)))
-    {
-      return false;
-    }
-
-    GL_OBJECT_NAME(fso, "Deinterlace Field Extract Fragment Shader");
-
-    plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
-    plconfig.vertex_shader = vso.get();
-    plconfig.fragment_shader = fso.get();
-    if (!(m_deinterlace_extract_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-      return false;
-
-    GL_OBJECT_NAME(m_deinterlace_extract_pipeline, "Deinterlace Field Extract Pipeline");
-
-    switch (g_settings.display_deinterlacing_mode)
-    {
-      case DisplayDeinterlacingMode::Disabled:
-      case DisplayDeinterlacingMode::Progressive:
-        break;
-
-      case DisplayDeinterlacingMode::Weave:
-      {
-        if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
-                                               shadergen.GenerateDeinterlaceWeaveFragmentShader(), error)))
-        {
-          return false;
-        }
-
-        GL_OBJECT_NAME(fso, "Weave Deinterlace Fragment Shader");
-
-        plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
-        plconfig.vertex_shader = vso.get();
-        plconfig.fragment_shader = fso.get();
-        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-          return false;
-
-        GL_OBJECT_NAME(m_deinterlace_pipeline, "Weave Deinterlace Pipeline");
-      }
-      break;
-
-      case DisplayDeinterlacingMode::Blend:
-      {
-        if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
-                                               shadergen.GenerateDeinterlaceBlendFragmentShader(), error)))
-        {
-          return false;
-        }
-
-        GL_OBJECT_NAME(fso, "Blend Deinterlace Fragment Shader");
-
-        plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants;
-        plconfig.vertex_shader = vso.get();
-        plconfig.fragment_shader = fso.get();
-        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-          return false;
-
-        GL_OBJECT_NAME(m_deinterlace_pipeline, "Blend Deinterlace Pipeline");
-      }
-      break;
-
-      case DisplayDeinterlacingMode::Adaptive:
-      {
-        fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
-                                         shadergen.GenerateFastMADReconstructFragmentShader(), error);
-        if (!fso)
-          return false;
-
-        GL_OBJECT_NAME(fso, "FastMAD Reconstruct Fragment Shader");
-
-        plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants;
-        plconfig.fragment_shader = fso.get();
-        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-          return false;
-
-        GL_OBJECT_NAME(m_deinterlace_pipeline, "FastMAD Reconstruct Pipeline");
-      }
-      break;
-
-      default:
-        UnreachableCode();
-    }
-  }
-
-  if (chroma_smoothing)
-  {
-    m_chroma_smoothing_pipeline.reset();
-    g_gpu_device->RecycleTexture(std::move(m_chroma_smoothing_texture));
-
-    if (g_settings.display_24bit_chroma_smoothing)
-    {
-      plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
-      plconfig.SetTargetFormats(GPUTexture::Format::RGBA8);
-
-      std::unique_ptr<GPUShader> vso = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(),
-                                                                  shadergen.GenerateScreenQuadVertexShader(), error);
-      std::unique_ptr<GPUShader> fso = g_gpu_device->CreateShader(
-        GPUShaderStage::Fragment, shadergen.GetLanguage(), shadergen.GenerateChromaSmoothingFragmentShader(), error);
-      if (!vso || !fso)
-        return false;
-      GL_OBJECT_NAME(vso, "Chroma Smoothing Vertex Shader");
-      GL_OBJECT_NAME(fso, "Chroma Smoothing Fragment Shader");
-
-      plconfig.vertex_shader = vso.get();
-      plconfig.fragment_shader = fso.get();
-      if (!(m_chroma_smoothing_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
-        return false;
-      GL_OBJECT_NAME(m_chroma_smoothing_pipeline, "Chroma Smoothing Pipeline");
-    }
-  }
-
-  return true;
-}
-
-void GPU::ClearDisplayTexture()
-{
-  m_display_texture = nullptr;
-  m_display_texture_view_x = 0;
-  m_display_texture_view_y = 0;
-  m_display_texture_view_width = 0;
-  m_display_texture_view_height = 0;
-}
-
-void GPU::SetDisplayTexture(GPUTexture* texture, GPUTexture* depth_buffer, s32 view_x, s32 view_y, s32 view_width,
-                            s32 view_height)
-{
-  DebugAssert(texture);
-
-  if (g_settings.display_auto_resize_window &&
-      (view_width != m_display_texture_view_width || view_height != m_display_texture_view_height))
-  {
-    System::RequestDisplaySize();
-  }
-
-  m_display_texture = texture;
-  m_display_depth_buffer = depth_buffer;
-  m_display_texture_view_x = view_x;
-  m_display_texture_view_y = view_y;
-  m_display_texture_view_width = view_width;
-  m_display_texture_view_height = view_height;
-}
-
-GPUDevice::PresentResult GPU::PresentDisplay()
-{
-  FlushRender();
-
-  if (!g_gpu_device->HasMainSwapChain())
-    return GPUDevice::PresentResult::SkipPresent;
-
-  GSVector4i display_rect;
-  GSVector4i draw_rect;
-  CalculateDrawRect(g_gpu_device->GetMainSwapChain()->GetWidth(), g_gpu_device->GetMainSwapChain()->GetHeight(),
-                    !g_settings.debugging.show_vram, true, &display_rect, &draw_rect);
-  return RenderDisplay(nullptr, display_rect, draw_rect, !g_settings.debugging.show_vram);
-}
-
-GPUDevice::PresentResult GPU::RenderDisplay(GPUTexture* target, const GSVector4i display_rect,
-                                            const GSVector4i draw_rect, bool postfx)
-{
-  GL_SCOPE_FMT("RenderDisplay: {}", draw_rect);
-
-  if (m_display_texture)
-    m_display_texture->MakeReadyForSampling();
-
-  // Internal post-processing.
-  GPUTexture* display_texture = m_display_texture;
-  s32 display_texture_view_x = m_display_texture_view_x;
-  s32 display_texture_view_y = m_display_texture_view_y;
-  s32 display_texture_view_width = m_display_texture_view_width;
-  s32 display_texture_view_height = m_display_texture_view_height;
-  if (postfx && display_texture && PostProcessing::InternalChain.IsActive() &&
-      PostProcessing::InternalChain.CheckTargets(DISPLAY_INTERNAL_POSTFX_FORMAT, display_texture_view_width,
-                                                 display_texture_view_height))
-  {
-    DebugAssert(display_texture_view_x == 0 && display_texture_view_y == 0 &&
-                static_cast<s32>(display_texture->GetWidth()) == display_texture_view_width &&
-                static_cast<s32>(display_texture->GetHeight()) == display_texture_view_height);
-
-    // Now we can apply the post chain.
-    GPUTexture* post_output_texture = PostProcessing::InternalChain.GetOutputTexture();
-    if (const GPUDevice::PresentResult pres = PostProcessing::InternalChain.Apply(
-          display_texture, m_display_depth_buffer, post_output_texture,
-          GSVector4i(0, 0, display_texture_view_width, display_texture_view_height), display_texture_view_width,
-          display_texture_view_height, m_crtc_state.display_width, m_crtc_state.display_height);
-        pres != GPUDevice::PresentResult::OK)
-    {
-      return pres;
-    }
-    else
-    {
-      display_texture_view_x = 0;
-      display_texture_view_y = 0;
-      display_texture = post_output_texture;
-      display_texture->MakeReadyForSampling();
-    }
-  }
-
-  const GPUTexture::Format hdformat = target ? target->GetFormat() : g_gpu_device->GetMainSwapChain()->GetFormat();
-  const u32 target_width = target ? target->GetWidth() : g_gpu_device->GetMainSwapChain()->GetWidth();
-  const u32 target_height = target ? target->GetHeight() : g_gpu_device->GetMainSwapChain()->GetHeight();
-  const bool really_postfx = (postfx && PostProcessing::DisplayChain.IsActive() && g_gpu_device->HasMainSwapChain() &&
-                              hdformat != GPUTexture::Format::Unknown && target_width > 0 && target_height > 0 &&
-                              PostProcessing::DisplayChain.CheckTargets(hdformat, target_width, target_height));
-  const GSVector4i real_draw_rect =
-    g_gpu_device->UsesLowerLeftOrigin() ? GPUDevice::FlipToLowerLeft(draw_rect, target_height) : draw_rect;
-  if (really_postfx)
-  {
-    g_gpu_device->ClearRenderTarget(PostProcessing::DisplayChain.GetInputTexture(), GPUDevice::DEFAULT_CLEAR_COLOR);
-    g_gpu_device->SetRenderTarget(PostProcessing::DisplayChain.GetInputTexture());
-  }
-  else
-  {
-    if (target)
-    {
-      g_gpu_device->SetRenderTarget(target);
-    }
-    else
-    {
-      const GPUDevice::PresentResult pres = g_gpu_device->BeginPresent(g_gpu_device->GetMainSwapChain());
-      if (pres != GPUDevice::PresentResult::OK)
-        return pres;
-    }
-  }
-
-  if (display_texture)
-  {
-    bool texture_filter_linear = false;
-
-    struct Uniforms
-    {
-      float src_rect[4];
-      float src_size[4];
-      float clamp_rect[4];
-      float params[4];
-      float rotation_matrix[2][2];
-    } uniforms;
-    std::memset(uniforms.params, 0, sizeof(uniforms.params));
-
-    switch (g_settings.display_scaling)
-    {
-      case DisplayScalingMode::Nearest:
-      case DisplayScalingMode::NearestInteger:
-        break;
-
-      case DisplayScalingMode::BilinearSmooth:
-      case DisplayScalingMode::BilinearInteger:
-        texture_filter_linear = true;
-        break;
-
-      case DisplayScalingMode::BilinearSharp:
-      {
-        texture_filter_linear = true;
-        uniforms.params[0] = std::max(
-          std::floor(static_cast<float>(draw_rect.width()) / static_cast<float>(m_display_texture_view_width)), 1.0f);
-        uniforms.params[1] = std::max(
-          std::floor(static_cast<float>(draw_rect.height()) / static_cast<float>(m_display_texture_view_height)), 1.0f);
-        uniforms.params[2] = 0.5f - 0.5f / uniforms.params[0];
-        uniforms.params[3] = 0.5f - 0.5f / uniforms.params[1];
-      }
-      break;
-
-      default:
-        UnreachableCode();
-        break;
-    }
-
-    g_gpu_device->SetPipeline(m_display_pipeline.get());
-    g_gpu_device->SetTextureSampler(
-      0, display_texture, texture_filter_linear ? g_gpu_device->GetLinearSampler() : g_gpu_device->GetNearestSampler());
-
-    // For bilinear, clamp to 0.5/SIZE-0.5 to avoid bleeding from the adjacent texels in VRAM. This is because
-    // 1.0 in UV space is not the bottom-right texel, but a mix of the bottom-right and wrapped/next texel.
-    const float rcp_width = 1.0f / static_cast<float>(display_texture->GetWidth());
-    const float rcp_height = 1.0f / static_cast<float>(display_texture->GetHeight());
-    uniforms.src_rect[0] = static_cast<float>(display_texture_view_x) * rcp_width;
-    uniforms.src_rect[1] = static_cast<float>(display_texture_view_y) * rcp_height;
-    uniforms.src_rect[2] = static_cast<float>(display_texture_view_width) * rcp_width;
-    uniforms.src_rect[3] = static_cast<float>(display_texture_view_height) * rcp_height;
-    uniforms.clamp_rect[0] = (static_cast<float>(display_texture_view_x) + 0.5f) * rcp_width;
-    uniforms.clamp_rect[1] = (static_cast<float>(display_texture_view_y) + 0.5f) * rcp_height;
-    uniforms.clamp_rect[2] =
-      (static_cast<float>(display_texture_view_x + display_texture_view_width) - 0.5f) * rcp_width;
-    uniforms.clamp_rect[3] =
-      (static_cast<float>(display_texture_view_y + display_texture_view_height) - 0.5f) * rcp_height;
-    uniforms.src_size[0] = static_cast<float>(display_texture->GetWidth());
-    uniforms.src_size[1] = static_cast<float>(display_texture->GetHeight());
-    uniforms.src_size[2] = rcp_width;
-    uniforms.src_size[3] = rcp_height;
-
-    if (g_settings.display_rotation != DisplayRotation::Normal)
-    {
-      static constexpr const std::array<float, static_cast<size_t>(DisplayRotation::Count) - 1> rotation_radians = {{
-        static_cast<float>(std::numbers::pi * 1.5f), // Rotate90
-        static_cast<float>(std::numbers::pi),        // Rotate180
-        static_cast<float>(std::numbers::pi / 2.0),  // Rotate270
-      }};
-
-      GSMatrix2x2::Rotation(rotation_radians[static_cast<size_t>(g_settings.display_rotation) - 1])
-        .store(uniforms.rotation_matrix);
-    }
-    else
-    {
-      GSMatrix2x2::Identity().store(uniforms.rotation_matrix);
-    }
-
-    g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
-
-    g_gpu_device->SetViewportAndScissor(real_draw_rect);
-    g_gpu_device->Draw(3, 0);
-  }
-
-  if (really_postfx)
-  {
-    DebugAssert(!g_settings.debugging.show_vram);
-
-    // "original size" in postfx includes padding.
-    const float upscale_x = m_display_texture ? static_cast<float>(m_display_texture_view_width) /
-                                                  static_cast<float>(m_crtc_state.display_vram_width) :
-                                                1.0f;
-    const float upscale_y = m_display_texture ? static_cast<float>(m_display_texture_view_height) /
-                                                  static_cast<float>(m_crtc_state.display_vram_height) :
-                                                1.0f;
-    const s32 orig_width = static_cast<s32>(std::ceil(static_cast<float>(m_crtc_state.display_width) * upscale_x));
-    const s32 orig_height = static_cast<s32>(std::ceil(static_cast<float>(m_crtc_state.display_height) * upscale_y));
-
-    return PostProcessing::DisplayChain.Apply(PostProcessing::DisplayChain.GetInputTexture(), nullptr, target,
-                                              display_rect, orig_width, orig_height, m_crtc_state.display_width,
-                                              m_crtc_state.display_height);
-  }
-  else
-  {
-    return GPUDevice::PresentResult::OK;
-  }
-}
-
-bool GPU::SendDisplayToMediaCapture(MediaCapture* cap)
-{
-  GPUTexture* target = cap->GetRenderTexture();
-  if (!target) [[unlikely]]
-    return false;
-
-  const bool apply_aspect_ratio =
-    (g_settings.display_screenshot_mode != DisplayScreenshotMode::UncorrectedInternalResolution);
-  const bool postfx = (g_settings.display_screenshot_mode != DisplayScreenshotMode::InternalResolution);
-  GSVector4i display_rect, draw_rect;
-  CalculateDrawRect(target->GetWidth(), target->GetHeight(), !g_settings.debugging.show_vram, apply_aspect_ratio,
-                    &display_rect, &draw_rect);
-
-  // Not cleared by RenderDisplay().
-  g_gpu_device->ClearRenderTarget(target, GPUDevice::DEFAULT_CLEAR_COLOR);
-
-  if (RenderDisplay(target, display_rect, draw_rect, postfx) != GPUDevice::PresentResult::OK) [[unlikely]]
-    return false;
-
-  return cap->DeliverVideoFrame(target);
-}
-
-void GPU::DestroyDeinterlaceTextures()
-{
-  for (std::unique_ptr<GPUTexture>& tex : m_deinterlace_buffers)
-    g_gpu_device->RecycleTexture(std::move(tex));
-  g_gpu_device->RecycleTexture(std::move(m_deinterlace_texture));
-  m_current_deinterlace_buffer = 0;
-}
-
-bool GPU::Deinterlace(u32 field, u32 line_skip)
-{
-  GPUTexture* src = m_display_texture;
-  const u32 x = m_display_texture_view_x;
-  const u32 y = m_display_texture_view_y;
-  const u32 width = m_display_texture_view_width;
-  const u32 height = m_display_texture_view_height;
-
-  switch (g_settings.display_deinterlacing_mode)
-  {
-    case DisplayDeinterlacingMode::Disabled:
-    {
-      if (line_skip == 0)
-        return true;
-
-      // Still have to extract the field.
-      if (!DeinterlaceExtractField(0, src, x, y, width, height, line_skip)) [[unlikely]]
-        return false;
-
-      SetDisplayTexture(m_deinterlace_buffers[0].get(), m_display_depth_buffer, 0, 0, width, height);
-      return true;
-    }
-
-    case DisplayDeinterlacingMode::Weave:
-    {
-      GL_SCOPE_FMT("DeinterlaceWeave({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field, line_skip);
-
-      const u32 full_height = height * 2;
-      if (!DeinterlaceSetTargetSize(width, full_height, true)) [[unlikely]]
-      {
-        ClearDisplayTexture();
-        return false;
-      }
-
-      src->MakeReadyForSampling();
-
-      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
-      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
-      g_gpu_device->SetTextureSampler(0, src, g_gpu_device->GetNearestSampler());
-      const u32 uniforms[] = {x, y, field, line_skip};
-      g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
-      g_gpu_device->SetViewportAndScissor(0, 0, width, full_height);
-      g_gpu_device->Draw(3, 0);
-
-      m_deinterlace_texture->MakeReadyForSampling();
-      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, full_height);
-      return true;
-    }
-
-    case DisplayDeinterlacingMode::Blend:
-    {
-      constexpr u32 NUM_BLEND_BUFFERS = 2;
-
-      GL_SCOPE_FMT("DeinterlaceBlend({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field, line_skip);
-
-      const u32 this_buffer = m_current_deinterlace_buffer;
-      m_current_deinterlace_buffer = (m_current_deinterlace_buffer + 1u) % NUM_BLEND_BUFFERS;
-      GL_INS_FMT("Current buffer: {}", this_buffer);
-      if (!DeinterlaceExtractField(this_buffer, src, x, y, width, height, line_skip) ||
-          !DeinterlaceSetTargetSize(width, height, false)) [[unlikely]]
-      {
-        ClearDisplayTexture();
-        return false;
-      }
-
-      // TODO: could be implemented with alpha blending instead..
-
-      g_gpu_device->InvalidateRenderTarget(m_deinterlace_texture.get());
-      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
-      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
-      g_gpu_device->SetTextureSampler(0, m_deinterlace_buffers[this_buffer].get(), g_gpu_device->GetNearestSampler());
-      g_gpu_device->SetTextureSampler(1, m_deinterlace_buffers[(this_buffer - 1) % NUM_BLEND_BUFFERS].get(),
-                                      g_gpu_device->GetNearestSampler());
-      g_gpu_device->SetViewportAndScissor(0, 0, width, height);
-      g_gpu_device->Draw(3, 0);
-
-      m_deinterlace_texture->MakeReadyForSampling();
-      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, height);
-      return true;
-    }
-
-    case DisplayDeinterlacingMode::Adaptive:
-    {
-      GL_SCOPE_FMT("DeinterlaceAdaptive({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field,
-                   line_skip);
-
-      const u32 full_height = height * 2;
-      const u32 this_buffer = m_current_deinterlace_buffer;
-      m_current_deinterlace_buffer = (m_current_deinterlace_buffer + 1u) % DEINTERLACE_BUFFER_COUNT;
-      GL_INS_FMT("Current buffer: {}", this_buffer);
-      if (!DeinterlaceExtractField(this_buffer, src, x, y, width, height, line_skip) ||
-          !DeinterlaceSetTargetSize(width, full_height, false)) [[unlikely]]
-      {
-        ClearDisplayTexture();
-        return false;
-      }
-
-      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
-      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
-      g_gpu_device->SetTextureSampler(0, m_deinterlace_buffers[this_buffer].get(), g_gpu_device->GetNearestSampler());
-      g_gpu_device->SetTextureSampler(1, m_deinterlace_buffers[(this_buffer - 1) % DEINTERLACE_BUFFER_COUNT].get(),
-                                      g_gpu_device->GetNearestSampler());
-      g_gpu_device->SetTextureSampler(2, m_deinterlace_buffers[(this_buffer - 2) % DEINTERLACE_BUFFER_COUNT].get(),
-                                      g_gpu_device->GetNearestSampler());
-      g_gpu_device->SetTextureSampler(3, m_deinterlace_buffers[(this_buffer - 3) % DEINTERLACE_BUFFER_COUNT].get(),
-                                      g_gpu_device->GetNearestSampler());
-      const u32 uniforms[] = {field, full_height};
-      g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
-      g_gpu_device->SetViewportAndScissor(0, 0, width, full_height);
-      g_gpu_device->Draw(3, 0);
-
-      m_deinterlace_texture->MakeReadyForSampling();
-      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, full_height);
-      return true;
-    }
-
-    default:
-      UnreachableCode();
-  }
-}
-
-bool GPU::DeinterlaceExtractField(u32 dst_bufidx, GPUTexture* src, u32 x, u32 y, u32 width, u32 height, u32 line_skip)
-{
-  if (!m_deinterlace_buffers[dst_bufidx] || m_deinterlace_buffers[dst_bufidx]->GetWidth() != width ||
-      m_deinterlace_buffers[dst_bufidx]->GetHeight() != height)
-  {
-    if (!g_gpu_device->ResizeTexture(&m_deinterlace_buffers[dst_bufidx], width, height, GPUTexture::Type::RenderTarget,
-                                     GPUTexture::Format::RGBA8, false)) [[unlikely]]
-    {
-      return false;
-    }
-
-    GL_OBJECT_NAME_FMT(m_deinterlace_buffers[dst_bufidx], "Blend Deinterlace Buffer {}", dst_bufidx);
-  }
-
-  GPUTexture* dst = m_deinterlace_buffers[dst_bufidx].get();
-  g_gpu_device->InvalidateRenderTarget(dst);
-
-  // If we're not skipping lines, then we can simply copy the texture.
-  if (line_skip == 0 && src->GetFormat() == dst->GetFormat())
-  {
-    GL_INS_FMT("DeinterlaceExtractField({{{},{}}} {}x{} line_skip={}) => copy direct", x, y, width, height, line_skip);
-    g_gpu_device->CopyTextureRegion(dst, 0, 0, 0, 0, src, x, y, 0, 0, width, height);
-  }
-  else
-  {
-    GL_SCOPE_FMT("DeinterlaceExtractField({{{},{}}} {}x{} line_skip={}) => shader copy", x, y, width, height,
-                 line_skip);
-
-    // Otherwise, we need to extract every other line from the texture.
-    src->MakeReadyForSampling();
-    g_gpu_device->SetRenderTarget(dst);
-    g_gpu_device->SetPipeline(m_deinterlace_extract_pipeline.get());
-    g_gpu_device->SetTextureSampler(0, src, g_gpu_device->GetNearestSampler());
-    const u32 uniforms[] = {x, y, line_skip};
-    g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
-    g_gpu_device->SetViewportAndScissor(0, 0, width, height);
-    g_gpu_device->Draw(3, 0);
-
-    GL_POP();
-  }
-
-  dst->MakeReadyForSampling();
-  return true;
-}
-
-bool GPU::DeinterlaceSetTargetSize(u32 width, u32 height, bool preserve)
-{
-  if (!m_deinterlace_texture || m_deinterlace_texture->GetWidth() != width ||
-      m_deinterlace_texture->GetHeight() != height)
-  {
-    if (!g_gpu_device->ResizeTexture(&m_deinterlace_texture, width, height, GPUTexture::Type::RenderTarget,
-                                     GPUTexture::Format::RGBA8, preserve)) [[unlikely]]
-    {
-      return false;
-    }
-
-    GL_OBJECT_NAME(m_deinterlace_texture, "Deinterlace target texture");
-  }
-
-  return true;
-}
-
-bool GPU::ApplyChromaSmoothing()
-{
-  const u32 x = m_display_texture_view_x;
-  const u32 y = m_display_texture_view_y;
-  const u32 width = m_display_texture_view_width;
-  const u32 height = m_display_texture_view_height;
-  if (!m_chroma_smoothing_texture || m_chroma_smoothing_texture->GetWidth() != width ||
-      m_chroma_smoothing_texture->GetHeight() != height)
-  {
-    if (!g_gpu_device->ResizeTexture(&m_chroma_smoothing_texture, width, height, GPUTexture::Type::RenderTarget,
-                                     GPUTexture::Format::RGBA8, false))
-    {
-      ClearDisplayTexture();
-      return false;
-    }
-
-    GL_OBJECT_NAME(m_chroma_smoothing_texture, "Chroma smoothing texture");
-  }
-
-  GL_SCOPE_FMT("ApplyChromaSmoothing({{{},{}}}, {}x{})", x, y, width, height);
-
-  m_display_texture->MakeReadyForSampling();
-  g_gpu_device->InvalidateRenderTarget(m_chroma_smoothing_texture.get());
-  g_gpu_device->SetRenderTarget(m_chroma_smoothing_texture.get());
-  g_gpu_device->SetPipeline(m_chroma_smoothing_pipeline.get());
-  g_gpu_device->SetTextureSampler(0, m_display_texture, g_gpu_device->GetNearestSampler());
-  const u32 uniforms[] = {x, y, width - 1, height - 1};
-  g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
-  g_gpu_device->SetViewportAndScissor(0, 0, width, height);
-  g_gpu_device->Draw(3, 0);
-
-  m_chroma_smoothing_texture->MakeReadyForSampling();
-  SetDisplayTexture(m_chroma_smoothing_texture.get(), m_display_depth_buffer, 0, 0, width, height);
-  return true;
-}
-
-void GPU::CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio,
-                            GSVector4i* display_rect, GSVector4i* draw_rect) const
-{
-  const bool integer_scale = (g_settings.display_scaling == DisplayScalingMode::NearestInteger ||
-                              g_settings.display_scaling == DisplayScalingMode::BilinearInteger);
-  const bool show_vram = g_settings.debugging.show_vram;
-  const float display_aspect_ratio = ComputeDisplayAspectRatio();
   const float window_ratio = static_cast<float>(window_width) / static_cast<float>(window_height);
-  const float crtc_display_width = static_cast<float>(show_vram ? VRAM_WIDTH : m_crtc_state.display_width);
-  const float crtc_display_height = static_cast<float>(show_vram ? VRAM_HEIGHT : m_crtc_state.display_height);
-  const float x_scale =
-    apply_aspect_ratio ?
-      (display_aspect_ratio / (static_cast<float>(crtc_display_width) / static_cast<float>(crtc_display_height))) :
-      1.0f;
-  float display_width = crtc_display_width;
-  float display_height = crtc_display_height;
-  float active_left = static_cast<float>(show_vram ? 0 : m_crtc_state.display_origin_left);
-  float active_top = static_cast<float>(show_vram ? 0 : m_crtc_state.display_origin_top);
-  float active_width = static_cast<float>(show_vram ? VRAM_WIDTH : m_crtc_state.display_vram_width);
-  float active_height = static_cast<float>(show_vram ? VRAM_HEIGHT : m_crtc_state.display_vram_height);
-  if (!g_settings.display_stretch_vertically)
+  float display_width = static_cast<float>(crtc_display_width);
+  float display_height = static_cast<float>(crtc_display_height);
+  const float x_scale = (aspect_ratio != 0.0f) ? (aspect_ratio / (display_width / display_height)) : 1.0f;
+  float active_left = static_cast<float>(display_origin_left);
+  float active_top = static_cast<float>(display_origin_top);
+  float active_width = static_cast<float>(display_vram_width);
+  float active_height = static_cast<float>(display_vram_height);
+  if (!stretch_vertically)
   {
     display_width *= x_scale;
     active_left *= x_scale;
@@ -2339,8 +1617,7 @@ void GPU::CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rota
   }
 
   // swap width/height when rotated, the flipping of padding is taken care of in the shader with the rotation matrix
-  if (g_settings.display_rotation == DisplayRotation::Rotate90 ||
-      g_settings.display_rotation == DisplayRotation::Rotate270)
+  if (rotation == DisplayRotation::Rotate90 || rotation == DisplayRotation::Rotate270)
   {
     std::swap(display_width, display_height);
     std::swap(active_width, active_height);
@@ -2421,285 +1698,96 @@ void GPU::CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rota
     GSVector4(left_padding, top_padding, left_padding + display_width * scale, top_padding + display_height * scale));
 }
 
-bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string filename, FileSystem::ManagedCFilePtr fp,
-                                   u8 quality, bool clear_alpha, bool flip_y, std::vector<u32> texture_data,
-                                   u32 texture_data_stride, GPUTexture::Format texture_format, std::string osd_key)
+void GPU::ReadVRAM(u16 x, u16 y, u16 width, u16 height)
 {
-  bool result;
+  GPUBackendReadVRAMCommand* cmd = GPUBackend::NewReadVRAMCommand();
+  cmd->x = x;
+  cmd->y = y;
+  cmd->width = width;
+  cmd->height = height;
+  GPUBackend::PushCommandAndSync(cmd, true);
+}
 
-  const char* extension = std::strrchr(filename.c_str(), '.');
-  if (extension)
+void GPU::UpdateVRAM(u16 x, u16 y, u16 width, u16 height, const void* data, bool set_mask, bool check_mask)
+{
+  const u32 num_words = width * height;
+  GPUBackendUpdateVRAMCommand* cmd = GPUBackend::NewUpdateVRAMCommand(num_words);
+  cmd->params.bits = 0;
+  cmd->params.set_mask_while_drawing = set_mask;
+  cmd->params.check_mask_before_draw = check_mask;
+  cmd->x = x;
+  cmd->y = y;
+  cmd->width = width;
+  cmd->height = height;
+  std::memcpy(cmd->data, data, num_words * sizeof(u16));
+  GPUBackend::PushCommand(cmd);
+}
+
+void GPU::ClearDisplay()
+{
+  GPUBackend::PushCommand(GPUBackend::NewClearDisplayCommand());
+}
+
+void GPU::UpdateDisplay(bool is_frame)
+{
+  GPUBackendUpdateDisplayCommand* cmd = GPUBackend::NewUpdateDisplayCommand();
+  cmd->frame_number = System::GetFrameNumber();
+  cmd->internal_frame_number = System::GetInternalFrameNumber();
+  cmd->display_width = m_crtc_state.display_width;
+  cmd->display_height = m_crtc_state.display_height;
+  cmd->display_origin_left = m_crtc_state.display_origin_left;
+  cmd->display_origin_top = m_crtc_state.display_origin_top;
+  cmd->display_vram_left = m_crtc_state.display_vram_left;
+  cmd->display_vram_top = m_crtc_state.display_vram_top;
+  cmd->display_vram_width = m_crtc_state.display_vram_width;
+  cmd->display_vram_height = m_crtc_state.display_vram_height;
+  cmd->X = m_crtc_state.regs.X;
+  cmd->bits = 0;
+  cmd->interlaced_display_enabled = IsInterlacedDisplayEnabled();
+  cmd->interlaced_display_field = GetInterlacedDisplayField();
+  cmd->interlaced_display_interleaved = cmd->interlaced_display_enabled && m_GPUSTAT.vertical_resolution;
+  cmd->display_24bit = m_GPUSTAT.display_area_color_depth_24;
+  cmd->display_disabled = IsDisplayDisabled();
+  cmd->display_aspect_ratio = ComputeDisplayAspectRatio();
+  cmd->media_capture = nullptr;
+  if (is_frame)
   {
-    if (GPUTexture::ConvertTextureDataToRGBA8(width, height, texture_data, texture_data_stride, texture_format))
+    bool present_frame;
+    bool should_allow_present_skip;
+    System::GetFramePresentationDetails(&is_frame, &present_frame, &should_allow_present_skip, &cmd->present_time);
+    cmd->is_frame = is_frame;
+    cmd->present_frame = present_frame;
+    cmd->allow_present_skip = should_allow_present_skip;
+
+    // Video capture setup.
+    if (MediaCapture* cap = System::GetMediaCapture(); cap && cap->IsCapturingVideo())
     {
-      if (clear_alpha)
+      if (cap->GetVideoFPS() != System::GetVideoFrameRate())
       {
-        for (u32& pixel : texture_data)
-          pixel |= 0xFF000000u;
+        const std::string next_capture_path = cap->GetNextCapturePath();
+        INFO_LOG("Video frame rate changed, switching to new capture file {}", Path::GetFileName(next_capture_path));
+
+        const bool was_capturing_audio = cap->IsCapturingAudio();
+        System::StopMediaCapture();
+        System::StartMediaCapture(std::move(next_capture_path), true, was_capturing_audio);
+        cap = System::GetMediaCapture();
       }
 
-      if (flip_y)
-        GPUTexture::FlipTextureDataRGBA8(width, height, reinterpret_cast<u8*>(texture_data.data()),
-                                         texture_data_stride);
-
-      Assert(texture_data_stride == sizeof(u32) * width);
-      RGBA8Image image(width, height, std::move(texture_data));
-      if (image.SaveToFile(filename.c_str(), fp.get(), quality))
-      {
-        result = true;
-      }
-      else
-      {
-        ERROR_LOG("Unknown extension in filename '{}' or save error: '{}'", filename, extension);
-        result = false;
-      }
-    }
-    else
-    {
-      result = false;
+      cmd->media_capture = cap;
     }
   }
   else
   {
-    ERROR_LOG("Unable to determine file extension for '{}'", filename);
-    result = false;
+    cmd->is_frame = false;
+    cmd->present_time = 0;
+    cmd->present_frame = false;
+    cmd->allow_present_skip = false;
   }
 
-  if (!osd_key.empty())
-  {
-    Host::AddIconOSDMessage(std::move(osd_key), ICON_EMOJI_CAMERA,
-                            fmt::format(result ? TRANSLATE_FS("GPU", "Saved screenshot to '{}'.") :
-                                                 TRANSLATE_FS("GPU", "Failed to save screenshot to '{}'."),
-                                        Path::GetFileName(filename),
-                                        result ? Host::OSD_INFO_DURATION : Host::OSD_ERROR_DURATION));
-  }
-
-  return result;
-}
-
-bool GPU::WriteDisplayTextureToFile(std::string filename)
-{
-  if (!m_display_texture)
-    return false;
-
-  const u32 read_x = static_cast<u32>(m_display_texture_view_x);
-  const u32 read_y = static_cast<u32>(m_display_texture_view_y);
-  const u32 read_width = static_cast<u32>(m_display_texture_view_width);
-  const u32 read_height = static_cast<u32>(m_display_texture_view_height);
-
-  const u32 texture_data_stride =
-    Common::AlignUpPow2(GPUTexture::GetPixelSize(m_display_texture->GetFormat()) * read_width, 4);
-  std::vector<u32> texture_data((texture_data_stride * read_height) / sizeof(u32));
-
-  std::unique_ptr<GPUDownloadTexture> dltex;
-  if (g_gpu_device->GetFeatures().memory_import)
-  {
-    dltex =
-      g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat(), texture_data.data(),
-                                          texture_data.size() * sizeof(u32), texture_data_stride);
-  }
-  if (!dltex)
-  {
-    if (!(dltex = g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat())))
-    {
-      ERROR_LOG("Failed to create {}x{} {} download texture", read_width, read_height,
-                GPUTexture::GetFormatName(m_display_texture->GetFormat()));
-      return false;
-    }
-  }
-
-  dltex->CopyFromTexture(0, 0, m_display_texture, read_x, read_y, read_width, read_height, 0, 0, !dltex->IsImported());
-  if (!dltex->ReadTexels(0, 0, read_width, read_height, texture_data.data(), texture_data_stride))
-  {
-    RestoreDeviceContext();
-    return false;
-  }
-
-  RestoreDeviceContext();
-
-  Error error;
-  auto fp = FileSystem::OpenManagedCFile(filename.c_str(), "wb", &error);
-  if (!fp)
-  {
-    ERROR_LOG("Can't open file '{}': {}", Path::GetFileName(filename), error.GetDescription());
-    return false;
-  }
-
-  constexpr bool clear_alpha = true;
-  const bool flip_y = g_gpu_device->UsesLowerLeftOrigin();
-
-  return CompressAndWriteTextureToFile(
-    read_width, read_height, std::move(filename), std::move(fp), g_settings.display_screenshot_quality, clear_alpha,
-    flip_y, std::move(texture_data), texture_data_stride, m_display_texture->GetFormat(), std::string());
-}
-
-bool GPU::RenderScreenshotToBuffer(u32 width, u32 height, const GSVector4i display_rect, const GSVector4i draw_rect,
-                                   bool postfx, std::vector<u32>* out_pixels, u32* out_stride,
-                                   GPUTexture::Format* out_format)
-{
-  const GPUTexture::Format hdformat =
-    g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetFormat() : GPUTexture::Format::RGBA8;
-
-  auto render_texture =
-    g_gpu_device->FetchAutoRecycleTexture(width, height, 1, 1, 1, GPUTexture::Type::RenderTarget, hdformat);
-  if (!render_texture)
-    return false;
-
-  g_gpu_device->ClearRenderTarget(render_texture.get(), GPUDevice::DEFAULT_CLEAR_COLOR);
-
-  // TODO: this should use copy shader instead.
-  RenderDisplay(render_texture.get(), display_rect, draw_rect, postfx);
-
-  const u32 stride = Common::AlignUpPow2(GPUTexture::GetPixelSize(hdformat) * width, sizeof(u32));
-  out_pixels->resize((height * stride) / sizeof(u32));
-
-  std::unique_ptr<GPUDownloadTexture> dltex;
-  if (g_gpu_device->GetFeatures().memory_import)
-  {
-    dltex = g_gpu_device->CreateDownloadTexture(width, height, hdformat, out_pixels->data(),
-                                                out_pixels->size() * sizeof(u32), stride);
-  }
-  if (!dltex)
-  {
-    if (!(dltex = g_gpu_device->CreateDownloadTexture(width, height, hdformat)))
-    {
-      ERROR_LOG("Failed to create {}x{} download texture", width, height);
-      return false;
-    }
-  }
-
-  dltex->CopyFromTexture(0, 0, render_texture.get(), 0, 0, width, height, 0, 0, false);
-  if (!dltex->ReadTexels(0, 0, width, height, out_pixels->data(), stride))
-  {
-    RestoreDeviceContext();
-    return false;
-  }
-
-  *out_stride = stride;
-  *out_format = hdformat;
-  RestoreDeviceContext();
-  return true;
-}
-
-void GPU::CalculateScreenshotSize(DisplayScreenshotMode mode, u32* width, u32* height, GSVector4i* display_rect,
-                                  GSVector4i* draw_rect) const
-{
-  *width = g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetWidth() : 1;
-  *height = g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetHeight() : 1;
-  CalculateDrawRect(*width, *height, true, !g_settings.debugging.show_vram, display_rect, draw_rect);
-
-  const bool internal_resolution = (mode != DisplayScreenshotMode::ScreenResolution || g_settings.debugging.show_vram);
-  if (internal_resolution && m_display_texture_view_width != 0 && m_display_texture_view_height != 0)
-  {
-    if (mode == DisplayScreenshotMode::InternalResolution)
-    {
-      const u32 draw_width = static_cast<u32>(display_rect->width());
-      const u32 draw_height = static_cast<u32>(display_rect->height());
-
-      // If internal res, scale the computed draw rectangle to the internal res.
-      // We re-use the draw rect because it's already been AR corrected.
-      const float sar =
-        static_cast<float>(m_display_texture_view_width) / static_cast<float>(m_display_texture_view_height);
-      const float dar = static_cast<float>(draw_width) / static_cast<float>(draw_height);
-      if (sar >= dar)
-      {
-        // stretch height, preserve width
-        const float scale = static_cast<float>(m_display_texture_view_width) / static_cast<float>(draw_width);
-        *width = m_display_texture_view_width;
-        *height = static_cast<u32>(std::round(static_cast<float>(draw_height) * scale));
-      }
-      else
-      {
-        // stretch width, preserve height
-        const float scale = static_cast<float>(m_display_texture_view_height) / static_cast<float>(draw_height);
-        *width = static_cast<u32>(std::round(static_cast<float>(draw_width) * scale));
-        *height = m_display_texture_view_height;
-      }
-
-      // DX11 won't go past 16K texture size.
-      const u32 max_texture_size = g_gpu_device->GetMaxTextureSize();
-      if (*width > max_texture_size)
-      {
-        *height = static_cast<u32>(static_cast<float>(*height) /
-                                   (static_cast<float>(*width) / static_cast<float>(max_texture_size)));
-        *width = max_texture_size;
-      }
-      if (*height > max_texture_size)
-      {
-        *height = max_texture_size;
-        *width = static_cast<u32>(static_cast<float>(*width) /
-                                  (static_cast<float>(*height) / static_cast<float>(max_texture_size)));
-      }
-    }
-    else // if (mode == DisplayScreenshotMode::UncorrectedInternalResolution)
-    {
-      *width = m_display_texture_view_width;
-      *height = m_display_texture_view_height;
-    }
-
-    // Remove padding, it's not part of the framebuffer.
-    *draw_rect = GSVector4i(0, 0, static_cast<s32>(*width), static_cast<s32>(*height));
-    *display_rect = *draw_rect;
-  }
-}
-
-bool GPU::RenderScreenshotToFile(std::string path, DisplayScreenshotMode mode, u8 quality, bool compress_on_thread,
-                                 bool show_osd_message)
-{
-  u32 width, height;
-  GSVector4i display_rect, draw_rect;
-  CalculateScreenshotSize(mode, &width, &height, &display_rect, &draw_rect);
-
-  const bool internal_resolution = (mode != DisplayScreenshotMode::ScreenResolution);
-  if (width == 0 || height == 0)
-    return false;
-
-  std::vector<u32> pixels;
-  u32 pixels_stride;
-  GPUTexture::Format pixels_format;
-  if (!RenderScreenshotToBuffer(width, height, display_rect, draw_rect, !internal_resolution, &pixels, &pixels_stride,
-                                &pixels_format))
-  {
-    ERROR_LOG("Failed to render {}x{} screenshot", width, height);
-    return false;
-  }
-
-  Error error;
-  auto fp = FileSystem::OpenManagedCFile(path.c_str(), "wb", &error);
-  if (!fp)
-  {
-    ERROR_LOG("Can't open file '{}': {}", Path::GetFileName(path), error.GetDescription());
-    return false;
-  }
-
-  std::string osd_key;
-  if (show_osd_message)
-  {
-    // Use a 60 second timeout to give it plenty of time to actually save.
-    osd_key = fmt::format("ScreenshotSaver_{}", path);
-    Host::AddIconOSDMessage(osd_key, ICON_EMOJI_CAMERA_WITH_FLASH,
-                            fmt::format(TRANSLATE_FS("GPU", "Saving screenshot to '{}'."), Path::GetFileName(path)),
-                            60.0f);
-  }
-
-  if (compress_on_thread)
-  {
-    System::QueueTaskOnThread([width, height, path = std::move(path), fp = fp.release(), quality,
-                               flip_y = g_gpu_device->UsesLowerLeftOrigin(), pixels = std::move(pixels), pixels_stride,
-                               pixels_format, osd_key = std::move(osd_key)]() mutable {
-      CompressAndWriteTextureToFile(width, height, std::move(path), FileSystem::ManagedCFilePtr(fp), quality, true,
-                                    flip_y, std::move(pixels), pixels_stride, pixels_format, std::move(osd_key));
-      System::RemoveSelfFromTaskThreads();
-    });
-
-    return true;
-  }
+  if (is_frame)
+    GPUThread::PushCommandAndFrame(cmd);
   else
-  {
-    return CompressAndWriteTextureToFile(width, height, std::move(path), std::move(fp), quality, true,
-                                         g_gpu_device->UsesLowerLeftOrigin(), std::move(pixels), pixels_stride,
-                                         pixels_format, std::move(osd_key));
-  }
+    GPUThread::PushCommand(cmd);
 }
 
 bool GPU::DumpVRAMToFile(const char* filename)
@@ -2748,8 +1836,6 @@ bool GPU::DumpVRAMToFile(const char* filename, u32 width, u32 height, u32 stride
 
 void GPU::DrawDebugStateWindow(float scale)
 {
-  DrawRendererStats();
-
   if (ImGui::CollapsingHeader("GPU", ImGuiTreeNodeFlags_DefaultOpen))
   {
     static constexpr std::array<const char*, 5> state_strings = {
@@ -2804,76 +1890,6 @@ void GPU::DrawDebugStateWindow(float scale)
   }
 }
 
-void GPU::DrawRendererStats()
-{
-}
-
-void GPU::OnBufferSwapped()
-{
-}
-
-void GPU::GetStatsString(SmallStringBase& str)
-{
-  if (IsHardwareRenderer())
-  {
-    str.format("{} HW | {} P | {} DC | {} B | {} RP | {} RB | {} C | {} W",
-               GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()), m_stats.num_primitives,
-               m_stats.host_num_draws, m_stats.host_num_barriers, m_stats.host_num_render_passes,
-               m_stats.host_num_downloads, m_stats.num_copies, m_stats.num_writes);
-  }
-  else
-  {
-    str.format("{} SW | {} P | {} R | {} C | {} W", GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()),
-               m_stats.num_primitives, m_stats.num_reads, m_stats.num_copies, m_stats.num_writes);
-  }
-}
-
-void GPU::GetMemoryStatsString(SmallStringBase& str)
-{
-  const u32 vram_usage_mb = static_cast<u32>((g_gpu_device->GetVRAMUsage() + (1048576 - 1)) / 1048576);
-  const u32 stream_kb = static_cast<u32>((m_stats.host_buffer_streamed + (1024 - 1)) / 1024);
-
-  str.format("{} MB VRAM | {} KB STR | {} TC | {} TU", vram_usage_mb, stream_kb, m_stats.host_num_copies,
-             m_stats.host_num_uploads);
-}
-
-void GPU::ResetStatistics()
-{
-  m_counters = {};
-  g_gpu_device->ResetStatistics();
-}
-
-void GPU::UpdateStatistics(u32 frame_count)
-{
-  const GPUDevice::Statistics& stats = g_gpu_device->GetStatistics();
-  const u32 round = (frame_count - 1);
-
-#define UPDATE_COUNTER(x) m_stats.x = (m_counters.x + round) / frame_count
-#define UPDATE_GPU_STAT(x) m_stats.host_##x = (stats.x + round) / frame_count
-
-  UPDATE_COUNTER(num_reads);
-  UPDATE_COUNTER(num_writes);
-  UPDATE_COUNTER(num_copies);
-  UPDATE_COUNTER(num_vertices);
-  UPDATE_COUNTER(num_primitives);
-
-  // UPDATE_COUNTER(num_read_texture_updates);
-  // UPDATE_COUNTER(num_ubo_updates);
-
-  UPDATE_GPU_STAT(buffer_streamed);
-  UPDATE_GPU_STAT(num_draws);
-  UPDATE_GPU_STAT(num_barriers);
-  UPDATE_GPU_STAT(num_render_passes);
-  UPDATE_GPU_STAT(num_copies);
-  UPDATE_GPU_STAT(num_downloads);
-  UPDATE_GPU_STAT(num_uploads);
-
-#undef UPDATE_GPU_STAT
-#undef UPDATE_COUNTER
-
-  ResetStatistics();
-}
-
 bool GPU::StartRecordingGPUDump(const char* path, u32 num_frames /* = 1 */)
 {
   if (m_gpu_dump)
@@ -2912,7 +1928,8 @@ bool GPU::StartRecordingGPUDump(const char* path, u32 num_frames /* = 1 */)
     Host::OSD_QUICK_DURATION);
 
   // save screenshot to same location to identify it
-  RenderScreenshotToFile(Path::ReplaceExtension(path, "png"), DisplayScreenshotMode::ScreenResolution, 85, true, false);
+  GPUBackend::RenderScreenshotToFile(Path::ReplaceExtension(path, "png"), DisplayScreenshotMode::ScreenResolution, 85,
+                                     true, false);
   return true;
 }
 
@@ -3085,10 +2102,8 @@ void GPU::ProcessGPUDumpPacket(GPUDump::PacketType type, const std::span<const u
       SystemTicksToCRTCTicks(system_ticks_per_frame, &m_crtc_state.fractional_ticks);
       TimingEvents::SetGlobalTickCounter(TimingEvents::GetGlobalTickCounter() +
                                          static_cast<GlobalTicks>(system_ticks_per_frame));
-
-      FlushRender();
-      UpdateDisplay();
       System::IncrementFrameNumber();
+      UpdateDisplay(true);
       System::FrameDone();
     }
     break;
diff --git a/src/core/gpu.h b/src/core/gpu.h
index a4f858794..9c15a0236 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -38,13 +38,11 @@ enum class PacketType : u8;
 class Recorder;
 class Player;
 } // namespace GPUDump
+
+class GPUBackend;
 struct Settings;
 
-namespace Threading {
-class Thread;
-}
-
-class GPU
+class GPU final
 {
 public:
   enum class BlitterState : u8
@@ -61,7 +59,6 @@ public:
     DOT_TIMER_INDEX = 0,
     HBLANK_TIMER_INDEX = 1,
     MAX_RESOLUTION_SCALE = 32,
-    DEINTERLACE_BUFFER_COUNT = 4,
     DRAWING_AREA_COORD_MASK = 1023,
   };
 
@@ -87,25 +84,14 @@ public:
 
   // Base class constructor.
   GPU();
-  virtual ~GPU();
+  ~GPU();
 
-  virtual const Threading::Thread* GetSWThread() const = 0;
-  virtual bool IsHardwareRenderer() const = 0;
-
-  virtual bool Initialize(Error* error);
-  virtual void Reset(bool clear_vram);
-  virtual bool DoState(StateWrapper& sw, GPUTexture** save_to_texture, bool update_display);
-
-  // Graphics API state reset/restore - call when drawing the UI etc.
-  // TODO: replace with "invalidate cached state"
-  virtual void RestoreDeviceContext();
+  void Initialize();
+  void Reset(bool clear_vram);
+  bool DoState(StateWrapper& sw, bool update_display);
 
   // Render statistics debug window.
   void DrawDebugStateWindow(float scale);
-  void GetStatsString(SmallStringBase& str);
-  void GetMemoryStatsString(SmallStringBase& str);
-  void ResetStatistics();
-  void UpdateStatistics(u32 frame_count);
 
   void CPUClockChanged();
 
@@ -169,24 +155,15 @@ public:
   void SynchronizeCRTC();
 
   /// Recompile shaders/recreate framebuffers when needed.
-  virtual void UpdateSettings(const Settings& old_settings);
+  void UpdateSettings(const Settings& old_settings);
 
-  /// Returns the current resolution scale.
-  virtual u32 GetResolutionScale() const;
-
-  /// Updates the resolution scale when it's set to automatic.
-  virtual void UpdateResolutionScale();
-
-  /// Returns the full display resolution of the GPU, including padding.
-  std::tuple<u32, u32> GetFullDisplayResolution() const;
+  /// Computes clamped drawing area.
+  static GSVector4i GetClampedDrawingArea(const GPUDrawingArea& drawing_area);
 
   float ComputeHorizontalFrequency() const;
   float ComputeVerticalFrequency() const;
   float ComputeDisplayAspectRatio() const;
 
-  static std::unique_ptr<GPU> CreateHardwareRenderer(Error* error);
-  static std::unique_ptr<GPU> CreateSoftwareRenderer(Error* error);
-
   // Converts window coordinates into horizontal ticks and scanlines. Returns false if out of range. Used for lightguns.
   void ConvertScreenCoordinatesToDisplayCoordinates(float window_x, float window_y, float* display_x,
                                                     float* display_y) const;
@@ -217,39 +194,14 @@ public:
   // Dumps raw VRAM to a file.
   bool DumpVRAMToFile(const char* filename);
 
-  // Ensures all buffered vertices are drawn.
-  virtual void FlushRender() = 0;
-
   /// Helper function for computing the draw rectangle in a larger window.
-  void CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio,
-                         GSVector4i* display_rect, GSVector4i* draw_rect) const;
+  static void CalculateDrawRect(u32 window_width, u32 window_height, u32 crtc_display_width, u32 crtc_display_height,
+                                s32 display_origin_left, s32 display_origin_top, u32 display_vram_width,
+                                u32 display_vram_height, DisplayRotation rotation, float aspect_ratio,
+                                bool stretch_vertically, bool integer_scale, GSVector4i* display_rect,
+                                GSVector4i* draw_rect);
 
-  /// Helper function for computing screenshot bounds.
-  void CalculateScreenshotSize(DisplayScreenshotMode mode, u32* width, u32* height, GSVector4i* display_rect,
-                               GSVector4i* draw_rect) const;
-
-  /// Helper function to save current display texture to PNG.
-  bool WriteDisplayTextureToFile(std::string path);
-
-  /// Renders the display, optionally with postprocessing to the specified image.
-  bool RenderScreenshotToBuffer(u32 width, u32 height, const GSVector4i display_rect, const GSVector4i draw_rect,
-                                bool postfx, std::vector<u32>* out_pixels, u32* out_stride,
-                                GPUTexture::Format* out_format);
-
-  /// Helper function to save screenshot to PNG.
-  bool RenderScreenshotToFile(std::string path, DisplayScreenshotMode mode, u8 quality, bool compress_on_thread,
-                              bool show_osd_message);
-
-  /// Draws the current display texture, with any post-processing.
-  GPUDevice::PresentResult PresentDisplay();
-
-  /// Sends the current frame to media capture.
-  bool SendDisplayToMediaCapture(MediaCapture* cap);
-
-  /// Reads the CLUT from the specified coordinates, accounting for wrap-around.
-  static void ReadCLUT(u16* dest, GPUTexturePaletteReg reg, bool clut_is_8bit);
-
-protected:
+private:
   TickCount CRTCTicksToSystemTicks(TickCount crtc_ticks, TickCount fractional_ticks) const;
   TickCount SystemTicksToCRTCTicks(TickCount sysclk_ticks, TickCount* fractional_ticks) const;
 
@@ -260,16 +212,6 @@ protected:
   }
   ALWAYS_INLINE static constexpr TickCount SystemTicksToGPUTicks(TickCount sysclk_ticks) { return sysclk_ticks << 1; }
 
-  static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
-  {
-    return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
-  }
-
-  static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
-  {
-    return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
-  }
-
   static bool DumpVRAMToFile(const char* filename, u32 width, u32 height, u32 stride, const void* buffer,
                              bool remove_alpha);
 
@@ -289,10 +231,10 @@ protected:
   void UpdateGPUIdle();
 
   /// Returns 0 if the currently-displayed field is on odd lines (1,3,5,...) or 1 if even (2,4,6,...).
-  ALWAYS_INLINE u32 GetInterlacedDisplayField() const { return ZeroExtend32(m_crtc_state.interlaced_field); }
+  ALWAYS_INLINE u8 GetInterlacedDisplayField() const { return m_crtc_state.interlaced_field; }
 
   /// Returns 0 if the currently-displayed field is on an even line in VRAM, otherwise 1.
-  ALWAYS_INLINE u32 GetActiveLineLSB() const { return ZeroExtend32(m_crtc_state.active_line_lsb); }
+  ALWAYS_INLINE u8 GetActiveLineLSB() const { return m_crtc_state.active_line_lsb; }
 
   /// Updates drawing area that's suitablef or clamping.
   void SetClampedDrawingArea();
@@ -327,16 +269,15 @@ protected:
   void InvalidateCLUT();
   bool IsCLUTValid() const;
 
-  // Rendering in the backend
-  virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0;
-  virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) = 0;
-  virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) = 0;
-  virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) = 0;
-  virtual void DispatchRenderCommand() = 0;
-  virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
-  virtual void UpdateDisplay() = 0;
-  virtual void DrawRendererStats();
-  virtual void OnBufferSwapped();
+  void ReadVRAM(u16 x, u16 y, u16 width, u16 height);
+  void UpdateVRAM(u16 x, u16 y, u16 width, u16 height, const void* data, bool set_mask, bool check_mask);
+
+  void UpdateDisplay(bool is_frame);
+
+  void PrepareForDraw();
+  void FinishPolyline();
+  void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
+  void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
 
   ALWAYS_INLINE_RELEASE void AddDrawTriangleTicks(GSVector2i v1, GSVector2i v2, GSVector2i v3, bool shaded,
                                                   bool textured, bool semitransparent)
@@ -433,14 +374,10 @@ protected:
     u32 texture_window_value;
 
     // decoded values
+    // TODO: Make this a command
     GPUTextureWindow texture_window;
     bool texture_x_flip;
     bool texture_y_flip;
-    bool texture_page_changed;
-
-    ALWAYS_INLINE bool IsTexturePageChanged() const { return texture_page_changed; }
-    ALWAYS_INLINE void SetTexturePageChanged() { texture_page_changed = true; }
-    ALWAYS_INLINE void ClearTexturePageChangedFlag() { texture_page_changed = false; }
   } m_draw_mode = {};
 
   GPUDrawingArea m_drawing_area = {};
@@ -574,65 +511,7 @@ protected:
   TickCount m_max_run_ahead = 128;
   u32 m_fifo_size = 128;
 
-  void ClearDisplayTexture();
-  void SetDisplayTexture(GPUTexture* texture, GPUTexture* depth_texture, s32 view_x, s32 view_y, s32 view_width,
-                         s32 view_height);
-
-  GPUDevice::PresentResult RenderDisplay(GPUTexture* target, const GSVector4i display_rect, const GSVector4i draw_rect,
-                                         bool postfx);
-
-  bool Deinterlace(u32 field, u32 line_skip);
-  bool DeinterlaceExtractField(u32 dst_bufidx, GPUTexture* src, u32 x, u32 y, u32 width, u32 height, u32 line_skip);
-  bool DeinterlaceSetTargetSize(u32 width, u32 height, bool preserve);
-  void DestroyDeinterlaceTextures();
-  bool ApplyChromaSmoothing();
-
-  u32 m_current_deinterlace_buffer = 0;
-  std::unique_ptr<GPUPipeline> m_deinterlace_pipeline;
-  std::unique_ptr<GPUPipeline> m_deinterlace_extract_pipeline;
-  std::array<std::unique_ptr<GPUTexture>, DEINTERLACE_BUFFER_COUNT> m_deinterlace_buffers;
-  std::unique_ptr<GPUTexture> m_deinterlace_texture;
-
-  std::unique_ptr<GPUPipeline> m_chroma_smoothing_pipeline;
-  std::unique_ptr<GPUTexture> m_chroma_smoothing_texture;
-
-  std::unique_ptr<GPUPipeline> m_display_pipeline;
-  GPUTexture* m_display_texture = nullptr;
-  GPUTexture* m_display_depth_buffer = nullptr;
-  s32 m_display_texture_view_x = 0;
-  s32 m_display_texture_view_y = 0;
-  s32 m_display_texture_view_width = 0;
-  s32 m_display_texture_view_height = 0;
-
-  struct Counters
-  {
-    u32 num_reads;
-    u32 num_writes;
-    u32 num_copies;
-    u32 num_vertices;
-    u32 num_primitives;
-
-    // u32 num_read_texture_updates;
-    // u32 num_ubo_updates;
-  };
-
-  struct Stats : Counters
-  {
-    size_t host_buffer_streamed;
-    u32 host_num_draws;
-    u32 host_num_barriers;
-    u32 host_num_render_passes;
-    u32 host_num_copies;
-    u32 host_num_downloads;
-    u32 host_num_uploads;
-  };
-
-  Counters m_counters = {};
-  Stats m_stats = {};
-
 private:
-  bool CompileDisplayPipelines(bool display, bool deinterlace, bool chroma_smoothing, Error* error);
-
   using GP0CommandHandler = bool (GPU::*)();
   using GP0CommandHandlerTable = std::array<GP0CommandHandler, 256>;
   static GP0CommandHandlerTable GenerateGP0CommandHandlerTable();
diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp
index a8b17818a..49faaad02 100644
--- a/src/core/gpu_backend.cpp
+++ b/src/core/gpu_backend.cpp
@@ -2,289 +2,398 @@
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0
 
 #include "gpu_backend.h"
+#include "gpu.h"
+#include "gpu_shadergen.h"
+#include "gpu_sw_rasterizer.h"
+#include "gpu_thread.h"
+#include "host.h"
+#include "performance_counters.h"
+#include "settings.h"
+#include "system.h"
+#include "system_private.h"
 
+#include "util/gpu_device.h"
+#include "util/image.h"
+#include "util/imgui_manager.h"
+#include "util/media_capture.h"
+#include "util/postprocessing.h"
 #include "util/state_wrapper.h"
 
 #include "common/align.h"
+#include "common/error.h"
+#include "common/file_system.h"
+#include "common/gsvector_formatter.h"
 #include "common/log.h"
+#include "common/path.h"
+#include "common/small_string.h"
+#include "common/string_util.h"
 #include "common/timer.h"
 
+#include "IconsEmoji.h"
+#include "IconsFontAwesome5.h"
+#include "fmt/format.h"
+
+#include <numbers>
+#include <thread>
+
 LOG_CHANNEL(GPUBackend);
 
-std::unique_ptr<GPUBackend> g_gpu_backend;
-
-GPUBackend::GPUBackend() = default;
-
-GPUBackend::~GPUBackend() = default;
-
-bool GPUBackend::Initialize(bool use_thread)
+namespace {
+struct Counters
 {
-  if (use_thread)
-    StartGPUThread();
+  u32 num_reads;
+  u32 num_writes;
+  u32 num_copies;
+  u32 num_vertices;
+  u32 num_primitives;
+};
+
+struct Stats : Counters
+{
+  size_t host_buffer_streamed;
+  u32 host_num_draws;
+  u32 host_num_barriers;
+  u32 host_num_render_passes;
+  u32 host_num_copies;
+  u32 host_num_downloads;
+  u32 host_num_uploads;
+};
+} // namespace
+
+static bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string filename, FileSystem::ManagedCFilePtr fp,
+                                          u8 quality, bool clear_alpha, bool flip_y, std::vector<u32> texture_data,
+                                          u32 texture_data_stride, GPUTexture::Format texture_format,
+                                          std::string osd_key);
+
+static constexpr GPUTexture::Format DISPLAY_INTERNAL_POSTFX_FORMAT = GPUTexture::Format::RGBA8;
+
+static Counters s_counters = {};
+static Stats s_stats = {};
+
+GPUBackend::GPUBackend()
+{
+  GPU_SW_Rasterizer::SelectImplementation();
+  ResetStatistics();
+
+  m_queued_frames.store(0, std::memory_order_release);
+  m_waiting_for_gpu_thread.store(false, std::memory_order_release);
+
+  m_display_width = 0;
+  m_display_height = 0;
+  m_display_origin_left = 0;
+  m_display_origin_top = 0;
+  m_display_vram_width = 0;
+  m_display_vram_height = 0;
+  m_display_aspect_ratio = 1.0f;
+}
+
+GPUBackend::~GPUBackend()
+{
+  DestroyDeinterlaceTextures();
+  g_gpu_device->RecycleTexture(std::move(m_chroma_smoothing_texture));
+}
+
+bool GPUBackend::Initialize(bool clear_vram, Error* error)
+{
+  if (!CompileDisplayPipelines(true, true, g_gpu_settings.display_24bit_chroma_smoothing, error))
+    return false;
 
   return true;
 }
 
-void GPUBackend::Reset()
+void GPUBackend::UpdateSettings(const Settings& old_settings)
 {
-  Sync(true);
-  DrawingAreaChanged(GPUDrawingArea{0, 0, 0, 0}, GSVector4i::zero());
-}
+  FlushRender();
 
-void GPUBackend::SetThreadEnabled(bool use_thread)
-{
-  Sync(true);
+  if (g_gpu_settings.display_show_gpu_stats != old_settings.display_show_gpu_stats)
+    GPUBackend::ResetStatistics();
 
-  if (m_use_gpu_thread != use_thread)
+  if (g_gpu_settings.display_scaling != old_settings.display_scaling ||
+      g_gpu_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode ||
+      g_gpu_settings.display_24bit_chroma_smoothing != old_settings.display_24bit_chroma_smoothing)
   {
-    if (!use_thread)
-      StopGPUThread();
-    else
-      StartGPUThread();
+    // Toss buffers on mode change.
+    if (g_gpu_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode)
+      DestroyDeinterlaceTextures();
+
+    if (!CompileDisplayPipelines(
+          g_gpu_settings.display_scaling != old_settings.display_scaling,
+          g_gpu_settings.display_deinterlacing_mode != old_settings.display_deinterlacing_mode,
+          g_gpu_settings.display_24bit_chroma_smoothing != old_settings.display_24bit_chroma_smoothing, nullptr))
+    {
+      Panic("Failed to compile display pipeline on settings change.");
+    }
   }
 }
 
-void GPUBackend::Shutdown()
+void GPUBackend::UpdateResolutionScale()
 {
-  StopGPUThread();
+}
+
+u32 GPUBackend::GetResolutionScale() const
+{
+  return 1u;
+}
+
+std::tuple<u32, u32> GPUBackend::GetFullDisplayResolution() const
+{
+  return std::tie(m_display_width, m_display_height);
+}
+
+void GPUBackend::RestoreDeviceContext()
+{
+}
+
+GPUThreadCommand* GPUBackend::NewClearVRAMCommand()
+{
+  return static_cast<GPUThreadCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::ClearVRAM, sizeof(GPUThreadCommand)));
+}
+
+GPUThreadCommand* GPUBackend::NewClearDisplayCommand()
+{
+  return static_cast<GPUThreadCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::ClearDisplay, sizeof(GPUThreadCommand)));
+}
+
+GPUBackendUpdateDisplayCommand* GPUBackend::NewUpdateDisplayCommand()
+{
+  return static_cast<GPUBackendUpdateDisplayCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::UpdateDisplay, sizeof(GPUBackendUpdateDisplayCommand)));
+}
+
+GPUThreadCommand* GPUBackend::NewClearCacheCommand()
+{
+  return static_cast<GPUThreadCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::ClearCache, sizeof(GPUThreadCommand)));
+}
+
+GPUThreadCommand* GPUBackend::NewBufferSwappedCommand()
+{
+  return static_cast<GPUThreadCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::BufferSwapped, sizeof(GPUThreadCommand)));
+}
+
+GPUThreadCommand* GPUBackend::NewUpdateResolutionScaleCommand()
+{
+  return static_cast<GPUThreadCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::UpdateResolutionScale, sizeof(GPUThreadCommand)));
+}
+
+GPUBackendReadVRAMCommand* GPUBackend::NewReadVRAMCommand()
+{
+  return static_cast<GPUBackendReadVRAMCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::ReadVRAM, sizeof(GPUBackendReadVRAMCommand)));
 }
 
 GPUBackendFillVRAMCommand* GPUBackend::NewFillVRAMCommand()
 {
   return static_cast<GPUBackendFillVRAMCommand*>(
-    AllocateCommand(GPUBackendCommandType::FillVRAM, sizeof(GPUBackendFillVRAMCommand)));
+    GPUThread::AllocateCommand(GPUBackendCommandType::FillVRAM, sizeof(GPUBackendFillVRAMCommand)));
 }
 
 GPUBackendUpdateVRAMCommand* GPUBackend::NewUpdateVRAMCommand(u32 num_words)
 {
   const u32 size = sizeof(GPUBackendUpdateVRAMCommand) + (num_words * sizeof(u16));
   GPUBackendUpdateVRAMCommand* cmd =
-    static_cast<GPUBackendUpdateVRAMCommand*>(AllocateCommand(GPUBackendCommandType::UpdateVRAM, size));
+    static_cast<GPUBackendUpdateVRAMCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::UpdateVRAM, size));
   return cmd;
 }
 
 GPUBackendCopyVRAMCommand* GPUBackend::NewCopyVRAMCommand()
 {
   return static_cast<GPUBackendCopyVRAMCommand*>(
-    AllocateCommand(GPUBackendCommandType::CopyVRAM, sizeof(GPUBackendCopyVRAMCommand)));
+    GPUThread::AllocateCommand(GPUBackendCommandType::CopyVRAM, sizeof(GPUBackendCopyVRAMCommand)));
 }
 
 GPUBackendSetDrawingAreaCommand* GPUBackend::NewSetDrawingAreaCommand()
 {
   return static_cast<GPUBackendSetDrawingAreaCommand*>(
-    AllocateCommand(GPUBackendCommandType::SetDrawingArea, sizeof(GPUBackendSetDrawingAreaCommand)));
+    GPUThread::AllocateCommand(GPUBackendCommandType::SetDrawingArea, sizeof(GPUBackendSetDrawingAreaCommand)));
 }
 
 GPUBackendUpdateCLUTCommand* GPUBackend::NewUpdateCLUTCommand()
 {
   return static_cast<GPUBackendUpdateCLUTCommand*>(
-    AllocateCommand(GPUBackendCommandType::UpdateCLUT, sizeof(GPUBackendUpdateCLUTCommand)));
+    GPUThread::AllocateCommand(GPUBackendCommandType::UpdateCLUT, sizeof(GPUBackendUpdateCLUTCommand)));
 }
 
 GPUBackendDrawPolygonCommand* GPUBackend::NewDrawPolygonCommand(u32 num_vertices)
 {
   const u32 size = sizeof(GPUBackendDrawPolygonCommand) + (num_vertices * sizeof(GPUBackendDrawPolygonCommand::Vertex));
   GPUBackendDrawPolygonCommand* cmd =
-    static_cast<GPUBackendDrawPolygonCommand*>(AllocateCommand(GPUBackendCommandType::DrawPolygon, size));
-  cmd->num_vertices = Truncate16(num_vertices);
+    static_cast<GPUBackendDrawPolygonCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::DrawPolygon, size));
+  cmd->num_vertices = Truncate8(num_vertices);
+  return cmd;
+}
+
+GPUBackendDrawPrecisePolygonCommand* GPUBackend::NewDrawPrecisePolygonCommand(u32 num_vertices)
+{
+  const u32 size =
+    sizeof(GPUBackendDrawPrecisePolygonCommand) + (num_vertices * sizeof(GPUBackendDrawPrecisePolygonCommand::Vertex));
+  GPUBackendDrawPrecisePolygonCommand* cmd = static_cast<GPUBackendDrawPrecisePolygonCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::DrawPrecisePolygon, size));
+  cmd->num_vertices = Truncate8(num_vertices);
   return cmd;
 }
 
 GPUBackendDrawRectangleCommand* GPUBackend::NewDrawRectangleCommand()
 {
   return static_cast<GPUBackendDrawRectangleCommand*>(
-    AllocateCommand(GPUBackendCommandType::DrawRectangle, sizeof(GPUBackendDrawRectangleCommand)));
+    GPUThread::AllocateCommand(GPUBackendCommandType::DrawRectangle, sizeof(GPUBackendDrawRectangleCommand)));
 }
 
 GPUBackendDrawLineCommand* GPUBackend::NewDrawLineCommand(u32 num_vertices)
 {
   const u32 size = sizeof(GPUBackendDrawLineCommand) + (num_vertices * sizeof(GPUBackendDrawLineCommand::Vertex));
   GPUBackendDrawLineCommand* cmd =
-    static_cast<GPUBackendDrawLineCommand*>(AllocateCommand(GPUBackendCommandType::DrawLine, size));
+    static_cast<GPUBackendDrawLineCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::DrawLine, size));
   cmd->num_vertices = Truncate16(num_vertices);
   return cmd;
 }
 
-void* GPUBackend::AllocateCommand(GPUBackendCommandType command, u32 size)
+void GPUBackend::PushCommand(GPUThreadCommand* cmd)
 {
-  // Ensure size is a multiple of 4 so we don't end up with an unaligned command.
-  size = Common::AlignUpPow2(size, 4);
+  GPUThread::PushCommand(cmd);
+}
 
-  for (;;)
+void GPUBackend::PushCommandAndWakeThread(GPUThreadCommand* cmd)
+{
+  GPUThread::PushCommandAndWakeThread(cmd);
+}
+
+void GPUBackend::PushCommandAndSync(GPUThreadCommand* cmd, bool spin)
+{
+  GPUThread::PushCommandAndSync(cmd, spin);
+}
+
+bool GPUBackend::IsUsingHardwareBackend()
+{
+  return (GPUThread::GetRequestedRenderer().value_or(GPURenderer::Software) != GPURenderer::Software);
+}
+
+bool GPUBackend::BeginQueueFrame()
+{
+  const u32 queued_frames = m_queued_frames.fetch_add(1, std::memory_order_acq_rel) + 1;
+  if (queued_frames < g_settings.gpu_max_queued_frames)
+    return false;
+
+  DEV_LOG("<-- {} queued frames, {} max, blocking CPU thread", queued_frames, g_settings.gpu_max_queued_frames);
+  m_waiting_for_gpu_thread.store(true, std::memory_order_release);
+  return true;
+}
+
+void GPUBackend::WaitForOneQueuedFrame()
+{
+  // Inbetween this and the post call, we may have finished the frame. Check.
+  if (m_queued_frames.load(std::memory_order_acquire) < g_settings.gpu_max_queued_frames)
   {
-    u32 read_ptr = m_command_fifo_read_ptr.load();
-    u32 write_ptr = m_command_fifo_write_ptr.load();
-    if (read_ptr > write_ptr)
+    // It's possible that the GPU thread has already signaled the semaphore.
+    // If so, then we still need to drain it, otherwise waits in the future will return prematurely.
+    bool expected = true;
+    if (m_waiting_for_gpu_thread.compare_exchange_strong(expected, false, std::memory_order_acq_rel,
+                                                         std::memory_order_relaxed))
     {
-      u32 available_size = read_ptr - write_ptr;
-      while (available_size < (size + sizeof(GPUBackendCommandType)))
-      {
-        WakeGPUThread();
-        read_ptr = m_command_fifo_read_ptr.load();
-        available_size = (read_ptr > write_ptr) ? (read_ptr - write_ptr) : (COMMAND_QUEUE_SIZE - write_ptr);
-      }
+      return;
     }
-    else
-    {
-      const u32 available_size = COMMAND_QUEUE_SIZE - write_ptr;
-      if ((size + sizeof(GPUBackendCommand)) > available_size)
-      {
-        // allocate a dummy command to wrap the buffer around
-        GPUBackendCommand* dummy_cmd = reinterpret_cast<GPUBackendCommand*>(&m_command_fifo_data[write_ptr]);
-        dummy_cmd->type = GPUBackendCommandType::Wraparound;
-        dummy_cmd->size = available_size;
-        dummy_cmd->params.bits = 0;
-        m_command_fifo_write_ptr.store(0);
-        continue;
-      }
-    }
-
-    GPUBackendCommand* cmd = reinterpret_cast<GPUBackendCommand*>(&m_command_fifo_data[write_ptr]);
-    cmd->type = command;
-    cmd->size = size;
-    return cmd;
   }
+
+  m_gpu_thread_wait.Wait();
+
+  // Sanity check: queued frames should be in range now. If they're not, we fucked up the semaphore.
+  Assert(m_queued_frames.load(std::memory_order_acquire) < g_settings.gpu_max_queued_frames);
 }
 
-u32 GPUBackend::GetPendingCommandSize() const
+bool GPUBackend::RenderScreenshotToBuffer(u32 width, u32 height, bool postfx, u32* out_width, u32* out_height,
+                                          std::vector<u32>* out_pixels, u32* out_stride, GPUTexture::Format* out_format)
 {
-  const u32 read_ptr = m_command_fifo_read_ptr.load();
-  const u32 write_ptr = m_command_fifo_write_ptr.load();
-  return (write_ptr >= read_ptr) ? (write_ptr - read_ptr) : (COMMAND_QUEUE_SIZE - read_ptr + write_ptr);
+  bool result;
+
+  GPUThreadRenderScreenshotToBufferCommand* cmd =
+    static_cast<GPUThreadRenderScreenshotToBufferCommand*>(GPUThread::AllocateCommand(
+      GPUBackendCommandType::RenderScreenshotToBuffer, sizeof(GPUThreadRenderScreenshotToBufferCommand)));
+  cmd->width = width;
+  cmd->height = height;
+  cmd->out_width = out_width;
+  cmd->out_height = out_height;
+  cmd->out_pixels = out_pixels;
+  cmd->out_stride = out_stride;
+  cmd->out_format = out_format;
+  cmd->out_result = &result;
+  cmd->postfx = postfx;
+  PushCommandAndSync(cmd, false);
+
+  return result;
 }
 
-void GPUBackend::PushCommand(GPUBackendCommand* cmd)
-{
-  if (!m_use_gpu_thread)
-  {
-    // single-thread mode
-    if (cmd->type != GPUBackendCommandType::Sync)
-      HandleCommand(cmd);
-  }
-  else
-  {
-    const u32 new_write_ptr = m_command_fifo_write_ptr.fetch_add(cmd->size) + cmd->size;
-    DebugAssert(new_write_ptr <= COMMAND_QUEUE_SIZE);
-    UNREFERENCED_VARIABLE(new_write_ptr);
-    if (GetPendingCommandSize() >= THRESHOLD_TO_WAKE_GPU)
-      WakeGPUThread();
-  }
-}
-
-void GPUBackend::WakeGPUThread()
-{
-  std::unique_lock<std::mutex> lock(m_sync_mutex);
-  if (!m_gpu_thread_sleeping.load())
-    return;
-
-  m_wake_gpu_thread_cv.notify_one();
-}
-
-void GPUBackend::StartGPUThread()
-{
-  m_gpu_loop_done.store(false);
-  m_use_gpu_thread = true;
-  m_gpu_thread.Start([this]() { RunGPULoop(); });
-  INFO_LOG("GPU thread started.");
-}
-
-void GPUBackend::StopGPUThread()
-{
-  if (!m_use_gpu_thread)
-    return;
-
-  m_gpu_loop_done.store(true);
-  WakeGPUThread();
-  m_gpu_thread.Join();
-  m_use_gpu_thread = false;
-  INFO_LOG("GPU thread stopped.");
-}
-
-void GPUBackend::Sync(bool allow_sleep)
-{
-  if (!m_use_gpu_thread)
-    return;
-
-  GPUBackendSyncCommand* cmd =
-    static_cast<GPUBackendSyncCommand*>(AllocateCommand(GPUBackendCommandType::Sync, sizeof(GPUBackendSyncCommand)));
-  cmd->allow_sleep = allow_sleep;
-  PushCommand(cmd);
-  WakeGPUThread();
-
-  m_sync_semaphore.Wait();
-}
-
-void GPUBackend::RunGPULoop()
-{
-  static constexpr double SPIN_TIME_NS = 1 * 1000000;
-  Common::Timer::Value last_command_time = 0;
-
-  for (;;)
-  {
-    u32 write_ptr = m_command_fifo_write_ptr.load();
-    u32 read_ptr = m_command_fifo_read_ptr.load();
-    if (read_ptr == write_ptr)
-    {
-      const Common::Timer::Value current_time = Common::Timer::GetCurrentValue();
-      if (Common::Timer::ConvertValueToNanoseconds(current_time - last_command_time) < SPIN_TIME_NS)
-        continue;
-
-      std::unique_lock<std::mutex> lock(m_sync_mutex);
-      m_gpu_thread_sleeping.store(true);
-      m_wake_gpu_thread_cv.wait(lock, [this]() { return m_gpu_loop_done.load() || GetPendingCommandSize() > 0; });
-      m_gpu_thread_sleeping.store(false);
-
-      if (m_gpu_loop_done.load())
-        break;
-      else
-        continue;
-    }
-
-    if (write_ptr < read_ptr)
-      write_ptr = COMMAND_QUEUE_SIZE;
-
-    bool allow_sleep = false;
-    while (read_ptr < write_ptr)
-    {
-      const GPUBackendCommand* cmd = reinterpret_cast<const GPUBackendCommand*>(&m_command_fifo_data[read_ptr]);
-      read_ptr += cmd->size;
-
-      switch (cmd->type)
-      {
-        case GPUBackendCommandType::Wraparound:
-        {
-          DebugAssert(read_ptr == COMMAND_QUEUE_SIZE);
-          write_ptr = m_command_fifo_write_ptr.load();
-          read_ptr = 0;
-        }
-        break;
-
-        case GPUBackendCommandType::Sync:
-        {
-          DebugAssert(read_ptr == write_ptr);
-          m_sync_semaphore.Post();
-          allow_sleep = static_cast<const GPUBackendSyncCommand*>(cmd)->allow_sleep;
-        }
-        break;
-
-        default:
-          HandleCommand(cmd);
-          break;
-      }
-    }
-
-    last_command_time = allow_sleep ? 0 : Common::Timer::GetCurrentValue();
-    m_command_fifo_read_ptr.store(read_ptr);
-  }
-}
-
-void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
+void GPUBackend::HandleCommand(const GPUThreadCommand* cmd)
 {
   switch (cmd->type)
   {
+    case GPUBackendCommandType::ClearVRAM:
+    {
+      ClearVRAM();
+    }
+    break;
+
+    case GPUBackendCommandType::LoadState:
+    {
+      LoadState(static_cast<const GPUBackendLoadStateCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::ClearDisplay:
+    {
+      ClearDisplay();
+    }
+    break;
+
+    case GPUBackendCommandType::UpdateDisplay:
+    {
+      HandleUpdateDisplayCommand(static_cast<const GPUBackendUpdateDisplayCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::ClearCache:
+    {
+      ClearCache();
+    }
+    break;
+
+    case GPUBackendCommandType::BufferSwapped:
+    {
+      OnBufferSwapped();
+    }
+    break;
+
+    case GPUBackendCommandType::UpdateResolutionScale:
+    {
+      UpdateResolutionScale();
+    }
+    break;
+
+    case GPUBackendCommandType::RenderScreenshotToBuffer:
+    {
+      HandleRenderScreenshotToBuffer(static_cast<const GPUThreadRenderScreenshotToBufferCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::RenderScreenshotToFile:
+    {
+      HandleRenderScreenshotToFile(static_cast<const GPUThreadRenderScreenshotToFileCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::ReadVRAM:
+    {
+      const GPUBackendReadVRAMCommand* ccmd = static_cast<const GPUBackendReadVRAMCommand*>(cmd);
+      s_counters.num_reads++;
+      ReadVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height));
+    }
+    break;
+
     case GPUBackendCommandType::FillVRAM:
     {
-      FlushRender();
       const GPUBackendFillVRAMCommand* ccmd = static_cast<const GPUBackendFillVRAMCommand*>(cmd);
       FillVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
                ccmd->color, ccmd->params);
@@ -293,8 +402,8 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
 
     case GPUBackendCommandType::UpdateVRAM:
     {
-      FlushRender();
       const GPUBackendUpdateVRAMCommand* ccmd = static_cast<const GPUBackendUpdateVRAMCommand*>(cmd);
+      s_counters.num_writes++;
       UpdateVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
                  ccmd->data, ccmd->params);
     }
@@ -302,8 +411,8 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
 
     case GPUBackendCommandType::CopyVRAM:
     {
-      FlushRender();
       const GPUBackendCopyVRAMCommand* ccmd = static_cast<const GPUBackendCopyVRAMCommand*>(cmd);
+      s_counters.num_copies++;
       CopyVRAM(ZeroExtend32(ccmd->src_x), ZeroExtend32(ccmd->src_y), ZeroExtend32(ccmd->dst_x),
                ZeroExtend32(ccmd->dst_y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height), ccmd->params);
     }
@@ -313,7 +422,8 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
     {
       FlushRender();
       const GPUBackendSetDrawingAreaCommand* ccmd = static_cast<const GPUBackendSetDrawingAreaCommand*>(cmd);
-      DrawingAreaChanged(ccmd->new_area, GSVector4i::load<false>(ccmd->new_clamped_area));
+      GPU_SW_Rasterizer::g_drawing_area = ccmd->new_area;
+      DrawingAreaChanged();
     }
     break;
 
@@ -326,23 +436,1155 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
 
     case GPUBackendCommandType::DrawPolygon:
     {
-      DrawPolygon(static_cast<const GPUBackendDrawPolygonCommand*>(cmd));
+      const GPUBackendDrawPolygonCommand* ccmd = static_cast<const GPUBackendDrawPolygonCommand*>(cmd);
+      s_counters.num_vertices += ccmd->num_vertices;
+      s_counters.num_primitives++;
+      DrawPolygon(ccmd);
+    }
+    break;
+
+    case GPUBackendCommandType::DrawPrecisePolygon:
+    {
+      const GPUBackendDrawPolygonCommand* ccmd = static_cast<const GPUBackendDrawPolygonCommand*>(cmd);
+      s_counters.num_vertices += ccmd->num_vertices;
+      s_counters.num_primitives++;
+      DrawPrecisePolygon(static_cast<const GPUBackendDrawPrecisePolygonCommand*>(cmd));
     }
     break;
 
     case GPUBackendCommandType::DrawRectangle:
     {
-      DrawRectangle(static_cast<const GPUBackendDrawRectangleCommand*>(cmd));
+      const GPUBackendDrawRectangleCommand* ccmd = static_cast<const GPUBackendDrawRectangleCommand*>(cmd);
+      s_counters.num_vertices++;
+      s_counters.num_primitives++;
+      DrawSprite(ccmd);
     }
     break;
 
     case GPUBackendCommandType::DrawLine:
     {
-      DrawLine(static_cast<const GPUBackendDrawLineCommand*>(cmd));
+      const GPUBackendDrawLineCommand* ccmd = static_cast<const GPUBackendDrawLineCommand*>(cmd);
+      s_counters.num_vertices += ccmd->num_vertices;
+      s_counters.num_primitives += ccmd->num_vertices / 2;
+      DrawLine(ccmd);
     }
     break;
 
+      DefaultCaseIsUnreachable();
+  }
+}
+
+bool GPUBackend::CompileDisplayPipelines(bool display, bool deinterlace, bool chroma_smoothing, Error* error)
+{
+  const GPUShaderGen shadergen(g_gpu_device->GetRenderAPI(), g_gpu_device->GetFeatures().dual_source_blend,
+                               g_gpu_device->GetFeatures().framebuffer_fetch);
+
+  GPUPipeline::GraphicsConfig plconfig;
+  plconfig.input_layout.vertex_stride = 0;
+  plconfig.primitive = GPUPipeline::Primitive::Triangles;
+  plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState();
+  plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
+  plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
+  plconfig.geometry_shader = nullptr;
+  plconfig.depth_format = GPUTexture::Format::Unknown;
+  plconfig.samples = 1;
+  plconfig.per_sample_shading = false;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
+
+  if (display)
+  {
+    plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
+    plconfig.SetTargetFormats(g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetFormat() :
+                                                                 GPUTexture::Format::RGBA8);
+
+    std::string vs = shadergen.GenerateDisplayVertexShader();
+    std::string fs;
+    switch (g_settings.display_scaling)
+    {
+      case DisplayScalingMode::BilinearSharp:
+        fs = shadergen.GenerateDisplaySharpBilinearFragmentShader();
+        break;
+
+      case DisplayScalingMode::BilinearSmooth:
+      case DisplayScalingMode::BilinearInteger:
+        fs = shadergen.GenerateDisplayFragmentShader(true, false);
+        break;
+
+      case DisplayScalingMode::Nearest:
+      case DisplayScalingMode::NearestInteger:
+      default:
+        fs = shadergen.GenerateDisplayFragmentShader(false, true);
+        break;
+    }
+
+    std::unique_ptr<GPUShader> vso =
+      g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs, error);
+    std::unique_ptr<GPUShader> fso =
+      g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(), fs, error);
+    if (!vso || !fso)
+      return false;
+    GL_OBJECT_NAME(vso, "Display Vertex Shader");
+    GL_OBJECT_NAME_FMT(fso, "Display Fragment Shader [{}]",
+                       Settings::GetDisplayScalingName(g_gpu_settings.display_scaling));
+    plconfig.vertex_shader = vso.get();
+    plconfig.fragment_shader = fso.get();
+    if (!(m_display_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+      return false;
+    GL_OBJECT_NAME_FMT(m_display_pipeline, "Display Pipeline [{}]",
+                       Settings::GetDisplayScalingName(g_gpu_settings.display_scaling));
+  }
+
+  if (deinterlace)
+  {
+    plconfig.SetTargetFormats(GPUTexture::Format::RGBA8);
+
+    std::unique_ptr<GPUShader> vso = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(),
+                                                                shadergen.GenerateScreenQuadVertexShader(), error);
+    if (!vso)
+      return false;
+    GL_OBJECT_NAME(vso, "Deinterlace Vertex Shader");
+
+    std::unique_ptr<GPUShader> fso;
+    if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
+                                           shadergen.GenerateInterleavedFieldExtractFragmentShader(), error)))
+    {
+      return false;
+    }
+
+    GL_OBJECT_NAME(fso, "Deinterlace Field Extract Fragment Shader");
+
+    plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
+    plconfig.vertex_shader = vso.get();
+    plconfig.fragment_shader = fso.get();
+    if (!(m_deinterlace_extract_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+      return false;
+
+    GL_OBJECT_NAME(m_deinterlace_extract_pipeline, "Deinterlace Field Extract Pipeline");
+
+    switch (g_gpu_settings.display_deinterlacing_mode)
+    {
+      case DisplayDeinterlacingMode::Disabled:
+      case DisplayDeinterlacingMode::Progressive:
+        break;
+
+      case DisplayDeinterlacingMode::Weave:
+      {
+        if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
+                                               shadergen.GenerateDeinterlaceWeaveFragmentShader(), error)))
+        {
+          return false;
+        }
+
+        GL_OBJECT_NAME(fso, "Weave Deinterlace Fragment Shader");
+
+        plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
+        plconfig.vertex_shader = vso.get();
+        plconfig.fragment_shader = fso.get();
+        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+          return false;
+
+        GL_OBJECT_NAME(m_deinterlace_pipeline, "Weave Deinterlace Pipeline");
+      }
+      break;
+
+      case DisplayDeinterlacingMode::Blend:
+      {
+        if (!(fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
+                                               shadergen.GenerateDeinterlaceBlendFragmentShader(), error)))
+        {
+          return false;
+        }
+
+        GL_OBJECT_NAME(fso, "Blend Deinterlace Fragment Shader");
+
+        plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants;
+        plconfig.vertex_shader = vso.get();
+        plconfig.fragment_shader = fso.get();
+        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+          return false;
+
+        GL_OBJECT_NAME(m_deinterlace_pipeline, "Blend Deinterlace Pipeline");
+      }
+      break;
+
+      case DisplayDeinterlacingMode::Adaptive:
+      {
+        fso = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
+                                         shadergen.GenerateFastMADReconstructFragmentShader(), error);
+        if (!fso)
+          return false;
+
+        GL_OBJECT_NAME(fso, "FastMAD Reconstruct Fragment Shader");
+
+        plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants;
+        plconfig.fragment_shader = fso.get();
+        if (!(m_deinterlace_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+          return false;
+
+        GL_OBJECT_NAME(m_deinterlace_pipeline, "FastMAD Reconstruct Pipeline");
+      }
+      break;
+
+      default:
+        UnreachableCode();
+    }
+  }
+
+  if (chroma_smoothing)
+  {
+    m_chroma_smoothing_pipeline.reset();
+    g_gpu_device->RecycleTexture(std::move(m_chroma_smoothing_texture));
+
+    if (g_gpu_settings.display_24bit_chroma_smoothing)
+    {
+      plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
+      plconfig.SetTargetFormats(GPUTexture::Format::RGBA8);
+
+      std::unique_ptr<GPUShader> vso = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(),
+                                                                  shadergen.GenerateScreenQuadVertexShader(), error);
+      std::unique_ptr<GPUShader> fso = g_gpu_device->CreateShader(
+        GPUShaderStage::Fragment, shadergen.GetLanguage(), shadergen.GenerateChromaSmoothingFragmentShader(), error);
+      if (!vso || !fso)
+        return false;
+      GL_OBJECT_NAME(vso, "Chroma Smoothing Vertex Shader");
+      GL_OBJECT_NAME(fso, "Chroma Smoothing Fragment Shader");
+
+      plconfig.vertex_shader = vso.get();
+      plconfig.fragment_shader = fso.get();
+      if (!(m_chroma_smoothing_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
+        return false;
+      GL_OBJECT_NAME(m_chroma_smoothing_pipeline, "Chroma Smoothing Pipeline");
+    }
+  }
+
+  return true;
+}
+
+void GPUBackend::HandleUpdateDisplayCommand(const GPUBackendUpdateDisplayCommand* cmd)
+{
+  const GPUBackendUpdateDisplayCommand* ccmd = static_cast<const GPUBackendUpdateDisplayCommand*>(cmd);
+  m_display_width = ccmd->display_width;
+  m_display_height = ccmd->display_height;
+  m_display_origin_left = ccmd->display_origin_left;
+  m_display_origin_top = ccmd->display_origin_top;
+  m_display_vram_width = ccmd->display_vram_width;
+  m_display_vram_height = ccmd->display_vram_height;
+  m_display_aspect_ratio = ccmd->display_aspect_ratio;
+
+  UpdateDisplay(ccmd);
+
+  if (ccmd->media_capture)
+    SendDisplayToMediaCapture(ccmd->media_capture);
+
+  if (ccmd->is_frame)
+    Host::FrameDoneOnGPUThread(this, cmd->frame_number);
+
+  if (ccmd->present_frame)
+  {
+    GPUThread::Internal::PresentFrame(ccmd->allow_present_skip, ccmd->present_time);
+
+    m_queued_frames.fetch_sub(1, std::memory_order_acq_rel);
+
+    bool expected = true;
+    if (m_waiting_for_gpu_thread.compare_exchange_strong(expected, false, std::memory_order_acq_rel,
+                                                         std::memory_order_relaxed))
+    {
+      DEV_LOG("--> Unblocking CPU thread");
+      m_gpu_thread_wait.Post();
+    }
+  }
+
+  // Update perf counters *after* throttling, we want to measure from start-of-frame
+  // to start-of-frame, not end-of-frame to end-of-frame (will be noisy due to different
+  // amounts of computation happening in each frame).
+  if (ccmd->is_frame)
+    PerformanceCounters::Update(this, ccmd->frame_number, ccmd->internal_frame_number);
+}
+
+void GPUBackend::ClearDisplay()
+{
+  ClearDisplayTexture();
+
+  // Just recycle the textures, it'll get re-fetched.
+  DestroyDeinterlaceTextures();
+}
+
+void GPUBackend::ClearDisplayTexture()
+{
+  m_display_texture = nullptr;
+  m_display_texture_view_x = 0;
+  m_display_texture_view_y = 0;
+  m_display_texture_view_width = 0;
+  m_display_texture_view_height = 0;
+}
+
+void GPUBackend::SetDisplayTexture(GPUTexture* texture, GPUTexture* depth_buffer, s32 view_x, s32 view_y,
+                                   s32 view_width, s32 view_height)
+{
+  DebugAssert(texture);
+
+  if (g_settings.display_auto_resize_window &&
+      (view_width != m_display_texture_view_width || view_height != m_display_texture_view_height))
+  {
+    Host::RunOnCPUThread([]() { System::RequestDisplaySize(); });
+  }
+
+  m_display_texture = texture;
+  m_display_depth_buffer = depth_buffer;
+  m_display_texture_view_x = view_x;
+  m_display_texture_view_y = view_y;
+  m_display_texture_view_width = view_width;
+  m_display_texture_view_height = view_height;
+}
+
+GPUDevice::PresentResult GPUBackend::PresentDisplay()
+{
+  FlushRender();
+
+  if (!g_gpu_device->HasMainSwapChain())
+    return GPUDevice::PresentResult::SkipPresent;
+
+  GSVector4i display_rect;
+  GSVector4i draw_rect;
+  CalculateDrawRect(g_gpu_device->GetMainSwapChain()->GetWidth(), g_gpu_device->GetMainSwapChain()->GetHeight(),
+                    !g_gpu_settings.debugging.show_vram, true, &display_rect, &draw_rect);
+  return RenderDisplay(nullptr, display_rect, draw_rect, !g_gpu_settings.debugging.show_vram);
+}
+
+GPUDevice::PresentResult GPUBackend::RenderDisplay(GPUTexture* target, const GSVector4i display_rect,
+                                                   const GSVector4i draw_rect, bool postfx)
+{
+  GL_SCOPE_FMT("RenderDisplay: {}", draw_rect);
+
+  if (m_display_texture)
+    m_display_texture->MakeReadyForSampling();
+
+  // Internal post-processing.
+  GPUTexture* display_texture = m_display_texture;
+  s32 display_texture_view_x = m_display_texture_view_x;
+  s32 display_texture_view_y = m_display_texture_view_y;
+  s32 display_texture_view_width = m_display_texture_view_width;
+  s32 display_texture_view_height = m_display_texture_view_height;
+  if (postfx && display_texture && PostProcessing::InternalChain.IsActive() &&
+      PostProcessing::InternalChain.CheckTargets(DISPLAY_INTERNAL_POSTFX_FORMAT, display_texture_view_width,
+                                                 display_texture_view_height))
+  {
+    DebugAssert(display_texture_view_x == 0 && display_texture_view_y == 0 &&
+                static_cast<s32>(display_texture->GetWidth()) == display_texture_view_width &&
+                static_cast<s32>(display_texture->GetHeight()) == display_texture_view_height);
+
+    // Now we can apply the post chain.
+    GPUTexture* post_output_texture = PostProcessing::InternalChain.GetOutputTexture();
+    if (const GPUDevice::PresentResult pres = PostProcessing::InternalChain.Apply(
+          display_texture, m_display_depth_buffer, post_output_texture,
+          GSVector4i(0, 0, display_texture_view_width, display_texture_view_height), display_texture_view_width,
+          display_texture_view_height, m_display_width, m_display_height);
+        pres != GPUDevice::PresentResult::OK)
+    {
+      return pres;
+    }
+    else
+    {
+      display_texture_view_x = 0;
+      display_texture_view_y = 0;
+      display_texture = post_output_texture;
+      display_texture->MakeReadyForSampling();
+    }
+  }
+
+  const GPUTexture::Format hdformat = target ? target->GetFormat() : g_gpu_device->GetMainSwapChain()->GetFormat();
+  const u32 target_width = target ? target->GetWidth() : g_gpu_device->GetMainSwapChain()->GetWidth();
+  const u32 target_height = target ? target->GetHeight() : g_gpu_device->GetMainSwapChain()->GetHeight();
+  const bool really_postfx = (postfx && PostProcessing::DisplayChain.IsActive() && !g_gpu_device->HasMainSwapChain() &&
+                              hdformat != GPUTexture::Format::Unknown && target_width > 0 && target_height > 0 &&
+                              PostProcessing::DisplayChain.CheckTargets(hdformat, target_width, target_height));
+  const GSVector4i real_draw_rect =
+    g_gpu_device->UsesLowerLeftOrigin() ? GPUDevice::FlipToLowerLeft(draw_rect, target_height) : draw_rect;
+  if (really_postfx)
+  {
+    g_gpu_device->ClearRenderTarget(PostProcessing::DisplayChain.GetInputTexture(), GPUDevice::DEFAULT_CLEAR_COLOR);
+    g_gpu_device->SetRenderTarget(PostProcessing::DisplayChain.GetInputTexture());
+  }
+  else
+  {
+    if (target)
+    {
+      g_gpu_device->SetRenderTarget(target);
+    }
+    else
+    {
+      const GPUDevice::PresentResult pres = g_gpu_device->BeginPresent(g_gpu_device->GetMainSwapChain());
+      if (pres != GPUDevice::PresentResult::OK)
+        return pres;
+    }
+  }
+
+  if (display_texture)
+  {
+    bool texture_filter_linear = false;
+
+    struct Uniforms
+    {
+      float src_rect[4];
+      float src_size[4];
+      float clamp_rect[4];
+      float params[4];
+      float rotation_matrix[2][2];
+    } uniforms;
+    std::memset(uniforms.params, 0, sizeof(uniforms.params));
+
+    switch (g_gpu_settings.display_scaling)
+    {
+      case DisplayScalingMode::Nearest:
+      case DisplayScalingMode::NearestInteger:
+        break;
+
+      case DisplayScalingMode::BilinearSmooth:
+      case DisplayScalingMode::BilinearInteger:
+        texture_filter_linear = true;
+        break;
+
+      case DisplayScalingMode::BilinearSharp:
+      {
+        texture_filter_linear = true;
+        uniforms.params[0] = std::max(
+          std::floor(static_cast<float>(draw_rect.width()) / static_cast<float>(m_display_texture_view_width)), 1.0f);
+        uniforms.params[1] = std::max(
+          std::floor(static_cast<float>(draw_rect.height()) / static_cast<float>(m_display_texture_view_height)), 1.0f);
+        uniforms.params[2] = 0.5f - 0.5f / uniforms.params[0];
+        uniforms.params[3] = 0.5f - 0.5f / uniforms.params[1];
+      }
+      break;
+
+      default:
+        UnreachableCode();
+        break;
+    }
+
+    g_gpu_device->SetPipeline(m_display_pipeline.get());
+    g_gpu_device->SetTextureSampler(
+      0, display_texture, texture_filter_linear ? g_gpu_device->GetLinearSampler() : g_gpu_device->GetNearestSampler());
+
+    // For bilinear, clamp to 0.5/SIZE-0.5 to avoid bleeding from the adjacent texels in VRAM. This is because
+    // 1.0 in UV space is not the bottom-right texel, but a mix of the bottom-right and wrapped/next texel.
+    const float rcp_width = 1.0f / static_cast<float>(display_texture->GetWidth());
+    const float rcp_height = 1.0f / static_cast<float>(display_texture->GetHeight());
+    uniforms.src_rect[0] = static_cast<float>(display_texture_view_x) * rcp_width;
+    uniforms.src_rect[1] = static_cast<float>(display_texture_view_y) * rcp_height;
+    uniforms.src_rect[2] = static_cast<float>(display_texture_view_width) * rcp_width;
+    uniforms.src_rect[3] = static_cast<float>(display_texture_view_height) * rcp_height;
+    uniforms.clamp_rect[0] = (static_cast<float>(display_texture_view_x) + 0.5f) * rcp_width;
+    uniforms.clamp_rect[1] = (static_cast<float>(display_texture_view_y) + 0.5f) * rcp_height;
+    uniforms.clamp_rect[2] =
+      (static_cast<float>(display_texture_view_x + display_texture_view_width) - 0.5f) * rcp_width;
+    uniforms.clamp_rect[3] =
+      (static_cast<float>(display_texture_view_y + display_texture_view_height) - 0.5f) * rcp_height;
+    uniforms.src_size[0] = static_cast<float>(display_texture->GetWidth());
+    uniforms.src_size[1] = static_cast<float>(display_texture->GetHeight());
+    uniforms.src_size[2] = rcp_width;
+    uniforms.src_size[3] = rcp_height;
+
+    if (g_gpu_settings.display_rotation != DisplayRotation::Normal)
+    {
+      static constexpr const std::array<float, static_cast<size_t>(DisplayRotation::Count) - 1> rotation_radians = {{
+        static_cast<float>(std::numbers::pi * 1.5f), // Rotate90
+        static_cast<float>(std::numbers::pi),        // Rotate180
+        static_cast<float>(std::numbers::pi / 2.0),  // Rotate270
+      }};
+
+      GSMatrix2x2::Rotation(rotation_radians[static_cast<size_t>(g_gpu_settings.display_rotation) - 1])
+        .store(uniforms.rotation_matrix);
+    }
+    else
+    {
+      GSMatrix2x2::Identity().store(uniforms.rotation_matrix);
+    }
+
+    g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
+
+    g_gpu_device->SetViewportAndScissor(real_draw_rect);
+    g_gpu_device->Draw(3, 0);
+  }
+
+  if (really_postfx)
+  {
+    DebugAssert(!g_gpu_settings.debugging.show_vram);
+
+    // "original size" in postfx includes padding.
+    const float upscale_x =
+      m_display_texture ? static_cast<float>(m_display_texture_view_width) / static_cast<float>(m_display_vram_width) :
+                          1.0f;
+    const float upscale_y = m_display_texture ? static_cast<float>(m_display_texture_view_height) /
+                                                  static_cast<float>(m_display_vram_height) :
+                                                1.0f;
+    const s32 orig_width = static_cast<s32>(std::ceil(static_cast<float>(m_display_width) * upscale_x));
+    const s32 orig_height = static_cast<s32>(std::ceil(static_cast<float>(m_display_height) * upscale_y));
+
+    return PostProcessing::DisplayChain.Apply(PostProcessing::DisplayChain.GetInputTexture(), nullptr, target,
+                                              display_rect, orig_width, orig_height, m_display_width, m_display_height);
+  }
+  else
+  {
+    return GPUDevice::PresentResult::OK;
+  }
+}
+
+void GPUBackend::SendDisplayToMediaCapture(MediaCapture* cap)
+{
+  GPUTexture* target = cap->GetRenderTexture();
+  if (!target) [[unlikely]]
+  {
+    WARNING_LOG("Failed to get video capture render texture.");
+    Host::RunOnCPUThread(&System::StopMediaCapture);
+    return;
+  }
+
+  const bool apply_aspect_ratio =
+    (g_settings.display_screenshot_mode != DisplayScreenshotMode::UncorrectedInternalResolution);
+  const bool postfx = (g_settings.display_screenshot_mode != DisplayScreenshotMode::InternalResolution);
+  GSVector4i display_rect, draw_rect;
+  CalculateDrawRect(target->GetWidth(), target->GetHeight(), !g_settings.debugging.show_vram, apply_aspect_ratio,
+                    &display_rect, &draw_rect);
+
+  // Not cleared by RenderDisplay().
+  g_gpu_device->ClearRenderTarget(target, GPUDevice::DEFAULT_CLEAR_COLOR);
+
+  if (RenderDisplay(target, display_rect, draw_rect, postfx) != GPUDevice::PresentResult::OK ||
+      !cap->DeliverVideoFrame(target)) [[unlikely]]
+  {
+    WARNING_LOG("Failed to render/deliver video capture frame.");
+    Host::RunOnCPUThread(&System::StopMediaCapture);
+    return;
+  }
+}
+
+void GPUBackend::DestroyDeinterlaceTextures()
+{
+  for (std::unique_ptr<GPUTexture>& tex : m_deinterlace_buffers)
+    g_gpu_device->RecycleTexture(std::move(tex));
+  g_gpu_device->RecycleTexture(std::move(m_deinterlace_texture));
+  m_current_deinterlace_buffer = 0;
+}
+
+bool GPUBackend::Deinterlace(u32 field, u32 line_skip)
+{
+  GPUTexture* src = m_display_texture;
+  const u32 x = m_display_texture_view_x;
+  const u32 y = m_display_texture_view_y;
+  const u32 width = m_display_texture_view_width;
+  const u32 height = m_display_texture_view_height;
+
+  switch (g_settings.display_deinterlacing_mode)
+  {
+    case DisplayDeinterlacingMode::Disabled:
+    {
+      if (line_skip == 0)
+        return true;
+
+      // Still have to extract the field.
+      if (!DeinterlaceExtractField(0, src, x, y, width, height, line_skip)) [[unlikely]]
+        return false;
+
+      SetDisplayTexture(m_deinterlace_buffers[0].get(), m_display_depth_buffer, 0, 0, width, height);
+      return true;
+    }
+
+    case DisplayDeinterlacingMode::Weave:
+    {
+      GL_SCOPE_FMT("DeinterlaceWeave({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field, line_skip);
+
+      const u32 full_height = height * 2;
+      if (!DeinterlaceSetTargetSize(width, full_height, true)) [[unlikely]]
+      {
+        ClearDisplayTexture();
+        return false;
+      }
+
+      src->MakeReadyForSampling();
+
+      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
+      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
+      g_gpu_device->SetTextureSampler(0, src, g_gpu_device->GetNearestSampler());
+      const u32 uniforms[] = {x, y, field, line_skip};
+      g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
+      g_gpu_device->SetViewportAndScissor(0, 0, width, full_height);
+      g_gpu_device->Draw(3, 0);
+
+      m_deinterlace_texture->MakeReadyForSampling();
+      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, full_height);
+      return true;
+    }
+
+    case DisplayDeinterlacingMode::Blend:
+    {
+      constexpr u32 NUM_BLEND_BUFFERS = 2;
+
+      GL_SCOPE_FMT("DeinterlaceBlend({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field, line_skip);
+
+      const u32 this_buffer = m_current_deinterlace_buffer;
+      m_current_deinterlace_buffer = (m_current_deinterlace_buffer + 1u) % NUM_BLEND_BUFFERS;
+      GL_INS_FMT("Current buffer: {}", this_buffer);
+      if (!DeinterlaceExtractField(this_buffer, src, x, y, width, height, line_skip) ||
+          !DeinterlaceSetTargetSize(width, height, false)) [[unlikely]]
+      {
+        ClearDisplayTexture();
+        return false;
+      }
+
+      // TODO: could be implemented with alpha blending instead..
+
+      g_gpu_device->InvalidateRenderTarget(m_deinterlace_texture.get());
+      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
+      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
+      g_gpu_device->SetTextureSampler(0, m_deinterlace_buffers[this_buffer].get(), g_gpu_device->GetNearestSampler());
+      g_gpu_device->SetTextureSampler(1, m_deinterlace_buffers[(this_buffer - 1) % NUM_BLEND_BUFFERS].get(),
+                                      g_gpu_device->GetNearestSampler());
+      g_gpu_device->SetViewportAndScissor(0, 0, width, height);
+      g_gpu_device->Draw(3, 0);
+
+      m_deinterlace_texture->MakeReadyForSampling();
+      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, height);
+      return true;
+    }
+
+    case DisplayDeinterlacingMode::Adaptive:
+    {
+      GL_SCOPE_FMT("DeinterlaceAdaptive({{{},{}}}, {}x{}, field={}, line_skip={})", x, y, width, height, field,
+                   line_skip);
+
+      const u32 full_height = height * 2;
+      const u32 this_buffer = m_current_deinterlace_buffer;
+      m_current_deinterlace_buffer = (m_current_deinterlace_buffer + 1u) % DEINTERLACE_BUFFER_COUNT;
+      GL_INS_FMT("Current buffer: {}", this_buffer);
+      if (!DeinterlaceExtractField(this_buffer, src, x, y, width, height, line_skip) ||
+          !DeinterlaceSetTargetSize(width, full_height, false)) [[unlikely]]
+      {
+        ClearDisplayTexture();
+        return false;
+      }
+
+      g_gpu_device->SetRenderTarget(m_deinterlace_texture.get());
+      g_gpu_device->SetPipeline(m_deinterlace_pipeline.get());
+      g_gpu_device->SetTextureSampler(0, m_deinterlace_buffers[this_buffer].get(), g_gpu_device->GetNearestSampler());
+      g_gpu_device->SetTextureSampler(1, m_deinterlace_buffers[(this_buffer - 1) % DEINTERLACE_BUFFER_COUNT].get(),
+                                      g_gpu_device->GetNearestSampler());
+      g_gpu_device->SetTextureSampler(2, m_deinterlace_buffers[(this_buffer - 2) % DEINTERLACE_BUFFER_COUNT].get(),
+                                      g_gpu_device->GetNearestSampler());
+      g_gpu_device->SetTextureSampler(3, m_deinterlace_buffers[(this_buffer - 3) % DEINTERLACE_BUFFER_COUNT].get(),
+                                      g_gpu_device->GetNearestSampler());
+      const u32 uniforms[] = {field, full_height};
+      g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
+      g_gpu_device->SetViewportAndScissor(0, 0, width, full_height);
+      g_gpu_device->Draw(3, 0);
+
+      m_deinterlace_texture->MakeReadyForSampling();
+      SetDisplayTexture(m_deinterlace_texture.get(), m_display_depth_buffer, 0, 0, width, full_height);
+      return true;
+    }
+
     default:
       UnreachableCode();
   }
 }
+
+bool GPUBackend::DeinterlaceExtractField(u32 dst_bufidx, GPUTexture* src, u32 x, u32 y, u32 width, u32 height,
+                                         u32 line_skip)
+{
+  if (!m_deinterlace_buffers[dst_bufidx] || m_deinterlace_buffers[dst_bufidx]->GetWidth() != width ||
+      m_deinterlace_buffers[dst_bufidx]->GetHeight() != height)
+  {
+    if (!g_gpu_device->ResizeTexture(&m_deinterlace_buffers[dst_bufidx], width, height, GPUTexture::Type::RenderTarget,
+                                     GPUTexture::Format::RGBA8, false)) [[unlikely]]
+    {
+      return false;
+    }
+
+    GL_OBJECT_NAME_FMT(m_deinterlace_buffers[dst_bufidx], "Blend Deinterlace Buffer {}", dst_bufidx);
+  }
+
+  GPUTexture* dst = m_deinterlace_buffers[dst_bufidx].get();
+  g_gpu_device->InvalidateRenderTarget(dst);
+
+  // If we're not skipping lines, then we can simply copy the texture.
+  if (line_skip == 0 && src->GetFormat() == dst->GetFormat())
+  {
+    GL_INS_FMT("DeinterlaceExtractField({{{},{}}} {}x{} line_skip={}) => copy direct", x, y, width, height, line_skip);
+    g_gpu_device->CopyTextureRegion(dst, 0, 0, 0, 0, src, x, y, 0, 0, width, height);
+  }
+  else
+  {
+    GL_SCOPE_FMT("DeinterlaceExtractField({{{},{}}} {}x{} line_skip={}) => shader copy", x, y, width, height,
+                 line_skip);
+
+    // Otherwise, we need to extract every other line from the texture.
+    src->MakeReadyForSampling();
+    g_gpu_device->SetRenderTarget(dst);
+    g_gpu_device->SetPipeline(m_deinterlace_extract_pipeline.get());
+    g_gpu_device->SetTextureSampler(0, src, g_gpu_device->GetNearestSampler());
+    const u32 uniforms[] = {x, y, line_skip};
+    g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
+    g_gpu_device->SetViewportAndScissor(0, 0, width, height);
+    g_gpu_device->Draw(3, 0);
+
+    GL_POP();
+  }
+
+  dst->MakeReadyForSampling();
+  return true;
+}
+
+bool GPUBackend::DeinterlaceSetTargetSize(u32 width, u32 height, bool preserve)
+{
+  if (!m_deinterlace_texture || m_deinterlace_texture->GetWidth() != width ||
+      m_deinterlace_texture->GetHeight() != height)
+  {
+    if (!g_gpu_device->ResizeTexture(&m_deinterlace_texture, width, height, GPUTexture::Type::RenderTarget,
+                                     GPUTexture::Format::RGBA8, preserve)) [[unlikely]]
+    {
+      return false;
+    }
+
+    GL_OBJECT_NAME(m_deinterlace_texture, "Deinterlace target texture");
+  }
+
+  return true;
+}
+
+bool GPUBackend::ApplyChromaSmoothing()
+{
+  const u32 x = m_display_texture_view_x;
+  const u32 y = m_display_texture_view_y;
+  const u32 width = m_display_texture_view_width;
+  const u32 height = m_display_texture_view_height;
+  if (!m_chroma_smoothing_texture || m_chroma_smoothing_texture->GetWidth() != width ||
+      m_chroma_smoothing_texture->GetHeight() != height)
+  {
+    if (!g_gpu_device->ResizeTexture(&m_chroma_smoothing_texture, width, height, GPUTexture::Type::RenderTarget,
+                                     GPUTexture::Format::RGBA8, false))
+    {
+      ClearDisplayTexture();
+      return false;
+    }
+
+    GL_OBJECT_NAME(m_chroma_smoothing_texture, "Chroma smoothing texture");
+  }
+
+  GL_SCOPE_FMT("ApplyChromaSmoothing({{{},{}}}, {}x{})", x, y, width, height);
+
+  m_display_texture->MakeReadyForSampling();
+  g_gpu_device->InvalidateRenderTarget(m_chroma_smoothing_texture.get());
+  g_gpu_device->SetRenderTarget(m_chroma_smoothing_texture.get());
+  g_gpu_device->SetPipeline(m_chroma_smoothing_pipeline.get());
+  g_gpu_device->SetTextureSampler(0, m_display_texture, g_gpu_device->GetNearestSampler());
+  const u32 uniforms[] = {x, y, width - 1, height - 1};
+  g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
+  g_gpu_device->SetViewportAndScissor(0, 0, width, height);
+  g_gpu_device->Draw(3, 0);
+
+  m_chroma_smoothing_texture->MakeReadyForSampling();
+  SetDisplayTexture(m_chroma_smoothing_texture.get(), m_display_depth_buffer, 0, 0, width, height);
+  return true;
+}
+
+void GPUBackend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
+{
+}
+
+void GPUBackend::CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio,
+                                   GSVector4i* display_rect, GSVector4i* draw_rect) const
+{
+  const bool integer_scale = (g_gpu_settings.display_scaling == DisplayScalingMode::NearestInteger ||
+                              g_gpu_settings.display_scaling == DisplayScalingMode::BilinearInteger);
+  const bool show_vram = g_gpu_settings.debugging.show_vram;
+  const u32 display_width = show_vram ? VRAM_WIDTH : m_display_width;
+  const u32 display_height = show_vram ? VRAM_WIDTH : m_display_height;
+  const s32 display_origin_left = show_vram ? 0 : m_display_origin_left;
+  const s32 display_origin_top = show_vram ? 0 : m_display_origin_top;
+  const u32 display_vram_width = show_vram ? VRAM_WIDTH : m_display_vram_width;
+  const u32 display_vram_height = show_vram ? VRAM_HEIGHT : m_display_vram_height;
+  const float display_aspect_ratio =
+    show_vram ? (static_cast<float>(VRAM_WIDTH) / static_cast<float>(VRAM_HEIGHT)) : m_display_aspect_ratio;
+  GPU::CalculateDrawRect(window_width, window_height, display_width, display_height, display_origin_left,
+                         display_origin_top, display_vram_width, display_vram_height, g_gpu_settings.display_rotation,
+                         display_aspect_ratio, g_gpu_settings.display_stretch_vertically, integer_scale, display_rect,
+                         draw_rect);
+}
+
+bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string filename, FileSystem::ManagedCFilePtr fp,
+                                   u8 quality, bool clear_alpha, bool flip_y, std::vector<u32> texture_data,
+                                   u32 texture_data_stride, GPUTexture::Format texture_format, std::string osd_key)
+{
+
+  bool result;
+
+  const char* extension = std::strrchr(filename.c_str(), '.');
+  if (extension)
+  {
+    if (GPUTexture::ConvertTextureDataToRGBA8(width, height, texture_data, texture_data_stride, texture_format))
+    {
+      if (clear_alpha)
+      {
+        for (u32& pixel : texture_data)
+          pixel |= 0xFF000000u;
+      }
+
+      if (flip_y)
+        GPUTexture::FlipTextureDataRGBA8(width, height, reinterpret_cast<u8*>(texture_data.data()),
+                                         texture_data_stride);
+
+      Assert(texture_data_stride == sizeof(u32) * width);
+      RGBA8Image image(width, height, std::move(texture_data));
+      if (image.SaveToFile(filename.c_str(), fp.get(), quality))
+      {
+        result = true;
+      }
+      else
+      {
+        ERROR_LOG("Unknown extension in filename '{}' or save error: '{}'", filename, extension);
+        result = false;
+      }
+    }
+    else
+    {
+      result = false;
+    }
+  }
+  else
+  {
+    ERROR_LOG("Unable to determine file extension for '{}'", filename);
+    result = false;
+  }
+
+  if (!osd_key.empty())
+  {
+    Host::AddIconOSDMessage(std::move(osd_key), ICON_EMOJI_CAMERA,
+                            fmt::format(result ? TRANSLATE_FS("GPU", "Saved screenshot to '{}'.") :
+                                                 TRANSLATE_FS("GPU", "Failed to save screenshot to '{}'."),
+                                        Path::GetFileName(filename),
+                                        result ? Host::OSD_INFO_DURATION : Host::OSD_ERROR_DURATION));
+  }
+
+  return result;
+}
+
+bool GPUBackend::WriteDisplayTextureToFile(std::string filename)
+{
+  if (!m_display_texture)
+    return false;
+
+  const u32 read_x = static_cast<u32>(m_display_texture_view_x);
+  const u32 read_y = static_cast<u32>(m_display_texture_view_y);
+  const u32 read_width = static_cast<u32>(m_display_texture_view_width);
+  const u32 read_height = static_cast<u32>(m_display_texture_view_height);
+
+  const u32 texture_data_stride =
+    Common::AlignUpPow2(GPUTexture::GetPixelSize(m_display_texture->GetFormat()) * read_width, 4);
+  std::vector<u32> texture_data((texture_data_stride * read_height) / sizeof(u32));
+
+  std::unique_ptr<GPUDownloadTexture> dltex;
+  if (g_gpu_device->GetFeatures().memory_import)
+  {
+    dltex =
+      g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat(), texture_data.data(),
+                                          texture_data.size() * sizeof(u32), texture_data_stride);
+  }
+  if (!dltex)
+  {
+    if (!(dltex = g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat())))
+    {
+      ERROR_LOG("Failed to create {}x{} {} download texture", read_width, read_height,
+                GPUTexture::GetFormatName(m_display_texture->GetFormat()));
+      return false;
+    }
+  }
+
+  dltex->CopyFromTexture(0, 0, m_display_texture, read_x, read_y, read_width, read_height, 0, 0, !dltex->IsImported());
+  if (!dltex->ReadTexels(0, 0, read_width, read_height, texture_data.data(), texture_data_stride))
+  {
+    RestoreDeviceContext();
+    return false;
+  }
+
+  RestoreDeviceContext();
+
+  Error error;
+  auto fp = FileSystem::OpenManagedCFile(filename.c_str(), "wb", &error);
+  if (!fp)
+  {
+    ERROR_LOG("Can't open file '{}': {}", Path::GetFileName(filename), error.GetDescription());
+    return false;
+  }
+
+  constexpr bool clear_alpha = true;
+  const bool flip_y = g_gpu_device->UsesLowerLeftOrigin();
+
+  return CompressAndWriteTextureToFile(
+    read_width, read_height, std::move(filename), std::move(fp), g_settings.display_screenshot_quality, clear_alpha,
+    flip_y, std::move(texture_data), texture_data_stride, m_display_texture->GetFormat(), std::string());
+}
+
+void GPUBackend::HandleRenderScreenshotToBuffer(const GPUThreadRenderScreenshotToBufferCommand* cmd)
+{
+  GSVector4i draw_rect, display_rect;
+  CalculateDrawRect(static_cast<s32>(cmd->width), static_cast<s32>(cmd->height), true, true, &display_rect, &draw_rect);
+
+  // Crop it.
+  const u32 width = static_cast<u32>(display_rect.width());
+  const u32 height = static_cast<u32>(display_rect.height());
+  draw_rect = draw_rect.sub32(display_rect.xyxy());
+  display_rect = display_rect.sub32(display_rect.xyxy());
+  *cmd->out_width = width;
+  *cmd->out_height = height;
+  *cmd->out_result = RenderScreenshotToBuffer(width, height, display_rect, draw_rect, cmd->postfx, cmd->out_pixels,
+                                              cmd->out_stride, cmd->out_format);
+
+  RestoreDeviceContext();
+}
+
+bool GPUBackend::RenderScreenshotToBuffer(u32 width, u32 height, const GSVector4i display_rect,
+                                          const GSVector4i draw_rect, bool postfx, std::vector<u32>* out_pixels,
+                                          u32* out_stride, GPUTexture::Format* out_format)
+{
+  const GPUTexture::Format hdformat =
+    g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetFormat() : GPUTexture::Format::RGBA8;
+
+  auto render_texture =
+    g_gpu_device->FetchAutoRecycleTexture(width, height, 1, 1, 1, GPUTexture::Type::RenderTarget, hdformat);
+  if (!render_texture)
+    return false;
+
+  g_gpu_device->ClearRenderTarget(render_texture.get(), GPUDevice::DEFAULT_CLEAR_COLOR);
+
+  // TODO: this should use copy shader instead.
+  RenderDisplay(render_texture.get(), display_rect, draw_rect, postfx);
+
+  const u32 stride = Common::AlignUpPow2(GPUTexture::GetPixelSize(hdformat) * width, sizeof(u32));
+  out_pixels->resize((height * stride) / sizeof(u32));
+
+  std::unique_ptr<GPUDownloadTexture> dltex;
+  if (g_gpu_device->GetFeatures().memory_import)
+  {
+    dltex = g_gpu_device->CreateDownloadTexture(width, height, hdformat, out_pixels->data(),
+                                                out_pixels->size() * sizeof(u32), stride);
+  }
+  if (!dltex)
+  {
+    if (!(dltex = g_gpu_device->CreateDownloadTexture(width, height, hdformat)))
+    {
+      ERROR_LOG("Failed to create {}x{} download texture", width, height);
+      RestoreDeviceContext();
+      return false;
+    }
+  }
+
+  dltex->CopyFromTexture(0, 0, render_texture.get(), 0, 0, width, height, 0, 0, false);
+  if (!dltex->ReadTexels(0, 0, width, height, out_pixels->data(), stride))
+  {
+    RestoreDeviceContext();
+    return false;
+  }
+
+  *out_stride = stride;
+  *out_format = hdformat;
+  RestoreDeviceContext();
+  return true;
+}
+
+void GPUBackend::CalculateScreenshotSize(DisplayScreenshotMode mode, u32* width, u32* height, GSVector4i* display_rect,
+                                         GSVector4i* draw_rect) const
+{
+  *width = g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetWidth() : 1;
+  *height = g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetHeight() : 1;
+  CalculateDrawRect(*width, *height, true, !g_gpu_settings.debugging.show_vram, display_rect, draw_rect);
+
+  const bool internal_resolution =
+    (mode != DisplayScreenshotMode::ScreenResolution || g_gpu_settings.debugging.show_vram);
+  if (internal_resolution && m_display_texture_view_width != 0 && m_display_texture_view_height != 0)
+  {
+    if (mode == DisplayScreenshotMode::InternalResolution)
+    {
+      const u32 draw_width = static_cast<u32>(display_rect->width());
+      const u32 draw_height = static_cast<u32>(display_rect->height());
+
+      // If internal res, scale the computed draw rectangle to the internal res.
+      // We re-use the draw rect because it's already been AR corrected.
+      const float sar =
+        static_cast<float>(m_display_texture_view_width) / static_cast<float>(m_display_texture_view_height);
+      const float dar = static_cast<float>(draw_width) / static_cast<float>(draw_height);
+      if (sar >= dar)
+      {
+        // stretch height, preserve width
+        const float scale = static_cast<float>(m_display_texture_view_width) / static_cast<float>(draw_width);
+        *width = m_display_texture_view_width;
+        *height = static_cast<u32>(std::round(static_cast<float>(draw_height) * scale));
+      }
+      else
+      {
+        // stretch width, preserve height
+        const float scale = static_cast<float>(m_display_texture_view_height) / static_cast<float>(draw_height);
+        *width = static_cast<u32>(std::round(static_cast<float>(draw_width) * scale));
+        *height = m_display_texture_view_height;
+      }
+
+      // DX11 won't go past 16K texture size.
+      const u32 max_texture_size = g_gpu_device->GetMaxTextureSize();
+      if (*width > max_texture_size)
+      {
+        *height = static_cast<u32>(static_cast<float>(*height) /
+                                   (static_cast<float>(*width) / static_cast<float>(max_texture_size)));
+        *width = max_texture_size;
+      }
+      if (*height > max_texture_size)
+      {
+        *height = max_texture_size;
+        *width = static_cast<u32>(static_cast<float>(*width) /
+                                  (static_cast<float>(*height) / static_cast<float>(max_texture_size)));
+      }
+    }
+    else // if (mode == DisplayScreenshotMode::UncorrectedInternalResolution)
+    {
+      *width = m_display_texture_view_width;
+      *height = m_display_texture_view_height;
+    }
+
+    // Remove padding, it's not part of the framebuffer.
+    *draw_rect = GSVector4i(0, 0, static_cast<s32>(*width), static_cast<s32>(*height));
+    *display_rect = *draw_rect;
+  }
+}
+
+void GPUBackend::RenderScreenshotToFile(const std::string_view path, DisplayScreenshotMode mode, u8 quality,
+                                        bool compress_on_thread, bool show_osd_message)
+{
+  GPUThreadRenderScreenshotToFileCommand* cmd = static_cast<GPUThreadRenderScreenshotToFileCommand*>(
+    GPUThread::AllocateCommand(GPUBackendCommandType::RenderScreenshotToFile,
+                               sizeof(GPUThreadRenderScreenshotToFileCommand) + static_cast<u32>(path.length())));
+  cmd->mode = mode;
+  cmd->quality = quality;
+  cmd->compress_on_thread = compress_on_thread;
+  cmd->show_osd_message = show_osd_message;
+  cmd->path_length = static_cast<u32>(path.length());
+  std::memcpy(cmd->path, path.data(), cmd->path_length);
+  GPUThread::PushCommandAndWakeThread(cmd);
+}
+
+void GPUBackend::HandleRenderScreenshotToFile(const GPUThreadRenderScreenshotToFileCommand* cmd)
+{
+  const std::string path(cmd->path, cmd->path_length);
+
+  u32 width, height;
+  GSVector4i display_rect, draw_rect;
+  CalculateScreenshotSize(cmd->mode, &width, &height, &display_rect, &draw_rect);
+
+  const bool internal_resolution = (cmd->mode != DisplayScreenshotMode::ScreenResolution);
+  if (width == 0 || height == 0)
+    return;
+
+  std::vector<u32> pixels;
+  u32 pixels_stride;
+  GPUTexture::Format pixels_format;
+  if (!RenderScreenshotToBuffer(width, height, display_rect, draw_rect, !internal_resolution, &pixels, &pixels_stride,
+                                &pixels_format))
+  {
+    ERROR_LOG("Failed to render {}x{} screenshot", width, height);
+    return;
+  }
+
+  Error error;
+  auto fp = FileSystem::OpenManagedCFile(path.c_str(), "wb", &error);
+  if (!fp)
+  {
+    ERROR_LOG("Can't open file '{}': {}", Path::GetFileName(path), error.GetDescription());
+    return;
+  }
+
+  std::string osd_key;
+  if (cmd->show_osd_message)
+  {
+    // Use a 60 second timeout to give it plenty of time to actually save.
+    osd_key = fmt::format("ScreenshotSaver_{}", path);
+    Host::AddIconOSDMessage(osd_key, ICON_EMOJI_CAMERA_WITH_FLASH,
+                            fmt::format(TRANSLATE_FS("GPU", "Saving screenshot to '{}'."), Path::GetFileName(path)),
+                            60.0f);
+  }
+
+  if (cmd->compress_on_thread)
+  {
+    System::QueueTaskOnThread([width, height, path = std::move(path), fp = fp.release(), quality = cmd->quality,
+                               flip_y = g_gpu_device->UsesLowerLeftOrigin(), pixels = std::move(pixels), pixels_stride,
+                               pixels_format, osd_key = std::move(osd_key)]() mutable {
+      CompressAndWriteTextureToFile(width, height, std::move(path), FileSystem::ManagedCFilePtr(fp), quality, true,
+                                    flip_y, std::move(pixels), pixels_stride, pixels_format, std::move(osd_key));
+      System::RemoveSelfFromTaskThreads();
+    });
+  }
+  else
+  {
+    CompressAndWriteTextureToFile(width, height, std::move(path), std::move(fp), cmd->quality, true,
+                                  g_gpu_device->UsesLowerLeftOrigin(), std::move(pixels), pixels_stride, pixels_format,
+                                  std::move(osd_key));
+  }
+}
+
+void GPUBackend::GetStatsString(SmallStringBase& str) const
+{
+  if (IsUsingHardwareBackend())
+  {
+    str.format("{}{} HW | {} P | {} DC | {} B | {} RP | {} RB | {} C | {} W",
+               GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()), g_gpu_settings.gpu_use_thread ? "-MT" : "",
+               s_stats.num_primitives, s_stats.host_num_draws, s_stats.host_num_barriers,
+               s_stats.host_num_render_passes, s_stats.host_num_downloads, s_stats.num_copies, s_stats.num_writes);
+  }
+  else
+  {
+    str.format("{}{} SW | {} P | {} R | {} C | {} W", GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()),
+               g_gpu_settings.gpu_use_thread ? "-MT" : "", s_stats.num_primitives, s_stats.num_reads, s_stats.num_copies,
+               s_stats.num_writes);
+  }
+}
+
+void GPUBackend::GetMemoryStatsString(SmallStringBase& str) const
+{
+  const u32 vram_usage_mb = static_cast<u32>((g_gpu_device->GetVRAMUsage() + (1048576 - 1)) / 1048576);
+  const u32 stream_kb = static_cast<u32>((s_stats.host_buffer_streamed + (1024 - 1)) / 1024);
+
+  str.format("{} MB VRAM | {} KB STR | {} TC | {} TU", vram_usage_mb, stream_kb, s_stats.host_num_copies,
+             s_stats.host_num_uploads);
+}
+
+void GPUBackend::ResetStatistics()
+{
+  s_counters = {};
+  g_gpu_device->ResetStatistics();
+}
+
+void GPUBackend::UpdateStatistics(u32 frame_count)
+{
+  const GPUDevice::Statistics& stats = g_gpu_device->GetStatistics();
+  const u32 round = (frame_count - 1);
+
+#define UPDATE_COUNTER(x) s_stats.x = (s_counters.x + round) / frame_count
+#define UPDATE_GPU_STAT(x) s_stats.host_##x = (stats.x + round) / frame_count
+
+  UPDATE_COUNTER(num_reads);
+  UPDATE_COUNTER(num_writes);
+  UPDATE_COUNTER(num_copies);
+  UPDATE_COUNTER(num_vertices);
+  UPDATE_COUNTER(num_primitives);
+
+  // UPDATE_COUNTER(num_read_texture_updates);
+  // UPDATE_COUNTER(num_ubo_updates);
+
+  UPDATE_GPU_STAT(buffer_streamed);
+  UPDATE_GPU_STAT(num_draws);
+  UPDATE_GPU_STAT(num_barriers);
+  UPDATE_GPU_STAT(num_render_passes);
+  UPDATE_GPU_STAT(num_copies);
+  UPDATE_GPU_STAT(num_downloads);
+  UPDATE_GPU_STAT(num_uploads);
+
+#undef UPDATE_GPU_STAT
+#undef UPDATE_COUNTER
+
+  ResetStatistics();
+}
diff --git a/src/core/gpu_backend.h b/src/core/gpu_backend.h
index ea25a36a1..b8fc8664d 100644
--- a/src/core/gpu_backend.h
+++ b/src/core/gpu_backend.h
@@ -5,6 +5,8 @@
 
 #include "gpu_types.h"
 
+#include "util/gpu_device.h"
+
 #include "common/heap_array.h"
 #include "common/threading.h"
 
@@ -12,84 +14,196 @@
 #include <condition_variable>
 #include <memory>
 #include <mutex>
+#include <tuple>
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4324) // warning C4324: 'GPUBackend': structure was padded due to alignment specifier
-#endif
+class Error;
+class SmallStringBase;
+
+class GPUFramebuffer;
+class GPUPipeline;
+
+struct Settings;
+class StateWrapper;
+
+// DESIGN NOTE: Only static methods should be called on the CPU thread.
+// You specifically don't have a global pointer available for this reason.
 
 class GPUBackend
 {
+public:
+  static GPUThreadCommand* NewClearVRAMCommand();
+  static GPUThreadCommand* NewClearDisplayCommand();
+  static GPUBackendUpdateDisplayCommand* NewUpdateDisplayCommand();
+  static GPUThreadCommand* NewClearCacheCommand();
+  static GPUThreadCommand* NewBufferSwappedCommand();
+  static GPUThreadCommand* NewUpdateResolutionScaleCommand();
+  static GPUBackendReadVRAMCommand* NewReadVRAMCommand();
+  static GPUBackendFillVRAMCommand* NewFillVRAMCommand();
+  static GPUBackendUpdateVRAMCommand* NewUpdateVRAMCommand(u32 num_words);
+  static GPUBackendCopyVRAMCommand* NewCopyVRAMCommand();
+  static GPUBackendSetDrawingAreaCommand* NewSetDrawingAreaCommand();
+  static GPUBackendUpdateCLUTCommand* NewUpdateCLUTCommand();
+  static GPUBackendDrawPolygonCommand* NewDrawPolygonCommand(u32 num_vertices);
+  static GPUBackendDrawPrecisePolygonCommand* NewDrawPrecisePolygonCommand(u32 num_vertices);
+  static GPUBackendDrawRectangleCommand* NewDrawRectangleCommand();
+  static GPUBackendDrawLineCommand* NewDrawLineCommand(u32 num_vertices);
+  static void PushCommand(GPUThreadCommand* cmd);
+  static void PushCommandAndWakeThread(GPUThreadCommand* cmd);
+  static void PushCommandAndSync(GPUThreadCommand* cmd, bool spin);
+
+  static bool IsUsingHardwareBackend();
+
+  static std::unique_ptr<GPUBackend> CreateHardwareBackend();
+  static std::unique_ptr<GPUBackend> CreateSoftwareBackend();
+
+  static bool RenderScreenshotToBuffer(u32 width, u32 height, bool postfx, u32* out_width, u32* out_height,
+                                       std::vector<u32>* out_pixels, u32* out_stride, GPUTexture::Format* out_format);
+  static void RenderScreenshotToFile(const std::string_view path, DisplayScreenshotMode mode, u8 quality,
+                                     bool compress_on_thread, bool show_osd_message);
+
 public:
   GPUBackend();
   virtual ~GPUBackend();
 
-  ALWAYS_INLINE const Threading::Thread* GetThread() const { return m_use_gpu_thread ? &m_gpu_thread : nullptr; }
+  virtual bool IsHardwareRenderer() const = 0;
 
-  virtual bool Initialize(bool use_thread);
-  virtual void Reset();
-  virtual void Shutdown();
+  virtual bool Initialize(bool upload_vram, Error* error);
 
-  void SetThreadEnabled(bool use_thread);
+  virtual void UpdateSettings(const Settings& old_settings);
 
-  GPUBackendFillVRAMCommand* NewFillVRAMCommand();
-  GPUBackendUpdateVRAMCommand* NewUpdateVRAMCommand(u32 num_words);
-  GPUBackendCopyVRAMCommand* NewCopyVRAMCommand();
-  GPUBackendSetDrawingAreaCommand* NewSetDrawingAreaCommand();
-  GPUBackendUpdateCLUTCommand* NewUpdateCLUTCommand();
-  GPUBackendDrawPolygonCommand* NewDrawPolygonCommand(u32 num_vertices);
-  GPUBackendDrawRectangleCommand* NewDrawRectangleCommand();
-  GPUBackendDrawLineCommand* NewDrawLineCommand(u32 num_vertices);
+  /// Returns the current resolution scale.
+  virtual u32 GetResolutionScale() const = 0;
 
-  void PushCommand(GPUBackendCommand* cmd);
-  void Sync(bool allow_sleep);
+  /// Updates the resolution scale when it's set to automatic.
+  virtual void UpdateResolutionScale() = 0;
 
-  /// Processes all pending GPU commands.
-  void RunGPULoop();
+  /// Returns the full display resolution of the GPU, including padding.
+  std::tuple<u32, u32> GetFullDisplayResolution() const;
+
+  // Graphics API state reset/restore - call when drawing the UI etc.
+  // TODO: replace with "invalidate cached state"
+  virtual void RestoreDeviceContext() = 0;
+
+  /// Main command handler for GPU thread.
+  void HandleCommand(const GPUThreadCommand* cmd);
+
+  /// Draws the current display texture, with any post-processing.
+  GPUDevice::PresentResult PresentDisplay();
+
+  /// Helper function to save current display texture to PNG. Used for regtest.
+  bool WriteDisplayTextureToFile(std::string filename);
+
+  bool BeginQueueFrame();
+  void WaitForOneQueuedFrame();
+
+  void GetStatsString(SmallStringBase& str) const;
+  void GetMemoryStatsString(SmallStringBase& str) const;
+
+  void ResetStatistics();
+  void UpdateStatistics(u32 frame_count);
 
 protected:
-  void* AllocateCommand(GPUBackendCommandType command, u32 size);
-  u32 GetPendingCommandSize() const;
-  void WakeGPUThread();
-  void StartGPUThread();
-  void StopGPUThread();
+  enum : u32
+  {
+    DEINTERLACE_BUFFER_COUNT = 4,
+  };
 
+  virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0;
   virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) = 0;
   virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
                           GPUBackendCommandParameters params) = 0;
   virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
                         GPUBackendCommandParameters params) = 0;
+
   virtual void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) = 0;
-  virtual void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) = 0;
+  virtual void DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd) = 0;
+  virtual void DrawSprite(const GPUBackendDrawRectangleCommand* cmd) = 0;
   virtual void DrawLine(const GPUBackendDrawLineCommand* cmd) = 0;
-  virtual void FlushRender() = 0;
-  virtual void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) = 0;
+
+  virtual void DrawingAreaChanged() = 0;
   virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
+  virtual void ClearCache() = 0;
+  virtual void OnBufferSwapped() = 0;
+  virtual void ClearVRAM() = 0;
 
-  void HandleCommand(const GPUBackendCommand* cmd);
+  virtual void UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd) = 0;
 
-  Threading::KernelSemaphore m_sync_semaphore;
-  std::atomic_bool m_gpu_thread_sleeping{false};
-  std::atomic_bool m_gpu_loop_done{false};
-  Threading::Thread m_gpu_thread;
-  bool m_use_gpu_thread = false;
+  virtual void LoadState(const GPUBackendLoadStateCommand* cmd) = 0;
 
-  std::mutex m_sync_mutex;
-  std::condition_variable m_sync_cpu_thread_cv;
-  std::condition_variable m_wake_gpu_thread_cv;
-  bool m_sync_done = false;
+  /// Ensures all pending draws are flushed to the host GPU.
+  virtual void FlushRender() = 0;
 
-  enum : u32
-  {
-    COMMAND_QUEUE_SIZE = 4 * 1024 * 1024,
-    THRESHOLD_TO_WAKE_GPU = 256
-  };
+  /// Helper function for computing the draw rectangle in a larger window.
+  void CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio,
+                         GSVector4i* display_rect, GSVector4i* draw_rect) const;
 
-  FixedHeapArray<u8, COMMAND_QUEUE_SIZE> m_command_fifo_data;
-  alignas(HOST_CACHE_LINE_SIZE) std::atomic<u32> m_command_fifo_read_ptr{0};
-  alignas(HOST_CACHE_LINE_SIZE) std::atomic<u32> m_command_fifo_write_ptr{0};
+  /// Helper function for computing screenshot bounds.
+  void CalculateScreenshotSize(DisplayScreenshotMode mode, u32* width, u32* height, GSVector4i* display_rect,
+                               GSVector4i* draw_rect) const;
+
+  /// Renders the display, optionally with postprocessing to the specified image.
+  void HandleRenderScreenshotToBuffer(const GPUThreadRenderScreenshotToBufferCommand* cmd);
+  void HandleRenderScreenshotToFile(const GPUThreadRenderScreenshotToFileCommand* cmd);
+
+  /// Renders the display, optionally with postprocessing to the specified image.
+  bool RenderScreenshotToBuffer(u32 width, u32 height, const GSVector4i display_rect, const GSVector4i draw_rect,
+                                bool postfx, std::vector<u32>* out_pixels, u32* out_stride,
+                                GPUTexture::Format* out_format);
+
+  bool CompileDisplayPipelines(bool display, bool deinterlace, bool chroma_smoothing, Error* error);
+
+  void HandleUpdateDisplayCommand(const GPUBackendUpdateDisplayCommand* cmd);
+
+  void ClearDisplay();
+  void ClearDisplayTexture();
+  void SetDisplayTexture(GPUTexture* texture, GPUTexture* depth_buffer, s32 view_x, s32 view_y, s32 view_width,
+                         s32 view_height);
+
+  GPUDevice::PresentResult RenderDisplay(GPUTexture* target, const GSVector4i display_rect, const GSVector4i draw_rect,
+                                         bool postfx);
+
+  /// Sends the current frame to media capture.
+  void SendDisplayToMediaCapture(MediaCapture* cap);
+
+  bool Deinterlace(u32 field, u32 line_skip);
+  bool DeinterlaceExtractField(u32 dst_bufidx, GPUTexture* src, u32 x, u32 y, u32 width, u32 height, u32 line_skip);
+  bool DeinterlaceSetTargetSize(u32 width, u32 height, bool preserve);
+  void DestroyDeinterlaceTextures();
+  bool ApplyChromaSmoothing();
+
+  s32 m_display_width = 0;
+  s32 m_display_height = 0;
+  s32 m_display_origin_left = 0;
+  s32 m_display_origin_top = 0;
+  s32 m_display_vram_width = 0;
+  s32 m_display_vram_height = 0;
+  float m_display_aspect_ratio = 0.0f;
+
+  u32 m_current_deinterlace_buffer = 0;
+  std::unique_ptr<GPUPipeline> m_deinterlace_pipeline;
+  std::unique_ptr<GPUPipeline> m_deinterlace_extract_pipeline;
+  std::array<std::unique_ptr<GPUTexture>, DEINTERLACE_BUFFER_COUNT> m_deinterlace_buffers;
+  std::unique_ptr<GPUTexture> m_deinterlace_texture;
+
+  std::unique_ptr<GPUPipeline> m_chroma_smoothing_pipeline;
+  std::unique_ptr<GPUTexture> m_chroma_smoothing_texture;
+
+  std::unique_ptr<GPUPipeline> m_display_pipeline;
+  GPUTexture* m_display_texture = nullptr;
+  GPUTexture* m_display_depth_buffer = nullptr;
+  s32 m_display_texture_view_x = 0;
+  s32 m_display_texture_view_y = 0;
+  s32 m_display_texture_view_width = 0;
+  s32 m_display_texture_view_height = 0;
+
+  std::atomic<u32> m_queued_frames;
+  std::atomic_bool m_waiting_for_gpu_thread;
+  Threading::KernelSemaphore m_gpu_thread_wait;
 };
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+namespace Host {
+
+/// Called at the end of the frame, before presentation.
+void FrameDoneOnGPUThread(GPUBackend* gpu_backend, u32 frame_number);
+
+} // namespace Host
diff --git a/src/core/gpu_commands.cpp b/src/core/gpu_commands.cpp
index 73c4a9d21..673ce6433 100644
--- a/src/core/gpu_commands.cpp
+++ b/src/core/gpu_commands.cpp
@@ -1,13 +1,16 @@
 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0
 
+#include "cpu_pgxp.h"
 #include "gpu.h"
+#include "gpu_backend.h"
 #include "gpu_dump.h"
 #include "gpu_hw_texture_cache.h"
 #include "interrupt_controller.h"
 #include "system.h"
 
 #include "common/assert.h"
+#include "common/gsvector_formatter.h"
 #include "common/log.h"
 #include "common/string_util.h"
 
@@ -93,7 +96,7 @@ void GPU::TryExecuteCommands()
           // drop terminator
           m_fifo.RemoveOne();
           DEBUG_LOG("Drawing poly-line with {} vertices", GetPolyLineVertexCount());
-          DispatchRenderCommand();
+          FinishPolyline();
           m_blit_buffer.clear();
           EndCommand();
           continue;
@@ -200,8 +203,8 @@ bool GPU::HandleNOPCommand()
 bool GPU::HandleClearCacheCommand()
 {
   DEBUG_LOG("GP0 clear cache");
-  m_draw_mode.SetTexturePageChanged();
   InvalidateCLUT();
+  GPUBackend::PushCommand(GPUBackend::NewClearCacheCommand());
   m_fifo.RemoveOne();
   AddCommandTicks(1);
   EndCommand();
@@ -248,8 +251,6 @@ bool GPU::HandleSetDrawingAreaTopLeftCommand()
   DEBUG_LOG("Set drawing area top-left: ({}, {})", left, top);
   if (m_drawing_area.left != left || m_drawing_area.top != top)
   {
-    FlushRender();
-
     m_drawing_area.left = left;
     m_drawing_area.top = top;
     m_drawing_area_changed = true;
@@ -270,8 +271,6 @@ bool GPU::HandleSetDrawingAreaBottomRightCommand()
   DEBUG_LOG("Set drawing area bottom-right: ({}, {})", m_drawing_area.right, m_drawing_area.bottom);
   if (m_drawing_area.right != right || m_drawing_area.bottom != bottom)
   {
-    FlushRender();
-
     m_drawing_area.right = right;
     m_drawing_area.bottom = bottom;
     m_drawing_area_changed = true;
@@ -291,8 +290,6 @@ bool GPU::HandleSetDrawingOffsetCommand()
   DEBUG_LOG("Set drawing offset ({}, {})", m_drawing_offset.x, m_drawing_offset.y);
   if (m_drawing_offset.x != x || m_drawing_offset.y != y)
   {
-    FlushRender();
-
     m_drawing_offset.x = x;
     m_drawing_offset.y = y;
   }
@@ -308,11 +305,7 @@ bool GPU::HandleSetMaskBitCommand()
 
   constexpr u32 gpustat_mask = (1 << 11) | (1 << 12);
   const u32 gpustat_bits = (param & 0x03) << 11;
-  if ((m_GPUSTAT.bits & gpustat_mask) != gpustat_bits)
-  {
-    FlushRender();
-    m_GPUSTAT.bits = (m_GPUSTAT.bits & ~gpustat_mask) | gpustat_bits;
-  }
+  m_GPUSTAT.bits = (m_GPUSTAT.bits & ~gpustat_mask) | gpustat_bits;
   DEBUG_LOG("Set mask bit {} {}", BoolToUInt32(m_GPUSTAT.set_mask_while_drawing),
             BoolToUInt32(m_GPUSTAT.check_mask_before_draw));
 
@@ -321,6 +314,36 @@ bool GPU::HandleSetMaskBitCommand()
   return true;
 }
 
+void GPU::PrepareForDraw()
+{
+  if (m_drawing_area_changed)
+  {
+    m_drawing_area_changed = false;
+    GPUBackendSetDrawingAreaCommand* cmd = GPUBackend::NewSetDrawingAreaCommand();
+    cmd->new_area = m_drawing_area;
+    GPUBackend::PushCommand(cmd);
+  }
+}
+
+void GPU::FillBackendCommandParameters(GPUBackendCommand* cmd) const
+{
+  cmd->params.bits = 0;
+  cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
+  cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
+  cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
+  cmd->params.interlaced_rendering = IsInterlacedRenderingEnabled();
+}
+
+void GPU::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const
+{
+  FillBackendCommandParameters(cmd);
+  cmd->rc.bits = rc.bits;
+  cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
+  cmd->draw_mode.dither_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
+  cmd->palette.bits = m_draw_mode.palette_reg.bits;
+  cmd->window = m_draw_mode.texture_window;
+}
+
 bool GPU::HandleRenderPolygonCommand()
 {
   const GPURenderCommand rc{FifoPeek(0)};
@@ -346,6 +369,7 @@ bool GPU::HandleRenderPolygonCommand()
             words_per_vertex, setup_ticks);
 
   // set draw state up
+  // TODO: Get rid of SetTexturePalette() and just fill it as needed
   if (rc.texture_enable)
   {
     const u16 texpage_attribute = Truncate16((rc.shading_enable ? FifoPeek(5) : FifoPeek(4)) >> 16);
@@ -355,12 +379,218 @@ bool GPU::HandleRenderPolygonCommand()
     UpdateCLUTIfNeeded(m_draw_mode.mode_reg.texture_mode, m_draw_mode.palette_reg);
   }
 
-  m_counters.num_vertices += num_vertices;
-  m_counters.num_primitives++;
   m_render_command.bits = rc.bits;
   m_fifo.RemoveOne();
 
-  DispatchRenderCommand();
+  PrepareForDraw();
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    GPUBackendDrawPrecisePolygonCommand* cmd = GPUBackend::NewDrawPrecisePolygonCommand(num_vertices);
+    FillDrawCommand(cmd, rc);
+
+    const u32 first_color = rc.color_for_first_vertex;
+    const bool shaded = rc.shading_enable;
+    const bool textured = rc.texture_enable;
+    bool valid_w = g_settings.gpu_pgxp_texture_correction;
+    for (u32 i = 0; i < num_vertices; i++)
+    {
+      GPUBackendDrawPrecisePolygonCommand::Vertex* vert = &cmd->vertices[i];
+      vert->color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
+      const u64 maddr_and_pos = m_fifo.Pop();
+      const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
+      vert->native_x = m_drawing_offset.x + vp.x;
+      vert->native_y = m_drawing_offset.y + vp.y;
+      vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
+
+      valid_w &= CPU::PGXP::GetPreciseVertex(Truncate32(maddr_and_pos >> 32), vp.bits, vert->native_x, vert->native_y,
+                                             m_drawing_offset.x, m_drawing_offset.y, &vert->x, &vert->y, &vert->w);
+    }
+
+    cmd->valid_w = valid_w;
+    if (!valid_w)
+    {
+      if (g_settings.gpu_pgxp_disable_2d)
+      {
+        // NOTE: This reads uninitialized data, but it's okay, it doesn't get used.
+        for (u32 i = 0; i < num_vertices; i++)
+        {
+          GPUBackendDrawPrecisePolygonCommand::Vertex& v = cmd->vertices[i];
+          GSVector2::store(&v.x, GSVector2(GSVector2i::load(&v.native_x)));
+          v.w = 1.0f;
+        }
+      }
+      else
+      {
+        for (u32 i = 0; i < num_vertices; i++)
+          cmd->vertices[i].w = 1.0f;
+      }
+    }
+
+    // Cull polygons which are too large.
+    const GSVector2 v0f = GSVector2::load(&cmd->vertices[0].x);
+    const GSVector2 v1f = GSVector2::load(&cmd->vertices[1].x);
+    const GSVector2 v2f = GSVector2::load(&cmd->vertices[2].x);
+    const GSVector2 min_pos_12 = v1f.min(v2f);
+    const GSVector2 max_pos_12 = v1f.max(v2f);
+    const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
+                                       .add32(GSVector4i::cxpr(0, 0, 1, 1));
+    const bool first_tri_culled =
+      (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT ||
+       !draw_rect_012.rintersects(m_clamped_drawing_area));
+    if (first_tri_culled)
+    {
+      // TODO: GPU events... somehow.
+      DEBUG_LOG("Culling off-screen/too-large polygon: {},{} {},{} {},{}", cmd->vertices[0].native_x,
+                cmd->vertices[0].native_y, cmd->vertices[1].native_x, cmd->vertices[1].native_y,
+                cmd->vertices[2].native_x, cmd->vertices[2].native_y);
+
+      if (!rc.quad_polygon)
+      {
+        EndCommand();
+        return true;
+      }
+    }
+    else
+    {
+      AddDrawTriangleTicks(GSVector2i::load(&cmd->vertices[0].native_x), GSVector2i::load(&cmd->vertices[1].native_x),
+                           GSVector2i::load(&cmd->vertices[2].native_x), rc.shading_enable, rc.texture_enable,
+                           rc.transparency_enable);
+    }
+
+    // quads
+    if (rc.quad_polygon)
+    {
+      const GSVector2 v3f = GSVector2::load(&cmd->vertices[3].x);
+      const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
+                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
+
+      // Cull polygons which are too large.
+      const bool second_tri_culled =
+        (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
+         !draw_rect_123.rintersects(m_clamped_drawing_area));
+      if (second_tri_culled)
+      {
+        DEBUG_LOG("Culling off-screen/too-large polygon (quad second half): {},{} {},{} {},{}",
+                  cmd->vertices[2].native_x, cmd->vertices[2].native_y, cmd->vertices[1].native_x,
+                  cmd->vertices[1].native_y, cmd->vertices[0].native_x, cmd->vertices[0].native_y);
+
+        if (first_tri_culled)
+        {
+          EndCommand();
+          return true;
+        }
+
+        // Remove second part of quad.
+        cmd->num_vertices = 3;
+      }
+      else
+      {
+        AddDrawTriangleTicks(GSVector2i::load(&cmd->vertices[2].native_x), GSVector2i::load(&cmd->vertices[1].native_x),
+                             GSVector2i::load(&cmd->vertices[3].native_x), rc.shading_enable, rc.texture_enable,
+                             rc.transparency_enable);
+
+        // If first part was culled, move the second part to the first.
+        if (first_tri_culled)
+        {
+          std::memcpy(&cmd->vertices[0], &cmd->vertices[2], sizeof(GPUBackendDrawPrecisePolygonCommand::Vertex));
+          std::memcpy(&cmd->vertices[2], &cmd->vertices[3], sizeof(GPUBackendDrawPrecisePolygonCommand::Vertex));
+          cmd->num_vertices = 3;
+        }
+      }
+    }
+
+    GPUBackend::PushCommand(cmd);
+  }
+  else
+  {
+    GPUBackendDrawPolygonCommand* cmd = GPUBackend::NewDrawPolygonCommand(num_vertices);
+    FillDrawCommand(cmd, rc);
+
+    const u32 first_color = rc.color_for_first_vertex;
+    const bool shaded = rc.shading_enable;
+    const bool textured = rc.texture_enable;
+    for (u32 i = 0; i < num_vertices; i++)
+    {
+      GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
+      vert->color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
+      const u64 maddr_and_pos = m_fifo.Pop();
+      const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
+      vert->x = m_drawing_offset.x + vp.x;
+      vert->y = m_drawing_offset.y + vp.y;
+      vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
+    }
+
+    // Cull polygons which are too large.
+    const GSVector2i v0 = GSVector2i::load(&cmd->vertices[0].x);
+    const GSVector2i v1 = GSVector2i::load(&cmd->vertices[1].x);
+    const GSVector2i v2 = GSVector2i::load(&cmd->vertices[2].x);
+    const GSVector2i min_pos_12 = v1.min_s32(v2);
+    const GSVector2i max_pos_12 = v1.max_s32(v2);
+    const GSVector4i draw_rect_012 =
+      GSVector4i::xyxy(min_pos_12.min_s32(v0), max_pos_12.max_s32(v0)).add32(GSVector4i::cxpr(0, 0, 1, 1));
+    const bool first_tri_culled =
+      (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT ||
+       !draw_rect_012.rintersects(m_clamped_drawing_area));
+    if (first_tri_culled)
+    {
+      DEBUG_LOG("Culling off-screen/too-large polygon: {},{} {},{} {},{}", cmd->vertices[0].x, cmd->vertices[0].y,
+                cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[2].x, cmd->vertices[2].y);
+
+      if (!rc.quad_polygon)
+      {
+        EndCommand();
+        return true;
+      }
+    }
+    else
+    {
+      AddDrawTriangleTicks(v0, v1, v2, rc.shading_enable, rc.texture_enable, rc.transparency_enable);
+    }
+
+    // quads
+    if (rc.quad_polygon)
+    {
+      const GSVector2i v3 = GSVector2i::load(&cmd->vertices[3].x);
+      const GSVector4i draw_rect_123 = GSVector4i(min_pos_12.min_s32(v3))
+                                         .upl64(GSVector4i(max_pos_12.max_s32(v3)))
+                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
+
+      // Cull polygons which are too large.
+      const bool second_tri_culled =
+        (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
+         !draw_rect_123.rintersects(m_clamped_drawing_area));
+      if (second_tri_culled)
+      {
+        DEBUG_LOG("Culling too-large polygon (quad second half): {},{} {},{} {},{}", cmd->vertices[2].x,
+                  cmd->vertices[2].y, cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[0].x, cmd->vertices[0].y);
+
+        if (first_tri_culled)
+        {
+          EndCommand();
+          return true;
+        }
+
+        // Remove second part of quad.
+        cmd->num_vertices = 3;
+      }
+      else
+      {
+        AddDrawTriangleTicks(v2, v1, v3, rc.shading_enable, rc.texture_enable, rc.transparency_enable);
+
+        // If first part was culled, move the second part to the first.
+        if (first_tri_culled)
+        {
+          std::memcpy(&cmd->vertices[0], &cmd->vertices[2], sizeof(GPUBackendDrawPolygonCommand::Vertex));
+          std::memcpy(&cmd->vertices[2], &cmd->vertices[3], sizeof(GPUBackendDrawPolygonCommand::Vertex));
+          cmd->num_vertices = 3;
+        }
+      }
+    }
+
+    GPUBackend::PushCommand(cmd);
+  }
+
   EndCommand();
   return true;
 }
@@ -389,12 +619,65 @@ bool GPU::HandleRenderRectangleCommand()
             rc.transparency_enable ? "semi-transparent" : "opaque", rc.texture_enable ? "textured" : "non-textured",
             rc.shading_enable ? "shaded" : "monochrome", total_words, setup_ticks);
 
-  m_counters.num_vertices++;
-  m_counters.num_primitives++;
   m_render_command.bits = rc.bits;
   m_fifo.RemoveOne();
 
-  DispatchRenderCommand();
+  PrepareForDraw();
+  GPUBackendDrawRectangleCommand* cmd = GPUBackend::NewDrawRectangleCommand();
+  FillDrawCommand(cmd, rc);
+  cmd->color = rc.color_for_first_vertex;
+
+  const GPUVertexPosition vp{FifoPop()};
+  cmd->x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
+  cmd->y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
+
+  if (rc.texture_enable)
+  {
+    const u32 texcoord_and_palette = FifoPop();
+    cmd->palette.bits = Truncate16(texcoord_and_palette >> 16);
+    cmd->texcoord = Truncate16(texcoord_and_palette);
+  }
+  else
+  {
+    cmd->palette.bits = 0;
+    cmd->texcoord = 0;
+  }
+
+  switch (rc.rectangle_size)
+  {
+    case GPUDrawRectangleSize::R1x1:
+      cmd->width = 1;
+      cmd->height = 1;
+      break;
+    case GPUDrawRectangleSize::R8x8:
+      cmd->width = 8;
+      cmd->height = 8;
+      break;
+    case GPUDrawRectangleSize::R16x16:
+      cmd->width = 16;
+      cmd->height = 16;
+      break;
+    default:
+    {
+      const u32 width_and_height = FifoPop();
+      cmd->width = static_cast<u16>(width_and_height & VRAM_WIDTH_MASK);
+      cmd->height = static_cast<u16>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
+    }
+    break;
+  }
+
+  const GSVector4i rect = GSVector4i(cmd->x, cmd->y, cmd->x + cmd->width, cmd->y + cmd->height);
+  const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
+  if (clamped_rect.rempty()) [[unlikely]]
+  {
+    DEBUG_LOG("Culling off-screen rectangle {}", rect);
+    EndCommand();
+    return true;
+  }
+
+  AddDrawRectangleTicks(clamped_rect, rc.texture_enable, rc.transparency_enable);
+
+  GPUBackend::PushCommand(cmd);
   EndCommand();
   return true;
 }
@@ -411,12 +694,55 @@ bool GPU::HandleRenderLineCommand()
   TRACE_LOG("Render {} {} line ({} total words)", rc.transparency_enable ? "semi-transparent" : "opaque",
             rc.shading_enable ? "shaded" : "monochrome", total_words);
 
-  m_counters.num_vertices += 2;
-  m_counters.num_primitives++;
   m_render_command.bits = rc.bits;
   m_fifo.RemoveOne();
 
-  DispatchRenderCommand();
+  PrepareForDraw();
+  GPUBackendDrawLineCommand* cmd = GPUBackend::NewDrawLineCommand(2);
+  FillDrawCommand(cmd, rc);
+  cmd->palette.bits = 0;
+
+  if (rc.shading_enable)
+  {
+    cmd->vertices[0].color = rc.color_for_first_vertex;
+    const GPUVertexPosition start_pos{FifoPop()};
+    cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
+    cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
+
+    cmd->vertices[1].color = FifoPop() & UINT32_C(0x00FFFFFF);
+    const GPUVertexPosition end_pos{FifoPop()};
+    cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
+    cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
+  }
+  else
+  {
+    cmd->vertices[0].color = rc.color_for_first_vertex;
+    cmd->vertices[1].color = rc.color_for_first_vertex;
+
+    const GPUVertexPosition start_pos{FifoPop()};
+    cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
+    cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
+
+    const GPUVertexPosition end_pos{FifoPop()};
+    cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
+    cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
+  }
+
+  const GSVector2i v0 = GSVector2i::load(&cmd->vertices[0].x);
+  const GSVector2i v1 = GSVector2i::load(&cmd->vertices[1].x);
+  const GSVector4i rect = GSVector4i::xyxy(v0.min_s32(v1), v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
+  const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
+
+  if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
+  {
+    DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", cmd->vertices[0].y, cmd->vertices[0].y,
+              cmd->vertices[1].x, cmd->vertices[1].y);
+    EndCommand();
+    return true;
+  }
+
+  AddDrawLineTicks(clamped_rect, rc.shading_enable);
+  GPUBackend::PushCommand(cmd);
   EndCommand();
   return true;
 }
@@ -453,6 +779,64 @@ bool GPU::HandleRenderPolyLineCommand()
   return true;
 }
 
+void GPU::FinishPolyline()
+{
+  PrepareForDraw();
+
+  const u32 num_vertices = GetPolyLineVertexCount();
+  DebugAssert(num_vertices >= 2);
+
+  GPUBackendDrawLineCommand* cmd = GPUBackend::NewDrawLineCommand((num_vertices - 1) * 2);
+  FillDrawCommand(cmd, m_render_command);
+
+  u32 buffer_pos = 0;
+  const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
+  const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
+  GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
+  u32 start_color = m_render_command.color_for_first_vertex;
+
+  const bool shaded = m_render_command.shading_enable;
+  u32 out_vertex_count = 0;
+  for (u32 i = 1; i < num_vertices; i++)
+  {
+    const u32 end_color =
+      shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : m_render_command.color_for_first_vertex;
+    const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
+    const GSVector2i end_pos = GSVector2i(vp.x, vp.y).add32(draw_offset);
+
+    const GSVector4i rect =
+      GSVector4i::xyxy(start_pos.min_s32(end_pos), start_pos.max_s32(end_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1));
+    const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
+
+    if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
+    {
+      DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", start_pos.x, start_pos.y, end_pos.x, end_pos.y);
+    }
+    else
+    {
+      AddDrawLineTicks(clamped_rect, m_render_command.shading_enable);
+
+      GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
+      out_vertex_count += 2;
+
+      GSVector2i::store(&out_vertex[0].x, start_pos);
+      out_vertex[0].color = start_color;
+      GSVector2i::store(&out_vertex[1].x, end_pos);
+      out_vertex[1].color = end_color;
+    }
+
+    start_pos = end_pos;
+    start_color = end_color;
+  }
+
+  if (out_vertex_count > 0)
+  {
+    DebugAssert(out_vertex_count <= cmd->num_vertices);
+    cmd->num_vertices = Truncate16(out_vertex_count);
+    GPUBackend::PushCommand(cmd);
+  }
+}
+
 bool GPU::HandleFillRectangleCommand()
 {
   CHECK_COMMAND_SIZE(3);
@@ -460,8 +844,6 @@ bool GPU::HandleFillRectangleCommand()
   if (IsInterlacedRenderingEnabled() && IsCRTCScanlinePending())
     SynchronizeCRTC();
 
-  FlushRender();
-
   const u32 color = FifoPop() & 0x00FFFFFF;
   const u32 dst_x = FifoPeek() & 0x3F0;
   const u32 dst_y = (FifoPop() >> 16) & VRAM_HEIGHT_MASK;
@@ -471,9 +853,17 @@ bool GPU::HandleFillRectangleCommand()
   DEBUG_LOG("Fill VRAM rectangle offset=({},{}), size=({},{})", dst_x, dst_y, width, height);
 
   if (width > 0 && height > 0)
-    FillVRAM(dst_x, dst_y, width, height, color);
+  {
+    GPUBackendFillVRAMCommand* cmd = GPUBackend::NewFillVRAMCommand();
+    FillBackendCommandParameters(cmd);
+    cmd->x = static_cast<u16>(dst_x);
+    cmd->y = static_cast<u16>(dst_y);
+    cmd->width = static_cast<u16>(width);
+    cmd->height = static_cast<u16>(height);
+    cmd->color = color;
+    GPUBackend::PushCommand(cmd);
+  }
 
-  m_counters.num_writes++;
   AddCommandTicks(46 + ((width / 8) + 9) * height);
   EndCommand();
   return true;
@@ -523,8 +913,6 @@ void GPU::FinishVRAMWrite()
   if (IsInterlacedRenderingEnabled() && IsCRTCScanlinePending())
     SynchronizeCRTC();
 
-  FlushRender();
-
   if (m_blit_remaining_words == 0)
   {
     if (g_settings.debugging.dump_cpu_to_vram_copies)
@@ -557,18 +945,18 @@ void GPU::FinishVRAMWrite()
     const u8* blit_ptr = reinterpret_cast<const u8*>(m_blit_buffer.data());
     if (transferred_full_rows > 0)
     {
-      UpdateVRAM(m_vram_transfer.x, m_vram_transfer.y, m_vram_transfer.width, transferred_full_rows, blit_ptr,
-                 m_GPUSTAT.set_mask_while_drawing, m_GPUSTAT.check_mask_before_draw);
+      UpdateVRAM(m_vram_transfer.x, m_vram_transfer.y, m_vram_transfer.width, static_cast<u16>(transferred_full_rows),
+                 blit_ptr, m_GPUSTAT.set_mask_while_drawing, m_GPUSTAT.check_mask_before_draw);
       blit_ptr += (ZeroExtend32(m_vram_transfer.width) * transferred_full_rows) * sizeof(u16);
     }
     if (transferred_width_last_row > 0)
     {
-      UpdateVRAM(m_vram_transfer.x, m_vram_transfer.y + transferred_full_rows, transferred_width_last_row, 1, blit_ptr,
-                 m_GPUSTAT.set_mask_while_drawing, m_GPUSTAT.check_mask_before_draw);
+      UpdateVRAM(m_vram_transfer.x, static_cast<u16>(m_vram_transfer.y + transferred_full_rows),
+                 static_cast<u16>(transferred_width_last_row), 1, blit_ptr, m_GPUSTAT.set_mask_while_drawing,
+                 m_GPUSTAT.check_mask_before_draw);
     }
   }
 
-  m_counters.num_writes++;
   m_blit_buffer.clear();
   m_vram_transfer = {};
   m_blitter_state = BlitterState::Idle;
@@ -588,9 +976,6 @@ bool GPU::HandleCopyRectangleVRAMToCPUCommand()
             m_vram_transfer.width, m_vram_transfer.height);
   DebugAssert(m_vram_transfer.col == 0 && m_vram_transfer.row == 0);
 
-  // all rendering should be done first...
-  FlushRender();
-
   // ensure VRAM shadow is up to date
   ReadVRAM(m_vram_transfer.x, m_vram_transfer.y, m_vram_transfer.width, m_vram_transfer.height);
 
@@ -602,7 +987,6 @@ bool GPU::HandleCopyRectangleVRAMToCPUCommand()
   }
 
   // switch to pixel-by-pixel read state
-  m_counters.num_reads++;
   m_blitter_state = BlitterState::ReadingVRAM;
   m_command_total_words = 0;
 
@@ -633,10 +1017,15 @@ bool GPU::HandleCopyRectangleVRAMToVRAMCommand()
     width == 0 || height == 0 || (src_x == dst_x && src_y == dst_y && !m_GPUSTAT.set_mask_while_drawing);
   if (!skip_copy)
   {
-    m_counters.num_copies++;
-
-    FlushRender();
-    CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
+    GPUBackendCopyVRAMCommand* cmd = GPUBackend::NewCopyVRAMCommand();
+    FillBackendCommandParameters(cmd);
+    cmd->src_x = static_cast<u16>(src_x);
+    cmd->src_y = static_cast<u16>(src_y);
+    cmd->dst_x = static_cast<u16>(dst_x);
+    cmd->dst_y = static_cast<u16>(dst_y);
+    cmd->width = static_cast<u16>(width);
+    cmd->height = static_cast<u16>(height);
+    GPUBackend::PushCommand(cmd);
   }
 
   AddCommandTicks(width * height * 2);
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 65b019c54..f9da24f4c 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -4,8 +4,8 @@
 #include "gpu_hw.h"
 #include "cpu_core.h"
 #include "cpu_pgxp.h"
+#include "gpu.h"
 #include "gpu_hw_shadergen.h"
-#include "gpu_sw_backend.h"
 #include "gpu_sw_rasterizer.h"
 #include "host.h"
 #include "settings.h"
@@ -26,6 +26,7 @@
 
 #include "IconsEmoji.h"
 #include "IconsFontAwesome5.h"
+#include "fmt/format.h"
 #include "imgui.h"
 
 #include <cmath>
@@ -87,7 +88,7 @@ ALWAYS_INLINE static u32 GetMaxResolutionScale()
 
 ALWAYS_INLINE_RELEASE static u32 GetBoxDownsampleScale(u32 resolution_scale)
 {
-  u32 scale = std::min<u32>(resolution_scale, g_settings.gpu_downsample_scale);
+  u32 scale = std::min<u32>(resolution_scale, g_gpu_settings.gpu_downsample_scale);
   while ((resolution_scale % scale) != 0)
     scale--;
   return scale;
@@ -96,19 +97,21 @@ ALWAYS_INLINE_RELEASE static u32 GetBoxDownsampleScale(u32 resolution_scale)
 ALWAYS_INLINE static bool ShouldClampUVs(GPUTextureFilter texture_filter)
 {
   // We only need UV limits if PGXP is enabled, or texture filtering is enabled.
-  return g_settings.gpu_pgxp_enable || texture_filter != GPUTextureFilter::Nearest;
+  return g_gpu_settings.gpu_pgxp_enable || texture_filter != GPUTextureFilter::Nearest;
 }
 
 ALWAYS_INLINE static bool ShouldAllowSpriteMode(u8 resolution_scale, GPUTextureFilter texture_filter,
                                                 GPUTextureFilter sprite_texture_filter)
 {
   // Use sprite shaders/mode when texcoord rounding is forced, or if the filters are different.
-  return (sprite_texture_filter != texture_filter || (resolution_scale > 1 && g_settings.gpu_force_round_texcoords));
+  return (sprite_texture_filter != texture_filter ||
+          (resolution_scale > 1 && g_gpu_settings.gpu_force_round_texcoords));
 }
 
 ALWAYS_INLINE static bool ShouldDisableColorPerspective()
 {
-  return g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction && !g_settings.gpu_pgxp_color_correction;
+  return g_gpu_settings.gpu_pgxp_enable && g_gpu_settings.gpu_pgxp_texture_correction &&
+         !g_gpu_settings.gpu_pgxp_color_correction;
 }
 
 /// Returns true if the specified texture filtering mode requires dual-source blending.
@@ -187,7 +190,7 @@ private:
 };
 } // namespace
 
-GPU_HW::GPU_HW() : GPU()
+GPU_HW::GPU_HW() : GPUBackend()
 {
 #ifdef _DEBUG
   s_draw_number = 0;
@@ -197,12 +200,11 @@ GPU_HW::GPU_HW() : GPU()
 GPU_HW::~GPU_HW()
 {
   GPUTextureCache::Shutdown();
+}
 
-  if (m_sw_renderer)
-  {
-    m_sw_renderer->Shutdown();
-    m_sw_renderer.reset();
-  }
+bool GPU_HW::IsHardwareRenderer() const
+{
+  return true;
 }
 
 ALWAYS_INLINE void GPU_HW::BatchVertex::Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_,
@@ -235,34 +237,24 @@ ALWAYS_INLINE void GPU_HW::BatchVertex::SetUVLimits(u32 min_u, u32 max_u, u32 mi
   uv_limits = PackUVLimits(min_u, max_u, min_v, max_v);
 }
 
-const Threading::Thread* GPU_HW::GetSWThread() const
+bool GPU_HW::Initialize(bool upload_vram, Error* error)
 {
-  return m_sw_renderer ? m_sw_renderer->GetThread() : nullptr;
-}
-
-bool GPU_HW::IsHardwareRenderer() const
-{
-  return true;
-}
-
-bool GPU_HW::Initialize(Error* error)
-{
-  if (!GPU::Initialize(error))
+  if (!GPUBackend::Initialize(upload_vram, error))
     return false;
 
   const GPUDevice::Features features = g_gpu_device->GetFeatures();
 
   m_resolution_scale = Truncate8(CalculateResolutionScale());
-  m_multisamples = Truncate8(std::min<u32>(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
-  m_texture_filtering = g_settings.gpu_texture_filter;
-  m_sprite_texture_filtering = g_settings.gpu_sprite_texture_filter;
-  m_line_detect_mode = (m_resolution_scale > 1) ? g_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
+  m_multisamples = Truncate8(std::min<u32>(g_gpu_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
+  m_texture_filtering = g_gpu_settings.gpu_texture_filter;
+  m_sprite_texture_filtering = g_gpu_settings.gpu_sprite_texture_filter;
+  m_line_detect_mode = (m_resolution_scale > 1) ? g_gpu_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
   m_downsample_mode = GetDownsampleMode(m_resolution_scale);
-  m_wireframe_mode = g_settings.gpu_wireframe_mode;
+  m_wireframe_mode = g_gpu_settings.gpu_wireframe_mode;
   m_supports_dual_source_blend = features.dual_source_blend;
   m_supports_framebuffer_fetch = features.framebuffer_fetch;
-  m_true_color = g_settings.gpu_true_color;
-  m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer();
+  m_true_color = g_gpu_settings.gpu_true_color;
+  m_pgxp_depth_buffer = g_gpu_settings.UsingPGXPDepthBuffer();
   m_clamp_uvs = ShouldClampUVs(m_texture_filtering) || ShouldClampUVs(m_sprite_texture_filtering);
   m_compute_uv_range = m_clamp_uvs;
   m_allow_sprite_mode = ShouldAllowSpriteMode(m_resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
@@ -271,8 +263,6 @@ bool GPU_HW::Initialize(Error* error)
 
   CheckSettings();
 
-  UpdateSoftwareRenderer(false);
-
   PrintSettingsToLog();
 
   if (!CompileCommonShaders(error) || !CompilePipelines(error))
@@ -286,7 +276,7 @@ bool GPU_HW::Initialize(Error* error)
 
   if (m_use_texture_cache)
   {
-    if (!GPUTextureCache::Initialize())
+    if (!GPUTextureCache::Initialize(this))
     {
       ERROR_LOG("Failed to initialize texture cache, disabling.");
       m_use_texture_cache = false;
@@ -296,33 +286,50 @@ bool GPU_HW::Initialize(Error* error)
   UpdateDownsamplingLevels();
 
   RestoreDeviceContext();
+
+  // If we're not initializing VRAM, need to upload it here. Implies RestoreDeviceContext().
+  if (upload_vram)
+    UpdateVRAMOnGPU(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, VRAM_WIDTH * sizeof(u16), false, false, VRAM_SIZE_RECT);
+
+  DrawingAreaChanged();
   return true;
 }
 
-void GPU_HW::Reset(bool clear_vram)
+u32 GPU_HW::GetResolutionScale() const
+{
+  return m_resolution_scale;
+}
+
+void GPU_HW::ClearVRAM()
 {
   // Texture cache needs to be invalidated before we load, otherwise we dump black.
   if (m_use_texture_cache)
     GPUTextureCache::Invalidate();
 
+  // Don't need to finish the current draw.
   if (m_batch_vertex_ptr)
     UnmapGPUBuffer(0, 0);
 
-  GPU::Reset(clear_vram);
+  m_texpage_dirty = false;
+  m_compute_uv_range = m_clamp_uvs;
 
-  if (m_sw_renderer)
-    m_sw_renderer->Reset();
+  if (ShouldDrawWithSoftwareRenderer())
+  {
+    std::memset(g_vram, 0, sizeof(g_vram));
+    std::memset(g_gpu_clut, 0, sizeof(g_gpu_clut));
+  }
 
   m_batch = {};
   m_current_depth = 1;
-  SetClampedDrawingArea();
-
-  if (clear_vram)
-    ClearFramebuffer();
+  ClearFramebuffer();
 }
 
-bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display)
+#if 0
+
+bool GPU_HW::DoState(GPUTexture** host_texture, bool is_reading, bool update_display)
 {
+#if 0
+  // TODO: FIXME
   // Need to download local VRAM copy before calling the base class, because it serializes this.
   if (m_sw_renderer)
   {
@@ -339,11 +346,15 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
 
   if (!GPU::DoState(sw, host_texture, update_display))
     return false;
+#else
+  if (!is_reading && !host_texture)
+    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
+#endif
 
   if (host_texture)
   {
     GPUTexture* tex = *host_texture;
-    if (sw.IsReading())
+    if (is_reading)
     {
       if (tex->GetWidth() != m_vram_texture->GetWidth() || tex->GetHeight() != m_vram_texture->GetHeight() ||
           tex->GetSamples() != m_vram_texture->GetSamples())
@@ -377,7 +388,7 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
                                       tex->GetHeight());
     }
   }
-  else if (sw.IsReading())
+  else if (is_reading)
   {
     // Need to update the VRAM copy on the GPU with the state data.
     // Would invalidate the TC, but base DoState() calls Reset().
@@ -385,7 +396,7 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
   }
 
   // invalidate the whole VRAM read texture when loading state
-  if (sw.IsReading())
+  if (is_reading)
   {
     DebugAssert(!m_batch_vertex_ptr && !m_batch_index_ptr);
     ClearVRAMDirtyRectangle();
@@ -395,7 +406,35 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
     ResetBatchVertexDepth();
   }
 
-  return GPUTextureCache::DoState(sw, !m_use_texture_cache);
+  // TODO:FIXME return GPUTextureCache::DoState(sw, !m_use_texture_cache);
+  return true;
+}
+
+#endif
+
+void GPU_HW::LoadState(const GPUBackendLoadStateCommand* cmd)
+{
+  DebugAssert((m_batch_vertex_ptr != nullptr) == (m_batch_index_ptr != nullptr));
+  if (m_batch_vertex_ptr)
+    UnmapGPUBuffer(0, 0);
+
+  std::memcpy(g_vram, cmd->vram_data, sizeof(g_vram));
+  UpdateVRAMOnGPU(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, VRAM_WIDTH * sizeof(u16), false, false, VRAM_SIZE_RECT);
+
+  if (ShouldDrawWithSoftwareRenderer())
+    std::memcpy(g_gpu_clut, cmd->clut_data, sizeof(g_gpu_clut));
+
+  if (m_use_texture_cache)
+  {
+    GPUTextureCache::LoadState(std::span<const u8>(cmd->texture_cache_state, cmd->texture_cache_state_size),
+                               cmd->texture_cache_state_version);
+  }
+
+  ClearVRAMDirtyRectangle();
+  SetFullVRAMDirtyRectangle();
+  UpdateVRAMReadTexture(true, false);
+  ClearVRAMDirtyRectangle();
+  ResetBatchVertexDepth();
 }
 
 void GPU_HW::RestoreDeviceContext()
@@ -409,50 +448,51 @@ void GPU_HW::RestoreDeviceContext()
 
 void GPU_HW::UpdateSettings(const Settings& old_settings)
 {
-  const bool prev_force_progressive_scan = m_force_progressive_scan;
-
-  GPU::UpdateSettings(old_settings);
+  GPUBackend::UpdateSettings(old_settings);
 
   const GPUDevice::Features features = g_gpu_device->GetFeatures();
 
   const u8 resolution_scale = Truncate8(CalculateResolutionScale());
-  const u8 multisamples = Truncate8(std::min<u32>(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
+  const u8 multisamples = Truncate8(std::min<u32>(g_gpu_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
   const bool clamp_uvs = ShouldClampUVs(m_texture_filtering) || ShouldClampUVs(m_sprite_texture_filtering);
-  const bool framebuffer_changed = (m_resolution_scale != resolution_scale || m_multisamples != multisamples ||
-                                    g_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
-                                    m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer() ||
-                                    (!old_settings.gpu_texture_cache && g_settings.gpu_texture_cache));
+  const bool framebuffer_changed =
+    (m_resolution_scale != resolution_scale || m_multisamples != multisamples ||
+     g_gpu_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
+     m_pgxp_depth_buffer != g_gpu_settings.UsingPGXPDepthBuffer() ||
+     (!old_settings.gpu_texture_cache && g_gpu_settings.gpu_texture_cache));
   const bool shaders_changed =
     ((m_resolution_scale > 1) != (resolution_scale > 1) || (m_multisamples > 1) != (multisamples > 1) ||
-     m_true_color != g_settings.gpu_true_color || prev_force_progressive_scan != m_force_progressive_scan ||
-     (multisamples > 1 && g_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading) ||
-     (resolution_scale > 1 && g_settings.gpu_scaled_dithering != old_settings.gpu_scaled_dithering) ||
-     (resolution_scale > 1 && g_settings.gpu_texture_filter == GPUTextureFilter::Nearest &&
-      g_settings.gpu_force_round_texcoords != old_settings.gpu_force_round_texcoords) ||
-     g_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
-     m_texture_filtering != g_settings.gpu_texture_filter ||
-     m_sprite_texture_filtering != g_settings.gpu_sprite_texture_filter || m_clamp_uvs != clamp_uvs ||
-     (features.geometry_shaders && g_settings.gpu_wireframe_mode != old_settings.gpu_wireframe_mode) ||
-     m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer() ||
-     (features.noperspective_interpolation && g_settings.gpu_pgxp_enable &&
-      g_settings.gpu_pgxp_color_correction != old_settings.gpu_pgxp_color_correction) ||
-     m_allow_sprite_mode !=
-       ShouldAllowSpriteMode(m_resolution_scale, g_settings.gpu_texture_filter, g_settings.gpu_sprite_texture_filter));
+     m_true_color != g_gpu_settings.gpu_true_color ||
+     (old_settings.display_deinterlacing_mode == DisplayDeinterlacingMode::Progressive) !=
+       (g_gpu_settings.display_deinterlacing_mode == DisplayDeinterlacingMode::Progressive) ||
+     (multisamples > 1 && g_gpu_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading) ||
+     (resolution_scale > 1 && g_gpu_settings.gpu_scaled_dithering != old_settings.gpu_scaled_dithering) ||
+     (resolution_scale > 1 && g_gpu_settings.gpu_texture_filter == GPUTextureFilter::Nearest &&
+      g_gpu_settings.gpu_force_round_texcoords != old_settings.gpu_force_round_texcoords) ||
+     g_gpu_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
+     m_texture_filtering != g_gpu_settings.gpu_texture_filter ||
+     m_sprite_texture_filtering != g_gpu_settings.gpu_sprite_texture_filter || m_clamp_uvs != clamp_uvs ||
+     (features.geometry_shaders && g_gpu_settings.gpu_wireframe_mode != old_settings.gpu_wireframe_mode) ||
+     m_pgxp_depth_buffer != g_gpu_settings.UsingPGXPDepthBuffer() ||
+     (features.noperspective_interpolation && g_gpu_settings.gpu_pgxp_enable &&
+      g_gpu_settings.gpu_pgxp_color_correction != old_settings.gpu_pgxp_color_correction) ||
+     m_allow_sprite_mode != ShouldAllowSpriteMode(m_resolution_scale, g_gpu_settings.gpu_texture_filter,
+                                                  g_gpu_settings.gpu_sprite_texture_filter));
   const bool resolution_dependent_shaders_changed =
     (m_resolution_scale != resolution_scale || m_multisamples != multisamples);
   const bool downsampling_shaders_changed =
     ((m_resolution_scale > 1) != (resolution_scale > 1) ||
-     (resolution_scale > 1 && (g_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
+     (resolution_scale > 1 && (g_gpu_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
                                (m_downsample_mode == GPUDownsampleMode::Box &&
                                 (resolution_scale != m_resolution_scale ||
-                                 g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale)))));
+                                 g_gpu_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale)))));
 
   if (m_resolution_scale != resolution_scale)
   {
     Host::AddIconOSDMessage("ResolutionScaleChanged", ICON_FA_PAINT_BRUSH,
                             fmt::format(TRANSLATE_FS("GPU_HW", "Internal resolution set to {0}x ({1}x{2})."),
-                                        resolution_scale, m_crtc_state.display_width * resolution_scale,
-                                        resolution_scale * m_crtc_state.display_height),
+                                        resolution_scale, m_display_width * resolution_scale,
+                                        resolution_scale * m_display_height),
                             Host::OSD_INFO_DURATION);
   }
 
@@ -484,31 +524,29 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
 
   m_resolution_scale = resolution_scale;
   m_multisamples = multisamples;
-  m_texture_filtering = g_settings.gpu_texture_filter;
-  m_sprite_texture_filtering = g_settings.gpu_sprite_texture_filter;
-  m_line_detect_mode = (m_resolution_scale > 1) ? g_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
+  m_texture_filtering = g_gpu_settings.gpu_texture_filter;
+  m_sprite_texture_filtering = g_gpu_settings.gpu_sprite_texture_filter;
+  m_line_detect_mode = (m_resolution_scale > 1) ? g_gpu_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
   m_downsample_mode = GetDownsampleMode(resolution_scale);
-  m_wireframe_mode = g_settings.gpu_wireframe_mode;
-  m_true_color = g_settings.gpu_true_color;
+  m_wireframe_mode = g_gpu_settings.gpu_wireframe_mode;
+  m_true_color = g_gpu_settings.gpu_true_color;
   m_clamp_uvs = clamp_uvs;
   m_compute_uv_range = m_clamp_uvs;
   m_allow_sprite_mode = ShouldAllowSpriteMode(resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
-  m_use_texture_cache = g_settings.gpu_texture_cache;
-  m_texture_dumping = m_use_texture_cache && g_settings.texture_replacements.dump_textures;
+  m_use_texture_cache = g_gpu_settings.gpu_texture_cache;
+  m_texture_dumping = m_use_texture_cache && g_gpu_settings.texture_replacements.dump_textures;
   m_batch.sprite_mode = (m_allow_sprite_mode && m_batch.sprite_mode);
 
-  const bool depth_buffer_changed = (m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer());
+  const bool depth_buffer_changed = (m_pgxp_depth_buffer != g_gpu_settings.UsingPGXPDepthBuffer());
   if (depth_buffer_changed)
   {
-    m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer();
+    m_pgxp_depth_buffer = g_gpu_settings.UsingPGXPDepthBuffer();
     m_batch.use_depth_buffer = false;
     m_depth_was_copied = false;
   }
 
   CheckSettings();
 
-  UpdateSoftwareRenderer(true);
-
   PrintSettingsToLog();
 
   if (shaders_changed)
@@ -543,10 +581,9 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
 
     UpdateDownsamplingLevels();
     RestoreDeviceContext();
-    UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, false, false);
+    UpdateVRAMOnGPU(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, VRAM_WIDTH * sizeof(u16), false, false, VRAM_SIZE_RECT);
     if (m_write_mask_as_depth)
       UpdateDepthBufferFromMaskBit();
-    UpdateDisplay();
   }
   else if (m_vram_depth_texture && depth_buffer_changed)
   {
@@ -558,7 +595,7 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
 
   if (m_use_texture_cache && !old_settings.gpu_texture_cache)
   {
-    if (!GPUTextureCache::Initialize())
+    if (!GPUTextureCache::Initialize(this))
     {
       ERROR_LOG("Failed to initialize texture cache, disabling.");
       m_use_texture_cache = false;
@@ -571,23 +608,33 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
 
   GPUTextureCache::UpdateSettings(m_use_texture_cache, old_settings);
 
-  if (g_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
-      (g_settings.gpu_downsample_mode == GPUDownsampleMode::Box &&
-       g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale))
+  if (g_gpu_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
+      (g_gpu_settings.gpu_downsample_mode == GPUDownsampleMode::Box &&
+       g_gpu_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale))
   {
     UpdateDownsamplingLevels();
   }
+
+  // Need to reload CLUT if we're enabling SW rendering.
+  if (g_gpu_settings.gpu_use_software_renderer_for_readbacks && !old_settings.gpu_use_software_renderer_for_readbacks)
+  {
+    if (m_draw_mode.mode_reg.texture_mode <= GPUTextureMode::Palette8Bit)
+    {
+      GPU_SW_Rasterizer::UpdateCLUT(m_draw_mode.palette_reg,
+                                    m_draw_mode.mode_reg.texture_mode == GPUTextureMode::Palette8Bit);
+    }
+  }
 }
 
 void GPU_HW::CheckSettings()
 {
   const GPUDevice::Features features = g_gpu_device->GetFeatures();
 
-  if (m_multisamples != g_settings.gpu_multisamples)
+  if (m_multisamples != g_gpu_settings.gpu_multisamples)
   {
     Host::AddIconOSDMessage("MSAAUnsupported", ICON_EMOJI_WARNING,
                             fmt::format(TRANSLATE_FS("GPU_HW", "{}x MSAA is not supported, using {}x instead."),
-                                        g_settings.gpu_multisamples, m_multisamples),
+                                        g_gpu_settings.gpu_multisamples, m_multisamples),
                             Host::OSD_CRITICAL_ERROR_DURATION);
   }
   else
@@ -595,7 +642,7 @@ void GPU_HW::CheckSettings()
     Host::RemoveKeyedOSDMessage("MSAAUnsupported");
   }
 
-  if (g_settings.gpu_per_sample_shading && !features.per_sample_shading)
+  if (g_gpu_settings.gpu_per_sample_shading && !features.per_sample_shading)
   {
     Host::AddIconOSDMessage("SSAAUnsupported", ICON_EMOJI_WARNING,
                             TRANSLATE_STR("GPU_HW", "SSAA is not supported, using MSAA instead."),
@@ -661,13 +708,13 @@ void GPU_HW::CheckSettings()
   {
     const u32 resolution_scale = CalculateResolutionScale();
     const u32 box_downscale = GetBoxDownsampleScale(resolution_scale);
-    if (box_downscale != g_settings.gpu_downsample_scale || box_downscale == resolution_scale)
+    if (box_downscale != g_gpu_settings.gpu_downsample_scale || box_downscale == resolution_scale)
     {
       Host::AddIconOSDMessage(
         "BoxDownsampleUnsupported", ICON_FA_PAINT_BRUSH,
         fmt::format(TRANSLATE_FS(
                       "GPU_HW", "Resolution scale {0}x is not divisible by downsample scale {1}x, using {2}x instead."),
-                    resolution_scale, g_settings.gpu_downsample_scale, box_downscale),
+                    resolution_scale, g_gpu_settings.gpu_downsample_scale, box_downscale),
         Host::OSD_WARNING_DURATION);
     }
     else
@@ -675,7 +722,7 @@ void GPU_HW::CheckSettings()
       Host::RemoveKeyedOSDMessage("BoxDownsampleUnsupported");
     }
 
-    if (box_downscale == g_settings.gpu_resolution_scale)
+    if (box_downscale == g_gpu_settings.gpu_resolution_scale)
       m_downsample_mode = GPUDownsampleMode::Disabled;
   }
 }
@@ -683,15 +730,15 @@ void GPU_HW::CheckSettings()
 u32 GPU_HW::CalculateResolutionScale() const
 {
   u32 scale;
-  if (g_settings.gpu_resolution_scale != 0)
+  if (g_gpu_settings.gpu_resolution_scale != 0)
   {
-    scale = g_settings.gpu_resolution_scale;
+    scale = g_gpu_settings.gpu_resolution_scale;
   }
   else
   {
     // Auto scaling.
-    if (m_crtc_state.display_width == 0 || m_crtc_state.display_height == 0 || m_crtc_state.display_vram_width == 0 ||
-        m_crtc_state.display_vram_height == 0 || m_GPUSTAT.display_disable || !g_gpu_device->HasMainSwapChain())
+    if (m_display_width == 0 || m_display_height == 0 || m_display_vram_width == 0 || m_display_vram_height == 0 ||
+        !m_display_texture || !g_gpu_device->HasMainSwapChain())
     {
       // When the system is starting and all borders crop is enabled, the registers are zero, and
       // display_height therefore is also zero. Keep the existing resolution until it updates.
@@ -708,19 +755,19 @@ u32 GPU_HW::CalculateResolutionScale() const
       const s32 draw_width = draw_rect.width();
       const s32 draw_height = draw_rect.height();
       scale = static_cast<u32>(
-        std::ceil(std::max(static_cast<float>(draw_width) / static_cast<float>(m_crtc_state.display_vram_width),
-                           static_cast<float>(draw_height) / static_cast<float>(m_crtc_state.display_vram_height))));
+        std::ceil(std::max(static_cast<float>(draw_width) / static_cast<float>(m_display_vram_width),
+                           static_cast<float>(draw_height) / static_cast<float>(m_display_vram_height))));
       VERBOSE_LOG("Draw Size = {}x{}, VRAM Size = {}x{}, Preferred Scale = {}", draw_width, draw_height,
-                  m_crtc_state.display_vram_width, m_crtc_state.display_vram_height, scale);
+                  m_display_vram_width, m_display_vram_height, scale);
     }
   }
 
-  if (g_settings.gpu_downsample_mode == GPUDownsampleMode::Adaptive && scale > 1 && !Common::IsPow2(scale))
+  if (g_gpu_settings.gpu_downsample_mode == GPUDownsampleMode::Adaptive && scale > 1 && !Common::IsPow2(scale))
   {
     const u32 new_scale = Common::PreviousPow2(scale);
     WARNING_LOG("Resolution scale {}x not supported for adaptive downsampling, using {}x", scale, new_scale);
 
-    if (g_settings.gpu_resolution_scale != 0)
+    if (g_gpu_settings.gpu_resolution_scale != 0)
     {
       Host::AddIconOSDMessage(
         "ResolutionNotPow2", ICON_FA_PAINT_BRUSH,
@@ -736,11 +783,6 @@ u32 GPU_HW::CalculateResolutionScale() const
   return std::clamp<u32>(scale, 1, GetMaxResolutionScale());
 }
 
-u32 GPU_HW::GetResolutionScale() const
-{
-  return m_resolution_scale;
-}
-
 void GPU_HW::UpdateResolutionScale()
 {
   if (CalculateResolutionScale() != m_resolution_scale)
@@ -749,7 +791,13 @@ void GPU_HW::UpdateResolutionScale()
 
 GPUDownsampleMode GPU_HW::GetDownsampleMode(u32 resolution_scale) const
 {
-  return (resolution_scale == 1) ? GPUDownsampleMode::Disabled : g_settings.gpu_downsample_mode;
+  return (resolution_scale == 1) ? GPUDownsampleMode::Disabled : g_gpu_settings.gpu_downsample_mode;
+}
+
+bool GPU_HW::ShouldDrawWithSoftwareRenderer() const
+{
+  // TODO: FIXME: Move into class.
+  return g_gpu_settings.gpu_use_software_renderer_for_readbacks;
 }
 
 bool GPU_HW::IsUsingMultisampling() const
@@ -757,15 +805,15 @@ bool GPU_HW::IsUsingMultisampling() const
   return m_multisamples > 1;
 }
 
-bool GPU_HW::IsUsingDownsampling() const
+bool GPU_HW::IsUsingDownsampling(const GPUBackendUpdateDisplayCommand* cmd) const
 {
-  return (m_downsample_mode != GPUDownsampleMode::Disabled && !m_GPUSTAT.display_area_color_depth_24);
+  return (m_downsample_mode != GPUDownsampleMode::Disabled && !cmd->display_24bit);
 }
 
 void GPU_HW::SetFullVRAMDirtyRectangle()
 {
   m_vram_dirty_draw_rect = VRAM_SIZE_RECT;
-  m_draw_mode.SetTexturePageChanged();
+  m_draw_mode.bits = INVALID_DRAW_MODE_BITS;
 }
 
 void GPU_HW::ClearVRAMDirtyRectangle()
@@ -810,12 +858,12 @@ void GPU_HW::SetTexPageChangedOnOverlap(const GSVector4i update_rect)
 {
   // the vram area can include the texture page, but the game can leave it as-is. in this case, set it as dirty so the
   // shadow texture is updated
-  if (!m_draw_mode.IsTexturePageChanged() && m_batch.texture_mode != BatchTextureMode::Disabled &&
+  if (m_draw_mode.bits != INVALID_DRAW_MODE_BITS && m_batch.texture_mode != BatchTextureMode::Disabled &&
       (GetTextureRect(m_draw_mode.mode_reg.texture_page, m_draw_mode.mode_reg.texture_mode).rintersects(update_rect) ||
        (m_draw_mode.mode_reg.IsUsingPalette() &&
         GetPaletteRect(m_draw_mode.palette_reg, m_draw_mode.mode_reg.texture_mode).rintersects(update_rect))))
   {
-    m_draw_mode.SetTexturePageChanged();
+    m_draw_mode.bits = INVALID_DRAW_MODE_BITS;
   }
 }
 
@@ -824,12 +872,13 @@ void GPU_HW::PrintSettingsToLog()
   INFO_LOG("Resolution Scale: {} ({}x{}), maximum {}", m_resolution_scale, VRAM_WIDTH * m_resolution_scale,
            VRAM_HEIGHT * m_resolution_scale, GetMaxResolutionScale());
   INFO_LOG("Multisampling: {}x{}", m_multisamples,
-           (g_settings.gpu_per_sample_shading && g_gpu_device->GetFeatures().per_sample_shading) ?
+           (g_gpu_settings.gpu_per_sample_shading && g_gpu_device->GetFeatures().per_sample_shading) ?
              " (per sample shading)" :
              "");
-  INFO_LOG("Dithering: {}", m_true_color ? "Disabled" : "Enabled", (!m_true_color && g_settings.gpu_scaled_dithering));
+  INFO_LOG("Dithering: {}", m_true_color ? "Disabled" : "Enabled",
+           (!m_true_color && g_gpu_settings.gpu_scaled_dithering));
   INFO_LOG("Force round texture coordinates: {}",
-           (m_resolution_scale > 1 && g_settings.gpu_force_round_texcoords) ? "Enabled" : "Disabled");
+           (m_resolution_scale > 1 && g_gpu_settings.gpu_force_round_texcoords) ? "Enabled" : "Disabled");
   INFO_LOG("Texture Filtering: {}/{}", Settings::GetTextureFilterDisplayName(m_texture_filtering),
            Settings::GetTextureFilterDisplayName(m_sprite_texture_filtering));
   INFO_LOG("Dual-source blending: {}", m_supports_dual_source_blend ? "Supported" : "Not supported");
@@ -838,7 +887,7 @@ void GPU_HW::PrintSettingsToLog()
   INFO_LOG("Downsampling: {}", Settings::GetDownsampleModeDisplayName(m_downsample_mode));
   INFO_LOG("Wireframe rendering: {}", Settings::GetGPUWireframeModeDisplayName(m_wireframe_mode));
   INFO_LOG("Line detection: {}", Settings::GetLineDetectModeDisplayName(m_line_detect_mode));
-  INFO_LOG("Using software renderer for readbacks: {}", m_sw_renderer ? "YES" : "NO");
+  INFO_LOG("Using software renderer for readbacks: {}", ShouldDrawWithSoftwareRenderer() ? "YES" : "NO");
   INFO_LOG("Separate sprite shaders: {}", m_allow_sprite_mode ? "YES" : "NO");
 }
 
@@ -947,6 +996,7 @@ void GPU_HW::ClearFramebuffer()
   if (m_use_texture_cache)
     GPUTextureCache::Invalidate();
   m_last_depth_z = 1.0f;
+  m_current_depth = 1;
 }
 
 void GPU_HW::SetVRAMRenderTarget()
@@ -1016,12 +1066,14 @@ bool GPU_HW::CompilePipelines(Error* error)
   const GPUDevice::Features features = g_gpu_device->GetFeatures();
   const bool upscaled = (m_resolution_scale > 1);
   const bool msaa = (m_multisamples > 1);
-  const bool per_sample_shading = (msaa && g_settings.gpu_per_sample_shading && features.per_sample_shading);
+  const bool per_sample_shading = (msaa && g_gpu_settings.gpu_per_sample_shading && features.per_sample_shading);
   const bool force_round_texcoords =
-    (upscaled && m_texture_filtering == GPUTextureFilter::Nearest && g_settings.gpu_force_round_texcoords);
-  const bool true_color = g_settings.gpu_true_color;
-  const bool scaled_dithering = (!m_true_color && upscaled && g_settings.gpu_scaled_dithering);
+    (upscaled && m_texture_filtering == GPUTextureFilter::Nearest && g_gpu_settings.gpu_force_round_texcoords);
+  const bool true_color = g_gpu_settings.gpu_true_color;
+  const bool scaled_dithering = (!m_true_color && upscaled && g_gpu_settings.gpu_scaled_dithering);
   const bool disable_color_perspective = ShouldDisableColorPerspective();
+  const bool force_progressive_scan =
+    (g_gpu_settings.display_deinterlacing_mode == DisplayDeinterlacingMode::Progressive);
 
   // Determine when to use shader blending.
   // FBFetch is free, we need it for filtering without DSB, or when accurate blending is forced.
@@ -1030,10 +1082,10 @@ bool GPU_HW::CompilePipelines(Error* error)
   // Abuse the depth buffer for the mask bit when it's free (FBFetch), or PGXP depth buffering is enabled.
   m_allow_shader_blend = features.framebuffer_fetch ||
                          ((features.feedback_loops || features.raster_order_views) &&
-                          (m_pgxp_depth_buffer || g_settings.IsUsingAccurateBlending() ||
+                          (m_pgxp_depth_buffer || g_gpu_settings.IsUsingAccurateBlending() ||
                            (!m_supports_dual_source_blend && (IsBlendedTextureFiltering(m_texture_filtering) ||
                                                               IsBlendedTextureFiltering(m_sprite_texture_filtering)))));
-  m_prefer_shader_blend = (m_allow_shader_blend && g_settings.IsUsingAccurateBlending());
+  m_prefer_shader_blend = (m_allow_shader_blend && g_gpu_settings.IsUsingAccurateBlending());
   m_use_rov_for_shader_blend = (m_allow_shader_blend && !features.framebuffer_fetch && features.raster_order_views &&
                                 (m_prefer_shader_blend || !features.feedback_loops));
   m_write_mask_as_depth = (!m_pgxp_depth_buffer && !features.framebuffer_fetch && !m_prefer_shader_blend);
@@ -1068,11 +1120,11 @@ bool GPU_HW::CompilePipelines(Error* error)
                           (NUM_TEXTURE_MODES - (NUM_TEXTURE_MODES - static_cast<u32>(BatchTextureMode::SpriteStart)));
   const u32 total_vertex_shaders = (m_allow_sprite_mode ? 7 : 4);
   const u32 total_fragment_shaders = ((1 + BoolToUInt32(needs_rov_depth)) * 5 * 5 * active_texture_modes * 2 *
-                                      (1 + BoolToUInt32(!true_color)) * (1 + BoolToUInt32(!m_force_progressive_scan)));
+                                      (1 + BoolToUInt32(!true_color)) * (1 + BoolToUInt32(!force_progressive_scan)));
   const u32 total_items =
     total_vertex_shaders + total_fragment_shaders +
     ((m_pgxp_depth_buffer ? 2 : 1) * 5 * 5 * active_texture_modes * 2 * (1 + BoolToUInt32(!true_color)) *
-     (1 + BoolToUInt32(!m_force_progressive_scan))) +            // batch pipelines
+     (1 + BoolToUInt32(!force_progressive_scan))) +              // batch pipelines
     ((m_wireframe_mode != GPUWireframeMode::Disabled) ? 1 : 0) + // wireframe
     (2 * 2) +                                                    // vram fill
     (1 + BoolToUInt32(m_write_mask_as_depth)) +                  // vram copy
@@ -1160,7 +1212,7 @@ bool GPU_HW::CompilePipelines(Error* error)
           (needs_rov_depth && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend)))
         {
           progress.Increment(active_texture_modes * 2 * (1 + BoolToUInt32(!true_color)) *
-                             (1 + BoolToUInt32(!m_force_progressive_scan)));
+                             (1 + BoolToUInt32(!force_progressive_scan)));
           continue;
         }
 
@@ -1171,7 +1223,7 @@ bool GPU_HW::CompilePipelines(Error* error)
             if (check_mask && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend))
             {
               // mask bit testing is only valid with shader blending.
-              progress.Increment((1 + BoolToUInt32(!true_color)) * (1 + BoolToUInt32(!m_force_progressive_scan)));
+              progress.Increment((1 + BoolToUInt32(!true_color)) * (1 + BoolToUInt32(!force_progressive_scan)));
               continue;
             }
 
@@ -1184,7 +1236,7 @@ bool GPU_HW::CompilePipelines(Error* error)
               for (u8 interlacing = 0; interlacing < 2; interlacing++)
               {
                 // Never going to draw with line skipping in force progressive.
-                if (interlacing && m_force_progressive_scan)
+                if (interlacing && force_progressive_scan)
                   continue;
 
                 const bool sprite = (static_cast<BatchTextureMode>(texture_mode) >= BatchTextureMode::SpriteStart);
@@ -1267,7 +1319,7 @@ bool GPU_HW::CompilePipelines(Error* error)
           (needs_rov_depth && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend)))
         {
           progress.Increment(active_texture_modes * 2 * (1 + BoolToUInt32(!true_color)) *
-                             (1 + BoolToUInt32(!m_force_progressive_scan)));
+                             (1 + BoolToUInt32(!force_progressive_scan)));
           continue;
         }
 
@@ -1282,7 +1334,7 @@ bool GPU_HW::CompilePipelines(Error* error)
             for (u8 interlacing = 0; interlacing < 2; interlacing++)
             {
               // Never going to draw with line skipping in force progressive.
-              if (interlacing && m_force_progressive_scan)
+              if (interlacing && force_progressive_scan)
                 continue;
 
               for (u8 check_mask = 0; check_mask < 2; check_mask++)
@@ -2007,7 +2059,8 @@ ALWAYS_INLINE_RELEASE void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode
   }
 }
 
-ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices)
+ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd,
+                                                                       BatchVertex* vertices)
 {
   // Taken from beetle-psx gpu_polygon.cpp
   // For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior. If U or V is decreasing in X
@@ -2113,7 +2166,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVert
 
   // 2D polygons should have zero change in V on the X axis, and vice versa.
   if (m_allow_sprite_mode)
-    SetBatchSpriteMode(zero_dudy && zero_dvdx);
+    SetBatchSpriteMode(cmd, zero_dudy && zero_dvdx);
 }
 
 bool GPU_HW::IsPossibleSpritePolygon(const BatchVertex* vertices) const
@@ -2293,7 +2346,7 @@ ALWAYS_INLINE_RELEASE bool GPU_HW::ExpandLineTriangles(BatchVertex* vertices)
   return true;
 }
 
-void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
+void GPU_HW::ComputePolygonUVLimits(const GPUBackendDrawCommand* cmd, BatchVertex* vertices, u32 num_vertices)
 {
   DebugAssert(num_vertices == 3 || num_vertices == 4);
 
@@ -2321,10 +2374,10 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
     vertices[i].SetUVLimits(min_u, max_u, min_v, max_v);
 
   if (ShouldCheckForTexPageOverlap())
-    CheckForTexPageOverlap(GSVector4i(min).upl32(GSVector4i(max)).u16to32());
+    CheckForTexPageOverlap(cmd, GSVector4i(min).upl32(GSVector4i(max)).u16to32());
 }
 
-void GPU_HW::SetBatchDepthBuffer(bool enabled)
+void GPU_HW::SetBatchDepthBuffer(const GPUBackendDrawCommand* cmd, bool enabled)
 {
   if (m_batch.use_depth_buffer == enabled)
     return;
@@ -2332,13 +2385,13 @@ void GPU_HW::SetBatchDepthBuffer(bool enabled)
   if (m_batch_index_count > 0)
   {
     FlushRender();
-    EnsureVertexBufferSpaceForCurrentCommand();
+    EnsureVertexBufferSpaceForCommand(cmd);
   }
 
   m_batch.use_depth_buffer = enabled;
 }
 
-void GPU_HW::CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices)
+void GPU_HW::CheckForDepthClear(const GPUBackendDrawCommand* cmd, const BatchVertex* vertices, u32 num_vertices)
 {
   DebugAssert(num_vertices == 3 || num_vertices == 4);
   float average_z;
@@ -2347,17 +2400,17 @@ void GPU_HW::CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices)
   else
     average_z = std::min((vertices[0].w + vertices[1].w + vertices[2].w + vertices[3].w) / 4.0f, 1.0f);
 
-  if ((average_z - m_last_depth_z) >= g_settings.gpu_pgxp_depth_clear_threshold)
+  if ((average_z - m_last_depth_z) >= g_gpu_settings.gpu_pgxp_depth_clear_threshold)
   {
     FlushRender();
     CopyAndClearDepthBuffer();
-    EnsureVertexBufferSpaceForCurrentCommand();
+    EnsureVertexBufferSpaceForCommand(cmd);
   }
 
   m_last_depth_z = average_z;
 }
 
-void GPU_HW::SetBatchSpriteMode(bool enabled)
+void GPU_HW::SetBatchSpriteMode(const GPUBackendDrawCommand* cmd, bool enabled)
 {
   if (m_batch.sprite_mode == enabled)
     return;
@@ -2365,7 +2418,7 @@ void GPU_HW::SetBatchSpriteMode(bool enabled)
   if (m_batch_index_count > 0)
   {
     FlushRender();
-    EnsureVertexBufferSpaceForCurrentCommand();
+    EnsureVertexBufferSpaceForCommand(cmd);
   }
 
   GL_INS_FMT("Sprite mode is now {}", enabled ? "ON" : "OFF");
@@ -2373,6 +2426,43 @@ void GPU_HW::SetBatchSpriteMode(bool enabled)
   m_batch.sprite_mode = enabled;
 }
 
+void GPU_HW::DrawLine(const GPUBackendDrawLineCommand* cmd)
+{
+  PrepareDraw(cmd);
+  SetBatchDepthBuffer(cmd, false);
+
+  const u32 num_vertices = cmd->num_vertices;
+  DebugAssert(m_batch_vertex_space >= (num_vertices * 4) && m_batch_index_space >= (num_vertices * 6));
+
+  const float depth = GetCurrentNormalizedVertexDepth();
+
+  for (u32 i = 0; i < num_vertices; i += 2)
+  {
+    const GSVector2i start_pos = GSVector2i::load(&cmd->vertices[i].x);
+    const u32 start_color = cmd->vertices[i].color;
+    const GSVector2i end_pos = GSVector2i::load(&cmd->vertices[i + 1].x);
+    const u32 end_color = cmd->vertices[i + 1].color;
+
+    const GSVector4i bounds = GSVector4i::xyxy(start_pos, end_pos);
+    const GSVector4i rect =
+      GSVector4i::xyxy(start_pos.min_s32(end_pos), start_pos.max_s32(end_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1));
+    const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
+    DebugAssert(rect.width() <= MAX_PRIMITIVE_WIDTH && rect.height() <= MAX_PRIMITIVE_HEIGHT && !clamped_rect.rempty())
+
+      AddDrawnRectangle(clamped_rect);
+    DrawLine(GSVector4(bounds), start_color, end_color, depth);
+  }
+
+  if (ShouldDrawWithSoftwareRenderer())
+  {
+    const GPU_SW_Rasterizer::DrawLineFunction DrawFunction =
+      GPU_SW_Rasterizer::GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable);
+
+    for (u32 i = 0; i < num_vertices; i += 2)
+      DrawFunction(cmd, &cmd->vertices[i], &cmd->vertices[i + 1]);
+  }
+}
+
 void GPU_HW::DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth)
 {
   DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
@@ -2471,454 +2561,241 @@ void GPU_HW::DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth)
   m_batch_index_space -= 6;
 }
 
-void GPU_HW::LoadVertices()
+void GPU_HW::DrawSprite(const GPUBackendDrawRectangleCommand* cmd)
 {
-  if (m_GPUSTAT.check_mask_before_draw)
-    m_current_depth++;
+  PrepareDraw(cmd);
+  SetBatchDepthBuffer(cmd, false);
+  SetBatchSpriteMode(cmd, m_allow_sprite_mode);
+  DebugAssert(m_batch_vertex_space >= MAX_VERTICES_FOR_RECTANGLE && m_batch_index_space >= MAX_VERTICES_FOR_RECTANGLE);
 
-  const GPURenderCommand rc{m_render_command.bits};
-  const u32 texpage = ZeroExtend32(m_draw_mode.mode_reg.bits) | (ZeroExtend32(m_draw_mode.palette_reg.bits) << 16);
+  const s32 pos_x = cmd->x;
+  const s32 pos_y = cmd->y;
+  const u32 texpage = m_draw_mode.bits;
+  const u32 color = (cmd->rc.texture_enable && cmd->rc.raw_texture_enable) ? UINT32_C(0x00808080) : cmd->color;
   const float depth = GetCurrentNormalizedVertexDepth();
+  const u32 orig_tex_left = ZeroExtend32(Truncate8(cmd->texcoord));
+  const u32 orig_tex_top = ZeroExtend32(cmd->texcoord) >> 8;
+  const u32 rectangle_width = cmd->width;
+  const u32 rectangle_height = cmd->height;
 
-  switch (rc.primitive)
+  const GSVector4i rect =
+    GSVector4i(pos_x, pos_y, pos_x + static_cast<s32>(rectangle_width), pos_y + static_cast<s32>(rectangle_height));
+  const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
+  DebugAssert(!clamped_rect.rempty());
+
+  // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat.
+  u32 tex_top = orig_tex_top;
+  for (u32 y_offset = 0; y_offset < rectangle_height;)
   {
-    case GPUPrimitive::Polygon:
+    const s32 quad_height = std::min(rectangle_height - y_offset, TEXTURE_PAGE_WIDTH - tex_top);
+    const float quad_start_y = static_cast<float>(pos_y + static_cast<s32>(y_offset));
+    const float quad_end_y = quad_start_y + static_cast<float>(quad_height);
+    const u32 tex_bottom = tex_top + quad_height;
+
+    u32 tex_left = orig_tex_left;
+    for (u32 x_offset = 0; x_offset < rectangle_width;)
     {
-      const bool textured = rc.texture_enable;
-      const bool raw_texture = textured && rc.raw_texture_enable;
-      const bool shaded = rc.shading_enable;
-      const bool pgxp = g_settings.gpu_pgxp_enable;
+      const s32 quad_width = std::min(rectangle_width - x_offset, TEXTURE_PAGE_HEIGHT - tex_left);
+      const float quad_start_x = static_cast<float>(pos_x + static_cast<s32>(x_offset));
+      const float quad_end_x = quad_start_x + static_cast<float>(quad_width);
+      const u32 tex_right = tex_left + quad_width;
+      const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1);
 
-      const u32 first_color = rc.color_for_first_vertex;
-      u32 num_vertices = rc.quad_polygon ? 4 : 3;
-      std::array<BatchVertex, 4> vertices;
-      std::array<GSVector2i, 4> native_vertex_positions;
-      std::array<u16, 4> native_texcoords;
-      bool valid_w = g_settings.gpu_pgxp_texture_correction;
-      for (u32 i = 0; i < num_vertices; i++)
+      if (cmd->rc.texture_enable && ShouldCheckForTexPageOverlap())
       {
-        const u32 vert_color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
-        const u32 color = raw_texture ? UINT32_C(0x00808080) : vert_color;
-        const u64 maddr_and_pos = m_fifo.Pop();
-        const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
-        const u16 texcoord = textured ? Truncate16(FifoPop()) : 0;
-        const s32 native_x = native_vertex_positions[i].x = m_drawing_offset.x + vp.x;
-        const s32 native_y = native_vertex_positions[i].y = m_drawing_offset.y + vp.y;
-        native_texcoords[i] = texcoord;
-        vertices[i].Set(static_cast<float>(native_x), static_cast<float>(native_y), depth, 1.0f, color, texpage,
-                        texcoord, 0xFFFF0000u);
-
-        if (pgxp)
-        {
-          valid_w &= CPU::PGXP::GetPreciseVertex(Truncate32(maddr_and_pos >> 32), vp.bits, native_x, native_y,
-                                                 m_drawing_offset.x, m_drawing_offset.y, &vertices[i].x, &vertices[i].y,
-                                                 &vertices[i].w);
-        }
-      }
-      if (pgxp)
-      {
-        if (!valid_w)
-        {
-          SetBatchDepthBuffer(false);
-          if (g_settings.gpu_pgxp_disable_2d)
-          {
-            // NOTE: This reads uninitialized data, but it's okay, it doesn't get used.
-            for (size_t i = 0; i < vertices.size(); i++)
-            {
-              BatchVertex& v = vertices[i];
-              v.x = static_cast<float>(native_vertex_positions[i].x);
-              v.y = static_cast<float>(native_vertex_positions[i].y);
-              v.w = 1.0f;
-            }
-          }
-          else
-          {
-            for (BatchVertex& v : vertices)
-              v.w = 1.0f;
-          }
-        }
-        else if (m_pgxp_depth_buffer)
-        {
-          SetBatchDepthBuffer(true);
-          CheckForDepthClear(vertices.data(), num_vertices);
-        }
+        CheckForTexPageOverlap(cmd, GSVector4i(static_cast<s32>(tex_left), static_cast<s32>(tex_top),
+                                               static_cast<s32>(tex_right), static_cast<s32>(tex_bottom)));
       }
 
-      // Use PGXP to exclude primitives that are definitely 3D.
-      const bool is_3d = (vertices[0].w != vertices[1].w || vertices[0].w != vertices[2].w);
-      if (m_resolution_scale > 1 && !is_3d && rc.quad_polygon)
-        HandleFlippedQuadTextureCoordinates(vertices.data());
-      else if (m_allow_sprite_mode)
-        SetBatchSpriteMode((pgxp && !is_3d) || IsPossibleSpritePolygon(vertices.data()));
+      const u32 base_vertex = m_batch_vertex_count;
+      (m_batch_vertex_ptr++)
+        ->Set(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_top),
+              uv_limits);
+      (m_batch_vertex_ptr++)
+        ->Set(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_top),
+              uv_limits);
+      (m_batch_vertex_ptr++)
+        ->Set(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_bottom),
+              uv_limits);
+      (m_batch_vertex_ptr++)
+        ->Set(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_bottom),
+              uv_limits);
+      m_batch_vertex_count += 4;
+      m_batch_vertex_space -= 4;
 
-      if (m_sw_renderer)
-      {
-        GPUBackendDrawPolygonCommand* cmd = m_sw_renderer->NewDrawPolygonCommand(num_vertices);
-        FillDrawCommand(cmd, rc);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 0);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
+      *(m_batch_index_ptr++) = Truncate16(base_vertex + 3);
+      m_batch_index_count += 6;
+      m_batch_index_space -= 6;
 
-        const u32 sw_num_vertices = rc.quad_polygon ? 4 : 3;
-        for (u32 i = 0; i < sw_num_vertices; i++)
-        {
-          GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
-          vert->x = native_vertex_positions[i].x;
-          vert->y = native_vertex_positions[i].y;
-          vert->texcoord = native_texcoords[i];
-          vert->color = vertices[i].color;
-        }
-
-        m_sw_renderer->PushCommand(cmd);
-      }
-
-      // Cull polygons which are too large.
-      const GSVector2 v0f = GSVector2::load(&vertices[0].x);
-      const GSVector2 v1f = GSVector2::load(&vertices[1].x);
-      const GSVector2 v2f = GSVector2::load(&vertices[2].x);
-      const GSVector2 min_pos_12 = v1f.min(v2f);
-      const GSVector2 max_pos_12 = v1f.max(v2f);
-      const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
-                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
-      const GSVector4i clamped_draw_rect_012 = draw_rect_012.rintersect(m_clamped_drawing_area);
-      const bool first_tri_culled = (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH ||
-                                     draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT || clamped_draw_rect_012.rempty());
-      if (first_tri_culled)
-      {
-        GL_INS_FMT("Culling off-screen/too-large polygon: {},{} {},{} {},{}", native_vertex_positions[0].x,
-                   native_vertex_positions[0].y, native_vertex_positions[1].x, native_vertex_positions[1].y,
-                   native_vertex_positions[2].x, native_vertex_positions[2].y);
-
-        if (!rc.quad_polygon)
-          return;
-      }
-      else
-      {
-        if (textured && m_compute_uv_range)
-          ComputePolygonUVLimits(vertices.data(), num_vertices);
-
-        AddDrawnRectangle(clamped_draw_rect_012);
-        AddDrawTriangleTicks(native_vertex_positions[0], native_vertex_positions[1], native_vertex_positions[2],
-                             rc.shading_enable, rc.texture_enable, rc.transparency_enable);
-
-        // Expand lines to triangles (Doom, Soul Blade, etc.)
-        if (!rc.quad_polygon && m_line_detect_mode >= GPULineDetectMode::BasicTriangles && !is_3d &&
-            ExpandLineTriangles(vertices.data()))
-        {
-          return;
-        }
-
-        const u32 start_index = m_batch_vertex_count;
-        DebugAssert(m_batch_index_space >= 3);
-        *(m_batch_index_ptr++) = Truncate16(start_index);
-        *(m_batch_index_ptr++) = Truncate16(start_index + 1);
-        *(m_batch_index_ptr++) = Truncate16(start_index + 2);
-        m_batch_index_count += 3;
-        m_batch_index_space -= 3;
-      }
-
-      // quads
-      if (rc.quad_polygon)
-      {
-        const GSVector2 v3f = GSVector2::load(&vertices[3].x);
-        const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
-                                           .add32(GSVector4i::cxpr(0, 0, 1, 1));
-        const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
-
-        // Cull polygons which are too large.
-        const bool second_tri_culled =
-          (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
-           clamped_draw_rect_123.rempty());
-        if (second_tri_culled)
-        {
-          GL_INS_FMT("Culling off-screen/too-large polygon (quad second half): {},{} {},{} {},{}",
-                     native_vertex_positions[2].x, native_vertex_positions[2].y, native_vertex_positions[1].x,
-                     native_vertex_positions[1].y, native_vertex_positions[0].x, native_vertex_positions[0].y);
-
-          if (first_tri_culled)
-            return;
-        }
-        else
-        {
-          if (first_tri_culled && textured && m_compute_uv_range)
-            ComputePolygonUVLimits(vertices.data(), num_vertices);
-
-          AddDrawnRectangle(clamped_draw_rect_123);
-          AddDrawTriangleTicks(native_vertex_positions[2], native_vertex_positions[1], native_vertex_positions[3],
-                               rc.shading_enable, rc.texture_enable, rc.transparency_enable);
-
-          const u32 start_index = m_batch_vertex_count;
-          DebugAssert(m_batch_index_space >= 3);
-          *(m_batch_index_ptr++) = Truncate16(start_index + 2);
-          *(m_batch_index_ptr++) = Truncate16(start_index + 1);
-          *(m_batch_index_ptr++) = Truncate16(start_index + 3);
-          m_batch_index_count += 3;
-          m_batch_index_space -= 3;
-        }
-      }
-
-      if (num_vertices == 4)
-      {
-        DebugAssert(m_batch_vertex_space >= 4);
-        std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 4);
-        m_batch_vertex_ptr += 4;
-        m_batch_vertex_count += 4;
-        m_batch_vertex_space -= 4;
-      }
-      else
-      {
-        DebugAssert(m_batch_vertex_space >= 3);
-        std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3);
-        m_batch_vertex_ptr += 3;
-        m_batch_vertex_count += 3;
-        m_batch_vertex_space -= 3;
-      }
+      x_offset += quad_width;
+      tex_left = 0;
     }
-    break;
 
-    case GPUPrimitive::Rectangle:
+    y_offset += quad_height;
+    tex_top = 0;
+  }
+
+  AddDrawnRectangle(clamped_rect);
+
+  if (ShouldDrawWithSoftwareRenderer())
+  {
+    const GPU_SW_Rasterizer::DrawRectangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawRectangleFunction(
+      cmd->rc.texture_enable, cmd->rc.raw_texture_enable, cmd->rc.transparency_enable);
+    DrawFunction(cmd);
+  }
+}
+
+void GPU_HW::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
+{
+  PrepareDraw(cmd);
+  SetBatchDepthBuffer(cmd, false);
+
+  // TODO: This could write directly to the mapped GPU pointer. But watch out for the reads below.
+  const float depth = GetCurrentNormalizedVertexDepth();
+  const bool raw_texture = (cmd->rc.texture_enable && cmd->rc.raw_texture_enable);
+  const u32 num_vertices = cmd->num_vertices;
+  const u32 texpage = m_draw_mode.bits;
+  std::array<BatchVertex, 4> vertices;
+  for (u32 i = 0; i < num_vertices; i++)
+  {
+    const GPUBackendDrawPolygonCommand::Vertex& vert = cmd->vertices[i];
+    const GSVector2 vert_pos = GSVector2(GSVector2i::load(&vert.x));
+    vertices[i].Set(vert_pos.x, vert_pos.y, depth, 1.0f, raw_texture ? UINT32_C(0x00808080) : vert.color, texpage,
+                    vert.texcoord, 0xFFFF0000u);
+  }
+
+  FinishPolygonDraw(cmd, vertices, num_vertices, false);
+
+  if (ShouldDrawWithSoftwareRenderer())
+  {
+    const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
+      cmd->rc.shading_enable, cmd->rc.texture_enable, cmd->rc.raw_texture_enable, cmd->rc.transparency_enable);
+    DrawFunction(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
+    if (cmd->num_vertices > 3)
+      DrawFunction(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
+  }
+}
+
+void GPU_HW::DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd)
+{
+  PrepareDraw(cmd);
+
+  // TODO: This could write directly to the mapped GPU pointer. But watch out for the reads below.
+  const float depth = GetCurrentNormalizedVertexDepth();
+  const bool raw_texture = (cmd->rc.texture_enable && cmd->rc.raw_texture_enable);
+  const u32 num_vertices = cmd->num_vertices;
+  const u32 texpage = m_draw_mode.bits;
+  std::array<BatchVertex, 4> vertices;
+  for (u32 i = 0; i < num_vertices; i++)
+  {
+    const GPUBackendDrawPrecisePolygonCommand::Vertex& vert = cmd->vertices[i];
+    vertices[i].Set(vert.x, vert.y, depth, vert.w, raw_texture ? UINT32_C(0x00808080) : vert.color, texpage,
+                    vert.texcoord, 0xFFFF0000u);
+  }
+
+  const bool use_depth = m_pgxp_depth_buffer && cmd->valid_w;
+  SetBatchDepthBuffer(cmd, use_depth);
+  if (use_depth)
+    CheckForDepthClear(cmd, vertices.data(), num_vertices);
+
+  // Use PGXP to exclude primitives that are definitely 3D.
+  const bool is_3d = (vertices[0].w != vertices[1].w || vertices[0].w != vertices[2].w);
+  FinishPolygonDraw(cmd, vertices, num_vertices, is_3d);
+
+  if (ShouldDrawWithSoftwareRenderer())
+  {
+    const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
+      cmd->rc.shading_enable, cmd->rc.texture_enable, cmd->rc.raw_texture_enable, cmd->rc.transparency_enable);
+    GPUBackendDrawPolygonCommand::Vertex sw_vertices[4];
+    for (u32 i = 0; i < cmd->num_vertices; i++)
     {
-      const u32 color = (rc.texture_enable && rc.raw_texture_enable) ? UINT32_C(0x00808080) : rc.color_for_first_vertex;
-      const GPUVertexPosition vp{FifoPop()};
-      const s32 pos_x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
-      const s32 pos_y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
-
-      const auto [texcoord_x, texcoord_y] = UnpackTexcoord(rc.texture_enable ? Truncate16(FifoPop()) : 0);
-      u32 orig_tex_left = ZeroExtend16(texcoord_x);
-      u32 orig_tex_top = ZeroExtend16(texcoord_y);
-      u32 rectangle_width;
-      u32 rectangle_height;
-      switch (rc.rectangle_size)
-      {
-        case GPUDrawRectangleSize::R1x1:
-          rectangle_width = 1;
-          rectangle_height = 1;
-          break;
-        case GPUDrawRectangleSize::R8x8:
-          rectangle_width = 8;
-          rectangle_height = 8;
-          break;
-        case GPUDrawRectangleSize::R16x16:
-          rectangle_width = 16;
-          rectangle_height = 16;
-          break;
-        default:
-        {
-          const u32 width_and_height = FifoPop();
-          rectangle_width = (width_and_height & VRAM_WIDTH_MASK);
-          rectangle_height = ((width_and_height >> 16) & VRAM_HEIGHT_MASK);
-        }
-        break;
-      }
-
-      const GSVector4i rect =
-        GSVector4i(pos_x, pos_y, pos_x + static_cast<s32>(rectangle_width), pos_y + static_cast<s32>(rectangle_height));
-      const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
-      if (clamped_rect.rempty()) [[unlikely]]
-      {
-        GL_INS_FMT("Culling off-screen rectangle {}", rect);
-        return;
-      }
-
-      // we can split the rectangle up into potentially 8 quads
-      SetBatchDepthBuffer(false);
-      SetBatchSpriteMode(m_allow_sprite_mode);
-      DebugAssert(m_batch_vertex_space >= MAX_VERTICES_FOR_RECTANGLE &&
-                  m_batch_index_space >= MAX_VERTICES_FOR_RECTANGLE);
-
-      // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat.
-      u32 tex_top = orig_tex_top;
-      for (u32 y_offset = 0; y_offset < rectangle_height;)
-      {
-        const s32 quad_height = std::min(rectangle_height - y_offset, TEXTURE_PAGE_WIDTH - tex_top);
-        const float quad_start_y = static_cast<float>(pos_y + static_cast<s32>(y_offset));
-        const float quad_end_y = quad_start_y + static_cast<float>(quad_height);
-        const u32 tex_bottom = tex_top + quad_height;
-
-        u32 tex_left = orig_tex_left;
-        for (u32 x_offset = 0; x_offset < rectangle_width;)
-        {
-          const s32 quad_width = std::min(rectangle_width - x_offset, TEXTURE_PAGE_HEIGHT - tex_left);
-          const float quad_start_x = static_cast<float>(pos_x + static_cast<s32>(x_offset));
-          const float quad_end_x = quad_start_x + static_cast<float>(quad_width);
-          const u32 tex_right = tex_left + quad_width;
-          const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1);
-
-          if (rc.texture_enable && ShouldCheckForTexPageOverlap())
-          {
-            CheckForTexPageOverlap(GSVector4i(static_cast<s32>(tex_left), static_cast<s32>(tex_top),
-                                              static_cast<s32>(tex_right), static_cast<s32>(tex_bottom)));
-          }
-
-          const u32 base_vertex = m_batch_vertex_count;
-          (m_batch_vertex_ptr++)
-            ->Set(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_top),
-                  uv_limits);
-          (m_batch_vertex_ptr++)
-            ->Set(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_top),
-                  uv_limits);
-          (m_batch_vertex_ptr++)
-            ->Set(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_bottom),
-                  uv_limits);
-          (m_batch_vertex_ptr++)
-            ->Set(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_bottom),
-                  uv_limits);
-          m_batch_vertex_count += 4;
-          m_batch_vertex_space -= 4;
-
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 0);
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
-          *(m_batch_index_ptr++) = Truncate16(base_vertex + 3);
-          m_batch_index_count += 6;
-          m_batch_index_space -= 6;
-
-          x_offset += quad_width;
-          tex_left = 0;
-        }
-
-        y_offset += quad_height;
-        tex_top = 0;
-      }
-
-      AddDrawnRectangle(clamped_rect);
-      AddDrawRectangleTicks(clamped_rect, rc.texture_enable, rc.transparency_enable);
-
-      if (m_sw_renderer)
-      {
-        GPUBackendDrawRectangleCommand* cmd = m_sw_renderer->NewDrawRectangleCommand();
-        FillDrawCommand(cmd, rc);
-        cmd->color = color;
-        cmd->x = pos_x;
-        cmd->y = pos_y;
-        cmd->width = static_cast<u16>(rectangle_width);
-        cmd->height = static_cast<u16>(rectangle_height);
-        cmd->texcoord = (static_cast<u16>(texcoord_y) << 8) | static_cast<u16>(texcoord_x);
-        m_sw_renderer->PushCommand(cmd);
-      }
+      const GPUBackendDrawPrecisePolygonCommand::Vertex& src = cmd->vertices[i];
+      sw_vertices[i] = GPUBackendDrawPolygonCommand::Vertex{
+        .x = src.native_x, .y = src.native_y, .color = src.color, .texcoord = src.texcoord};
     }
-    break;
 
-    case GPUPrimitive::Line:
-    {
-      SetBatchDepthBuffer(false);
+    DrawFunction(cmd, &sw_vertices[0], &sw_vertices[1], &sw_vertices[2]);
+    if (cmd->num_vertices > 3)
+      DrawFunction(cmd, &sw_vertices[2], &sw_vertices[1], &sw_vertices[3]);
+  }
+}
 
-      if (!rc.polyline)
-      {
-        DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
+ALWAYS_INLINE_RELEASE void GPU_HW::FinishPolygonDraw(const GPUBackendDrawCommand* cmd,
+                                                     std::array<BatchVertex, 4>& vertices, u32 num_vertices, bool is_3d)
+{
+  // Use PGXP to exclude primitives that are definitely 3D.
+  if (m_resolution_scale > 1 && !is_3d && cmd->rc.quad_polygon)
+    HandleFlippedQuadTextureCoordinates(cmd, vertices.data());
+  else if (m_allow_sprite_mode)
+    SetBatchSpriteMode(cmd, !is_3d || IsPossibleSpritePolygon(vertices.data()));
 
-        u32 start_color, end_color;
-        GPUVertexPosition start_pos, end_pos;
-        if (rc.shading_enable)
-        {
-          start_color = rc.color_for_first_vertex;
-          start_pos.bits = FifoPop();
-          end_color = FifoPop() & UINT32_C(0x00FFFFFF);
-          end_pos.bits = FifoPop();
-        }
-        else
-        {
-          start_color = end_color = rc.color_for_first_vertex;
-          start_pos.bits = FifoPop();
-          end_pos.bits = FifoPop();
-        }
+  const GSVector2 v0f = GSVector2::load(&vertices[0].x);
+  const GSVector2 v1f = GSVector2::load(&vertices[1].x);
+  const GSVector2 v2f = GSVector2::load(&vertices[2].x);
+  const GSVector2 min_pos_12 = v1f.min(v2f);
+  const GSVector2 max_pos_12 = v1f.max(v2f);
+  const GSVector4i draw_rect_012 =
+    GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f)))).add32(GSVector4i::cxpr(0, 0, 1, 1));
+  const GSVector4i clamped_draw_rect_012 = draw_rect_012.rintersect(m_clamped_drawing_area);
+  DebugAssert(draw_rect_012.width() <= MAX_PRIMITIVE_WIDTH && draw_rect_012.height() <= MAX_PRIMITIVE_HEIGHT &&
+              !clamped_draw_rect_012.rempty());
 
-        const GSVector2i vstart_pos = GSVector2i(start_pos.x + m_drawing_offset.x, start_pos.y + m_drawing_offset.y);
-        const GSVector2i vend_pos = GSVector2i(end_pos.x + m_drawing_offset.x, end_pos.y + m_drawing_offset.y);
-        const GSVector4i bounds = GSVector4i::xyxy(vstart_pos, vend_pos);
-        const GSVector4i rect = GSVector4i::xyxy(vstart_pos.min_s32(vend_pos), vstart_pos.max_s32(vend_pos))
-                                  .add32(GSVector4i::cxpr(0, 0, 1, 1));
-        const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
+  if (cmd->rc.texture_enable && m_compute_uv_range)
+    ComputePolygonUVLimits(cmd, vertices.data(), num_vertices);
 
-        if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
-        {
-          GL_INS_FMT("Culling too-large/off-screen line: {},{} - {},{}", bounds.x, bounds.y, bounds.z, bounds.w);
-          return;
-        }
+  AddDrawnRectangle(clamped_draw_rect_012);
 
-        AddDrawnRectangle(clamped_rect);
-        AddDrawLineTicks(clamped_rect, rc.shading_enable);
+  // Expand lines to triangles (Doom, Soul Blade, etc.)
+  if (!cmd->rc.quad_polygon && m_line_detect_mode >= GPULineDetectMode::BasicTriangles && !is_3d &&
+      ExpandLineTriangles(vertices.data()))
+  {
+    return;
+  }
 
-        // TODO: Should we do a PGXP lookup here? Most lines are 2D.
-        DrawLine(GSVector4(bounds), start_color, end_color, depth);
+  const u32 start_index = m_batch_vertex_count;
+  DebugAssert(m_batch_index_space >= 3);
+  *(m_batch_index_ptr++) = Truncate16(start_index);
+  *(m_batch_index_ptr++) = Truncate16(start_index + 1);
+  *(m_batch_index_ptr++) = Truncate16(start_index + 2);
+  m_batch_index_count += 3;
+  m_batch_index_space -= 3;
 
-        if (m_sw_renderer)
-        {
-          GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
-          FillDrawCommand(cmd, rc);
-          GSVector4i::storel(&cmd->vertices[0], bounds);
-          cmd->vertices[0].color = start_color;
-          GSVector4i::storeh(&cmd->vertices[1], bounds);
-          cmd->vertices[1].color = end_color;
-          m_sw_renderer->PushCommand(cmd);
-        }
-      }
-      else
-      {
-        // Multiply by two because we don't use line strips.
-        const u32 num_vertices = GetPolyLineVertexCount();
-        DebugAssert(m_batch_vertex_space >= (num_vertices * 4) && m_batch_index_space >= (num_vertices * 6));
+  // quads, use num_vertices here, because the first half might be culled
+  if (num_vertices == 4)
+  {
+    const GSVector2 v3f = GSVector2::load(&vertices[3].x);
+    const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
+                                       .add32(GSVector4i::cxpr(0, 0, 1, 1));
+    const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
+    DebugAssert(draw_rect_123.width() <= MAX_PRIMITIVE_WIDTH && draw_rect_123.height() <= MAX_PRIMITIVE_HEIGHT &&
+                !clamped_draw_rect_123.rempty());
+    AddDrawnRectangle(clamped_draw_rect_123);
 
-        const bool shaded = rc.shading_enable;
+    DebugAssert(m_batch_index_space >= 3);
+    *(m_batch_index_ptr++) = Truncate16(start_index + 2);
+    *(m_batch_index_ptr++) = Truncate16(start_index + 1);
+    *(m_batch_index_ptr++) = Truncate16(start_index + 3);
+    m_batch_index_count += 3;
+    m_batch_index_space -= 3;
 
-        u32 buffer_pos = 0;
-        const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
-        GSVector2i start_pos = GSVector2i(start_vp.x + m_drawing_offset.x, start_vp.y + m_drawing_offset.y);
-        u32 start_color = rc.color_for_first_vertex;
-
-        GPUBackendDrawLineCommand* cmd;
-        if (m_sw_renderer)
-        {
-          cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
-          FillDrawCommand(cmd, rc);
-          GSVector2i::store(&cmd->vertices[0].x, start_pos);
-          cmd->vertices[0].color = start_color;
-        }
-        else
-        {
-          cmd = nullptr;
-        }
-
-        for (u32 i = 1; i < num_vertices; i++)
-        {
-          const u32 end_color = shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : start_color;
-          const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
-          const GSVector2i end_pos = GSVector2i(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y);
-          const GSVector4i bounds = GSVector4i::xyxy(start_pos, end_pos);
-          const GSVector4i rect = GSVector4i::xyxy(start_pos.min_s32(end_pos), start_pos.max_s32(end_pos))
-                                    .add32(GSVector4i::cxpr(0, 0, 1, 1));
-          const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
-          if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
-          {
-            GL_INS_FMT("Culling too-large line: {},{} - {},{}", start_pos.x, start_pos.y, end_pos.x, end_pos.y);
-          }
-          else
-          {
-            AddDrawnRectangle(clamped_rect);
-            AddDrawLineTicks(clamped_rect, rc.shading_enable);
-
-            // TODO: Should we do a PGXP lookup here? Most lines are 2D.
-            DrawLine(GSVector4(bounds), start_color, end_color, depth);
-          }
-
-          start_pos = end_pos;
-          start_color = end_color;
-
-          if (cmd)
-          {
-            GSVector2i::store(&cmd->vertices[i], end_pos);
-            cmd->vertices[i].color = end_color;
-          }
-        }
-
-        if (cmd)
-          m_sw_renderer->PushCommand(cmd);
-      }
-    }
-    break;
-
-    default:
-      UnreachableCode();
-      break;
+    DebugAssert(m_batch_vertex_space >= 4);
+    std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 4);
+    m_batch_vertex_ptr += 4;
+    m_batch_vertex_count += 4;
+    m_batch_vertex_space -= 4;
+  }
+  else
+  {
+    DebugAssert(m_batch_vertex_space >= 3);
+    std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3);
+    m_batch_vertex_ptr += 3;
+    m_batch_vertex_count += 3;
+    m_batch_vertex_space -= 3;
   }
 }
 
@@ -2963,7 +2840,7 @@ bool GPU_HW::BlitVRAMReplacementTexture(const GPUTextureCache::TextureReplacemen
   return true;
 }
 
-ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
+ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(const GPUBackendDrawCommand* cmd, GSVector4i uv_rect)
 {
   DebugAssert((m_texpage_dirty != 0 || m_texture_dumping) && m_batch.texture_mode != BatchTextureMode::Disabled);
 
@@ -3004,7 +2881,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
         if (m_batch_index_count > 0)
         {
           FlushRender();
-          EnsureVertexBufferSpaceForCurrentCommand();
+          EnsureVertexBufferSpaceForCommand(cmd);
         }
 
         // We need to swap the dirty tracking over to drawn/written.
@@ -3046,7 +2923,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
       if (m_batch_index_count > 0)
       {
         FlushRender();
-        EnsureVertexBufferSpaceForCurrentCommand();
+        EnsureVertexBufferSpaceForCommand(cmd);
       }
 
       UpdateVRAMReadTexture(update_drawn, update_written);
@@ -3099,26 +2976,27 @@ void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices
   MapGPUBuffer(required_vertices, required_indices);
 }
 
-void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand()
+void GPU_HW::EnsureVertexBufferSpaceForCommand(const GPUBackendDrawCommand* cmd)
 {
   u32 required_vertices;
   u32 required_indices;
-  switch (m_render_command.primitive)
+  switch (cmd->type)
   {
-    case GPUPrimitive::Polygon:
+    case GPUBackendCommandType::DrawPolygon:
+    case GPUBackendCommandType::DrawPrecisePolygon:
       required_vertices = 4; // assume quad, in case of expansion
       required_indices = 6;
       break;
-    case GPUPrimitive::Rectangle:
+    case GPUBackendCommandType::DrawRectangle:
       required_vertices = MAX_VERTICES_FOR_RECTANGLE; // TODO: WRong
       required_indices = MAX_VERTICES_FOR_RECTANGLE;
       break;
-    case GPUPrimitive::Line:
+    case GPUBackendCommandType::DrawLine:
     {
       // assume expansion
-      const u32 vert_count = m_render_command.polyline ? GetPolyLineVertexCount() : 2;
-      required_vertices = vert_count * 4;
-      required_indices = vert_count * 6;
+      const GPUBackendDrawLineCommand* lcmd = static_cast<const GPUBackendDrawLineCommand*>(cmd);
+      required_vertices = lcmd->num_vertices * 4;
+      required_indices = lcmd->num_vertices * 6;
     }
     break;
 
@@ -3153,95 +3031,31 @@ ALWAYS_INLINE float GPU_HW::GetCurrentNormalizedVertexDepth() const
   return 1.0f - (static_cast<float>(m_current_depth) / 65535.0f);
 }
 
-void GPU_HW::UpdateSoftwareRenderer(bool copy_vram_from_hw)
+void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
 {
-  const bool current_enabled = (m_sw_renderer != nullptr);
-  const bool new_enabled = g_settings.gpu_use_software_renderer_for_readbacks;
-  const bool use_thread = !g_settings.gpu_texture_cache;
-  if (current_enabled == new_enabled)
-  {
-    if (m_sw_renderer)
-      m_sw_renderer->SetThreadEnabled(use_thread);
-    return;
-  }
+  FlushRender();
 
-  if (!new_enabled)
-  {
-    if (m_sw_renderer)
-      m_sw_renderer->Shutdown();
-    m_sw_renderer.reset();
-    return;
-  }
-
-  std::unique_ptr<GPU_SW_Backend> sw_renderer = std::make_unique<GPU_SW_Backend>();
-  if (!sw_renderer->Initialize(use_thread))
-    return;
-
-  // We need to fill in the SW renderer's VRAM with the current state for hot toggles.
-  if (copy_vram_from_hw)
-  {
-    FlushRender();
-    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
-
-    // Sync the drawing area and CLUT.
-    GPUBackendSetDrawingAreaCommand* clip_cmd = sw_renderer->NewSetDrawingAreaCommand();
-    clip_cmd->new_area = m_drawing_area;
-    sw_renderer->PushCommand(clip_cmd);
-
-    if (IsCLUTValid())
-    {
-      GPUBackendUpdateCLUTCommand* clut_cmd = sw_renderer->NewUpdateCLUTCommand();
-      FillBackendCommandParameters(clut_cmd);
-      clut_cmd->reg.bits = static_cast<u16>(m_current_clut_reg_bits);
-      clut_cmd->clut_is_8bit = m_current_clut_is_8bit;
-      sw_renderer->PushCommand(clut_cmd);
-    }
-  }
-
-  m_sw_renderer = std::move(sw_renderer);
-}
-
-void GPU_HW::FillBackendCommandParameters(GPUBackendCommand* cmd) const
-{
-  cmd->params.bits = 0;
-  cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
-  cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
-  cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
-  cmd->params.interlaced_rendering = m_GPUSTAT.SkipDrawingToActiveField();
-}
-
-void GPU_HW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const
-{
-  FillBackendCommandParameters(cmd);
-  cmd->rc.bits = rc.bits;
-  cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
-  cmd->palette.bits = m_draw_mode.palette_reg.bits;
-  cmd->window = m_draw_mode.texture_window;
-}
-
-void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
-{
   GL_SCOPE_FMT("FillVRAM({},{} => {},{} ({}x{}) with 0x{:08X}", x, y, x + width, y + height, width, height, color);
   DeactivateROV();
 
-  if (m_sw_renderer)
-  {
-    GPUBackendFillVRAMCommand* cmd = m_sw_renderer->NewFillVRAMCommand();
-    FillBackendCommandParameters(cmd);
-    cmd->x = static_cast<u16>(x);
-    cmd->y = static_cast<u16>(y);
-    cmd->width = static_cast<u16>(width);
-    cmd->height = static_cast<u16>(height);
-    cmd->color = color;
-    m_sw_renderer->PushCommand(cmd);
-  }
+  //   if (m_sw_renderer)
+  //   {
+  //     GPUBackendFillVRAMCommand* cmd = m_sw_renderer->NewFillVRAMCommand();
+  //     FillBackendCommandParameters(cmd);
+  //     cmd->x = static_cast<u16>(x);
+  //     cmd->y = static_cast<u16>(y);
+  //     cmd->width = static_cast<u16>(width);
+  //     cmd->height = static_cast<u16>(height);
+  //     cmd->color = color;
+  //     m_sw_renderer->PushCommand(cmd);
+  //   }
 
   GL_INS_FMT("Dirty draw area before: {}", m_vram_dirty_draw_rect);
 
   const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
 
   // If TC is enabled, we have to update local memory.
-  if (m_use_texture_cache && !IsInterlacedRenderingEnabled())
+  if (m_use_texture_cache && !params.interlaced_rendering)
   {
     AddWrittenRectangle(bounds);
     GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, false, 0);
@@ -3255,7 +3069,7 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
 
   const bool is_oversized = (((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT));
   g_gpu_device->SetPipeline(
-    m_vram_fill_pipelines[BoolToUInt8(is_oversized)][BoolToUInt8(IsInterlacedRenderingEnabled())].get());
+    m_vram_fill_pipelines[BoolToUInt8(is_oversized)][BoolToUInt8(params.interlaced_rendering)].get());
 
   const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale));
   g_gpu_device->SetViewportAndScissor(scaled_bounds);
@@ -3277,7 +3091,7 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
   // drop precision unless true colour is enabled
   uniforms.u_fill_color =
     GPUDevice::RGBA8ToFloat(m_true_color ? color : VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color)));
-  uniforms.u_interlaced_displayed_field = GetActiveLineLSB();
+  uniforms.u_interlaced_displayed_field = params.active_line_lsb;
   g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   g_gpu_device->Draw(3, 0);
 
@@ -3286,11 +3100,13 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
 
 void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
 {
+  FlushRender();
+
   GL_PUSH_FMT("ReadVRAM({},{} => {},{} ({}x{})", x, y, x + width, y + height, width, height);
 
-  if (m_sw_renderer)
+  if (ShouldDrawWithSoftwareRenderer())
   {
-    m_sw_renderer->Sync(false);
+    GL_INS("VRAM is already up to date due to SW draws.");
     GL_POP();
     return;
   }
@@ -3344,8 +3160,10 @@ void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
   RestoreDeviceContext();
 }
 
-void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
+void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params)
 {
+  FlushRender();
+
   GL_SCOPE_FMT("UpdateVRAM({},{} => {},{} ({}x{})", x, y, x + width, y + height, width, height);
 
   // TODO: Handle wrapped transfers... break them up or something
@@ -3353,26 +3171,10 @@ void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, b
   DebugAssert(bounds.right <= static_cast<s32>(VRAM_WIDTH) && bounds.bottom <= static_cast<s32>(VRAM_HEIGHT));
   AddWrittenRectangle(bounds);
 
-  if (m_sw_renderer)
-  {
-    const u32 num_words = width * height;
-    GPUBackendUpdateVRAMCommand* cmd = m_sw_renderer->NewUpdateVRAMCommand(num_words);
-    FillBackendCommandParameters(cmd);
-    cmd->params.set_mask_while_drawing = set_mask;
-    cmd->params.check_mask_before_draw = check_mask;
-    cmd->x = static_cast<u16>(x);
-    cmd->y = static_cast<u16>(y);
-    cmd->width = static_cast<u16>(width);
-    cmd->height = static_cast<u16>(height);
-    std::memcpy(cmd->data, data, sizeof(u16) * num_words);
-    m_sw_renderer->PushCommand(cmd);
-  }
-  else
-  {
-    GPUTextureCache::WriteVRAM(x, y, width, height, data, set_mask, check_mask, bounds);
-  }
+  GPUTextureCache::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw,
+                             bounds);
 
-  if (check_mask)
+  if (params.check_mask_before_draw)
   {
     // set new vertex counter since we want this to take into consideration previous masked pixels
     m_current_depth++;
@@ -3387,7 +3189,8 @@ void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, b
     }
   }
 
-  UpdateVRAMOnGPU(x, y, width, height, data, sizeof(u16) * width, set_mask, check_mask, bounds);
+  UpdateVRAMOnGPU(x, y, width, height, data, sizeof(u16) * width, params.set_mask_while_drawing,
+                  params.check_mask_before_draw, bounds);
 }
 
 void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* data, u32 data_pitch, bool set_mask,
@@ -3463,8 +3266,11 @@ void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* da
   RestoreDeviceContext();
 }
 
-void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
+void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                      GPUBackendCommandParameters params)
 {
+  FlushRender();
+
   GL_SCOPE_FMT("CopyVRAM({}x{} @ {},{} => {},{}", width, height, src_x, src_y, dst_x, dst_y);
 
   // masking enabled, oversized, or overlapping
@@ -3473,7 +3279,7 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
   const bool intersect_with_draw = m_vram_dirty_draw_rect.rintersects(src_bounds);
   const bool intersect_with_write = m_vram_dirty_write_rect.rintersects(src_bounds);
   const bool use_shader =
-    (m_GPUSTAT.IsMaskingEnabled() || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
+    (params.set_mask_while_drawing || params.check_mask_before_draw || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
      ((src_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || ((dst_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
      ((dst_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT) ||
     (!intersect_with_draw && !intersect_with_write);
@@ -3482,24 +3288,17 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
   if (m_use_texture_cache && !GPUTextureCache::IsRectDrawn(src_bounds))
   {
     GL_INS("Performed in local memory.");
-    GPUTextureCache::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, m_GPUSTAT.set_mask_while_drawing,
-                              m_GPUSTAT.check_mask_before_draw, src_bounds, dst_bounds);
+    GPUTextureCache::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
+                              params.check_mask_before_draw, src_bounds, dst_bounds);
     UpdateVRAMOnGPU(dst_bounds.left, dst_bounds.top, dst_bounds.width(), dst_bounds.height(),
                     &g_vram[dst_bounds.top * VRAM_WIDTH + dst_bounds.left], VRAM_WIDTH * sizeof(u16), false, false,
                     dst_bounds);
     return;
   }
-  else if (m_sw_renderer)
+  else if (ShouldDrawWithSoftwareRenderer())
   {
-    GPUBackendCopyVRAMCommand* cmd = m_sw_renderer->NewCopyVRAMCommand();
-    FillBackendCommandParameters(cmd);
-    cmd->src_x = static_cast<u16>(src_x);
-    cmd->src_y = static_cast<u16>(src_y);
-    cmd->dst_x = static_cast<u16>(dst_x);
-    cmd->dst_y = static_cast<u16>(dst_y);
-    cmd->width = static_cast<u16>(width);
-    cmd->height = static_cast<u16>(height);
-    m_sw_renderer->PushCommand(cmd);
+    GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
+                                params.check_mask_before_draw);
   }
 
   if (use_shader || IsUsingMultisampling())
@@ -3533,20 +3332,20 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
                                       static_cast<float>(m_vram_texture->GetWidth()),
                                       static_cast<float>(m_vram_texture->GetHeight()),
                                       static_cast<float>(m_resolution_scale),
-                                      m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
+                                      params.set_mask_while_drawing ? 1u : 0u,
                                       GetCurrentNormalizedVertexDepth()};
 
     // VRAM read texture should already be bound.
     const GSVector4i dst_bounds_scaled = dst_bounds.mul32l(GSVector4i(m_resolution_scale));
     g_gpu_device->SetViewportAndScissor(dst_bounds_scaled);
     g_gpu_device->SetPipeline(
-      m_vram_copy_pipelines[BoolToUInt8(m_GPUSTAT.check_mask_before_draw && m_write_mask_as_depth)].get());
+      m_vram_copy_pipelines[BoolToUInt8(params.check_mask_before_draw && m_write_mask_as_depth)].get());
     g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler());
     g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
     g_gpu_device->Draw(3, 0);
     RestoreDeviceContext();
 
-    if (m_GPUSTAT.check_mask_before_draw && !m_pgxp_depth_buffer)
+    if (params.check_mask_before_draw && !m_pgxp_depth_buffer)
       m_current_depth++;
 
     return;
@@ -3581,7 +3380,7 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
       AddUnclampedDrawnRectangle(dst_bounds);
   }
 
-  if (m_GPUSTAT.check_mask_before_draw)
+  if (params.check_mask_before_draw)
   {
     // set new vertex counter since we want this to take into consideration previous masked pixels
     m_current_depth++;
@@ -3594,19 +3393,29 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
     m_vram_read_texture->MakeReadyForSampling();
 }
 
-void GPU_HW::DispatchRenderCommand()
+void GPU_HW::ClearCache()
 {
-  const GPURenderCommand rc{m_render_command.bits};
+  FlushRender();
 
+  // Force the check below to fail.
+  m_draw_mode.bits = INVALID_DRAW_MODE_BITS;
+}
+
+void GPU_HW::PrepareDraw(const GPUBackendDrawCommand* cmd)
+{
   // TODO: avoid all this for vertex loading, only do when the type of draw changes
-  BatchTextureMode texture_mode = rc.IsTexturingEnabled() ? m_batch.texture_mode : BatchTextureMode::Disabled;
+  BatchTextureMode texture_mode = cmd->rc.IsTexturingEnabled() ? m_batch.texture_mode : BatchTextureMode::Disabled;
   GPUTextureCache::SourceKey texture_cache_key = m_batch.texture_cache_key;
-  if (rc.IsTexturingEnabled())
+  if (cmd->rc.IsTexturingEnabled())
   {
     // texture page changed - check that the new page doesn't intersect the drawing area
-    if (m_draw_mode.IsTexturePageChanged() || texture_mode == BatchTextureMode::Disabled)
+    if (((m_draw_mode.bits ^ cmd->draw_mode.bits) & GPUDrawModeReg::TEXTURE_MODE_AND_PAGE_MASK) != 0 ||
+        (cmd->draw_mode.IsUsingPalette() && m_draw_mode.palette_reg.bits != cmd->palette.bits) ||
+        texture_mode == BatchTextureMode::Disabled)
+
     {
-      m_draw_mode.ClearTexturePageChangedFlag();
+      m_draw_mode.mode_reg.bits = cmd->draw_mode.bits;
+      m_draw_mode.palette_reg.bits = cmd->palette.bits;
 
       // start by assuming we can use the TC
       bool use_texture_cache = m_use_texture_cache;
@@ -3682,39 +3491,41 @@ void GPU_HW::DispatchRenderCommand()
     }
   }
 
-  DebugAssert((rc.IsTexturingEnabled() && (texture_mode == BatchTextureMode::PageTexture &&
-                                           texture_cache_key.mode == m_draw_mode.mode_reg.texture_mode) ||
+  DebugAssert((cmd->rc.IsTexturingEnabled() && (texture_mode == BatchTextureMode::PageTexture &&
+                                                texture_cache_key.mode == m_draw_mode.mode_reg.texture_mode) ||
                texture_mode == static_cast<BatchTextureMode>(
                                  (m_draw_mode.mode_reg.texture_mode == GPUTextureMode::Reserved_Direct16Bit) ?
                                    GPUTextureMode::Direct16Bit :
                                    m_draw_mode.mode_reg.texture_mode)) ||
-              (!rc.IsTexturingEnabled() && texture_mode == BatchTextureMode::Disabled));
+              (!cmd->rc.IsTexturingEnabled() && texture_mode == BatchTextureMode::Disabled));
   DebugAssert(!(m_texpage_dirty & TEXPAGE_DIRTY_PAGE_RECT) || texture_mode == BatchTextureMode::PageTexture ||
-              !rc.IsTexturingEnabled());
+              !cmd->rc.IsTexturingEnabled());
 
   // has any state changed which requires a new batch?
   // Reverse blending breaks with mixed transparent and opaque pixels, so we have to do one draw per polygon.
   // If we have fbfetch, we don't need to draw it in two passes. Test case: Suikoden 2 shadows.
+  // TODO: make this suck less.. somehow. probably arrange the relevant bits in a comparable pattern
   const GPUTransparencyMode transparency_mode =
-    rc.transparency_enable ? m_draw_mode.mode_reg.transparency_mode : GPUTransparencyMode::Disabled;
-  const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
+    cmd->rc.transparency_enable ? cmd->draw_mode.transparency_mode : GPUTransparencyMode::Disabled;
+  const bool dithering_enable = (!m_true_color && cmd->draw_mode.dither_enable);
   if (!IsFlushed())
   {
     if (texture_mode != m_batch.texture_mode || transparency_mode != m_batch.transparency_mode ||
         (transparency_mode == GPUTransparencyMode::BackgroundMinusForeground && !m_allow_shader_blend) ||
-        dithering_enable != m_batch.dithering || m_batch_ubo_data.u_texture_window_bits != m_draw_mode.texture_window ||
+        dithering_enable != m_batch.dithering || m_batch_ubo_data.u_texture_window_bits != cmd->window ||
+        m_batch_ubo_data.u_set_mask_while_drawing != BoolToUInt32(cmd->params.set_mask_while_drawing) ||
         (texture_mode == BatchTextureMode::PageTexture && m_batch.texture_cache_key != texture_cache_key))
     {
       FlushRender();
     }
   }
 
-  EnsureVertexBufferSpaceForCurrentCommand();
+  EnsureVertexBufferSpaceForCommand(cmd);
 
   if (m_batch_index_count == 0)
   {
     // transparency mode change
-    const bool check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
+    const bool check_mask_before_draw = cmd->params.check_mask_before_draw;
     if (transparency_mode != GPUTransparencyMode::Disabled && !m_rov_active && !m_prefer_shader_blend &&
         !NeedsShaderBlending(transparency_mode, texture_mode, check_mask_before_draw))
     {
@@ -3728,7 +3539,7 @@ void GPU_HW::DispatchRenderCommand()
       m_batch_ubo_data.u_dst_alpha_factor = dst_alpha_factor;
     }
 
-    const bool set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
+    const bool set_mask_while_drawing = cmd->params.set_mask_while_drawing;
     if (m_batch.check_mask_before_draw != check_mask_before_draw ||
         m_batch.set_mask_while_drawing != set_mask_while_drawing)
     {
@@ -3738,10 +3549,10 @@ void GPU_HW::DispatchRenderCommand()
       m_batch_ubo_data.u_set_mask_while_drawing = BoolToUInt32(set_mask_while_drawing);
     }
 
-    m_batch.interlacing = IsInterlacedRenderingEnabled();
+    m_batch.interlacing = cmd->params.interlaced_rendering;
     if (m_batch.interlacing)
     {
-      const u32 displayed_field = GetActiveLineLSB();
+      const u32 displayed_field = cmd->params.active_line_lsb;
       m_batch_ubo_dirty |= (m_batch_ubo_data.u_interlaced_displayed_field != displayed_field);
       m_batch_ubo_data.u_interlaced_displayed_field = displayed_field;
     }
@@ -3752,51 +3563,36 @@ void GPU_HW::DispatchRenderCommand()
     m_batch.dithering = dithering_enable;
     m_batch.texture_cache_key = texture_cache_key;
 
-    if (m_batch_ubo_data.u_texture_window_bits != m_draw_mode.texture_window)
+    if (m_batch_ubo_data.u_texture_window_bits != cmd->window)
     {
-      m_batch_ubo_data.u_texture_window_bits = m_draw_mode.texture_window;
-      m_texture_window_active = (m_draw_mode.texture_window != GPUTextureWindow{0xFF, 0xFF, 0x00, 0x00});
-      GSVector4i::store<true>(&m_batch_ubo_data.u_texture_window[0],
-                              GSVector4i::load32(&m_draw_mode.texture_window).u8to32());
+      m_batch_ubo_data.u_texture_window_bits = cmd->window;
+      m_texture_window_active = (cmd->window != GPUTextureWindow{{0xFF, 0xFF, 0x00, 0x00}});
+      GSVector4i::store<true>(&m_batch_ubo_data.u_texture_window[0], GSVector4i::load32(&cmd->window).u8to32());
       m_batch_ubo_dirty = true;
     }
 
     if (m_drawing_area_changed)
     {
       m_drawing_area_changed = false;
-      SetClampedDrawingArea();
       SetScissor();
 
       if (m_pgxp_depth_buffer && m_last_depth_z < 1.0f)
       {
         FlushRender();
         CopyAndClearDepthBuffer();
-        EnsureVertexBufferSpaceForCurrentCommand();
-      }
-
-      if (m_sw_renderer)
-      {
-        GPUBackendSetDrawingAreaCommand* cmd = m_sw_renderer->NewSetDrawingAreaCommand();
-        cmd->new_area = m_drawing_area;
-        m_sw_renderer->PushCommand(cmd);
+        EnsureVertexBufferSpaceForCommand(cmd);
       }
     }
   }
 
-  LoadVertices();
+  if (cmd->params.check_mask_before_draw)
+    m_current_depth++;
 }
 
 void GPU_HW::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
 {
-  // Not done in HW, but need to forward through to SW if using that for readbacks
-  if (m_sw_renderer)
-  {
-    GPUBackendUpdateCLUTCommand* cmd = m_sw_renderer->NewUpdateCLUTCommand();
-    FillBackendCommandParameters(cmd);
-    cmd->reg.bits = reg.bits;
-    cmd->clut_is_8bit = clut_is_8bit;
-    m_sw_renderer->PushCommand(cmd);
-  }
+  if (ShouldDrawWithSoftwareRenderer())
+    GPU_SW_Rasterizer::UpdateCLUT(reg, clut_is_8bit);
 }
 
 void GPU_HW::FlushRender()
@@ -3864,7 +3660,13 @@ void GPU_HW::FlushRender()
   }
 }
 
-void GPU_HW::UpdateDisplay()
+void GPU_HW::DrawingAreaChanged()
+{
+  m_clamped_drawing_area = GPU::GetClampedDrawingArea(GPU_SW_Rasterizer::g_drawing_area);
+  m_drawing_area_changed = true;
+}
+
+void GPU_HW::UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd)
 {
   FlushRender();
   DeactivateROV();
@@ -3873,7 +3675,7 @@ void GPU_HW::UpdateDisplay()
 
   GPUTextureCache::Compact();
 
-  if (g_settings.debugging.show_vram)
+  if (g_gpu_settings.debugging.show_vram)
   {
     if (IsUsingMultisampling())
     {
@@ -3889,30 +3691,30 @@ void GPU_HW::UpdateDisplay()
     return;
   }
 
-  const bool interlaced = IsInterlacedDisplayEnabled();
-  const u32 interlaced_field = GetInterlacedDisplayField();
-  const u32 resolution_scale = m_GPUSTAT.display_area_color_depth_24 ? 1 : m_resolution_scale;
-  const u32 scaled_vram_offset_x = m_crtc_state.display_vram_left * resolution_scale;
-  const u32 scaled_vram_offset_y = (m_crtc_state.display_vram_top * resolution_scale) +
-                                   ((interlaced && m_GPUSTAT.vertical_resolution) ? interlaced_field : 0);
-  const u32 scaled_display_width = m_crtc_state.display_vram_width * resolution_scale;
-  const u32 scaled_display_height = m_crtc_state.display_vram_height * resolution_scale;
+  const bool interlaced = cmd->interlaced_display_enabled;
+  const u32 interlaced_field = cmd->interlaced_display_field;
+  const u32 resolution_scale = cmd->display_24bit ? 1 : m_resolution_scale;
+  const u32 scaled_vram_offset_x = cmd->display_vram_left * resolution_scale;
+  const u32 scaled_vram_offset_y = (cmd->display_vram_top * resolution_scale) +
+                                   ((interlaced && cmd->interlaced_display_interleaved) ? interlaced_field : 0);
+  const u32 scaled_display_width = cmd->display_vram_width * resolution_scale;
+  const u32 scaled_display_height = cmd->display_vram_height * resolution_scale;
   const u32 read_height = interlaced ? (scaled_display_height / 2u) : scaled_display_height;
-  const u32 line_skip = BoolToUInt32(interlaced && m_GPUSTAT.vertical_resolution);
+  const u32 line_skip = cmd->interlaced_display_interleaved;
   bool drew_anything = false;
 
   // Don't bother grabbing depth if postfx doesn't need it.
-  GPUTexture* depth_source = (!m_GPUSTAT.display_area_color_depth_24 && m_pgxp_depth_buffer &&
-                              PostProcessing::InternalChain.NeedsDepthBuffer()) ?
-                               (m_depth_was_copied ? m_vram_depth_copy_texture.get() : m_vram_depth_texture.get()) :
-                               nullptr;
+  GPUTexture* depth_source =
+    (!cmd->display_24bit && m_pgxp_depth_buffer && PostProcessing::InternalChain.NeedsDepthBuffer()) ?
+      (m_depth_was_copied ? m_vram_depth_copy_texture.get() : m_vram_depth_texture.get()) :
+      nullptr;
 
-  if (IsDisplayDisabled())
+  if (cmd->display_disabled)
   {
     ClearDisplayTexture();
     return;
   }
-  else if (!m_GPUSTAT.display_area_color_depth_24 && !IsUsingMultisampling() &&
+  else if (!cmd->display_24bit && !IsUsingMultisampling() &&
            (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture->GetWidth() &&
            (scaled_vram_offset_y + scaled_display_height) <= m_vram_texture->GetHeight() &&
            !PostProcessing::InternalChain.IsActive())
@@ -3967,14 +3769,14 @@ void GPU_HW::UpdateDisplay()
     else
     {
       g_gpu_device->SetRenderTarget(m_vram_extract_texture.get());
-      g_gpu_device->SetPipeline(m_vram_extract_pipeline[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)].get());
+      g_gpu_device->SetPipeline(m_vram_extract_pipeline[BoolToUInt8(cmd->display_24bit)].get());
       g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
     }
 
-    const u32 reinterpret_start_x = m_crtc_state.regs.X * resolution_scale;
-    const u32 skip_x = (m_crtc_state.display_vram_left - m_crtc_state.regs.X) * resolution_scale;
+    const u32 reinterpret_start_x = cmd->X * resolution_scale;
+    const u32 skip_x = (cmd->display_vram_left - cmd->X) * resolution_scale;
     GL_INS_FMT("VRAM extract, depth = {}, 24bpp = {}, skip_x = {}, line_skip = {}", depth_source ? "yes" : "no",
-               m_GPUSTAT.display_area_color_depth_24.GetValue(), skip_x, line_skip);
+               cmd->display_24bit.GetValue(), skip_x, line_skip);
     GL_INS_FMT("Source: {},{} => {},{} ({}x{})", reinterpret_start_x, scaled_vram_offset_y,
                reinterpret_start_x + scaled_display_width, scaled_vram_offset_y + read_height, scaled_display_width,
                read_height);
@@ -4020,7 +3822,7 @@ void GPU_HW::UpdateDisplay()
     }
   }
 
-  if (m_downsample_mode != GPUDownsampleMode::Disabled && !m_GPUSTAT.display_area_color_depth_24)
+  if (m_downsample_mode != GPUDownsampleMode::Disabled && !cmd->display_24bit)
   {
     DebugAssert(m_display_texture);
     DownsampleFramebuffer();
@@ -4231,68 +4033,7 @@ void GPU_HW::DownsampleFramebufferBoxFilter(GPUTexture* source, u32 left, u32 to
   SetDisplayTexture(m_downsample_texture.get(), m_display_depth_buffer, 0, 0, ds_width, ds_height);
 }
 
-void GPU_HW::DrawRendererStats()
+std::unique_ptr<GPUBackend> GPUBackend::CreateHardwareBackend()
 {
-  if (ImGui::CollapsingHeader("Renderer Statistics", ImGuiTreeNodeFlags_DefaultOpen))
-  {
-    static const ImVec4 active_color{1.0f, 1.0f, 1.0f, 1.0f};
-    static const ImVec4 inactive_color{0.4f, 0.4f, 0.4f, 1.0f};
-
-    ImGui::Columns(2);
-    ImGui::SetColumnWidth(0, 200.0f * ImGuiManager::GetGlobalScale());
-
-    ImGui::TextUnformatted("Resolution Scale:");
-    ImGui::NextColumn();
-    ImGui::Text("%u (VRAM %ux%u)", m_resolution_scale, VRAM_WIDTH * m_resolution_scale,
-                VRAM_HEIGHT * m_resolution_scale);
-    ImGui::NextColumn();
-
-    ImGui::TextUnformatted("Effective Display Resolution:");
-    ImGui::NextColumn();
-    ImGui::Text("%ux%u", m_crtc_state.display_vram_width * m_resolution_scale,
-                m_crtc_state.display_vram_height * m_resolution_scale);
-    ImGui::NextColumn();
-
-    ImGui::TextUnformatted("True Color:");
-    ImGui::NextColumn();
-    ImGui::TextColored(m_true_color ? active_color : inactive_color, m_true_color ? "Enabled" : "Disabled");
-    ImGui::NextColumn();
-
-    const bool scaled_dithering = (m_resolution_scale > 1 && g_settings.gpu_scaled_dithering);
-    ImGui::TextUnformatted("Scaled Dithering:");
-    ImGui::NextColumn();
-    ImGui::TextColored(scaled_dithering ? active_color : inactive_color, scaled_dithering ? "Enabled" : "Disabled");
-    ImGui::NextColumn();
-
-    ImGui::TextUnformatted("Texture Filtering:");
-    ImGui::NextColumn();
-    ImGui::TextColored((m_texture_filtering != GPUTextureFilter::Nearest) ? active_color : inactive_color, "%s",
-                       Settings::GetTextureFilterDisplayName(m_texture_filtering));
-    ImGui::NextColumn();
-
-    ImGui::TextUnformatted("PGXP:");
-    ImGui::NextColumn();
-    ImGui::TextColored(g_settings.gpu_pgxp_enable ? active_color : inactive_color, "Geom");
-    ImGui::SameLine();
-    ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) ? active_color : inactive_color,
-                       "Cull");
-    ImGui::SameLine();
-    ImGui::TextColored(
-      (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction) ? active_color : inactive_color, "Tex");
-    ImGui::SameLine();
-    ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_vertex_cache) ? active_color : inactive_color,
-                       "Cache");
-    ImGui::NextColumn();
-
-    ImGui::Columns(1);
-  }
-}
-
-std::unique_ptr<GPU> GPU::CreateHardwareRenderer(Error* error)
-{
-  std::unique_ptr<GPU_HW> gpu(std::make_unique<GPU_HW>());
-  if (!gpu->Initialize(error))
-    gpu.reset();
-
-  return gpu;
+  return std::make_unique<GPU_HW>();
 }
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index a35f88bad..280baf400 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gpu.h"
+#include "gpu_backend.h"
 #include "gpu_hw_texture_cache.h"
 
 #include "util/gpu_device.h"
@@ -21,7 +21,9 @@ class GPU_SW_Backend;
 struct GPUBackendCommand;
 struct GPUBackendDrawCommand;
 
-class GPU_HW final : public GPU
+// TODO: Move to cpp
+// TODO: Rename to GPUHWBackend, preserved to avoid conflicts.
+class GPU_HW final : public GPUBackend
 {
 public:
   enum class BatchRenderMode : u8
@@ -63,21 +65,40 @@ public:
   GPU_HW();
   ~GPU_HW() override;
 
-  const Threading::Thread* GetSWThread() const override;
   bool IsHardwareRenderer() const override;
 
-  bool Initialize(Error* error) override;
-  void Reset(bool clear_vram) override;
-  bool DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display) override;
+  bool Initialize(bool upload_vram, Error* error) override;
+
+  u32 GetResolutionScale() const override;
 
   void RestoreDeviceContext() override;
 
+protected:
   void UpdateSettings(const Settings& old_settings) override;
 
-  u32 GetResolutionScale() const override;
   void UpdateResolutionScale() override;
 
-  void UpdateDisplay() override;
+  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override;
+  void ReadVRAM(u32 x, u32 y, u32 width, u32 height) override;
+  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override;
+  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                GPUBackendCommandParameters params) override;
+  void ClearCache() override;
+  void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
+  void OnBufferSwapped() override;
+
+  void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
+  void DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd) override;
+  void DrawSprite(const GPUBackendDrawRectangleCommand* cmd) override;
+  void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
+
+  void FlushRender() override;
+  void DrawingAreaChanged() override;
+  void ClearVRAM() override;
+
+  void LoadState(const GPUBackendLoadStateCommand* cmd) override;
+
+  void UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd) override;
 
 private:
   enum : u32
@@ -86,6 +107,7 @@ private:
     MAX_VERTICES_FOR_RECTANGLE = 6 * (((MAX_PRIMITIVE_WIDTH + (TEXTURE_PAGE_WIDTH - 1)) / TEXTURE_PAGE_WIDTH) + 1u) *
                                  (((MAX_PRIMITIVE_HEIGHT + (TEXTURE_PAGE_HEIGHT - 1)) / TEXTURE_PAGE_HEIGHT) + 1u),
     NUM_TEXTURE_MODES = static_cast<u32>(BatchTextureMode::MaxCount),
+    INVALID_DRAW_MODE_BITS = 0xFFFFFFFFu,
   };
   enum : u8
   {
@@ -164,8 +186,6 @@ private:
   bool CompileResolutionDependentPipelines(Error* error);
   bool CompileDownsamplePipelines(Error* error);
 
-  void LoadVertices();
-
   void PrintSettingsToLog();
   void CheckSettings();
 
@@ -184,8 +204,10 @@ private:
   u32 CalculateResolutionScale() const;
   GPUDownsampleMode GetDownsampleMode(u32 resolution_scale) const;
 
+  bool ShouldDrawWithSoftwareRenderer() const;
+
   bool IsUsingMultisampling() const;
-  bool IsUsingDownsampling() const;
+  bool IsUsingDownsampling(const GPUBackendUpdateDisplayCommand* cmd) const;
 
   void SetFullVRAMDirtyRectangle();
   void ClearVRAMDirtyRectangle();
@@ -195,12 +217,15 @@ private:
   void AddUnclampedDrawnRectangle(const GSVector4i rect);
   void SetTexPageChangedOnOverlap(const GSVector4i update_rect);
 
-  void CheckForTexPageOverlap(GSVector4i uv_rect);
+  void CheckForTexPageOverlap(const GPUBackendDrawCommand* cmd, GSVector4i uv_rect);
   bool ShouldCheckForTexPageOverlap() const;
 
   bool IsFlushed() const;
   void EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices);
-  void EnsureVertexBufferSpaceForCurrentCommand();
+  void EnsureVertexBufferSpaceForCommand(const GPUBackendDrawCommand* cmd);
+  void PrepareDraw(const GPUBackendDrawCommand* cmd);
+  void FinishPolygonDraw(const GPUBackendDrawCommand* cmd, std::array<BatchVertex, 4>& vertices, u32 num_vertices,
+                         bool is_3d);
   void ResetBatchVertexDepth();
 
   /// Returns the value to be written to the depth buffer for the current operation for mask bit emulation.
@@ -212,20 +237,6 @@ private:
   /// Returns true if the draw is going to use shader blending/framebuffer fetch.
   bool NeedsShaderBlending(GPUTransparencyMode transparency, BatchTextureMode texture, bool check_mask) const;
 
-  void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
-  void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
-  void UpdateSoftwareRenderer(bool copy_vram_from_hw);
-
-  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) override;
-  void ReadVRAM(u32 x, u32 y, u32 width, u32 height) override;
-  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) override;
-  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override;
-  void DispatchRenderCommand() override;
-  void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
-  void FlushRender() override;
-  void DrawRendererStats() override;
-  void OnBufferSwapped() override;
-
   void UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* data, u32 data_pitch, bool set_mask,
                        bool check_mask, const GSVector4i bounds);
   bool BlitVRAMReplacementTexture(const GPUTextureCache::TextureReplacementImage* tex, u32 dst_x, u32 dst_y, u32 width,
@@ -235,17 +246,17 @@ private:
   void DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth);
 
   /// Handles quads with flipped texture coordinate directions.
-  void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);
+  void HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd, BatchVertex* vertices);
   bool IsPossibleSpritePolygon(const BatchVertex* vertices) const;
   bool ExpandLineTriangles(BatchVertex* vertices);
 
   /// Computes polygon U/V boundaries, and for overlap with the current texture page.
-  void ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices);
+  void ComputePolygonUVLimits(const GPUBackendDrawCommand* cmd, BatchVertex* vertices, u32 num_vertices);
 
   /// Sets the depth test flag for PGXP depth buffering.
-  void SetBatchDepthBuffer(bool enabled);
-  void CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices);
-  void SetBatchSpriteMode(bool enabled);
+  void SetBatchDepthBuffer(const GPUBackendDrawCommand* cmd, bool enabled);
+  void CheckForDepthClear(const GPUBackendDrawCommand* cmd, const BatchVertex* vertices, u32 num_vertices);
+  void SetBatchSpriteMode(const GPUBackendDrawCommand* cmd, bool enabled);
 
   void UpdateDownsamplingLevels();
 
@@ -264,8 +275,6 @@ private:
   std::unique_ptr<GPUTextureBuffer> m_vram_upload_buffer;
   std::unique_ptr<GPUTexture> m_vram_write_texture;
 
-  std::unique_ptr<GPU_SW_Backend> m_sw_renderer;
-
   BatchVertex* m_batch_vertex_ptr = nullptr;
   u16* m_batch_index_ptr = nullptr;
   u32 m_batch_base_vertex = 0;
@@ -307,18 +316,32 @@ private:
   u8 m_texpage_dirty = 0;
 
   bool m_batch_ubo_dirty = true;
+  bool m_drawing_area_changed = true;
   BatchConfig m_batch;
 
   // Changed state
   BatchUBOData m_batch_ubo_data = {};
 
   // Bounding box of VRAM area that the GPU has drawn into.
+  GSVector4i m_clamped_drawing_area = {};
   GSVector4i m_vram_dirty_draw_rect = INVALID_RECT;
   GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
   GSVector4i m_current_uv_rect = INVALID_RECT;
   GSVector4i m_current_draw_rect = INVALID_RECT;
   s32 m_current_texture_page_offset[2] = {};
 
+  union
+  {
+    struct
+    {
+      // NOTE: Only the texture-related bits should be used here, the others are not validated.
+      GPUDrawModeReg mode_reg;
+      GPUTexturePaletteReg palette_reg;
+    };
+
+    u32 bits = INVALID_DRAW_MODE_BITS;
+  } m_draw_mode = {};
+
   std::unique_ptr<GPUPipeline> m_wireframe_pipeline;
 
   // [wrapped][interlaced]
diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp
index 630456b78..5777b6add 100644
--- a/src/core/gpu_hw_texture_cache.cpp
+++ b/src/core/gpu_hw_texture_cache.cpp
@@ -49,6 +49,9 @@ static constexpr const GSVector4i& INVALID_RECT = GPU_HW::INVALID_RECT;
 static constexpr const GPUTexture::Format REPLACEMENT_TEXTURE_FORMAT = GPUTexture::Format::RGBA8;
 static constexpr const char LOCAL_CONFIG_FILENAME[] = "config.yaml";
 
+static constexpr u32 STATE_PALETTE_RECORD_SIZE =
+  sizeof(GSVector4i) + sizeof(SourceKey) + sizeof(PaletteRecordFlags) + sizeof(HashType) + sizeof(u16) * MAX_CLUT_SIZE;
+
 // Has to be public because it's referenced in Source.
 struct HashCacheEntry
 {
@@ -517,6 +520,7 @@ static std::unique_ptr<GPUTexture> s_replacement_texture_render_target;
 static std::unique_ptr<GPUPipeline> s_replacement_draw_pipeline;                 // copies alpha as-is
 static std::unique_ptr<GPUPipeline> s_replacement_semitransparent_draw_pipeline; // inverts alpha (i.e. semitransparent)
 
+static GPU_HW* s_hw_backend = nullptr; // TODO:FIXME: remove me
 static bool s_track_vram_writes = false;
 
 static std::string s_game_id;
@@ -551,8 +555,10 @@ bool GPUTextureCache::IsDumpingVRAMWriteTextures()
   return (g_settings.texture_replacements.dump_textures && !s_config.dump_texture_pages);
 }
 
-bool GPUTextureCache::Initialize()
+bool GPUTextureCache::Initialize(GPU_HW* backend)
 {
+  s_hw_backend = backend;
+
   LoadLocalConfiguration(false, false);
   UpdateVRAMTrackingState();
   if (!CompilePipelines())
@@ -599,134 +605,164 @@ void GPUTextureCache::UpdateSettings(bool use_texture_cache, const Settings& old
   }
 }
 
-bool GPUTextureCache::DoState(StateWrapper& sw, bool skip)
+bool GPUTextureCache::GetStateSize(StateWrapper& sw, u32* size)
 {
   if (sw.GetVersion() < 73)
   {
-    if (!skip)
-      WARNING_LOG("Texture cache not in save state due to old version.");
-
-    Invalidate();
+    *size = 0;
     return true;
   }
 
-  if (!sw.DoMarker("GPUTextureCache"))
+  const size_t start = sw.GetPosition();
+  if (!sw.DoMarker("GPUTextureCache")) [[unlikely]]
     return false;
 
-  if (sw.IsReading())
+  u32 num_vram_writes = 0;
+  sw.Do(&num_vram_writes);
+
+  for (u32 i = 0; i < num_vram_writes; i++)
   {
-    if (!skip)
-      Invalidate();
+    sw.SkipBytes(sizeof(GSVector4i) * 2 + sizeof(HashType));
 
-    u32 num_vram_writes = 0;
-    sw.Do(&num_vram_writes);
-
-    const bool skip_writes = (skip || !s_track_vram_writes);
-
-    for (u32 i = 0; i < num_vram_writes; i++)
-    {
-      static constexpr u32 PALETTE_RECORD_SIZE = sizeof(GSVector4i) + sizeof(SourceKey) + sizeof(PaletteRecordFlags) +
-                                                 sizeof(HashType) + sizeof(u16) * MAX_CLUT_SIZE;
-
-      if (skip_writes)
-      {
-        sw.SkipBytes(sizeof(GSVector4i) * 2 + sizeof(HashType));
-
-        u32 num_palette_records = 0;
-        sw.Do(&num_palette_records);
-        sw.SkipBytes(num_palette_records * PALETTE_RECORD_SIZE);
-      }
-      else
-      {
-        VRAMWrite* vrw = new VRAMWrite();
-        DoStateVector(sw, &vrw->active_rect);
-        DoStateVector(sw, &vrw->write_rect);
-        sw.Do(&vrw->hash);
-
-        u32 num_palette_records = 0;
-        sw.Do(&num_palette_records);
-
-        // Skip palette records if we're not dumping now.
-        if (g_settings.texture_replacements.dump_textures)
-        {
-          vrw->palette_records.reserve(num_palette_records);
-          for (u32 j = 0; j < num_palette_records; j++)
-          {
-            VRAMWrite::PaletteRecord& rec = vrw->palette_records.emplace_back();
-            DoStateVector(sw, &rec.rect);
-            sw.DoBytes(&rec.key, sizeof(rec.key));
-            sw.Do(&rec.flags);
-            sw.Do(&rec.palette_hash);
-            sw.DoBytes(rec.palette, sizeof(rec.palette));
-          }
-        }
-        else
-        {
-          sw.SkipBytes(num_palette_records * PALETTE_RECORD_SIZE);
-        }
-
-        if (sw.HasError())
-        {
-          delete vrw;
-          Invalidate();
-          return false;
-        }
-
-        vrw->num_page_refs = 0;
-        LoopRectPages(vrw->active_rect, [vrw](u32 pn) {
-          DebugAssert(vrw->num_page_refs < MAX_PAGE_REFS_PER_WRITE);
-          ListAppend(&s_pages[pn].writes, vrw, &vrw->page_refs[vrw->num_page_refs++]);
-          return true;
-        });
-      }
-    }
+    u32 num_palette_records = 0;
+    sw.Do(&num_palette_records);
+    sw.SkipBytes(num_palette_records * STATE_PALETTE_RECORD_SIZE);
   }
-  else
+
+  if (sw.HasError()) [[unlikely]]
+    return false;
+
+  *size = static_cast<u32>(sw.GetPosition() - start);
+  return true;
+}
+
+void GPUTextureCache::LoadState(std::span<const u8> data, u32 data_version)
+{
+  Invalidate();
+
+  if (data.empty())
   {
-    s_temp_vram_write_list.clear();
+    WARNING_LOG("Texture cache not in save state due to old version.");
+    return;
+  }
 
-    if (!skip && s_track_vram_writes)
+  // Don't need anything if we're not tracking VRAM writes.
+  if (!s_track_vram_writes)
+    return;
+
+  StateWrapper sw(data, StateWrapper::Mode::Read, data_version);
+
+  if (!sw.DoMarker("GPUTextureCache")) [[unlikely]]
+  {
+    WARNING_LOG("Invalid save state data.");
+    return;
+  }
+
+  u32 num_vram_writes = 0;
+  sw.Do(&num_vram_writes);
+
+  for (u32 i = 0; i < num_vram_writes; i++)
+  {
+    if (!s_track_vram_writes)
     {
-      for (PageEntry& page : s_pages)
-      {
-        ListIterate(page.writes, [](VRAMWrite* vrw) {
-          if (std::find(s_temp_vram_write_list.begin(), s_temp_vram_write_list.end(), vrw) !=
-              s_temp_vram_write_list.end())
-          {
-            return;
-          }
+      sw.SkipBytes(sizeof(GSVector4i) * 2 + sizeof(HashType));
 
-          // try not to lose data... pull it from the sources
-          if (g_settings.texture_replacements.dump_textures)
-            SyncVRAMWritePaletteRecords(vrw);
-
-          s_temp_vram_write_list.push_back(vrw);
-        });
-      }
+      u32 num_palette_records = 0;
+      sw.Do(&num_palette_records);
+      sw.SkipBytes(num_palette_records * STATE_PALETTE_RECORD_SIZE);
     }
-
-    u32 num_vram_writes = static_cast<u32>(s_temp_vram_write_list.size());
-    sw.Do(&num_vram_writes);
-    for (VRAMWrite* vrw : s_temp_vram_write_list)
+    else
     {
+      VRAMWrite* vrw = new VRAMWrite();
       DoStateVector(sw, &vrw->active_rect);
       DoStateVector(sw, &vrw->write_rect);
       sw.Do(&vrw->hash);
 
-      u32 num_palette_records = static_cast<u32>(vrw->palette_records.size());
+      u32 num_palette_records = 0;
       sw.Do(&num_palette_records);
-      for (VRAMWrite::PaletteRecord& rec : vrw->palette_records)
+
+      // Skip palette records if we're not dumping now.
+      if (g_settings.texture_replacements.dump_textures)
       {
-        DoStateVector(sw, &rec.rect);
-        sw.DoBytes(&rec.key, sizeof(rec.key));
-        sw.Do(&rec.flags);
-        sw.Do(&rec.palette_hash);
-        sw.DoBytes(rec.palette, sizeof(rec.palette));
+        vrw->palette_records.reserve(num_palette_records);
+        for (u32 j = 0; j < num_palette_records; j++)
+        {
+          VRAMWrite::PaletteRecord& rec = vrw->palette_records.emplace_back();
+          DoStateVector(sw, &rec.rect);
+          sw.DoBytes(&rec.key, sizeof(rec.key));
+          sw.Do(&rec.flags);
+          sw.Do(&rec.palette_hash);
+          sw.DoBytes(rec.palette, sizeof(rec.palette));
+        }
       }
+      else
+      {
+        sw.SkipBytes(num_palette_records * STATE_PALETTE_RECORD_SIZE);
+      }
+
+      if (sw.HasError())
+      {
+        WARNING_LOG("Invalid save state data.");
+        delete vrw;
+        Invalidate();
+        return;
+      }
+
+      vrw->num_page_refs = 0;
+      LoopRectPages(vrw->active_rect, [vrw](u32 pn) {
+        DebugAssert(vrw->num_page_refs < MAX_PAGE_REFS_PER_WRITE);
+        ListAppend(&s_pages[pn].writes, vrw, &vrw->page_refs[vrw->num_page_refs++]);
+        return true;
+      });
+    }
+  }
+}
+
+void GPUTextureCache::SaveState(StateWrapper& sw)
+{
+  sw.DoMarker("GPUTextureCache");
+
+  s_temp_vram_write_list.clear();
+
+  if (s_track_vram_writes)
+  {
+    for (PageEntry& page : s_pages)
+    {
+      ListIterate(page.writes, [](VRAMWrite* vrw) {
+        if (std::find(s_temp_vram_write_list.begin(), s_temp_vram_write_list.end(), vrw) !=
+            s_temp_vram_write_list.end())
+        {
+          return;
+        }
+
+        // try not to lose data... pull it from the sources
+        if (g_settings.texture_replacements.dump_textures)
+          SyncVRAMWritePaletteRecords(vrw);
+
+        s_temp_vram_write_list.push_back(vrw);
+      });
     }
   }
 
-  return !sw.HasError();
+  u32 num_vram_writes = static_cast<u32>(s_temp_vram_write_list.size());
+  sw.Do(&num_vram_writes);
+  for (VRAMWrite* vrw : s_temp_vram_write_list)
+  {
+    DoStateVector(sw, &vrw->active_rect);
+    DoStateVector(sw, &vrw->write_rect);
+    sw.Do(&vrw->hash);
+
+    u32 num_palette_records = static_cast<u32>(vrw->palette_records.size());
+    sw.Do(&num_palette_records);
+    for (VRAMWrite::PaletteRecord& rec : vrw->palette_records)
+    {
+      DoStateVector(sw, &rec.rect);
+      sw.DoBytes(&rec.key, sizeof(rec.key));
+      sw.Do(&rec.flags);
+      sw.Do(&rec.palette_hash);
+      sw.DoBytes(rec.palette, sizeof(rec.palette));
+    }
+  }
 }
 
 void GPUTextureCache::Shutdown()
@@ -737,6 +773,7 @@ void GPUTextureCache::Shutdown()
   s_replacement_texture_render_target.reset();
   s_hash_cache_purge_list = {};
   s_temp_vram_write_list = {};
+  s_hw_backend = nullptr;
   s_track_vram_writes = false;
 
   s_replacement_image_cache.clear();
@@ -3305,5 +3342,5 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
   g_gpu_device->RecycleTexture(std::move(entry->texture));
   entry->texture = std::move(replacement_tex);
 
-  g_gpu->RestoreDeviceContext();
+  s_hw_backend->RestoreDeviceContext();
 }
\ No newline at end of file
diff --git a/src/core/gpu_hw_texture_cache.h b/src/core/gpu_hw_texture_cache.h
index dd629a40f..a44e257b6 100644
--- a/src/core/gpu_hw_texture_cache.h
+++ b/src/core/gpu_hw_texture_cache.h
@@ -10,6 +10,7 @@ class RGBA8Image;
 class StateWrapper;
 
 struct Settings;
+class GPU_HW;
 
 //////////////////////////////////////////////////////////////////////////
 // Texture Cache
@@ -102,9 +103,13 @@ struct Source
   TListNode<Source> hash_cache_ref;
 };
 
-bool Initialize();
+bool Initialize(GPU_HW* backend);
 void UpdateSettings(bool use_texture_cache, const Settings& old_settings);
-bool DoState(StateWrapper& sw, bool skip);
+
+bool GetStateSize(StateWrapper& sw, u32* size);
+void LoadState(std::span<const u8> data, u32 data_version);
+void SaveState(StateWrapper& sw);
+
 void Shutdown();
 
 void Invalidate();
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index 10eb5a5bc..480f09745 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0
 
 #include "gpu_sw.h"
-#include "gpu_hw_texture_cache.h"
+#include "gpu.h"
+#include "gpu_sw_rasterizer.h"
 #include "settings.h"
 #include "system.h"
 
@@ -10,8 +11,7 @@
 
 #include "common/align.h"
 #include "common/assert.h"
-#include "common/gsvector.h"
-#include "common/gsvector_formatter.h"
+#include "common/intrin.h"
 #include "common/log.h"
 
 #include <algorithm>
@@ -20,27 +20,149 @@ LOG_CHANNEL(GPU_SW);
 
 GPU_SW::GPU_SW() = default;
 
-GPU_SW::~GPU_SW()
-{
-  g_gpu_device->RecycleTexture(std::move(m_upload_texture));
-  m_backend.Shutdown();
-}
-
-const Threading::Thread* GPU_SW::GetSWThread() const
-{
-  return m_backend.GetThread();
-}
+GPU_SW::~GPU_SW() = default;
 
 bool GPU_SW::IsHardwareRenderer() const
 {
   return false;
 }
 
-bool GPU_SW::Initialize(Error* error)
+u32 GPU_SW::GetResolutionScale() const
 {
-  if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread))
+  return 1u;
+}
+
+bool GPU_SW::Initialize(bool upload_vram, Error* error)
+{
+  if (!GPUBackend::Initialize(upload_vram, error))
     return false;
 
+  // if we're using "new" vram, clear it out here
+  if (!upload_vram)
+    std::memset(g_vram, 0, sizeof(g_vram));
+
+  SetDisplayTextureFormat();
+  return true;
+}
+
+void GPU_SW::ClearVRAM()
+{
+  std::memset(g_vram, 0, sizeof(g_vram));
+  std::memset(g_gpu_clut, 0, sizeof(g_gpu_clut));
+}
+
+void GPU_SW::UpdateResolutionScale()
+{
+}
+
+void GPU_SW::LoadState(const GPUBackendLoadStateCommand* cmd)
+{
+  std::memcpy(g_vram, cmd->vram_data, sizeof(g_vram));
+  std::memcpy(g_gpu_clut, cmd->clut_data, sizeof(g_gpu_clut));
+}
+
+void GPU_SW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
+{
+}
+
+void GPU_SW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
+{
+  GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, params.interlaced_rendering, params.active_line_lsb);
+}
+
+void GPU_SW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params)
+{
+  GPU_SW_Rasterizer::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw);
+}
+
+void GPU_SW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                      GPUBackendCommandParameters params)
+{
+  GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
+                              params.check_mask_before_draw);
+}
+
+void GPU_SW::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
+{
+  const GPURenderCommand rc{cmd->rc.bits};
+
+  const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
+    rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+
+  DrawFunction(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
+  if (cmd->num_vertices > 3)
+    DrawFunction(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
+}
+
+void GPU_SW::DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd)
+{
+  const GPURenderCommand rc{cmd->rc.bits};
+
+  const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
+    rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+
+  // Need to cut out the irrelevant bits.
+  // TODO: In _theory_ we could use the fixed-point parts here.
+  GPUBackendDrawPolygonCommand::Vertex vertices[4];
+  for (u32 i = 0; i < cmd->num_vertices; i++)
+  {
+    const GPUBackendDrawPrecisePolygonCommand::Vertex& src = cmd->vertices[i];
+    vertices[i] = GPUBackendDrawPolygonCommand::Vertex{
+      .x = src.native_x, .y = src.native_y, .color = src.color, .texcoord = src.texcoord};
+  }
+
+  DrawFunction(cmd, &vertices[0], &vertices[1], &vertices[2]);
+  if (cmd->num_vertices > 3)
+    DrawFunction(cmd, &vertices[2], &vertices[1], &vertices[3]);
+}
+
+void GPU_SW::DrawSprite(const GPUBackendDrawRectangleCommand* cmd)
+{
+  const GPURenderCommand rc{cmd->rc.bits};
+
+  const GPU_SW_Rasterizer::DrawRectangleFunction DrawFunction =
+    GPU_SW_Rasterizer::GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+
+  DrawFunction(cmd);
+}
+
+void GPU_SW::DrawLine(const GPUBackendDrawLineCommand* cmd)
+{
+  const GPU_SW_Rasterizer::DrawLineFunction DrawFunction =
+    GPU_SW_Rasterizer::GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable);
+
+  for (u16 i = 0; i < cmd->num_vertices; i += 2)
+    DrawFunction(cmd, &cmd->vertices[i], &cmd->vertices[i + 1]);
+}
+
+void GPU_SW::DrawingAreaChanged()
+{
+  // GPU_SW_Rasterizer::g_drawing_area set by base class.
+}
+
+void GPU_SW::ClearCache()
+{
+}
+
+void GPU_SW::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
+{
+  GPU_SW_Rasterizer::UpdateCLUT(reg, clut_is_8bit);
+}
+
+void GPU_SW::OnBufferSwapped()
+{
+}
+
+void GPU_SW::FlushRender()
+{
+}
+
+void GPU_SW::RestoreDeviceContext()
+{
+}
+
+void GPU_SW::SetDisplayTextureFormat()
+{
   static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGBA5551,
                                                          GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8};
   static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
@@ -61,35 +183,6 @@ bool GPU_SW::Initialize(Error* error)
       break;
     }
   }
-
-  return true;
-}
-
-bool GPU_SW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display)
-{
-  // need to ensure the worker thread is done
-  m_backend.Sync(true);
-
-  // ignore the host texture for software mode, since we want to save vram here
-  if (!GPU::DoState(sw, nullptr, update_display))
-    return false;
-
-  // need to still call the TC, to toss any data in the state
-  return GPUTextureCache::DoState(sw, true);
-}
-
-void GPU_SW::Reset(bool clear_vram)
-{
-  GPU::Reset(clear_vram);
-
-  m_backend.Reset();
-}
-
-void GPU_SW::UpdateSettings(const Settings& old_settings)
-{
-  GPU::UpdateSettings(old_settings);
-  if (g_settings.gpu_use_thread != old_settings.gpu_use_thread)
-    m_backend.SetThreadEnabled(g_settings.gpu_use_thread);
 }
 
 GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format)
@@ -427,32 +520,28 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
   }
 }
 
-void GPU_SW::UpdateDisplay()
+void GPU_SW::UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd)
 {
-  // fill display texture
-  m_backend.Sync(true);
-
   if (!g_settings.debugging.show_vram)
   {
-    if (IsDisplayDisabled())
+    if (cmd->display_disabled)
     {
       ClearDisplayTexture();
       return;
     }
 
-    const bool is_24bit = m_GPUSTAT.display_area_color_depth_24;
-    const bool interlaced = IsInterlacedDisplayEnabled();
-    const u32 field = GetInterlacedDisplayField();
-    const u32 vram_offset_x = is_24bit ? m_crtc_state.regs.X : m_crtc_state.display_vram_left;
-    const u32 vram_offset_y =
-      m_crtc_state.display_vram_top + ((interlaced && m_GPUSTAT.vertical_resolution) ? field : 0);
-    const u32 skip_x = is_24bit ? (m_crtc_state.display_vram_left - m_crtc_state.regs.X) : 0;
-    const u32 read_width = m_crtc_state.display_vram_width;
-    const u32 read_height = interlaced ? (m_crtc_state.display_vram_height / 2) : m_crtc_state.display_vram_height;
+    const bool is_24bit = cmd->display_24bit;
+    const bool interlaced = cmd->interlaced_display_enabled;
+    const u32 field = cmd->interlaced_display_field;
+    const u32 vram_offset_x = is_24bit ? cmd->X : cmd->display_vram_left;
+    const u32 vram_offset_y = cmd->display_vram_top + ((interlaced && cmd->interlaced_display_interleaved) ? field : 0);
+    const u32 skip_x = is_24bit ? (cmd->display_vram_left - cmd->X) : 0;
+    const u32 read_width = cmd->display_vram_width;
+    const u32 read_height = interlaced ? (cmd->display_vram_height / 2) : cmd->display_vram_height;
 
-    if (IsInterlacedDisplayEnabled())
+    if (cmd->interlaced_display_enabled)
     {
-      const u32 line_skip = m_GPUSTAT.vertical_resolution;
+      const u32 line_skip = cmd->interlaced_display_interleaved;
       if (CopyOut(vram_offset_x, vram_offset_y, skip_x, read_width, read_height, line_skip, is_24bit))
       {
         SetDisplayTexture(m_upload_texture.get(), nullptr, 0, 0, read_width, read_height);
@@ -484,351 +573,7 @@ void GPU_SW::UpdateDisplay()
   }
 }
 
-void GPU_SW::FillBackendCommandParameters(GPUBackendCommand* cmd) const
+std::unique_ptr<GPUBackend> GPUBackend::CreateSoftwareBackend()
 {
-  cmd->params.bits = 0;
-  cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
-  cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
-  cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
-  cmd->params.interlaced_rendering = IsInterlacedRenderingEnabled();
-}
-
-void GPU_SW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const
-{
-  FillBackendCommandParameters(cmd);
-  cmd->rc.bits = rc.bits;
-  cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
-  cmd->draw_mode.dither_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
-  cmd->palette.bits = m_draw_mode.palette_reg.bits;
-  cmd->window = m_draw_mode.texture_window;
-}
-
-void GPU_SW::DispatchRenderCommand()
-{
-  if (m_drawing_area_changed)
-  {
-    GPUBackendSetDrawingAreaCommand* cmd = m_backend.NewSetDrawingAreaCommand();
-    cmd->new_area = m_drawing_area;
-    GSVector4i::store<false>(cmd->new_clamped_area, m_clamped_drawing_area);
-    m_backend.PushCommand(cmd);
-    m_drawing_area_changed = false;
-  }
-
-  const GPURenderCommand rc{m_render_command.bits};
-
-  switch (rc.primitive)
-  {
-    case GPUPrimitive::Polygon:
-    {
-      const u32 num_vertices = rc.quad_polygon ? 4 : 3;
-      GPUBackendDrawPolygonCommand* cmd = m_backend.NewDrawPolygonCommand(num_vertices);
-      FillDrawCommand(cmd, rc);
-
-      std::array<GSVector2i, 4> positions;
-      const u32 first_color = rc.color_for_first_vertex;
-      const bool shaded = rc.shading_enable;
-      const bool textured = rc.texture_enable;
-      for (u32 i = 0; i < num_vertices; i++)
-      {
-        GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
-        vert->color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
-        const u64 maddr_and_pos = m_fifo.Pop();
-        const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
-        vert->x = m_drawing_offset.x + vp.x;
-        vert->y = m_drawing_offset.y + vp.y;
-        vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
-        positions[i] = GSVector2i::load(&vert->x);
-      }
-
-      // Cull polygons which are too large.
-      const GSVector2i min_pos_12 = positions[1].min_s32(positions[2]);
-      const GSVector2i max_pos_12 = positions[1].max_s32(positions[2]);
-      const GSVector4i draw_rect_012 = GSVector4i(min_pos_12.min_s32(positions[0]))
-                                         .upl64(GSVector4i(max_pos_12.max_s32(positions[0])))
-                                         .add32(GSVector4i::cxpr(0, 0, 1, 1));
-      const bool first_tri_culled =
-        (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT ||
-         !m_clamped_drawing_area.rintersects(draw_rect_012));
-      if (first_tri_culled)
-      {
-        DEBUG_LOG("Culling off-screen/too-large polygon: {},{} {},{} {},{}", cmd->vertices[0].x, cmd->vertices[0].y,
-                  cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[2].x, cmd->vertices[2].y);
-
-        if (!rc.quad_polygon)
-          return;
-      }
-      else
-      {
-        AddDrawTriangleTicks(positions[0], positions[1], positions[2], rc.shading_enable, rc.texture_enable,
-                             rc.transparency_enable);
-      }
-
-      // quads
-      if (rc.quad_polygon)
-      {
-        const GSVector4i draw_rect_123 = GSVector4i(min_pos_12.min_s32(positions[3]))
-                                           .upl64(GSVector4i(max_pos_12.max_s32(positions[3])))
-                                           .add32(GSVector4i::cxpr(0, 0, 1, 1));
-
-        // Cull polygons which are too large.
-        const bool second_tri_culled =
-          (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
-           !m_clamped_drawing_area.rintersects(draw_rect_123));
-        if (second_tri_culled)
-        {
-          DEBUG_LOG("Culling too-large polygon (quad second half): {},{} {},{} {},{}", cmd->vertices[2].x,
-                    cmd->vertices[2].y, cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[0].x, cmd->vertices[0].y);
-
-          if (first_tri_culled)
-            return;
-        }
-        else
-        {
-          AddDrawTriangleTicks(positions[2], positions[1], positions[3], rc.shading_enable, rc.texture_enable,
-                               rc.transparency_enable);
-        }
-      }
-
-      m_backend.PushCommand(cmd);
-    }
-    break;
-
-    case GPUPrimitive::Rectangle:
-    {
-      GPUBackendDrawRectangleCommand* cmd = m_backend.NewDrawRectangleCommand();
-      FillDrawCommand(cmd, rc);
-      cmd->color = rc.color_for_first_vertex;
-
-      const GPUVertexPosition vp{FifoPop()};
-      cmd->x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
-      cmd->y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
-
-      if (rc.texture_enable)
-      {
-        const u32 texcoord_and_palette = FifoPop();
-        cmd->palette.bits = Truncate16(texcoord_and_palette >> 16);
-        cmd->texcoord = Truncate16(texcoord_and_palette);
-      }
-      else
-      {
-        cmd->palette.bits = 0;
-        cmd->texcoord = 0;
-      }
-
-      switch (rc.rectangle_size)
-      {
-        case GPUDrawRectangleSize::R1x1:
-          cmd->width = 1;
-          cmd->height = 1;
-          break;
-        case GPUDrawRectangleSize::R8x8:
-          cmd->width = 8;
-          cmd->height = 8;
-          break;
-        case GPUDrawRectangleSize::R16x16:
-          cmd->width = 16;
-          cmd->height = 16;
-          break;
-        default:
-        {
-          const u32 width_and_height = FifoPop();
-          cmd->width = static_cast<u16>(width_and_height & VRAM_WIDTH_MASK);
-          cmd->height = static_cast<u16>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
-        }
-        break;
-      }
-
-      const GSVector4i rect = GSVector4i(cmd->x, cmd->y, cmd->x + cmd->width, cmd->y + cmd->height);
-      const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
-      if (clamped_rect.rempty()) [[unlikely]]
-      {
-        DEBUG_LOG("Culling off-screen rectangle {}", rect);
-        return;
-      }
-
-      AddDrawRectangleTicks(clamped_rect, rc.texture_enable, rc.transparency_enable);
-
-      m_backend.PushCommand(cmd);
-    }
-    break;
-
-    case GPUPrimitive::Line:
-    {
-      if (!rc.polyline)
-      {
-        GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand(2);
-        FillDrawCommand(cmd, rc);
-        cmd->palette.bits = 0;
-
-        if (rc.shading_enable)
-        {
-          cmd->vertices[0].color = rc.color_for_first_vertex;
-          const GPUVertexPosition start_pos{FifoPop()};
-          cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
-          cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
-
-          cmd->vertices[1].color = FifoPop() & UINT32_C(0x00FFFFFF);
-          const GPUVertexPosition end_pos{FifoPop()};
-          cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
-          cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
-        }
-        else
-        {
-          cmd->vertices[0].color = rc.color_for_first_vertex;
-          cmd->vertices[1].color = rc.color_for_first_vertex;
-
-          const GPUVertexPosition start_pos{FifoPop()};
-          cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
-          cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
-
-          const GPUVertexPosition end_pos{FifoPop()};
-          cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
-          cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
-        }
-
-        const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
-        const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
-        const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
-        const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
-
-        if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
-        {
-          DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", cmd->vertices[0].y, cmd->vertices[0].y,
-                    cmd->vertices[1].x, cmd->vertices[1].y);
-          return;
-        }
-
-        AddDrawLineTicks(clamped_rect, rc.shading_enable);
-
-        m_backend.PushCommand(cmd);
-      }
-      else
-      {
-        const u32 num_vertices = GetPolyLineVertexCount();
-
-        GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand((num_vertices - 1) * 2);
-        FillDrawCommand(cmd, m_render_command);
-
-        u32 buffer_pos = 0;
-        const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
-        const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
-        GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
-        u32 start_color = m_render_command.color_for_first_vertex;
-
-        const bool shaded = m_render_command.shading_enable;
-        u32 out_vertex_count = 0;
-        for (u32 i = 1; i < num_vertices; i++)
-        {
-          const u32 end_color =
-            shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : m_render_command.color_for_first_vertex;
-          const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
-          const GSVector2i end_pos = GSVector2i(vp.x, vp.y).add32(draw_offset);
-
-          const GSVector4i rect = GSVector4i::xyxy(start_pos.min_s32(end_pos), start_pos.max_s32(end_pos))
-                                    .add32(GSVector4i::cxpr(0, 0, 1, 1));
-          const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
-
-          if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
-          {
-            DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", cmd->vertices[i - 1].x,
-                      cmd->vertices[i - 1].y, cmd->vertices[i].x, cmd->vertices[i].y);
-          }
-          else
-          {
-            AddDrawLineTicks(clamped_rect, rc.shading_enable);
-
-            GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
-            out_vertex_count += 2;
-
-            GSVector2i::store(&out_vertex[0].x, start_pos);
-            out_vertex[0].color = start_color;
-            GSVector2i::store(&out_vertex[1].x, end_pos);
-            out_vertex[1].color = end_color;
-          }
-
-          start_pos = end_pos;
-          start_color = end_color;
-        }
-
-        if (out_vertex_count > 0)
-        {
-          DebugAssert(out_vertex_count <= cmd->num_vertices);
-          cmd->num_vertices = Truncate16(out_vertex_count);
-          m_backend.PushCommand(cmd);
-        }
-      }
-    }
-    break;
-
-    default:
-      UnreachableCode();
-      break;
-  }
-}
-
-void GPU_SW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
-{
-  m_backend.Sync(false);
-}
-
-void GPU_SW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
-{
-  GPUBackendFillVRAMCommand* cmd = m_backend.NewFillVRAMCommand();
-  FillBackendCommandParameters(cmd);
-  cmd->x = static_cast<u16>(x);
-  cmd->y = static_cast<u16>(y);
-  cmd->width = static_cast<u16>(width);
-  cmd->height = static_cast<u16>(height);
-  cmd->color = color;
-  m_backend.PushCommand(cmd);
-}
-
-void GPU_SW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
-{
-  const u32 num_words = width * height;
-  GPUBackendUpdateVRAMCommand* cmd = m_backend.NewUpdateVRAMCommand(num_words);
-  FillBackendCommandParameters(cmd);
-  cmd->params.set_mask_while_drawing = set_mask;
-  cmd->params.check_mask_before_draw = check_mask;
-  cmd->x = static_cast<u16>(x);
-  cmd->y = static_cast<u16>(y);
-  cmd->width = static_cast<u16>(width);
-  cmd->height = static_cast<u16>(height);
-  std::memcpy(cmd->data, data, sizeof(u16) * num_words);
-  m_backend.PushCommand(cmd);
-}
-
-void GPU_SW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
-{
-  GPUBackendCopyVRAMCommand* cmd = m_backend.NewCopyVRAMCommand();
-  FillBackendCommandParameters(cmd);
-  cmd->src_x = static_cast<u16>(src_x);
-  cmd->src_y = static_cast<u16>(src_y);
-  cmd->dst_x = static_cast<u16>(dst_x);
-  cmd->dst_y = static_cast<u16>(dst_y);
-  cmd->width = static_cast<u16>(width);
-  cmd->height = static_cast<u16>(height);
-  m_backend.PushCommand(cmd);
-}
-
-void GPU_SW::FlushRender()
-{
-}
-
-void GPU_SW::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
-{
-  GPUBackendUpdateCLUTCommand* cmd = m_backend.NewUpdateCLUTCommand();
-  FillBackendCommandParameters(cmd);
-  cmd->reg.bits = reg.bits;
-  cmd->clut_is_8bit = clut_is_8bit;
-  m_backend.PushCommand(cmd);
-}
-
-std::unique_ptr<GPU> GPU::CreateSoftwareRenderer(Error* error)
-{
-  std::unique_ptr<GPU_SW> gpu(std::make_unique<GPU_SW>());
-  if (!gpu->Initialize(error))
-    gpu.reset();
-
-  return gpu;
+  return std::make_unique<GPU_SW>();
 }
diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h
index 2251843aa..9be0930e5 100644
--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "gpu.h"
-#include "gpu_sw_backend.h"
+#include "gpu_backend.h"
 
 #include "util/gpu_device.h"
 
@@ -12,36 +12,49 @@
 
 #include <memory>
 
-namespace Threading {
-class Thread;
-}
-
-class GPUTexture;
-
-class GPU_SW final : public GPU
+// TODO: Move to cpp
+// TODO: Rename to GPUSWBackend, preserved to avoid conflicts.
+class GPU_SW final : public GPUBackend
 {
 public:
   GPU_SW();
   ~GPU_SW() override;
 
-  ALWAYS_INLINE const GPU_SW_Backend& GetBackend() const { return m_backend; }
-
-  const Threading::Thread* GetSWThread() const override;
   bool IsHardwareRenderer() const override;
 
-  bool Initialize(Error* error) override;
-  bool DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display) override;
-  void Reset(bool clear_vram) override;
-  void UpdateSettings(const Settings& old_settings) override;
+  bool Initialize(bool upload_vram, Error* error) override;
+
+  void RestoreDeviceContext() override;
+
+  u32 GetResolutionScale() const override;
 
 protected:
   void ReadVRAM(u32 x, u32 y, u32 width, u32 height) override;
-  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) override;
-  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) override;
-  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override;
-  void FlushRender() override;
-  void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
+  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override;
+  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override;
+  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                GPUBackendCommandParameters params) override;
 
+  void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
+  void DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd) override;
+  void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
+  void DrawSprite(const GPUBackendDrawRectangleCommand* cmd) override;
+  void DrawingAreaChanged() override;
+  void ClearCache() override;
+  void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
+  void OnBufferSwapped() override;
+
+  void UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd) override;
+
+  void ClearVRAM() override;
+
+  void FlushRender() override;
+
+  void UpdateResolutionScale() override;
+
+  void LoadState(const GPUBackendLoadStateCommand* cmd) override;
+
+private:
   template<GPUTexture::Format display_format>
   bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip);
 
@@ -50,19 +63,11 @@ protected:
 
   bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit);
 
-  void UpdateDisplay() override;
-
-  void DispatchRenderCommand() override;
-
-  void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
-  void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
-
+  void SetDisplayTextureFormat();
   GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format);
 
   FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer;
   GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565;
   GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8;
   std::unique_ptr<GPUTexture> m_upload_texture;
-
-  GPU_SW_Backend m_backend;
 };
diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp
deleted file mode 100644
index 0ab2e68e8..000000000
--- a/src/core/gpu_sw_backend.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
-// SPDX-License-Identifier: CC-BY-NC-ND-4.0
-
-#include "gpu_sw_backend.h"
-#include "gpu.h"
-#include "gpu_sw_rasterizer.h"
-#include "system.h"
-
-#include "util/gpu_device.h"
-
-#include <algorithm>
-
-GPU_SW_Backend::GPU_SW_Backend() = default;
-
-GPU_SW_Backend::~GPU_SW_Backend() = default;
-
-bool GPU_SW_Backend::Initialize(bool use_thread)
-{
-  return GPUBackend::Initialize(use_thread);
-}
-
-void GPU_SW_Backend::Reset()
-{
-  GPUBackend::Reset();
-}
-
-void GPU_SW_Backend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
-{
-  const GPURenderCommand rc{cmd->rc.bits};
-
-  const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
-    rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
-
-  DrawFunction(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
-  if (rc.quad_polygon)
-    DrawFunction(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
-}
-
-void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
-{
-  const GPURenderCommand rc{cmd->rc.bits};
-
-  const GPU_SW_Rasterizer::DrawRectangleFunction DrawFunction =
-    GPU_SW_Rasterizer::GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
-
-  DrawFunction(cmd);
-}
-
-void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
-{
-  const GPU_SW_Rasterizer::DrawLineFunction DrawFunction =
-    GPU_SW_Rasterizer::GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable);
-
-  for (u16 i = 1; i < cmd->num_vertices; i++)
-    DrawFunction(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]);
-}
-
-void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
-{
-  GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, params.interlaced_rendering, params.active_line_lsb);
-}
-
-void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
-                                GPUBackendCommandParameters params)
-{
-  GPU_SW_Rasterizer::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw);
-}
-
-void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
-                              GPUBackendCommandParameters params)
-{
-  GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
-                              params.check_mask_before_draw);
-}
-
-void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
-{
-  GPU::ReadCLUT(g_gpu_clut, reg, clut_is_8bit);
-}
-
-void GPU_SW_Backend::DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area)
-{
-  GPU_SW_Rasterizer::g_drawing_area = new_drawing_area;
-}
-
-void GPU_SW_Backend::FlushRender()
-{
-}
diff --git a/src/core/gpu_sw_backend.h b/src/core/gpu_sw_backend.h
deleted file mode 100644
index 7f2c492ca..000000000
--- a/src/core/gpu_sw_backend.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
-// SPDX-License-Identifier: CC-BY-NC-ND-4.0
-
-#pragma once
-
-#include "gpu.h"
-#include "gpu_backend.h"
-
-#include <array>
-
-class GPU_SW_Backend final : public GPUBackend
-{
-public:
-  GPU_SW_Backend();
-  ~GPU_SW_Backend() override;
-
-  bool Initialize(bool use_thread) override;
-  void Reset() override;
-
-protected:
-  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override;
-  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override;
-  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
-                GPUBackendCommandParameters params) override;
-
-  void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
-  void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
-  void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) override;
-  void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) override;
-  void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
-  void FlushRender() override;
-};
diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
index f2648dcd0..bcd1e4267 100644
--- a/src/core/gpu_sw_rasterizer.cpp
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -45,6 +45,31 @@ CopyVRAMFunction CopyVRAM = nullptr;
 GPUDrawingArea g_drawing_area = {};
 } // namespace GPU_SW_Rasterizer
 
+void GPU_SW_Rasterizer::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
+{
+  const u16* const src_row = &g_vram[reg.GetYBase() * VRAM_WIDTH];
+  const u32 start_x = reg.GetXBase();
+  if (!clut_is_8bit)
+  {
+    // Wraparound can't happen in 4-bit mode.
+    std::memcpy(g_gpu_clut, &src_row[start_x], sizeof(u16) * 16);
+  }
+  else
+  {
+    if ((start_x + 256) > VRAM_WIDTH) [[unlikely]]
+    {
+      const u32 end = VRAM_WIDTH - start_x;
+      const u32 start = 256 - end;
+      std::memcpy(g_gpu_clut, &src_row[start_x], sizeof(u16) * end);
+      std::memcpy(g_gpu_clut + end, src_row, sizeof(u16) * start);
+    }
+    else
+    {
+      std::memcpy(g_gpu_clut, &src_row[start_x], sizeof(u16) * 256);
+    }
+  }
+}
+
 // Default scalar implementation definitions.
 namespace GPU_SW_Rasterizer::Scalar {
 namespace {
diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h
index cdc6e9d5e..69e89e65f 100644
--- a/src/core/gpu_sw_rasterizer.h
+++ b/src/core/gpu_sw_rasterizer.h
@@ -18,12 +18,15 @@ static constexpr u32 DITHER_LUT_SIZE = 512;
 using DitherLUT = std::array<std::array<std::array<u8, DITHER_LUT_SIZE>, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
 extern const DitherLUT g_dither_lut;
 
+// TODO: Pack in struct
 extern GPUDrawingArea g_drawing_area;
 
+extern void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit);
+
 using DrawRectangleFunction = void (*)(const GPUBackendDrawRectangleCommand* cmd);
 typedef const DrawRectangleFunction DrawRectangleFunctionTable[2][2][2];
 
-using DrawTriangleFunction = void (*)(const GPUBackendDrawPolygonCommand* cmd,
+using DrawTriangleFunction = void (*)(const GPUBackendDrawCommand* cmd,
                                       const GPUBackendDrawPolygonCommand::Vertex* v0,
                                       const GPUBackendDrawPolygonCommand::Vertex* v1,
                                       const GPUBackendDrawPolygonCommand::Vertex* v2);
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index 0a1ed95e3..64a3f18a8 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -966,7 +966,7 @@ struct TrianglePart
 #ifndef USE_VECTOR
 
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
+static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
                      const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
 {
   s32 width = x_bound - x_start;
@@ -1006,7 +1006,7 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start
 }
 
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
+ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawCommand* cmd, const TrianglePart& tp,
                                                    const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
                                                    const RGBSteps& rgbstep)
 {
@@ -1143,7 +1143,7 @@ struct TriangleVectors : PixelVectors<texture_enable>
 } // namespace
 
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
+static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
                      const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep,
                      const TriangleVectors<shading_enable, texture_enable>& tv)
 {
@@ -1248,7 +1248,7 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start
 }
 
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
+ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawCommand* cmd, const TrianglePart& tp,
                                                    const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
                                                    const RGBSteps& rgbstep)
 {
@@ -1347,7 +1347,7 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo
 #endif // USE_VECTOR
 
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
+static void DrawTriangle(const GPUBackendDrawCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
                          const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
 {
 #ifdef CHECK_VECTOR
diff --git a/src/core/gpu_thread.cpp b/src/core/gpu_thread.cpp
new file mode 100644
index 000000000..fba927789
--- /dev/null
+++ b/src/core/gpu_thread.cpp
@@ -0,0 +1,1173 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#include "gpu_thread.h"
+#include "fullscreen_ui.h"
+#include "gpu_backend.h"
+#include "gpu_types.h"
+#include "host.h"
+#include "imgui_overlays.h"
+#include "performance_counters.h"
+#include "settings.h"
+#include "shader_cache_version.h"
+#include "system.h"
+#include "system_private.h"
+
+#include "util/gpu_device.h"
+#include "util/imgui_manager.h"
+#include "util/input_manager.h"
+#include "util/postprocessing.h"
+#include "util/state_wrapper.h"
+
+#include "common/align.h"
+#include "common/error.h"
+#include "common/log.h"
+#include "common/threading.h"
+#include "common/timer.h"
+
+#include "IconsEmoji.h"
+#include "IconsFontAwesome5.h"
+#include "fmt/format.h"
+#include "imgui.h"
+
+#include <optional>
+
+LOG_CHANNEL(GPUThread);
+
+// TODO: Runahead/rewind textures.
+// TODO: SW renderer for readback flag in class.
+// TODO: Smaller settings struct.
+// TODO: Remove g_gpu pointer.
+// TODO: Auto size video capture.
+// TODO: Smooth loady bar for achievements.
+// TODO: Tidy up gpu_backend headers.
+// TODO: Test that loading new states in old version works.
+// TODO: Disable thread when debug windows are enabled.
+// TODO: Fullscreen UI without thread active, locks up.
+
+namespace GPUThread {
+enum : u32
+{
+  COMMAND_QUEUE_SIZE = 16 * 1024 * 1024,
+  THRESHOLD_TO_WAKE_GPU = 65536,
+  MAX_SKIPPED_PRESENT_COUNT = 50
+};
+
+static constexpr s32 THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING = 0x40000000; // CPU thread needs waking
+static constexpr s32 THREAD_WAKE_COUNT_SLEEPING = -1;
+
+// Use a slightly longer spin time on ARM64 due to power management.
+#ifndef _M_ARM64
+static constexpr u32 THREAD_SPIN_TIME_US = 50;
+#else
+static constexpr u32 THREAD_SPIN_TIME_US = 200;
+#endif
+
+static bool Reconfigure(std::optional<GPURenderer> renderer, bool upload_vram, std::optional<bool> fullscreen,
+                        std::optional<bool> start_fullscreen_ui, bool recreate_device, Error* error);
+
+static u32 GetPendingCommandSize();
+static void ResetCommandFIFO();
+static void WakeGPUThread();
+static void SyncGPUThread(bool spin);
+static bool SleepGPUThread(bool allow_sleep);
+
+static bool CreateDeviceOnThread(RenderAPI api, bool fullscreen, Error* error);
+static void DestroyDeviceOnThread();
+static void ResizeDisplayWindowOnThread(u32 width, u32 height, float scale);
+static void UpdateDisplayWindowOnThread(bool fullscreen);
+static void DisplayWindowResizedOnThread();
+static void HandleGPUDeviceLost();
+static void HandleExclusiveFullscreenLost();
+
+static void ReconfigureOnThread(GPUThreadReconfigureCommand* cmd);
+static bool CreateGPUBackendOnThread(GPURenderer renderer, bool upload_vram, Error* error);
+static void DestroyGPUBackendOnThread();
+
+static void UpdateSettingsOnThread(const Settings& old_settings);
+
+static void SleepUntilPresentTime(Common::Timer::Value present_time);
+
+namespace {
+
+struct ALIGN_TO_CACHE_LINE State
+{
+  // Owned by CPU thread.
+  ALIGN_TO_CACHE_LINE Common::Timer::Value thread_spin_time = 0;
+  Threading::ThreadHandle gpu_thread;
+  Common::unique_aligned_ptr<u8[]> command_fifo_data;
+  WindowInfo render_window_info;
+  std::optional<GPURenderer> requested_renderer; // TODO: Non thread safe accessof this
+  bool use_gpu_thread = false;
+
+  // Hot variables between both threads.
+  ALIGN_TO_CACHE_LINE std::atomic<u32> command_fifo_write_ptr{0};
+  std::atomic<s32> thread_wake_count{0}; // <0 = sleeping, >= 0 = has work
+  Threading::KernelSemaphore thread_wake_semaphore;
+  Threading::KernelSemaphore thread_is_done_semaphore;
+
+  // Owned by GPU thread.
+  ALIGN_TO_CACHE_LINE std::unique_ptr<GPUBackend> gpu_backend;
+  std::atomic<u32> command_fifo_read_ptr{0};
+  u32 skipped_present_count = 0;
+  bool run_idle_flag = false;
+  GPUVSyncMode requested_vsync = GPUVSyncMode::Disabled;
+  bool requested_allow_present_throttle = false;
+  bool requested_fullscreen_ui = false;
+};
+
+} // namespace
+
+static State s_state;
+
+} // namespace GPUThread
+
+const Threading::ThreadHandle& GPUThread::Internal::GetThreadHandle()
+{
+  return s_state.gpu_thread;
+}
+
+void GPUThread::ResetCommandFIFO()
+{
+  Assert(!s_state.run_idle_flag && s_state.command_fifo_read_ptr.load(std::memory_order_acquire) ==
+                                     s_state.command_fifo_write_ptr.load(std::memory_order_relaxed));
+  s_state.command_fifo_write_ptr.store(0, std::memory_order_release);
+  s_state.command_fifo_read_ptr.store(0, std::memory_order_release);
+}
+
+void GPUThread::Internal::SetThreadEnabled(bool enabled)
+{
+  if (s_state.use_gpu_thread == enabled)
+    return;
+
+  if (s_state.use_gpu_thread)
+  {
+    SyncGPUThread(false);
+    std::atomic_thread_fence(std::memory_order_acquire);
+  }
+
+  // Was anything active?
+  if (!g_gpu_device)
+  {
+    // Thread should be idle. Just reset the FIFO.
+    s_state.use_gpu_thread = enabled;
+    ResetCommandFIFO();
+    return;
+  }
+
+  const bool fullscreen = Host::IsFullscreen();
+  const bool requested_fullscreen_ui = s_state.requested_fullscreen_ui;
+  const std::optional<GPURenderer> requested_renderer = s_state.requested_renderer;
+
+  // Force VRAM download, we're recreating.
+  if (requested_renderer.has_value())
+  {
+    GPUBackendReadVRAMCommand* cmd = GPUBackend::NewReadVRAMCommand();
+    cmd->x = 0;
+    cmd->y = 0;
+    cmd->width = VRAM_WIDTH;
+    cmd->height = VRAM_HEIGHT;
+    PushCommand(cmd);
+  }
+
+  // Shutdown reconfigure.
+  Reconfigure(std::nullopt, false, std::nullopt, std::nullopt, false, nullptr);
+
+  // Thread should be idle at this point. Reset the FIFO.
+  ResetCommandFIFO();
+
+  // Update state and reconfigure again.
+  s_state.use_gpu_thread = enabled;
+
+  Error error;
+  if (!Reconfigure(requested_renderer, requested_renderer.has_value(), fullscreen, requested_fullscreen_ui, true,
+                   &error))
+  {
+    ERROR_LOG("Reconfigure failed: {}", error.GetDescription());
+    Panic("Failed to reconfigure when changing thread state.");
+  }
+}
+
+void GPUThread::Internal::ProcessStartup()
+{
+  s_state.thread_spin_time = Common::Timer::ConvertNanosecondsToValue(THREAD_SPIN_TIME_US * 1000.0);
+  s_state.command_fifo_data = Common::make_unique_aligned_for_overwrite<u8[]>(HOST_CACHE_LINE_SIZE, COMMAND_QUEUE_SIZE);
+  s_state.use_gpu_thread = g_settings.gpu_use_thread;
+}
+
+void GPUThread::Internal::RequestShutdown()
+{
+  INFO_LOG("Shutting down GPU thread...");
+  if (GetPendingCommandSize() > 0)
+  {
+    WakeGPUThread();
+    SyncGPUThread(false);
+  }
+
+  // Thread must be enabled to shut it down.
+  SetThreadEnabled(true);
+  PushCommandAndWakeThread(AllocateCommand(GPUBackendCommandType::Shutdown, sizeof(GPUThreadCommand)));
+}
+
+bool GPUThread::Reconfigure(std::optional<GPURenderer> renderer, bool upload_vram, std::optional<bool> fullscreen,
+                            std::optional<bool> start_fullscreen_ui, bool recreate_device, Error* error)
+{
+  INFO_LOG("Reconfiguring GPU thread.");
+
+  GPUThreadReconfigureCommand* cmd = static_cast<GPUThreadReconfigureCommand*>(
+    AllocateCommand(GPUBackendCommandType::Reconfigure, sizeof(GPUThreadReconfigureCommand)));
+  cmd->renderer = renderer;
+  cmd->fullscreen = fullscreen;
+  cmd->start_fullscreen_ui = start_fullscreen_ui;
+  cmd->vsync_mode = System::GetEffectiveVSyncMode();
+  cmd->allow_present_throttle = System::ShouldAllowPresentThrottle();
+  cmd->force_recreate_device = recreate_device;
+  cmd->upload_vram = upload_vram;
+  cmd->error_ptr = error;
+
+  if (!s_state.use_gpu_thread) [[unlikely]]
+    ReconfigureOnThread(cmd);
+  else
+    PushCommandAndSync(cmd, false);
+
+  return cmd->result;
+}
+
+bool GPUThread::StartFullscreenUI(bool fullscreen, Error* error)
+{
+  // Don't need to reconfigure if we already have a system.
+  if (System::IsValid())
+  {
+    RunOnThread([]() { s_state.requested_fullscreen_ui = true; });
+    return true;
+  }
+
+  return Reconfigure(std::nullopt, false, fullscreen, true, false, error);
+}
+
+void GPUThread::StopFullscreenUI()
+{
+  // Don't need to reconfigure if we already have a system.
+  if (System::IsValid())
+  {
+    RunOnThread([]() { s_state.requested_fullscreen_ui = true; });
+    return;
+  }
+
+  Reconfigure(std::nullopt, false, std::nullopt, false, false, nullptr);
+}
+
+std::optional<GPURenderer> GPUThread::GetRequestedRenderer()
+{
+  return s_state.requested_renderer;
+}
+
+bool GPUThread::CreateGPUBackend(GPURenderer renderer, bool upload_vram, bool fullscreen, bool force_recreate_device,
+                                 Error* error)
+{
+  s_state.requested_renderer = renderer;
+  return Reconfigure(renderer, upload_vram, fullscreen ? std::optional<bool>(true) : std::nullopt, std::nullopt,
+                     force_recreate_device, error);
+}
+
+void GPUThread::DestroyGPUBackend()
+{
+  Reconfigure(std::nullopt, false, std::nullopt, std::nullopt, false, nullptr);
+  s_state.requested_renderer.reset();
+}
+
+GPUThreadCommand* GPUThread::AllocateCommand(GPUBackendCommandType command, u32 size)
+{
+  // Ensure size is a multiple of 4 so we don't end up with an unaligned command.
+  size = Common::AlignUpPow2(size, 4);
+
+  for (;;)
+  {
+    u32 read_ptr = s_state.command_fifo_read_ptr.load(std::memory_order_acquire);
+    u32 write_ptr = s_state.command_fifo_write_ptr.load(std::memory_order_relaxed);
+    if (read_ptr > write_ptr)
+    {
+      u32 available_size = read_ptr - write_ptr;
+      while (available_size < (size + sizeof(GPUBackendCommandType)))
+      {
+        WakeGPUThread();
+        read_ptr = s_state.command_fifo_read_ptr.load(std::memory_order_acquire);
+        available_size = (read_ptr > write_ptr) ? (read_ptr - write_ptr) : (COMMAND_QUEUE_SIZE - write_ptr);
+      }
+    }
+    else
+    {
+      const u32 available_size = COMMAND_QUEUE_SIZE - write_ptr;
+      if ((size + sizeof(GPUBackendCommand)) > available_size)
+      {
+        // allocate a dummy command to wrap the buffer around
+        GPUBackendCommand* dummy_cmd = reinterpret_cast<GPUBackendCommand*>(&s_state.command_fifo_data[write_ptr]);
+        dummy_cmd->type = GPUBackendCommandType::Wraparound;
+        dummy_cmd->size = available_size;
+        dummy_cmd->params.bits = 0;
+        s_state.command_fifo_write_ptr.store(0, std::memory_order_release);
+        continue;
+      }
+    }
+
+    GPUThreadCommand* cmd = reinterpret_cast<GPUThreadCommand*>(&s_state.command_fifo_data[write_ptr]);
+    cmd->type = command;
+    cmd->size = size;
+    return cmd;
+  }
+}
+
+u32 GPUThread::GetPendingCommandSize()
+{
+  const u32 read_ptr = s_state.command_fifo_read_ptr.load();
+  const u32 write_ptr = s_state.command_fifo_write_ptr.load();
+  return (write_ptr >= read_ptr) ? (write_ptr - read_ptr) : (COMMAND_QUEUE_SIZE - read_ptr + write_ptr);
+}
+
+void GPUThread::PushCommand(GPUThreadCommand* cmd)
+{
+  if (!s_state.use_gpu_thread) [[unlikely]]
+  {
+    DebugAssert(s_state.gpu_backend);
+    s_state.gpu_backend->HandleCommand(cmd);
+    return;
+  }
+
+  const u32 new_write_ptr = s_state.command_fifo_write_ptr.fetch_add(cmd->size, std::memory_order_release) + cmd->size;
+  DebugAssert(new_write_ptr <= COMMAND_QUEUE_SIZE);
+  UNREFERENCED_VARIABLE(new_write_ptr);
+  if (GetPendingCommandSize() >= THRESHOLD_TO_WAKE_GPU) // TODO:FIXME: maybe purge this?
+    WakeGPUThread();
+}
+
+void GPUThread::PushCommandAndWakeThread(GPUThreadCommand* cmd)
+{
+  if (!s_state.use_gpu_thread) [[unlikely]]
+  {
+    DebugAssert(s_state.gpu_backend);
+    s_state.gpu_backend->HandleCommand(cmd);
+    return;
+  }
+
+  const u32 new_write_ptr = s_state.command_fifo_write_ptr.fetch_add(cmd->size, std::memory_order_release) + cmd->size;
+  DebugAssert(new_write_ptr <= COMMAND_QUEUE_SIZE);
+  UNREFERENCED_VARIABLE(new_write_ptr);
+  WakeGPUThread();
+}
+
+void GPUThread::PushCommandAndSync(GPUThreadCommand* cmd, bool spin)
+{
+  if (!s_state.use_gpu_thread) [[unlikely]]
+  {
+    DebugAssert(s_state.gpu_backend);
+    s_state.gpu_backend->HandleCommand(cmd);
+    return;
+  }
+
+  const u32 new_write_ptr = s_state.command_fifo_write_ptr.fetch_add(cmd->size, std::memory_order_release) + cmd->size;
+  DebugAssert(new_write_ptr <= COMMAND_QUEUE_SIZE);
+  UNREFERENCED_VARIABLE(new_write_ptr);
+  WakeGPUThread();
+  SyncGPUThread(spin);
+}
+
+void GPUThread::PushCommandAndFrame(GPUBackendUpdateDisplayCommand* cmd)
+{
+  if (!s_state.use_gpu_thread) [[unlikely]]
+  {
+    DebugAssert(s_state.gpu_backend);
+    s_state.gpu_backend->HandleCommand(cmd);
+    return;
+  }
+
+  const bool drain_one = cmd->present_frame && s_state.gpu_backend->BeginQueueFrame();
+
+  PushCommandAndWakeThread(cmd);
+
+  if (drain_one)
+    s_state.gpu_backend->WaitForOneQueuedFrame();
+}
+
+ALWAYS_INLINE s32 GetThreadWakeCount(s32 state)
+{
+  return (state & ~GPUThread::THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING);
+}
+
+void GPUThread::WakeGPUThread()
+{
+  // If sleeping, state will be <0, otherwise this will increment the pending work count.
+  // We add 2 so that there's a positive work count if we were sleeping, otherwise the thread would go to sleep.
+  if (s_state.thread_wake_count.fetch_add(2, std::memory_order_release) < 0)
+    s_state.thread_wake_semaphore.Post();
+}
+
+void GPUThread::SyncGPUThread(bool spin)
+{
+  DebugAssert(s_state.use_gpu_thread);
+  if (spin)
+  {
+    // Check if the GPU thread is done/sleeping.
+    if (GetThreadWakeCount(s_state.thread_wake_count.load(std::memory_order_acquire)) < 0)
+      return;
+
+    Common::Timer::Value start_time = Common::Timer::GetCurrentValue();
+    Common::Timer::Value current_time;
+    do
+    {
+      // Check if the GPU thread is done/sleeping.
+      if (GetThreadWakeCount(s_state.thread_wake_count.load(std::memory_order_acquire)) < 0)
+        return;
+
+      // Hopefully ought to be enough.
+      MultiPause();
+
+      current_time = Common::Timer::GetCurrentValue();
+    } while ((current_time - start_time) < s_state.thread_spin_time);
+  }
+
+  // s_thread_wake_count |= THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING if not zero
+  s32 value;
+  do
+  {
+    // Check if the GPU thread is done/sleeping.
+    value = s_state.thread_wake_count.load(std::memory_order_acquire);
+    if (GetThreadWakeCount(value) < 0)
+      return;
+  } while (!s_state.thread_wake_count.compare_exchange_weak(value, value | THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING,
+                                                            std::memory_order_acq_rel, std::memory_order_relaxed));
+  s_state.thread_is_done_semaphore.Wait();
+}
+
+bool GPUThread::SleepGPUThread(bool allow_sleep)
+{
+  DebugAssert(!allow_sleep || s_state.thread_wake_count.load(std::memory_order_relaxed) >= 0);
+  for (;;)
+  {
+    // Acknowledge any work that has been queued, but preserve the waiting flag if there is any, since we're not done
+    // yet.
+    s32 old_state, new_state;
+    do
+    {
+      old_state = s_state.thread_wake_count.load(std::memory_order_relaxed);
+      new_state = (GetThreadWakeCount(old_state) > 0) ? (old_state & THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING) :
+                                                        (allow_sleep ? THREAD_WAKE_COUNT_SLEEPING : 0);
+    } while (!s_state.thread_wake_count.compare_exchange_weak(old_state, new_state, std::memory_order_acq_rel,
+                                                              std::memory_order_relaxed));
+
+    // Are we not done yet?
+    if (GetThreadWakeCount(old_state) > 0)
+      return true;
+
+    // We're done, so wake the CPU thread if it's waiting.
+    if (old_state & THREAD_WAKE_COUNT_CPU_THREAD_IS_WAITING)
+      s_state.thread_is_done_semaphore.Post();
+
+    // Sleep until more work is queued.
+    if (allow_sleep)
+      s_state.thread_wake_semaphore.Wait();
+    else
+      return false;
+  }
+}
+
+void GPUThread::Internal::GPUThreadEntryPoint()
+{
+  s_state.gpu_thread = Threading::ThreadHandle::GetForCallingThread();
+  Threading::SetNameOfCurrentThread("GPU Thread");
+
+  // Take a local copy of the FIFO, that way it's not ping-ponging between the threads.
+  u8* const command_fifo_data = s_state.command_fifo_data.get();
+
+  for (;;)
+  {
+    u32 write_ptr = s_state.command_fifo_write_ptr.load(std::memory_order_acquire);
+    u32 read_ptr = s_state.command_fifo_read_ptr.load(std::memory_order_relaxed);
+    if (read_ptr == write_ptr)
+    {
+      if (SleepGPUThread(!s_state.run_idle_flag))
+      {
+        // sleep => wake, need to reload pointers
+        continue;
+      }
+      else
+      {
+        Internal::PresentFrame(false, 0);
+        if (!g_gpu_device->GetMainSwapChain()->IsVSyncModeBlocking())
+          g_gpu_device->GetMainSwapChain()->ThrottlePresentation();
+
+        continue;
+      }
+    }
+
+    write_ptr = (write_ptr < read_ptr) ? COMMAND_QUEUE_SIZE : write_ptr;
+    while (read_ptr < write_ptr)
+    {
+      GPUThreadCommand* cmd = reinterpret_cast<GPUThreadCommand*>(&command_fifo_data[read_ptr]);
+      DebugAssert((read_ptr + cmd->size) <= COMMAND_QUEUE_SIZE);
+      read_ptr += cmd->size;
+
+      if (cmd->type > GPUBackendCommandType::Shutdown) [[likely]]
+      {
+        DebugAssert(s_state.gpu_backend);
+        s_state.gpu_backend->HandleCommand(cmd);
+        continue;
+      }
+
+      switch (cmd->type)
+      {
+        case GPUBackendCommandType::Wraparound:
+        {
+          DebugAssert(read_ptr == COMMAND_QUEUE_SIZE);
+          write_ptr = s_state.command_fifo_write_ptr.load(std::memory_order_acquire);
+          read_ptr = 0;
+
+          // let the CPU thread know as early as possible that we're here
+          s_state.command_fifo_read_ptr.store(read_ptr, std::memory_order_release);
+        }
+        break;
+
+        case GPUBackendCommandType::AsyncCall:
+        {
+          GPUThreadAsyncCallCommand* acmd = static_cast<GPUThreadAsyncCallCommand*>(cmd);
+          acmd->func();
+          acmd->~GPUThreadAsyncCallCommand();
+        }
+        break;
+
+        case GPUBackendCommandType::Reconfigure:
+        {
+          ReconfigureOnThread(static_cast<GPUThreadReconfigureCommand*>(cmd));
+        }
+        break;
+
+        case GPUBackendCommandType::Shutdown:
+        {
+          // Should have consumed everything, and be shutdown.
+          DebugAssert(read_ptr == write_ptr);
+          s_state.command_fifo_read_ptr.store(read_ptr, std::memory_order_release);
+          return;
+        }
+        break;
+
+          DefaultCaseIsUnreachable();
+      }
+    }
+
+    s_state.command_fifo_read_ptr.store(read_ptr, std::memory_order_release);
+  }
+}
+
+bool GPUThread::CreateDeviceOnThread(RenderAPI api, bool fullscreen, Error* error)
+{
+  DebugAssert(!g_gpu_device);
+
+  INFO_LOG("Trying to create a {} GPU device...", GPUDevice::RenderAPIToString(api));
+  g_gpu_device = GPUDevice::CreateDeviceForAPI(api);
+
+  std::optional<GPUDevice::ExclusiveFullscreenMode> fullscreen_mode;
+  if (fullscreen && g_gpu_device && g_gpu_device->SupportsExclusiveFullscreen())
+  {
+    fullscreen_mode =
+      GPUDevice::ExclusiveFullscreenMode::Parse(Host::GetTinyStringSettingValue("GPU", "FullscreenMode", ""));
+  }
+  std::optional<bool> exclusive_fullscreen_control;
+  if (g_settings.display_exclusive_fullscreen_control != DisplayExclusiveFullscreenControl::Automatic)
+  {
+    exclusive_fullscreen_control =
+      (g_settings.display_exclusive_fullscreen_control == DisplayExclusiveFullscreenControl::Allowed);
+  }
+
+  u32 disabled_features = 0;
+  if (g_settings.gpu_disable_dual_source_blend)
+    disabled_features |= GPUDevice::FEATURE_MASK_DUAL_SOURCE_BLEND;
+  if (g_settings.gpu_disable_framebuffer_fetch)
+    disabled_features |= GPUDevice::FEATURE_MASK_FRAMEBUFFER_FETCH;
+  if (g_settings.gpu_disable_texture_buffers)
+    disabled_features |= GPUDevice::FEATURE_MASK_TEXTURE_BUFFERS;
+  if (g_settings.gpu_disable_memory_import)
+    disabled_features |= GPUDevice::FEATURE_MASK_MEMORY_IMPORT;
+  if (g_settings.gpu_disable_raster_order_views)
+    disabled_features |= GPUDevice::FEATURE_MASK_RASTER_ORDER_VIEWS;
+
+    // Don't dump shaders on debug builds for Android, users will complain about storage...
+#if !defined(__ANDROID__) || defined(_DEBUG)
+  const std::string_view shader_dump_directory(EmuFolders::DataRoot);
+#else
+  const std::string_view shader_dump_directory;
+#endif
+
+  Error create_error;
+  std::optional<WindowInfo> wi;
+  if (!g_gpu_device ||
+      !(wi = Host::AcquireRenderWindow(api, fullscreen, fullscreen_mode.has_value(), &create_error)).has_value() ||
+      !g_gpu_device->Create(
+        g_settings.gpu_adapter, static_cast<GPUDevice::FeatureMask>(disabled_features), shader_dump_directory,
+        g_settings.gpu_disable_shader_cache ? std::string_view() : std::string_view(EmuFolders::Cache),
+        SHADER_CACHE_VERSION, g_settings.gpu_use_debug_device, wi.value(), s_state.requested_vsync,
+        s_state.requested_allow_present_throttle, fullscreen_mode.has_value() ? &fullscreen_mode.value() : nullptr,
+        exclusive_fullscreen_control, &create_error))
+  {
+    ERROR_LOG("Failed to create GPU device: {}", create_error.GetDescription());
+    if (g_gpu_device)
+      g_gpu_device->Destroy();
+    g_gpu_device.reset();
+    if (wi.has_value())
+      Host::ReleaseRenderWindow();
+
+    Error::SetStringFmt(
+      error,
+      TRANSLATE_FS("System", "Failed to create render device:\n\n{0}\n\nThis may be due to your GPU not supporting the "
+                             "chosen renderer ({1}), or because your graphics drivers need to be updated."),
+      create_error.GetDescription(), GPUDevice::RenderAPIToString(api));
+
+    return false;
+  }
+
+  if (!ImGuiManager::Initialize(g_settings.display_osd_scale / 100.0f, g_settings.display_osd_margin, &create_error) ||
+      (s_state.requested_fullscreen_ui && !FullscreenUI::Initialize()))
+  {
+    ERROR_LOG("Failed to initialize ImGuiManager: {}", create_error.GetDescription());
+    Error::SetStringFmt(error, "Failed to initialize ImGuiManager: {}", create_error.GetDescription());
+    FullscreenUI::Shutdown();
+    ImGuiManager::Shutdown();
+    g_gpu_device->Destroy();
+    g_gpu_device.reset();
+    if (wi.has_value())
+      Host::ReleaseRenderWindow();
+    return false;
+  }
+
+  InputManager::SetDisplayWindowSize(ImGuiManager::GetWindowWidth(), ImGuiManager::GetWindowHeight());
+
+  if (const GPUSwapChain* swap_chain = g_gpu_device->GetMainSwapChain())
+    s_state.render_window_info = swap_chain->GetWindowInfo();
+  else
+    s_state.render_window_info = WindowInfo();
+
+  std::atomic_thread_fence(std::memory_order_release);
+
+  return true;
+}
+
+void GPUThread::DestroyDeviceOnThread()
+{
+  if (!g_gpu_device)
+    return;
+
+  const bool has_window = g_gpu_device->HasMainSwapChain();
+
+  ImGuiManager::DestroyOverlayTextures();
+  FullscreenUI::Shutdown();
+  ImGuiManager::Shutdown();
+
+  INFO_LOG("Destroying {} GPU device...", GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()));
+  g_gpu_device->Destroy();
+  g_gpu_device.reset();
+  if (has_window)
+    Host::ReleaseRenderWindow();
+
+  s_state.render_window_info = WindowInfo();
+  std::atomic_thread_fence(std::memory_order_release);
+}
+
+void GPUThread::HandleGPUDeviceLost()
+{
+  static Common::Timer::Value s_last_gpu_reset_time = 0;
+  static constexpr float MIN_TIME_BETWEEN_RESETS = 15.0f;
+
+  // If we're constantly crashing on something in particular, we don't want to end up in an
+  // endless reset loop.. that'd probably end up leaking memory and/or crashing us for other
+  // reasons. So just abort in such case.
+  const Common::Timer::Value current_time = Common::Timer::GetCurrentValue();
+  if (s_last_gpu_reset_time != 0 &&
+      Common::Timer::ConvertValueToSeconds(current_time - s_last_gpu_reset_time) < MIN_TIME_BETWEEN_RESETS)
+  {
+    Panic("Host GPU lost too many times, device is probably completely wedged.");
+  }
+  s_last_gpu_reset_time = current_time;
+
+  const bool is_fullscreen = Host::IsFullscreen();
+
+  // Device lost, something went really bad.
+  // Let's just toss out everything, and try to hobble on.
+  DestroyGPUBackendOnThread();
+  DestroyDeviceOnThread();
+
+  Error error;
+  if (!CreateDeviceOnThread(
+        Settings::GetRenderAPIForRenderer(s_state.requested_renderer.value_or(g_gpu_settings.gpu_renderer)),
+        is_fullscreen, &error) ||
+      (s_state.requested_renderer.has_value() &&
+       !CreateGPUBackendOnThread(s_state.requested_renderer.value(), true, &error)))
+  {
+    ERROR_LOG("Failed to recreate GPU device after loss: {}", error.GetDescription());
+    Panic("Failed to recreate GPU device after loss.");
+    return;
+  }
+
+  // First frame after reopening is definitely going to be trash, so skip it.
+  Host::AddIconOSDWarning(
+    "HostGPUDeviceLost", ICON_EMOJI_WARNING,
+    TRANSLATE_STR("System", "Host GPU device encountered an error and has recovered. This may cause broken rendering."),
+    Host::OSD_CRITICAL_ERROR_DURATION);
+}
+
+void GPUThread::HandleExclusiveFullscreenLost()
+{
+  WARNING_LOG("Lost exclusive fullscreen.");
+  Host::SetFullscreen(false);
+}
+
+bool GPUThread::CreateGPUBackendOnThread(GPURenderer renderer, bool upload_vram, Error* error)
+{
+  const bool is_hardware = (renderer != GPURenderer::Software);
+
+  if (is_hardware)
+    s_state.gpu_backend = GPUBackend::CreateHardwareBackend();
+  else
+    s_state.gpu_backend = GPUBackend::CreateSoftwareBackend();
+
+  Error local_error;
+  bool okay = s_state.gpu_backend->Initialize(upload_vram, &local_error);
+  if (!okay)
+  {
+    ERROR_LOG("Failed to create {} renderer: {}", Settings::GetRendererName(renderer), local_error.GetDescription());
+
+    if (is_hardware)
+    {
+      Host::AddIconOSDMessage(
+        "GPUBackendCreationFailed", ICON_FA_PAINT_ROLLER,
+        fmt::format(TRANSLATE_FS("OSDMessage", "Failed to initialize {} renderer, falling back to software renderer."),
+                    Settings::GetRendererName(s_state.requested_renderer.value())),
+        Host::OSD_CRITICAL_ERROR_DURATION);
+
+      s_state.requested_renderer = GPURenderer::Software;
+      s_state.gpu_backend = GPUBackend::CreateSoftwareBackend();
+      okay = s_state.gpu_backend->Initialize(upload_vram, &local_error);
+    }
+
+    if (!okay)
+    {
+      if (error)
+        *error = local_error;
+      return false;
+    }
+  }
+
+  g_gpu_device->SetGPUTimingEnabled(g_settings.display_show_gpu_usage);
+  ImGuiManager::UpdateDebugWindowConfig();
+  std::atomic_thread_fence(std::memory_order_release);
+  return true;
+}
+
+void GPUThread::ReconfigureOnThread(GPUThreadReconfigureCommand* cmd)
+{
+  // Store state.
+  s_state.requested_vsync = cmd->vsync_mode;
+  s_state.requested_allow_present_throttle = cmd->allow_present_throttle;
+  s_state.requested_fullscreen_ui = cmd->start_fullscreen_ui.value_or(s_state.requested_fullscreen_ui);
+
+  // Are we shutting down everything?
+  if (!cmd->renderer.has_value() && !s_state.requested_fullscreen_ui)
+  {
+    DestroyGPUBackendOnThread();
+    DestroyDeviceOnThread();
+    return;
+  }
+
+  // TODO: Make this suck less.
+  g_gpu_settings = g_settings;
+
+  // Readback old VRAM for hardware renderers.
+  if (s_state.gpu_backend && cmd->renderer.has_value() && cmd->upload_vram)
+  {
+    GPUBackendReadVRAMCommand read_cmd;
+    read_cmd.type = GPUBackendCommandType::ReadVRAM;
+    read_cmd.size = sizeof(cmd);
+    read_cmd.x = 0;
+    read_cmd.y = 0;
+    read_cmd.width = VRAM_WIDTH;
+    read_cmd.height = VRAM_HEIGHT;
+    s_state.gpu_backend->HandleCommand(&read_cmd);
+  }
+
+  if (s_state.gpu_backend)
+    DestroyGPUBackendOnThread();
+
+  // Device recreation?
+  const RenderAPI current_api = g_gpu_device ? g_gpu_device->GetRenderAPI() : RenderAPI::None;
+  const RenderAPI expected_api =
+    (cmd->renderer.has_value() && cmd->renderer.value() == GPURenderer::Software && current_api != RenderAPI::None) ?
+      current_api :
+      Settings::GetRenderAPIForRenderer(s_state.requested_renderer.value_or(g_gpu_settings.gpu_renderer));
+  if (cmd->force_recreate_device || !GPUDevice::IsSameRenderAPI(current_api, expected_api))
+  {
+    const bool fullscreen = cmd->fullscreen.value_or(Host::IsFullscreen());
+    DestroyDeviceOnThread();
+
+    Error local_error;
+    if (!CreateDeviceOnThread(expected_api, fullscreen, &local_error))
+    {
+      Host::AddIconOSDMessage(
+        "DeviceSwitchFailed", ICON_FA_PAINT_ROLLER,
+        fmt::format(TRANSLATE_FS("OSDMessage", "Failed to create {} GPU device, reverting to {}.\n{}"),
+                    GPUDevice::RenderAPIToString(expected_api), GPUDevice::RenderAPIToString(current_api),
+                    local_error.GetDescription()),
+        Host::OSD_CRITICAL_ERROR_DURATION);
+
+      Host::ReleaseRenderWindow();
+      if (current_api == RenderAPI::None || !CreateDeviceOnThread(current_api, fullscreen, &local_error))
+      {
+        if (cmd->error_ptr)
+          *cmd->error_ptr = local_error;
+
+        cmd->result = false;
+        return;
+      }
+    }
+  }
+
+  if (cmd->renderer.has_value())
+  {
+    // Do we want a renderer?
+    cmd->result = CreateGPUBackendOnThread(cmd->renderer.value(), cmd->upload_vram, cmd->error_ptr);
+  }
+  else if (s_state.requested_fullscreen_ui)
+  {
+    if (!g_gpu_device && !CreateDeviceOnThread(expected_api, cmd->fullscreen.value_or(false), cmd->error_ptr))
+    {
+      cmd->result = false;
+      return;
+    }
+
+    // Don't need timing to run FSUI.
+    g_gpu_device->SetGPUTimingEnabled(false);
+
+    cmd->result = FullscreenUI::IsInitialized() || FullscreenUI::Initialize();
+    if (!cmd->result)
+      Error::SetStringView(cmd->error_ptr, "Failed to initialize FullscreenUI.");
+  }
+  else
+  {
+    // Device is no longer needed.
+    DestroyDeviceOnThread();
+  }
+}
+
+void GPUThread::DestroyGPUBackendOnThread()
+{
+  if (!s_state.gpu_backend)
+    return;
+
+  VERBOSE_LOG("Shutting down GPU backend...");
+
+  ImGuiManager::DestroyAllDebugWindows();
+  PostProcessing::Shutdown();
+  s_state.gpu_backend.reset();
+}
+
+void GPUThread::UpdateSettingsOnThread(const Settings& old_settings)
+{
+  DebugAssert(s_state.gpu_backend);
+  if (g_gpu_settings.display_show_gpu_usage != old_settings.display_show_gpu_usage)
+    g_gpu_device->SetGPUTimingEnabled(g_gpu_settings.display_show_gpu_usage);
+
+  s_state.gpu_backend->UpdateSettings(old_settings);
+  if (ImGuiManager::UpdateDebugWindowConfig())
+    Internal::PresentFrame(false, 0);
+}
+
+void GPUThread::RunOnThread(AsyncCallType func)
+{
+  if (!s_state.use_gpu_thread) [[unlikely]]
+  {
+    func();
+    return;
+  }
+
+  GPUThreadAsyncCallCommand* cmd = static_cast<GPUThreadAsyncCallCommand*>(
+    AllocateCommand(GPUBackendCommandType::AsyncCall, sizeof(GPUThreadAsyncCallCommand)));
+  new (cmd) GPUThreadAsyncCallCommand;
+  cmd->func = std::move(func);
+  PushCommandAndWakeThread(cmd);
+}
+
+void GPUThread::UpdateSettings(bool gpu_settings_changed)
+{
+  if (gpu_settings_changed)
+  {
+    RunOnThread([settings = g_settings]() {
+      VERBOSE_LOG("Updating GPU settings on thread...");
+
+      Settings old_settings = std::move(g_gpu_settings);
+      g_gpu_settings = std::move(settings);
+
+      if (s_state.gpu_backend)
+        UpdateSettingsOnThread(old_settings);
+    });
+  }
+  else
+  {
+    RunOnThread([]() {
+      if (s_state.gpu_backend && ImGuiManager::UpdateDebugWindowConfig())
+        Internal::PresentFrame(false, 0);
+    });
+  }
+}
+
+void GPUThread::ResizeDisplayWindow(s32 width, s32 height, float scale)
+{
+  RunOnThread([width, height, scale]() { ResizeDisplayWindowOnThread(width, height, scale); });
+}
+
+void GPUThread::ResizeDisplayWindowOnThread(u32 width, u32 height, float scale)
+{
+  // We should _not_ be getting this without a device, since we should have shut down.
+  if (!g_gpu_device || !g_gpu_device->HasMainSwapChain())
+    return;
+
+  DEV_LOG("Display window resized to {}x{}", width, height);
+
+  Error error;
+  if (!g_gpu_device->GetMainSwapChain()->ResizeBuffers(width, height, scale, &error))
+  {
+    ERROR_LOG("Failed to resize main swap chain: {}", error.GetDescription());
+    UpdateDisplayWindowOnThread(Host::IsFullscreen());
+    return;
+  }
+
+  DisplayWindowResizedOnThread();
+}
+
+void GPUThread::UpdateDisplayWindow(bool fullscreen)
+{
+  RunOnThread([fullscreen]() { UpdateDisplayWindowOnThread(fullscreen); });
+}
+
+void GPUThread::UpdateDisplayWindowOnThread(bool fullscreen)
+{
+  // In case we get the event late.
+  if (!g_gpu_device)
+    return;
+
+  std::optional<GPUDevice::ExclusiveFullscreenMode> fullscreen_mode;
+  if (fullscreen && g_gpu_device->SupportsExclusiveFullscreen())
+  {
+    fullscreen_mode =
+      GPUDevice::ExclusiveFullscreenMode::Parse(Host::GetTinyStringSettingValue("GPU", "FullscreenMode", ""));
+  }
+  std::optional<bool> exclusive_fullscreen_control;
+  if (g_settings.display_exclusive_fullscreen_control != DisplayExclusiveFullscreenControl::Automatic)
+  {
+    exclusive_fullscreen_control =
+      (g_settings.display_exclusive_fullscreen_control == DisplayExclusiveFullscreenControl::Allowed);
+  }
+
+  g_gpu_device->DestroyMainSwapChain();
+
+  Error error;
+  std::optional<WindowInfo> wi =
+    Host::AcquireRenderWindow(g_gpu_device->GetRenderAPI(), fullscreen, fullscreen_mode.has_value(), &error);
+  if (!wi.has_value())
+  {
+    Host::ReportFatalError("Failed to get render window after update", error.GetDescription());
+    return;
+  }
+
+  // if surfaceless, just leave it
+  if (!wi->IsSurfaceless())
+  {
+    if (!g_gpu_device->RecreateMainSwapChain(
+          wi.value(), s_state.requested_vsync, s_state.requested_allow_present_throttle,
+          fullscreen_mode.has_value() ? &fullscreen_mode.value() : nullptr, exclusive_fullscreen_control, &error))
+    {
+      Host::ReportFatalError("Failed to change window after update", error.GetDescription());
+      return;
+    }
+  }
+
+  DisplayWindowResizedOnThread();
+}
+
+void GPUThread::DisplayWindowResizedOnThread()
+{
+  const GPUSwapChain* swap_chain = g_gpu_device->GetMainSwapChain();
+  if (swap_chain)
+    s_state.render_window_info = swap_chain->GetWindowInfo();
+  else
+    s_state.render_window_info = WindowInfo();
+  std::atomic_thread_fence(std::memory_order_release);
+
+  // surfaceless is usually temporary, so just ignore it
+  if (!swap_chain)
+    return;
+
+  const float f_width = static_cast<float>(swap_chain->GetWidth());
+  const float f_height = static_cast<float>(swap_chain->GetHeight());
+  ImGuiManager::WindowResized(f_width, f_height);
+  InputManager::SetDisplayWindowSize(f_width, f_height);
+
+  if (s_state.gpu_backend)
+  {
+    Host::RunOnCPUThread([width = swap_chain->GetWidth(), height = swap_chain->GetHeight()]() {
+      System::DisplayWindowResized(width, height);
+    });
+
+    // If we're paused, re-present the current frame at the new window size.
+    if (System::IsPaused())
+    {
+      // Hackity hack, on some systems, presenting a single frame isn't enough to actually get it
+      // displayed. Two seems to be good enough. Maybe something to do with direct scanout.
+      Internal::PresentFrame(false, 0);
+      Internal::PresentFrame(false, 0);
+    }
+
+    if (g_gpu_settings.gpu_resolution_scale == 0)
+      s_state.gpu_backend->UpdateResolutionScale();
+  }
+}
+
+const WindowInfo& GPUThread::GetRenderWindowInfo()
+{
+  // This is infrequently used, so we can get away with a full barrier.
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return s_state.render_window_info;
+}
+
+void GPUThread::SetVSync(GPUVSyncMode mode, bool allow_present_throttle)
+{
+  RunOnThread([mode, allow_present_throttle]() {
+    if (s_state.requested_vsync == mode && s_state.requested_allow_present_throttle == allow_present_throttle)
+      return;
+
+    s_state.requested_vsync = mode;
+    s_state.requested_allow_present_throttle = allow_present_throttle;
+
+    if (!g_gpu_device->HasMainSwapChain())
+      return;
+
+    Error error;
+    if (!g_gpu_device->GetMainSwapChain()->SetVSyncMode(s_state.requested_vsync,
+                                                        s_state.requested_allow_present_throttle, &error))
+    {
+      ERROR_LOG("Failed to update vsync mode: {}", error.GetDescription());
+    }
+  });
+}
+
+void GPUThread::PresentCurrentFrame()
+{
+  RunOnThread([]() {
+    if (s_state.run_idle_flag)
+    {
+      // If we're running idle, we're going to re-present anyway.
+      return;
+    }
+
+    Internal::PresentFrame(false, 0);
+  });
+}
+
+void GPUThread::SleepUntilPresentTime(Common::Timer::Value present_time)
+{
+  // Use a spinwait if we undersleep for all platforms except android.. don't want to burn battery.
+  // Linux also seems to do a much better job of waking up at the requested time.
+
+#if !defined(__linux__) && !defined(__ANDROID__)
+  Common::Timer::SleepUntil(present_time, true);
+#else
+  Common::Timer::SleepUntil(present_time, false);
+#endif
+}
+
+void GPUThread::Internal::PresentFrame(bool allow_skip_present, u64 present_time)
+{
+  const bool skip_present = (!g_gpu_device->HasMainSwapChain() ||
+                             (allow_skip_present && g_gpu_device->GetMainSwapChain()->ShouldSkipPresentingFrame() &&
+                              s_state.skipped_present_count < MAX_SKIPPED_PRESENT_COUNT));
+
+  if (!skip_present)
+  {
+    // acquire for IO.MousePos and system state.
+    std::atomic_thread_fence(std::memory_order_acquire);
+
+    FullscreenUI::Render();
+
+    if (s_state.gpu_backend && System::IsValid())
+      ImGuiManager::RenderTextOverlays(s_state.gpu_backend.get());
+
+    ImGuiManager::RenderOSDMessages();
+
+    if (s_state.gpu_backend && System::GetState() == System::State::Running)
+      ImGuiManager::RenderSoftwareCursors();
+
+    ImGuiManager::RenderOverlayWindows();
+    ImGuiManager::RenderDebugWindows();
+  }
+
+  const GPUDevice::PresentResult pres =
+    skip_present ? GPUDevice::PresentResult::SkipPresent :
+                   (s_state.gpu_backend ? s_state.gpu_backend->PresentDisplay() :
+                                          g_gpu_device->BeginPresent(g_gpu_device->GetMainSwapChain()));
+  if (pres == GPUDevice::PresentResult::OK)
+  {
+    s_state.skipped_present_count = 0;
+
+    g_gpu_device->RenderImGui(g_gpu_device->GetMainSwapChain());
+
+    const GPUDevice::Features features = g_gpu_device->GetFeatures();
+    const bool scheduled_present = (present_time != 0);
+    const bool explicit_present = (scheduled_present && (features.explicit_present && !features.timed_present));
+    const bool timed_present = (scheduled_present && features.timed_present);
+
+    if (scheduled_present && !explicit_present)
+    {
+      // No explicit present support, simulate it with Flush.
+      g_gpu_device->FlushCommands();
+      SleepUntilPresentTime(present_time);
+    }
+
+    g_gpu_device->EndPresent(g_gpu_device->GetMainSwapChain(), explicit_present, timed_present ? present_time : 0);
+
+    if (g_gpu_device->IsGPUTimingEnabled())
+      PerformanceCounters::AccumulateGPUTime();
+
+    if (explicit_present)
+    {
+      SleepUntilPresentTime(present_time);
+      g_gpu_device->SubmitPresent(g_gpu_device->GetMainSwapChain());
+    }
+  }
+  else
+  {
+    s_state.skipped_present_count++;
+
+    if (pres == GPUDevice::PresentResult::DeviceLost) [[unlikely]]
+      HandleGPUDeviceLost();
+    else if (pres == GPUDevice::PresentResult::ExclusiveFullscreenLost)
+      HandleExclusiveFullscreenLost();
+    else if (!skip_present)
+      g_gpu_device->FlushCommands();
+
+    // Still need to kick ImGui or it gets cranky.
+    ImGui::EndFrame();
+  }
+
+  ImGuiManager::NewFrame();
+
+  if (s_state.gpu_backend)
+    s_state.gpu_backend->RestoreDeviceContext();
+}
+
+bool GPUThread::GetRunIdleOnThread()
+{
+  // Read from both threads.
+  return s_state.run_idle_flag;
+}
+
+void GPUThread::SetRunIdleOnThread(bool enabled)
+{
+  // Should only be called on GPU thread.
+  s_state.run_idle_flag = enabled;
+  DEV_LOG("GPU thread now {} idle", enabled ? "running" : "NOT running");
+}
diff --git a/src/core/gpu_thread.h b/src/core/gpu_thread.h
new file mode 100644
index 000000000..05e706d22
--- /dev/null
+++ b/src/core/gpu_thread.h
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#pragma once
+
+#include "common/types.h"
+
+#include <functional>
+#include <optional>
+
+class Error;
+struct WindowInfo;
+
+namespace Threading {
+class ThreadHandle;
+}
+
+enum class RenderAPI : u8;
+enum class GPUVSyncMode : u8;
+
+enum class GPURenderer : u8;
+enum class GPUBackendCommandType : u8;
+
+struct GPUThreadCommand;
+struct GPUBackendUpdateDisplayCommand;
+
+namespace GPUThread {
+using AsyncCallType = std::function<void()>;
+
+/// Starts Big Picture UI.
+bool StartFullscreenUI(bool fullscreen, Error* error);
+void StopFullscreenUI();
+
+/// Backend control.
+std::optional<GPURenderer> GetRequestedRenderer();
+bool CreateGPUBackend(GPURenderer renderer, bool upload_vram, bool fullscreen, bool force_recreate_device,
+                      Error* error);
+void DestroyGPUBackend();
+
+/// Re-presents the current frame. Call when things like window resizes happen to re-display
+/// the current frame with the correct proportions. Should only be called from the CPU thread.
+void PresentCurrentFrame();
+
+/// Handles fullscreen transitions and such.
+void UpdateDisplayWindow(bool fullscreen);
+
+/// Called when the window is resized.
+void ResizeDisplayWindow(s32 width, s32 height, float scale);
+
+/// Access to main window size from CPU thread.
+const WindowInfo& GetRenderWindowInfo();
+
+void UpdateSettings(bool gpu_settings_changed);
+
+void RunOnThread(AsyncCallType func);
+void SetVSync(GPUVSyncMode mode, bool allow_present_throttle);
+
+bool GetRunIdleOnThread();
+void SetRunIdleOnThread(bool enabled);
+
+GPUThreadCommand* AllocateCommand(GPUBackendCommandType command, u32 size);
+void PushCommand(GPUThreadCommand* cmd);
+void PushCommandAndWakeThread(GPUThreadCommand* cmd);
+void PushCommandAndSync(GPUThreadCommand* cmd, bool spin);
+void PushCommandAndFrame(GPUBackendUpdateDisplayCommand* cmd);
+
+// NOTE: Only called by GPUBackend
+namespace Internal {
+const Threading::ThreadHandle& GetThreadHandle();
+void ProcessStartup();
+void SetThreadEnabled(bool enabled);
+void RequestShutdown();
+void GPUThreadEntryPoint();
+void PresentFrame(bool allow_skip_present, u64 present_time);
+} // namespace Internal
+} // namespace GPUThread
diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h
index 137264ec5..e2c93b6f5 100644
--- a/src/core/gpu_types.h
+++ b/src/core/gpu_types.h
@@ -5,12 +5,24 @@
 
 #include "types.h"
 
+#include "util/gpu_texture.h"
+
 #include "common/bitfield.h"
 #include "common/bitutils.h"
 #include "common/gsvector.h"
 
 #include <array>
 #include <string>
+#include <functional>
+#include <vector>
+
+class Error;
+
+class StateWrapper;
+
+class MediaCapture;
+
+enum class GPUVSyncMode : u8;
 
 enum : u32
 {
@@ -308,12 +320,17 @@ union GPUTexturePaletteReg
   ALWAYS_INLINE constexpr u32 GetYBase() const { return static_cast<u32>(y); }
 };
 
-struct GPUTextureWindow
+union GPUTextureWindow
 {
-  u8 and_x;
-  u8 and_y;
-  u8 or_x;
-  u8 or_y;
+  struct
+  {
+    u8 and_x;
+    u8 and_y;
+    u8 or_x;
+    u8 or_y;
+  };
+
+  u32 bits;
 
   ALWAYS_INLINE bool operator==(const GPUTextureWindow& rhs) const
   {
@@ -453,17 +470,149 @@ static constexpr s32 DITHER_MATRIX[DITHER_MATRIX_SIZE][DITHER_MATRIX_SIZE] = {{-
 enum class GPUBackendCommandType : u8
 {
   Wraparound,
-  Sync,
+  AsyncCall,
+  Reconfigure,
+  Shutdown,
+  ClearVRAM,
+  ClearDisplay,
+  UpdateDisplay,
+  BufferSwapped,
+  UpdateResolutionScale,
+  RenderScreenshotToBuffer,
+  RenderScreenshotToFile,
+  LoadState,
+  SaveState,
+  LoadMemoryState,
+  SaveMemoryState,
+  ReadVRAM,
   FillVRAM,
   UpdateVRAM,
   CopyVRAM,
   SetDrawingArea,
   UpdateCLUT,
+  ClearCache,
   DrawPolygon,
+  DrawPrecisePolygon,
   DrawRectangle,
   DrawLine,
 };
 
+struct GPUThreadCommand
+{
+  u32 size;
+  GPUBackendCommandType type;
+};
+
+struct GPUThreadReconfigureCommand : public GPUThreadCommand
+{
+  Error* error_ptr;
+  std::optional<GPURenderer> renderer;
+  std::optional<bool> fullscreen;
+  std::optional<bool> start_fullscreen_ui;
+  GPUVSyncMode vsync_mode;
+  bool allow_present_throttle;
+  bool force_recreate_device;
+  bool upload_vram;
+  bool result;
+};
+
+struct GPUThreadAsyncCallCommand : public GPUThreadCommand
+{
+  std::function<void()> func;
+};
+
+struct GPUThreadRenderScreenshotToBufferCommand : public GPUThreadCommand
+{
+  u32 width;
+  u32 height;
+  u32* out_width;
+  u32* out_height;
+  std::vector<u32>* out_pixels;
+  u32* out_stride;
+  GPUTexture::Format* out_format;
+  bool* out_result;
+  bool postfx;
+};
+
+struct GPUThreadRenderScreenshotToFileCommand : public GPUThreadCommand
+{
+  DisplayScreenshotMode mode;
+  u8 quality;
+  bool compress_on_thread;
+  bool show_osd_message;
+  u32 path_length;
+  char path[0];
+};
+
+struct GPUBackendLoadStateCommand : public GPUThreadCommand
+{
+  GPUDrawingArea drawing_area;
+  u16 vram_data[VRAM_WIDTH * VRAM_HEIGHT];
+  u16 clut_data[GPU_CLUT_SIZE];
+  u32 texture_cache_state_version;
+  u32 texture_cache_state_size;
+  u8 texture_cache_state[0]; // texture_cache_state_size
+};
+
+struct GPUBackendSaveStateCommand : public GPUThreadCommand
+{
+  StateWrapper* sw;
+};
+
+struct GPUBackendLoadMemoryStateCommand : public GPUThreadCommand
+{
+};
+
+struct GPUBackendSaveMemoryStateCommand : public GPUThreadCommand
+{
+};
+
+struct GPUBackendUpdateDisplayCommand : public GPUThreadCommand
+{
+  u32 frame_number;
+  u32 internal_frame_number;
+
+  u16 display_width;
+  u16 display_height;
+  u16 display_origin_left;
+  u16 display_origin_top;
+  u16 display_vram_left;
+  u16 display_vram_top;
+  u16 display_vram_width;
+  u16 display_vram_height;
+
+  u16 X; // TODO: Can we get rid of this?
+
+  union
+  {
+    u16 bits;
+
+    BitField<u16, bool, 0, 1> interlaced_display_enabled;
+    BitField<u16, u8, 1, 1> interlaced_display_field;
+    BitField<u16, bool, 2, 1> interlaced_display_interleaved;
+    BitField<u16, bool, 3, 1> display_24bit;
+    BitField<u16, bool, 4, 1> display_disabled;
+
+    BitField<u16, bool, 6, 1> allow_present_skip;
+    BitField<u16, bool, 7, 1> present_frame;
+
+    BitField<u16, bool, 8, 1> is_frame;
+  };
+
+  float display_aspect_ratio;
+
+  u64 present_time;
+  MediaCapture* media_capture;
+};
+
+struct GPUBackendReadVRAMCommand : public GPUThreadCommand
+{
+  u16 x;
+  u16 y;
+  u16 width;
+  u16 height;
+};
+
 union GPUBackendCommandParameters
 {
   u8 bits;
@@ -489,18 +638,12 @@ union GPUBackendCommandParameters
   }
 };
 
-struct GPUBackendCommand
+// TODO: Merge this into the other structs, saves padding bytes
+struct GPUBackendCommand : public GPUThreadCommand
 {
-  u32 size;
-  GPUBackendCommandType type;
   GPUBackendCommandParameters params;
 };
 
-struct GPUBackendSyncCommand : public GPUBackendCommand
-{
-  bool allow_sleep;
-};
-
 struct GPUBackendFillVRAMCommand : public GPUBackendCommand
 {
   u16 x;
@@ -532,7 +675,6 @@ struct GPUBackendCopyVRAMCommand : public GPUBackendCommand
 struct GPUBackendSetDrawingAreaCommand : public GPUBackendCommand
 {
   GPUDrawingArea new_area;
-  s32 new_clamped_area[4];
 };
 
 struct GPUBackendUpdateCLUTCommand : public GPUBackendCommand
@@ -541,8 +683,10 @@ struct GPUBackendUpdateCLUTCommand : public GPUBackendCommand
   bool clut_is_8bit;
 };
 
+// TODO: Pack texpage
 struct GPUBackendDrawCommand : public GPUBackendCommand
 {
+  // TODO: Cut this down
   GPUDrawModeReg draw_mode;
   GPURenderCommand rc;
   GPUTexturePaletteReg palette;
@@ -551,7 +695,7 @@ struct GPUBackendDrawCommand : public GPUBackendCommand
 
 struct GPUBackendDrawPolygonCommand : public GPUBackendDrawCommand
 {
-  u16 num_vertices;
+  u8 num_vertices;
 
   struct Vertex
   {
@@ -572,14 +716,22 @@ struct GPUBackendDrawPolygonCommand : public GPUBackendDrawCommand
       };
       u16 texcoord;
     };
+  };
 
-    ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u16 texcoord_)
-    {
-      x = x_;
-      y = y_;
-      color = color_;
-      texcoord = texcoord_;
-    }
+  Vertex vertices[0];
+};
+
+struct GPUBackendDrawPrecisePolygonCommand : public GPUBackendDrawCommand
+{
+  u8 num_vertices;
+  bool valid_w;
+
+  struct Vertex
+  {
+    float x, y, w;
+    s32 native_x, native_y;
+    u32 color;
+    u16 texcoord;
   };
 
   Vertex vertices[0];
@@ -587,9 +739,9 @@ struct GPUBackendDrawPolygonCommand : public GPUBackendDrawCommand
 
 struct GPUBackendDrawRectangleCommand : public GPUBackendDrawCommand
 {
-  s32 x, y;
   u16 width, height;
   u16 texcoord;
+  s32 x, y;
   u32 color;
 };
 
diff --git a/src/core/host.cpp b/src/core/host.cpp
index 1d26548be..0c530fde4 100644
--- a/src/core/host.cpp
+++ b/src/core/host.cpp
@@ -2,19 +2,13 @@
 // SPDX-License-Identifier: CC-BY-NC-ND-4.0
 
 #include "host.h"
-#include "fullscreen_ui.h"
 #include "gpu.h"
-#include "imgui_overlays.h"
-#include "shader_cache_version.h"
 #include "system.h"
 #include "system_private.h"
 
 #include "scmversion/scmversion.h"
 
 #include "util/compress_helpers.h"
-#include "util/gpu_device.h"
-#include "util/imgui_manager.h"
-#include "util/input_manager.h"
 
 #include "common/assert.h"
 #include "common/error.h"
@@ -342,175 +336,3 @@ std::string Host::GetHTTPUserAgent()
 {
   return fmt::format("DuckStation for {} ({}) {}", TARGET_OS_STR, CPU_ARCH_STR, g_scm_tag_str);
 }
-
-bool Host::CreateGPUDevice(RenderAPI api, bool fullscreen, Error* error)
-{
-  DebugAssert(!g_gpu_device);
-
-  INFO_LOG("Trying to create a {} GPU device...", GPUDevice::RenderAPIToString(api));
-  g_gpu_device = GPUDevice::CreateDeviceForAPI(api);
-
-  std::optional<GPUDevice::ExclusiveFullscreenMode> fullscreen_mode;
-  if (fullscreen && g_gpu_device && g_gpu_device->SupportsExclusiveFullscreen())
-  {
-    fullscreen_mode =
-      GPUDevice::ExclusiveFullscreenMode::Parse(Host::GetTinyStringSettingValue("GPU", "FullscreenMode", ""));
-  }
-  std::optional<bool> exclusive_fullscreen_control;
-  if (g_settings.display_exclusive_fullscreen_control != DisplayExclusiveFullscreenControl::Automatic)
-  {
-    exclusive_fullscreen_control =
-      (g_settings.display_exclusive_fullscreen_control == DisplayExclusiveFullscreenControl::Allowed);
-  }
-
-  u32 disabled_features = 0;
-  if (g_settings.gpu_disable_dual_source_blend)
-    disabled_features |= GPUDevice::FEATURE_MASK_DUAL_SOURCE_BLEND;
-  if (g_settings.gpu_disable_framebuffer_fetch)
-    disabled_features |= GPUDevice::FEATURE_MASK_FRAMEBUFFER_FETCH;
-  if (g_settings.gpu_disable_texture_buffers)
-    disabled_features |= GPUDevice::FEATURE_MASK_TEXTURE_BUFFERS;
-  if (g_settings.gpu_disable_memory_import)
-    disabled_features |= GPUDevice::FEATURE_MASK_MEMORY_IMPORT;
-  if (g_settings.gpu_disable_raster_order_views)
-    disabled_features |= GPUDevice::FEATURE_MASK_RASTER_ORDER_VIEWS;
-
-    // Don't dump shaders on debug builds for Android, users will complain about storage...
-#if !defined(__ANDROID__) || defined(_DEBUG)
-  const std::string_view shader_dump_directory(EmuFolders::DataRoot);
-#else
-  const std::string_view shader_dump_directory;
-#endif
-
-  Error create_error;
-  std::optional<WindowInfo> wi;
-  if (!g_gpu_device ||
-      !(wi = Host::AcquireRenderWindow(api, fullscreen, fullscreen_mode.has_value(), &create_error)).has_value() ||
-      !g_gpu_device->Create(
-        g_settings.gpu_adapter, static_cast<GPUDevice::FeatureMask>(disabled_features), shader_dump_directory,
-        g_settings.gpu_disable_shader_cache ? std::string_view() : std::string_view(EmuFolders::Cache),
-        SHADER_CACHE_VERSION, g_settings.gpu_use_debug_device, wi.value(), System::GetEffectiveVSyncMode(),
-        System::ShouldAllowPresentThrottle(), fullscreen_mode.has_value() ? &fullscreen_mode.value() : nullptr,
-        exclusive_fullscreen_control, &create_error))
-  {
-    ERROR_LOG("Failed to create GPU device: {}", create_error.GetDescription());
-    if (g_gpu_device)
-      g_gpu_device->Destroy();
-    g_gpu_device.reset();
-    if (wi.has_value())
-      Host::ReleaseRenderWindow();
-
-    Error::SetStringFmt(
-      error,
-      TRANSLATE_FS("System", "Failed to create render device:\n\n{0}\n\nThis may be due to your GPU not supporting the "
-                             "chosen renderer ({1}), or because your graphics drivers need to be updated."),
-      create_error.GetDescription(), GPUDevice::RenderAPIToString(api));
-    return false;
-  }
-
-  if (!ImGuiManager::Initialize(g_settings.display_osd_scale / 100.0f, g_settings.display_osd_margin, &create_error))
-  {
-    ERROR_LOG("Failed to initialize ImGuiManager: {}", create_error.GetDescription());
-    Error::SetStringFmt(error, "Failed to initialize ImGuiManager: {}", create_error.GetDescription());
-    g_gpu_device->Destroy();
-    g_gpu_device.reset();
-    Host::ReleaseRenderWindow();
-    return false;
-  }
-
-  InputManager::SetDisplayWindowSize(ImGuiManager::GetWindowWidth(), ImGuiManager::GetWindowHeight());
-  return true;
-}
-
-void Host::UpdateDisplayWindow(bool fullscreen)
-{
-  if (!g_gpu_device)
-    return;
-
-  const GPUVSyncMode vsync_mode = System::GetEffectiveVSyncMode();
-  const bool allow_present_throttle = System::ShouldAllowPresentThrottle();
-  std::optional<GPUDevice::ExclusiveFullscreenMode> fullscreen_mode;
-  if (fullscreen && g_gpu_device->SupportsExclusiveFullscreen())
-  {
-    fullscreen_mode =
-      GPUDevice::ExclusiveFullscreenMode::Parse(Host::GetTinyStringSettingValue("GPU", "FullscreenMode", ""));
-  }
-  std::optional<bool> exclusive_fullscreen_control;
-  if (g_settings.display_exclusive_fullscreen_control != DisplayExclusiveFullscreenControl::Automatic)
-  {
-    exclusive_fullscreen_control =
-      (g_settings.display_exclusive_fullscreen_control == DisplayExclusiveFullscreenControl::Allowed);
-  }
-
-  g_gpu_device->DestroyMainSwapChain();
-
-  Error error;
-  std::optional<WindowInfo> wi =
-    Host::AcquireRenderWindow(g_gpu_device->GetRenderAPI(), fullscreen, fullscreen_mode.has_value(), &error);
-  if (!wi.has_value())
-  {
-    Host::ReportFatalError("Failed to get render window after update", error.GetDescription());
-    return;
-  }
-
-  // if surfaceless, just leave it
-  if (wi->IsSurfaceless())
-    return;
-
-  if (!g_gpu_device->RecreateMainSwapChain(wi.value(), vsync_mode, allow_present_throttle,
-                                           fullscreen_mode.has_value() ? &fullscreen_mode.value() : nullptr,
-                                           exclusive_fullscreen_control, &error))
-  {
-    Host::ReportFatalError("Failed to change window after update", error.GetDescription());
-    return;
-  }
-
-  const u32 new_width = g_gpu_device->GetMainSwapChain()->GetWidth();
-  const u32 new_height = g_gpu_device->GetMainSwapChain()->GetHeight();
-  const float f_width = static_cast<float>(new_width);
-  const float f_height = static_cast<float>(new_height);
-  ImGuiManager::WindowResized(f_width, f_height);
-  InputManager::SetDisplayWindowSize(f_width, f_height);
-  System::DisplayWindowResized(new_width, new_height);
-}
-
-void Host::ResizeDisplayWindow(s32 width, s32 height, float scale)
-{
-  if (!g_gpu_device || !g_gpu_device->HasMainSwapChain())
-    return;
-
-  DEV_LOG("Display window resized to {}x{}", width, height);
-
-  Error error;
-  if (!g_gpu_device->GetMainSwapChain()->ResizeBuffers(width, height, scale, &error))
-  {
-    ERROR_LOG("Failed to resize main swap chain: {}", error.GetDescription());
-    UpdateDisplayWindow(Host::IsFullscreen());
-    return;
-  }
-
-  const u32 new_width = g_gpu_device->GetMainSwapChain()->GetWidth();
-  const u32 new_height = g_gpu_device->GetMainSwapChain()->GetHeight();
-  const float f_width = static_cast<float>(new_width);
-  const float f_height = static_cast<float>(new_height);
-  ImGuiManager::WindowResized(f_width, f_height);
-  InputManager::SetDisplayWindowSize(f_width, f_height);
-  System::DisplayWindowResized(new_width, new_height);
-}
-
-void Host::ReleaseGPUDevice()
-{
-  if (!g_gpu_device)
-    return;
-
-  ImGuiManager::DestroyAllDebugWindows();
-  ImGuiManager::DestroyOverlayTextures();
-  FullscreenUI::Shutdown();
-  ImGuiManager::Shutdown();
-
-  INFO_LOG("Destroying {} GPU device...", GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()));
-  g_gpu_device->Destroy();
-  g_gpu_device.reset();
-
-  Host::ReleaseRenderWindow();
-}
diff --git a/src/core/host.h b/src/core/host.h
index 41896a522..498971245 100644
--- a/src/core/host.h
+++ b/src/core/host.h
@@ -96,21 +96,6 @@ bool IsFullscreen();
 /// Alters fullscreen state of hosting application.
 void SetFullscreen(bool enabled);
 
-/// Attempts to create the rendering device backend.
-bool CreateGPUDevice(RenderAPI api, bool fullscreen, Error* error);
-
-/// Handles fullscreen transitions and such.
-void UpdateDisplayWindow(bool fullscreen);
-
-/// Called when the window is resized.
-void ResizeDisplayWindow(s32 width, s32 height, float scale);
-
-/// Destroys any active rendering device.
-void ReleaseGPUDevice();
-
-/// Called at the end of the frame, before presentation.
-void FrameDone();
-
 namespace Internal {
 
 /// Returns true if the host should use portable mode.
diff --git a/src/core/hotkeys.cpp b/src/core/hotkeys.cpp
index 6fc333f4d..55ad2d5d2 100644
--- a/src/core/hotkeys.cpp
+++ b/src/core/hotkeys.cpp
@@ -8,6 +8,7 @@
 #include "fullscreen_ui.h"
 #include "gpu.h"
 #include "gpu_hw_texture_cache.h"
+#include "gpu_thread.h"
 #include "host.h"
 #include "imgui_overlays.h"
 #include "settings.h"
@@ -58,8 +59,7 @@ static void HotkeyModifyResolutionScale(s32 increment)
 
   if (System::IsValid())
   {
-    g_gpu->RestoreDeviceContext();
-    g_gpu->UpdateSettings(old_settings);
+    GPUThread::UpdateSettings(true);
     System::ClearMemorySaveStates();
   }
 }
@@ -386,11 +386,10 @@ DEFINE_HOTKEY("TogglePGXP", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_NOO
               [](s32 pressed) {
                 if (!pressed && System::IsValid())
                 {
-                  Settings old_settings = g_settings;
                   g_settings.gpu_pgxp_enable = !g_settings.gpu_pgxp_enable;
-                  g_gpu->RestoreDeviceContext();
-                  g_gpu->UpdateSettings(old_settings);
                   System::ClearMemorySaveStates();
+                  GPUThread::UpdateSettings(true);
+
                   Host::AddKeyedOSDMessage("TogglePGXP",
                                            g_settings.gpu_pgxp_enable ?
                                              TRANSLATE_STR("OSDMessage", "PGXP is now enabled.") :
@@ -459,12 +458,11 @@ DEFINE_HOTKEY("TogglePGXPDepth", TRANSLATE_NOOP("Hotkeys", "Graphics"),
                   if (!g_settings.gpu_pgxp_enable)
                     return;
 
-                  const Settings old_settings = g_settings;
                   g_settings.gpu_pgxp_depth_buffer = !g_settings.gpu_pgxp_depth_buffer;
 
-                  g_gpu->RestoreDeviceContext();
-                  g_gpu->UpdateSettings(old_settings);
                   System::ClearMemorySaveStates();
+                  GPUThread::UpdateSettings(true);
+
                   Host::AddKeyedOSDMessage("TogglePGXPDepth",
                                            g_settings.gpu_pgxp_depth_buffer ?
                                              TRANSLATE_STR("OSDMessage", "PGXP Depth Buffer is now enabled.") :
@@ -480,12 +478,11 @@ DEFINE_HOTKEY("TogglePGXPCPU", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_
                   if (!g_settings.gpu_pgxp_enable)
                     return;
 
-                  const Settings old_settings = g_settings;
                   g_settings.gpu_pgxp_cpu = !g_settings.gpu_pgxp_cpu;
 
-                  g_gpu->RestoreDeviceContext();
-                  g_gpu->UpdateSettings(old_settings);
+                  // GPU thread is unchanged
                   System::ClearMemorySaveStates();
+
                   Host::AddKeyedOSDMessage("TogglePGXPCPU",
                                            g_settings.gpu_pgxp_cpu ?
                                              TRANSLATE_STR("OSDMessage", "PGXP CPU mode is now enabled.") :
@@ -595,29 +592,29 @@ DEFINE_HOTKEY("AudioVolumeDown", TRANSLATE_NOOP("Hotkeys", "Audio"), TRANSLATE_N
 DEFINE_HOTKEY("LoadSelectedSaveState", TRANSLATE_NOOP("Hotkeys", "Save States"),
               TRANSLATE_NOOP("Hotkeys", "Load From Selected Slot"), [](s32 pressed) {
                 if (!pressed)
-                  Host::RunOnCPUThread(SaveStateSelectorUI::LoadCurrentSlot);
+                  GPUThread::RunOnThread(SaveStateSelectorUI::LoadCurrentSlot);
               })
 DEFINE_HOTKEY("SaveSelectedSaveState", TRANSLATE_NOOP("Hotkeys", "Save States"),
               TRANSLATE_NOOP("Hotkeys", "Save To Selected Slot"), [](s32 pressed) {
                 if (!pressed)
-                  Host::RunOnCPUThread(SaveStateSelectorUI::SaveCurrentSlot);
+                  GPUThread::RunOnThread(SaveStateSelectorUI::SaveCurrentSlot);
               })
 DEFINE_HOTKEY("SelectPreviousSaveStateSlot", TRANSLATE_NOOP("Hotkeys", "Save States"),
               TRANSLATE_NOOP("Hotkeys", "Select Previous Save Slot"), [](s32 pressed) {
                 if (!pressed)
-                  Host::RunOnCPUThread([]() { SaveStateSelectorUI::SelectPreviousSlot(true); });
+                  GPUThread::RunOnThread([]() { SaveStateSelectorUI::SelectPreviousSlot(true); });
               })
 DEFINE_HOTKEY("SelectNextSaveStateSlot", TRANSLATE_NOOP("Hotkeys", "Save States"),
               TRANSLATE_NOOP("Hotkeys", "Select Next Save Slot"), [](s32 pressed) {
                 if (!pressed)
-                  Host::RunOnCPUThread([]() { SaveStateSelectorUI::SelectNextSlot(true); });
+                  GPUThread::RunOnThread([]() { SaveStateSelectorUI::SelectNextSlot(true); });
               })
 DEFINE_HOTKEY("SaveStateAndSelectNextSlot", TRANSLATE_NOOP("Hotkeys", "Save States"),
               TRANSLATE_NOOP("Hotkeys", "Save State and Select Next Slot"), [](s32 pressed) {
                 if (!pressed && System::IsValid())
                 {
                   SaveStateSelectorUI::SaveCurrentSlot();
-                  SaveStateSelectorUI::SelectNextSlot(false);
+                  GPUThread::RunOnThread([]() { SaveStateSelectorUI::SelectNextSlot(false); });
                 }
               })
 
diff --git a/src/core/imgui_overlays.cpp b/src/core/imgui_overlays.cpp
index 47556c01d..11a50fdc4 100644
--- a/src/core/imgui_overlays.cpp
+++ b/src/core/imgui_overlays.cpp
@@ -9,6 +9,8 @@
 #include "dma.h"
 #include "fullscreen_ui.h"
 #include "gpu.h"
+#include "gpu_backend.h"
+#include "gpu_thread.h"
 #include "host.h"
 #include "mdec.h"
 #include "performance_counters.h"
@@ -70,10 +72,10 @@ struct DebugWindowInfo
 } // namespace
 
 static void FormatProcessorStat(SmallStringBase& text, double usage, double time);
-static void DrawPerformanceOverlay(float& position_y, float scale, float margin, float spacing);
+static void DrawPerformanceOverlay(const GPUBackend* gpu, float& position_y, float scale, float margin, float spacing);
 static void DrawMediaCaptureOverlay(float& position_y, float scale, float margin, float spacing);
 static void DrawFrameTimeOverlay(float& position_y, float scale, float margin, float spacing);
-static void DrawEnhancementsOverlay();
+static void DrawEnhancementsOverlay(const GPUBackend* gpu);
 static void DrawInputsOverlay();
 
 #ifndef __ANDROID__
@@ -284,26 +286,25 @@ void ImGuiManager::DestroyAllDebugWindows()
 #endif
 }
 
-void ImGuiManager::RenderTextOverlays()
+void ImGuiManager::RenderTextOverlays(const GPUBackend* gpu)
 {
+  // NOTE: Racey read.
   const System::State state = System::GetState();
-  if (state != System::State::Shutdown)
-  {
-    const float scale = ImGuiManager::GetGlobalScale();
-    const float f_margin = ImGuiManager::GetScreenMargin() * scale;
-    const float margin = ImCeil(ImGuiManager::GetScreenMargin() * scale);
-    const float spacing = ImCeil(5.0f * scale);
-    float position_y = ImFloor(f_margin);
-    DrawPerformanceOverlay(position_y, scale, margin, spacing);
-    DrawFrameTimeOverlay(position_y, scale, margin, spacing);
-    DrawMediaCaptureOverlay(position_y, scale, margin, spacing);
 
-    if (g_settings.display_show_enhancements && state != System::State::Paused)
-      DrawEnhancementsOverlay();
+  const float scale = ImGuiManager::GetGlobalScale();
+  const float f_margin = ImGuiManager::GetScreenMargin() * scale;
+  const float margin = ImCeil(ImGuiManager::GetScreenMargin() * scale);
+  const float spacing = ImCeil(5.0f * scale);
+  float position_y = ImFloor(f_margin);
+  DrawPerformanceOverlay(gpu, position_y, scale, margin, spacing);
+  DrawFrameTimeOverlay(position_y, scale, margin, spacing);
+  DrawMediaCaptureOverlay(position_y, scale, margin, spacing);
 
-    if (g_settings.display_show_inputs && state != System::State::Paused)
-      DrawInputsOverlay();
-  }
+  if (g_gpu_settings.display_show_enhancements && state != System::State::Paused)
+    DrawEnhancementsOverlay(gpu);
+
+  if (g_gpu_settings.display_show_inputs && state != System::State::Paused)
+    DrawInputsOverlay();
 }
 
 void ImGuiManager::FormatProcessorStat(SmallStringBase& text, double usage, double time)
@@ -317,11 +318,12 @@ void ImGuiManager::FormatProcessorStat(SmallStringBase& text, double usage, doub
     text.append_format("{:.1f}% ({:.2f}ms)", usage, time);
 }
 
-void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float margin, float spacing)
+void ImGuiManager::DrawPerformanceOverlay(const GPUBackend* gpu, float& position_y, float scale, float margin,
+                                          float spacing)
 {
-  if (!(g_settings.display_show_fps || g_settings.display_show_speed || g_settings.display_show_gpu_stats ||
-        g_settings.display_show_resolution || g_settings.display_show_cpu_usage ||
-        (g_settings.display_show_status_indicators &&
+  if (!(g_gpu_settings.display_show_fps || g_gpu_settings.display_show_speed || g_gpu_settings.display_show_gpu_stats ||
+        g_gpu_settings.display_show_resolution || g_gpu_settings.display_show_cpu_usage ||
+        (g_gpu_settings.display_show_status_indicators &&
          (System::IsPaused() || System::IsFastForwardEnabled() || System::IsTurboEnabled()))))
   {
     return;
@@ -352,9 +354,9 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
   if (state == System::State::Running)
   {
     const float speed = PerformanceCounters::GetEmulationSpeed();
-    if (g_settings.display_show_fps)
+    if (g_gpu_settings.display_show_fps)
       text.append_format("G: {:.2f} | V: {:.2f}", PerformanceCounters::GetFPS(), PerformanceCounters::GetVPS());
-    if (g_settings.display_show_speed)
+    if (g_gpu_settings.display_show_speed)
     {
       text.append_format("{}{}%", text.empty() ? "" : " | ", static_cast<u32>(std::round(speed)));
 
@@ -377,19 +379,19 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
       DRAW_LINE(fixed_font, text, color);
     }
 
-    if (g_settings.display_show_gpu_stats)
+    if (g_gpu_settings.display_show_gpu_stats)
     {
-      g_gpu->GetStatsString(text);
+      gpu->GetStatsString(text);
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
 
-      g_gpu->GetMemoryStatsString(text);
+      gpu->GetMemoryStatsString(text);
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
     }
 
-    if (g_settings.display_show_resolution)
+    if (g_gpu_settings.display_show_resolution)
     {
-      const u32 resolution_scale = g_gpu->GetResolutionScale();
-      const auto [display_width, display_height] = g_gpu->GetFullDisplayResolution();
+      const u32 resolution_scale = gpu->GetResolutionScale();
+      const auto [display_width, display_height] = gpu->GetFullDisplayResolution();
       const bool interlaced = g_gpu->IsInterlacedDisplayEnabled();
       const bool pal = g_gpu->IsInPALMode();
       text.format("{}x{} {} {} [{}x]", display_width * resolution_scale, display_height * resolution_scale,
@@ -397,13 +399,13 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
     }
 
-    if (g_settings.display_show_latency_stats)
+    if (g_gpu_settings.display_show_latency_stats)
     {
       System::FormatLatencyStats(text);
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
     }
 
-    if (g_settings.display_show_cpu_usage)
+    if (g_gpu_settings.display_show_cpu_usage)
     {
       text.format("{:.2f}ms | {:.2f}ms | {:.2f}ms", PerformanceCounters::GetMinimumFrameTime(),
                   PerformanceCounters::GetAverageFrameTime(), PerformanceCounters::GetMaximumFrameTime());
@@ -459,11 +461,11 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
                           PerformanceCounters::GetCPUThreadAverageTime());
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
 
-      if (g_gpu->GetSWThread())
+      if (g_gpu_settings.gpu_use_thread)
       {
-        text.assign("SW: ");
-        FormatProcessorStat(text, PerformanceCounters::GetSWThreadUsage(),
-                            PerformanceCounters::GetSWThreadAverageTime());
+        text.assign("RNDR: ");
+        FormatProcessorStat(text, PerformanceCounters::GetGPUThreadUsage(),
+                            PerformanceCounters::GetGPUThreadAverageTime());
         DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
       }
 
@@ -477,14 +479,14 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
 #endif
     }
 
-    if (g_settings.display_show_gpu_usage && g_gpu_device->IsGPUTimingEnabled())
+    if (g_gpu_settings.display_show_gpu_usage && g_gpu_device->IsGPUTimingEnabled())
     {
       text.assign("GPU: ");
       FormatProcessorStat(text, PerformanceCounters::GetGPUUsage(), PerformanceCounters::GetGPUAverageTime());
       DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
     }
 
-    if (g_settings.display_show_status_indicators)
+    if (g_gpu_settings.display_show_status_indicators)
     {
       const bool rewinding = System::IsRewinding();
       if (rewinding || System::IsFastForwardEnabled() || System::IsTurboEnabled())
@@ -494,7 +496,7 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
       }
     }
   }
-  else if (g_settings.display_show_status_indicators && state == System::State::Paused &&
+  else if (g_gpu_settings.display_show_status_indicators && state == System::State::Paused &&
            !FullscreenUI::HasActiveWindow())
   {
     text.assign(ICON_EMOJI_PAUSE);
@@ -504,12 +506,12 @@ void ImGuiManager::DrawPerformanceOverlay(float& position_y, float scale, float
 #undef DRAW_LINE
 }
 
-void ImGuiManager::DrawEnhancementsOverlay()
+void ImGuiManager::DrawEnhancementsOverlay(const GPUBackend* gpu)
 {
   LargeString text;
   text.append_format("{} {}-{}", Settings::GetConsoleRegionName(System::GetRegion()),
                      GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()),
-                     g_gpu->IsHardwareRenderer() ? "HW" : "SW");
+                     gpu->IsHardwareRenderer() ? "HW" : "SW");
 
   if (g_settings.rewind_enable)
     text.append_format(" RW={}/{}", g_settings.rewind_save_frequency, g_settings.rewind_save_slots);
@@ -953,7 +955,10 @@ void SaveStateSelectorUI::ClearList()
   for (ListEntry& li : s_slots)
   {
     if (li.preview_texture)
-      g_gpu_device->RecycleTexture(std::move(li.preview_texture));
+    {
+      GPUThread::RunOnThread(
+        [tex = li.preview_texture.release()]() { g_gpu_device->RecycleTexture(std::unique_ptr<GPUTexture>(tex)); });
+    }
   }
   s_slots.clear();
 }
@@ -1273,7 +1278,7 @@ void SaveStateSelectorUI::LoadCurrentSlot()
     }
   }
 
-  Close();
+  GPUThread::RunOnThread(&Close);
 }
 
 void SaveStateSelectorUI::SaveCurrentSlot()
@@ -1290,7 +1295,7 @@ void SaveStateSelectorUI::SaveCurrentSlot()
     }
   }
 
-  Close();
+  GPUThread::RunOnThread(&Close);
 }
 
 void SaveStateSelectorUI::ShowSlotOSDMessage()
@@ -1314,7 +1319,7 @@ void SaveStateSelectorUI::ShowSlotOSDMessage()
 void ImGuiManager::RenderOverlayWindows()
 {
   const System::State state = System::GetState();
-  if (state != System::State::Shutdown)
+  if (state == System::State::Paused || state == System::State::Running)
   {
     if (SaveStateSelectorUI::s_open)
       SaveStateSelectorUI::Draw();
diff --git a/src/core/imgui_overlays.h b/src/core/imgui_overlays.h
index 7c9f26540..dee60e426 100644
--- a/src/core/imgui_overlays.h
+++ b/src/core/imgui_overlays.h
@@ -7,8 +7,10 @@
 
 #include <string>
 
+class GPUBackend;
+
 namespace ImGuiManager {
-void RenderTextOverlays();
+void RenderTextOverlays(const GPUBackend* gpu);
 void RenderDebugWindows();
 bool UpdateDebugWindowConfig();
 void DestroyAllDebugWindows();
diff --git a/src/core/performance_counters.cpp b/src/core/performance_counters.cpp
index 5a6ad2a20..4c68f1935 100644
--- a/src/core/performance_counters.cpp
+++ b/src/core/performance_counters.cpp
@@ -3,6 +3,8 @@
 
 #include "performance_counters.h"
 #include "gpu.h"
+#include "gpu_backend.h"
+#include "gpu_thread.h"
 #include "system.h"
 #include "system_private.h"
 
@@ -45,9 +47,9 @@ struct State
   float cpu_thread_usage;
   float cpu_thread_time;
 
-  u64 last_sw_time;
-  float sw_thread_usage;
-  float sw_thread_time;
+  u64 last_gpu_thread_time;
+  float gpu_thread_usage;
+  float gpu_thread_time;
 
   float average_gpu_time;
   float accumulated_gpu_time;
@@ -105,14 +107,14 @@ float PerformanceCounters::GetCPUThreadAverageTime()
   return s_state.cpu_thread_time;
 }
 
-float PerformanceCounters::GetSWThreadUsage()
+float PerformanceCounters::GetGPUThreadUsage()
 {
-  return s_state.sw_thread_usage;
+  return s_state.gpu_thread_usage;
 }
 
-float PerformanceCounters::GetSWThreadAverageTime()
+float PerformanceCounters::GetGPUThreadAverageTime()
 {
-  return s_state.sw_thread_time;
+  return s_state.gpu_thread_time;
 }
 
 float PerformanceCounters::GetGPUUsage()
@@ -150,17 +152,16 @@ void PerformanceCounters::Reset()
   s_state.last_frame_number = System::GetFrameNumber();
   s_state.last_internal_frame_number = System::GetInternalFrameNumber();
   s_state.last_cpu_time = System::GetCPUThreadHandle().GetCPUTime();
-  if (const Threading::Thread* sw_thread = g_gpu->GetSWThread(); sw_thread)
-    s_state.last_sw_time = sw_thread->GetCPUTime();
-  else
-    s_state.last_sw_time = 0;
+  s_state.last_gpu_thread_time = GPUThread::Internal::GetThreadHandle().GetCPUTime();
 
   s_state.average_frame_time_accumulator = 0.0f;
   s_state.minimum_frame_time_accumulator = 0.0f;
   s_state.maximum_frame_time_accumulator = 0.0f;
+
+  std::atomic_thread_fence(std::memory_order_release);
 }
 
-void PerformanceCounters::Update(u32 frame_number, u32 internal_frame_number)
+void PerformanceCounters::Update(GPUBackend* gpu, u32 frame_number, u32 internal_frame_number)
 {
   const Common::Timer::Value now_ticks = Common::Timer::GetCurrentValue();
 
@@ -177,7 +178,7 @@ void PerformanceCounters::Update(u32 frame_number, u32 internal_frame_number)
   // update fps counter
   const Common::Timer::Value ticks_diff = now_ticks - s_state.last_update_time;
   const float time = static_cast<float>(Common::Timer::ConvertValueToSeconds(ticks_diff));
-  if (time < PERFORMANCE_COUNTER_UPDATE_INTERVAL)
+  if (time < PERFORMANCE_COUNTER_UPDATE_INTERVAL || s_state.last_frame_number == frame_number)
     return;
 
   s_state.last_update_time = now_ticks;
@@ -202,18 +203,17 @@ void PerformanceCounters::Update(u32 frame_number, u32 internal_frame_number)
   s_state.fps = static_cast<float>(internal_frames_run) / time;
   s_state.speed = (s_state.vps / System::GetVideoFrameRate()) * 100.0f;
 
-  const Threading::Thread* sw_thread = g_gpu->GetSWThread();
   const u64 cpu_time = System::GetCPUThreadHandle().GetCPUTime();
-  const u64 sw_time = sw_thread ? sw_thread->GetCPUTime() : 0;
+  const u64 gpu_thread_time = GPUThread::Internal::GetThreadHandle().GetCPUTime();
   const u64 cpu_delta = cpu_time - s_state.last_cpu_time;
-  const u64 sw_delta = sw_time - s_state.last_sw_time;
+  const u64 gpu_thread_delta = gpu_thread_time - s_state.last_gpu_thread_time;
   s_state.last_cpu_time = cpu_time;
-  s_state.last_sw_time = sw_time;
+  s_state.last_gpu_thread_time = gpu_thread_time;
 
   s_state.cpu_thread_usage = static_cast<float>(static_cast<double>(cpu_delta) * pct_divider);
   s_state.cpu_thread_time = static_cast<float>(static_cast<double>(cpu_delta) * time_divider);
-  s_state.sw_thread_usage = static_cast<float>(static_cast<double>(sw_delta) * pct_divider);
-  s_state.sw_thread_time = static_cast<float>(static_cast<double>(sw_delta) * time_divider);
+  s_state.gpu_thread_usage = static_cast<float>(static_cast<double>(gpu_thread_delta) * pct_divider);
+  s_state.gpu_thread_time = static_cast<float>(static_cast<double>(gpu_thread_delta) * time_divider);
 
   if (MediaCapture* cap = System::GetMediaCapture())
     cap->UpdateCaptureThreadUsage(pct_divider, time_divider);
@@ -228,13 +228,13 @@ void PerformanceCounters::Update(u32 frame_number, u32 internal_frame_number)
   s_state.presents_since_last_update = 0;
 
   if (g_settings.display_show_gpu_stats)
-    g_gpu->UpdateStatistics(frames_run);
+    gpu->UpdateStatistics(frames_run);
 
-  VERBOSE_LOG("FPS: {:.2f} VPS: {:.2f} CPU: {:.2f} GPU: {:.2f} Avg: {:.2f}ms Min: {:.2f}ms Max: {:.2f}ms",
-              s_state.fps, s_state.vps, s_state.cpu_thread_usage, s_state.gpu_usage, s_state.average_frame_time,
-              s_state.minimum_frame_time, s_state.maximum_frame_time);
+  VERBOSE_LOG("FPS: {:.2f} VPS: {:.2f} CPU: {:.2f} RNDR: {:.2f} GPU: {:.2f} Avg: {:.2f}ms Min: {:.2f}ms Max: {:.2f}ms",
+              s_state.fps, s_state.vps, s_state.cpu_thread_usage, s_state.gpu_thread_usage, s_state.gpu_usage,
+              s_state.average_frame_time, s_state.minimum_frame_time, s_state.maximum_frame_time);
 
-  Host::OnPerformanceCountersUpdated();
+  Host::OnPerformanceCountersUpdated(gpu);
 }
 
 void PerformanceCounters::AccumulateGPUTime()
diff --git a/src/core/performance_counters.h b/src/core/performance_counters.h
index db3827b5b..fbdfc86cc 100644
--- a/src/core/performance_counters.h
+++ b/src/core/performance_counters.h
@@ -5,6 +5,8 @@
 
 #include "common/types.h"
 
+class GPUBackend;
+
 namespace PerformanceCounters
 {
 static constexpr u32 NUM_FRAME_TIME_SAMPLES = 150;
@@ -18,8 +20,8 @@ float GetMinimumFrameTime();
 float GetMaximumFrameTime();
 float GetCPUThreadUsage();
 float GetCPUThreadAverageTime();
-float GetSWThreadUsage();
-float GetSWThreadAverageTime();
+float GetGPUThreadUsage();
+float GetGPUThreadAverageTime();
 float GetGPUUsage();
 float GetGPUAverageTime();
 const FrameTimeHistory& GetFrameTimeHistory();
@@ -27,7 +29,7 @@ u32 GetFrameTimeHistoryPos();
 
 void Clear();
 void Reset();
-void Update(u32 frame_number, u32 internal_frame_number);
+void Update(GPUBackend* gpu, u32 frame_number, u32 internal_frame_number);
 void AccumulateGPUTime();
 
 } // namespace Host
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index 401525463..1b2b9df02 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -28,7 +28,8 @@
 
 LOG_CHANNEL(Settings);
 
-Settings g_settings;
+ALIGN_TO_CACHE_LINE Settings g_settings;
+ALIGN_TO_CACHE_LINE Settings g_gpu_settings;
 
 const char* SettingInfo::StringDefaultValue() const
 {
@@ -204,6 +205,7 @@ void Settings::Load(const SettingsInterface& si, const SettingsInterface& contro
   gpu_disable_raster_order_views = si.GetBoolValue("GPU", "DisableRasterOrderViews", false);
   gpu_per_sample_shading = si.GetBoolValue("GPU", "PerSampleShading", false);
   gpu_use_thread = si.GetBoolValue("GPU", "UseThread", true);
+  gpu_max_queued_frames = static_cast<u8>(si.GetUIntValue("GPU", "MaxQueuedFrames", DEFAULT_GPU_MAX_QUEUED_FRAMES));
   gpu_use_software_renderer_for_readbacks = si.GetBoolValue("GPU", "UseSoftwareRendererForReadbacks", false);
   gpu_true_color = si.GetBoolValue("GPU", "TrueColor", true);
   gpu_scaled_dithering = si.GetBoolValue("GPU", "ScaledDithering", true);
@@ -533,6 +535,7 @@ void Settings::Save(SettingsInterface& si, bool ignore_base) const
   }
 
   si.SetBoolValue("GPU", "PerSampleShading", gpu_per_sample_shading);
+  si.SetUIntValue("GPU", "MaxQueuedFrames", gpu_max_queued_frames);
   si.SetBoolValue("GPU", "UseThread", gpu_use_thread);
   si.SetBoolValue("GPU", "UseSoftwareRendererForReadbacks", gpu_use_software_renderer_for_readbacks);
   si.SetBoolValue("GPU", "TrueColor", gpu_true_color);
diff --git a/src/core/settings.h b/src/core/settings.h
index d30b40767..08cabf14f 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -98,6 +98,7 @@ struct Settings
   std::string gpu_adapter;
   u8 gpu_resolution_scale = 1;
   u8 gpu_multisamples = 1;
+  u8 gpu_max_queued_frames = 2;
   bool gpu_use_thread : 1 = true;
   bool gpu_use_software_renderer_for_readbacks : 1 = false;
   bool gpu_use_debug_device : 1 = false;
@@ -486,6 +487,8 @@ struct Settings
   static constexpr ConsoleRegion DEFAULT_CONSOLE_REGION = ConsoleRegion::Auto;
   static constexpr float DEFAULT_GPU_PGXP_DEPTH_THRESHOLD = 300.0f;
   static constexpr float GPU_PGXP_DEPTH_THRESHOLD_SCALE = 4096.0f;
+  static constexpr u8 DEFAULT_GPU_MAX_QUEUED_FRAMES = 2; // TODO: Maybe lower? But that means fast CPU threads would
+                                                         // always stall, could be a problem for power management.
 
   // Prefer oldrec over newrec for now. Except on RISC-V, where there is no oldrec.
 #if defined(CPU_ARCH_RISCV64)
@@ -552,7 +555,9 @@ struct Settings
 #endif
 };
 
-extern Settings g_settings;
+// TODO: Use smaller copy for GPU thread copy.
+ALIGN_TO_CACHE_LINE extern Settings g_settings;     // CPU thread copy.
+ALIGN_TO_CACHE_LINE extern Settings g_gpu_settings; // GPU thread copy.
 
 namespace EmuFolders {
 extern std::string AppRoot;
diff --git a/src/core/system.cpp b/src/core/system.cpp
index f522add4e..c6f4319a2 100644
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@@ -16,8 +16,10 @@
 #include "game_database.h"
 #include "game_list.h"
 #include "gpu.h"
+#include "gpu_backend.h"
 #include "gpu_dump.h"
 #include "gpu_hw_texture_cache.h"
+#include "gpu_thread.h"
 #include "gte.h"
 #include "host.h"
 #include "host_interface_progress_callback.h"
@@ -167,11 +169,10 @@ static void ClearRunningGame();
 static void DestroySystem();
 static void JoinTaskThreads();
 
-static bool CreateGPU(GPURenderer renderer, bool is_switching, bool fullscreen, Error* error);
-static bool RecreateGPU(GPURenderer renderer, bool force_recreate_device = false, bool update_display = true);
-static void HandleHostGPUDeviceLost();
-static void HandleExclusiveFullscreenLost();
+static void RecreateGPU(GPURenderer renderer, bool force_recreate_device = false, bool update_display = true);
+static void SetGTEAspectRatioFromRenderWindow();
 static std::string GetScreenshotPath(const char* extension);
+static void StopMediaCapture(std::unique_ptr<MediaCapture> cap);
 
 /// Returns true if boot is being fast forwarded.
 static bool IsFastForwardingBoot();
@@ -181,7 +182,7 @@ static void UpdateThrottlePeriod();
 static void ResetThrottler();
 
 /// Throttles the system, i.e. sleeps until it's time to execute the next frame.
-static void Throttle(Common::Timer::Value current_time);
+static void Throttle(Common::Timer::Value current_time, Common::Timer::Value sleep_until);
 static void AccumulatePreFrameSleepTime(Common::Timer::Value current_time);
 static void UpdateDisplayVSync();
 
@@ -304,7 +305,6 @@ struct ALIGN_TO_CACHE_LINE StateVars
   GameHash running_game_hash;
   bool running_game_custom_title = false;
 
-  bool keep_gpu_device_on_shutdown = false;
   std::atomic_bool startup_cancelled{false};
 
   bool rewinding_first_save = false;
@@ -517,6 +517,8 @@ bool System::CPUThreadInitialize(Error* error)
 
   LogStartupInformation();
 
+  GPUThread::Internal::ProcessStartup();
+
   if (g_settings.achievements_enabled)
     Achievements::Initialize();
 
@@ -569,16 +571,6 @@ System::State System::GetState()
   return s_state.state;
 }
 
-void System::SetState(State new_state)
-{
-  if (s_state.state == new_state)
-    return;
-
-  Assert(s_state.state == State::Paused || s_state.state == State::Running);
-  Assert(new_state == State::Paused || new_state == State::Running);
-  s_state.state = new_state;
-}
-
 bool System::IsRunning()
 {
   return s_state.state == State::Running;
@@ -1186,120 +1178,17 @@ std::string System::GetInputProfilePath(std::string_view name)
   return Path::Combine(EmuFolders::InputProfiles, fmt::format("{}.ini", name));
 }
 
-bool System::RecreateGPU(GPURenderer renderer, bool force_recreate_device, bool update_display /* = true*/)
+void System::RecreateGPU(GPURenderer renderer, bool force_recreate_device, bool update_display /* = true*/)
 {
   ClearMemorySaveStates();
-  g_gpu->RestoreDeviceContext();
-
-  // save current state
-  DynamicHeapArray<u8> state_data(GetMaxSaveStateSize());
-  {
-    StateWrapper sw(state_data.span(), StateWrapper::Mode::Write, SAVE_STATE_VERSION);
-    if (!g_gpu->DoState(sw, nullptr, false) || !TimingEvents::DoState(sw))
-    {
-      ERROR_LOG("Failed to save old GPU state when switching renderers");
-      state_data.deallocate();
-    }
-  }
-
-  // create new renderer
-  g_gpu.reset();
-  if (force_recreate_device)
-  {
-    PostProcessing::Shutdown();
-    Host::ReleaseGPUDevice();
-    Host::ReleaseRenderWindow();
-  }
+  StopMediaCapture();
 
   Error error;
-  if (!CreateGPU(renderer, true, Host::IsFullscreen(), &error))
+  if (!GPUThread::CreateGPUBackend(renderer, true, false, force_recreate_device, &error))
   {
-    if (!IsStartupCancelled())
-      Host::ReportErrorAsync("Error", error.GetDescription());
-
-    DestroySystem();
-    return false;
+    ERROR_LOG("Failed to switch to {} renderer: {}", Settings::GetRendererName(renderer), error.GetDescription());
+    Panic("Failed to switch renderer.");
   }
-
-  if (!state_data.empty())
-  {
-    StateWrapper sw(state_data.span(), StateWrapper::Mode::Read, SAVE_STATE_VERSION);
-    g_gpu->RestoreDeviceContext();
-    g_gpu->DoState(sw, nullptr, update_display);
-    TimingEvents::DoState(sw);
-  }
-
-  if (force_recreate_device)
-  {
-    ImGuiManager::UpdateDebugWindowConfig();
-    InvalidateDisplay();
-  }
-
-  // fix up vsync etc
-  UpdateSpeedLimiterState();
-  return true;
-}
-
-void System::HandleHostGPUDeviceLost()
-{
-  static Common::Timer::Value s_last_gpu_reset_time = 0;
-  static constexpr float MIN_TIME_BETWEEN_RESETS = 15.0f;
-
-  // If we're constantly crashing on something in particular, we don't want to end up in an
-  // endless reset loop.. that'd probably end up leaking memory and/or crashing us for other
-  // reasons. So just abort in such case.
-  const Common::Timer::Value current_time = Common::Timer::GetCurrentValue();
-  if (s_last_gpu_reset_time != 0 &&
-      Common::Timer::ConvertValueToSeconds(current_time - s_last_gpu_reset_time) < MIN_TIME_BETWEEN_RESETS)
-  {
-    Panic("Host GPU lost too many times, device is probably completely wedged.");
-  }
-  s_last_gpu_reset_time = current_time;
-
-  if (g_gpu)
-  {
-    // Little bit janky, but because the device is lost, the VRAM readback is going to give us garbage.
-    // So back up what we have, it's probably missing bits, but whatever...
-    DynamicHeapArray<u8> vram_backup(VRAM_SIZE);
-    std::memcpy(vram_backup.data(), g_vram, VRAM_SIZE);
-
-    // Device lost, something went really bad.
-    // Let's just toss out everything, and try to hobble on.
-    if (!RecreateGPU(g_gpu->IsHardwareRenderer() ? g_settings.gpu_renderer : GPURenderer::Software, true, false))
-    {
-      Panic("Failed to recreate GPU device after loss.");
-      return;
-    }
-
-    // Restore backed-up VRAM.
-    std::memcpy(g_vram, vram_backup.data(), VRAM_SIZE);
-  }
-  else
-  {
-    // Only big picture mode was running.
-    const bool fsui_running = FullscreenUI::IsInitialized();
-    const bool fullscreen = Host::IsFullscreen();
-    const RenderAPI api = g_gpu_device->GetRenderAPI();
-    Host::ReleaseGPUDevice();
-    Host::ReleaseRenderWindow();
-    if (!Host::CreateGPUDevice(api, fullscreen, nullptr) || (fsui_running && !FullscreenUI::Initialize()))
-    {
-      Panic("Failed to recreate GPU device after loss.");
-      return;
-    }
-  }
-
-  // First frame after reopening is definitely going to be trash, so skip it.
-  Host::AddIconOSDWarning(
-    "HostGPUDeviceLost", ICON_EMOJI_WARNING,
-    TRANSLATE_STR("System", "Host GPU device encountered an error and has recovered. This may cause broken rendering."),
-    Host::OSD_CRITICAL_ERROR_DURATION);
-}
-
-void System::HandleExclusiveFullscreenLost()
-{
-  WARNING_LOG("Lost exclusive fullscreen.");
-  Host::SetFullscreen(false);
 }
 
 void System::LoadSettings(bool display_osd_messages)
@@ -1579,16 +1468,12 @@ void System::PauseSystem(bool paused)
   if (paused == IsPaused() || !IsValid())
     return;
 
-  SetState(paused ? State::Paused : State::Running);
+  s_state.state = (paused ? State::Paused : State::Running);
+  std::atomic_thread_fence(std::memory_order_release);
   SPU::GetOutputStream()->SetPaused(paused);
 
   if (paused)
   {
-    // Make sure the GPU is flushed, otherwise the VB might still be mapped.
-    g_gpu->FlushRender();
-
-    FullscreenUI::OnSystemPaused();
-
     InputManager::PauseVibration();
     InputManager::UpdateHostMouseMode();
 
@@ -1602,9 +1487,8 @@ void System::PauseSystem(bool paused)
 #endif
 
     Host::OnSystemPaused();
-    Host::OnIdleStateChanged();
     UpdateDisplayVSync();
-    InvalidateDisplay();
+    GPUThread::PresentCurrentFrame();
   }
   else
   {
@@ -1622,8 +1506,6 @@ void System::PauseSystem(bool paused)
 #endif
 
     Host::OnSystemResumed();
-    Host::OnIdleStateChanged();
-
     UpdateDisplayVSync();
     PerformanceCounters::Reset();
     ResetThrottler();
@@ -1660,8 +1542,8 @@ bool System::BootSystem(SystemBootParameters parameters, Error* error)
   Assert(s_state.state == State::Shutdown);
   s_state.state = State::Starting;
   s_state.startup_cancelled.store(false, std::memory_order_relaxed);
-  s_state.keep_gpu_device_on_shutdown = static_cast<bool>(g_gpu_device);
   s_state.region = g_settings.region;
+  std::atomic_thread_fence(std::memory_order_release);
   Host::OnSystemStarting();
 
   // Load CD image up and detect region.
@@ -1836,6 +1718,7 @@ bool System::BootSystem(SystemBootParameters parameters, Error* error)
 
   // Good to go.
   s_state.state = State::Running;
+  std::atomic_thread_fence(std::memory_order_release);
   SPU::GetOutputStream()->SetPaused(false);
 
   FullscreenUI::OnSystemStarted();
@@ -1851,7 +1734,6 @@ bool System::BootSystem(SystemBootParameters parameters, Error* error)
 #endif
 
   Host::OnSystemStarted();
-  Host::OnIdleStateChanged();
 
   // try to load the state, if it fails, bail out
   if (!parameters.save_state.empty() && !LoadState(parameters.save_state.c_str(), error, false))
@@ -1872,7 +1754,6 @@ bool System::BootSystem(SystemBootParameters parameters, Error* error)
     PauseSystem(true);
 
   UpdateSpeedLimiterState();
-  ImGuiManager::UpdateDebugWindowConfig();
   PerformanceCounters::Reset();
   ResetThrottler();
   return true;
@@ -1910,11 +1791,17 @@ bool System::Initialize(std::unique_ptr<CDImage> disc, DiscRegion disc_region, b
       !CDROM::InsertMedia(std::move(disc), disc_region, s_state.running_game_serial, s_state.running_game_title, error))
     return false;
 
-  if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer, false, fullscreen, error))
-    return false;
+  // TODO: Drop pointer
+  g_gpu = std::make_unique<GPU>();
+  g_gpu->Initialize();
 
-  if (GPUSwapChain* swap_chain = g_gpu_device->GetMainSwapChain())
-    GTE::UpdateAspectRatio(swap_chain->GetWidth(), swap_chain->GetHeight());
+  if (!GPUThread::CreateGPUBackend(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer, false,
+                                   fullscreen, false, error))
+  {
+    return false;
+  }
+
+  SetGTEAspectRatioFromRenderWindow();
 
   if (g_settings.gpu_pgxp_enable)
     CPU::PGXP::Initialize();
@@ -1956,8 +1843,6 @@ void System::DestroySystem()
   if (s_state.media_capture)
     StopMediaCapture();
 
-  ImGuiManager::DestroyAllDebugWindows();
-
   s_state.gpu_dump_player.reset();
 
   s_state.undo_load_state.reset();
@@ -1972,7 +1857,6 @@ void System::DestroySystem()
   PostProcessing::Shutdown();
 
   SaveStateSelectorUI::Clear();
-  FullscreenUI::OnSystemDestroyed();
 
   InputManager::PauseVibration();
   InputManager::UpdateHostMouseMode();
@@ -1999,17 +1883,7 @@ void System::DestroySystem()
   TimingEvents::Shutdown();
   GPUTextureCache::Shutdown();
   ClearRunningGame();
-
-  // Restore present-all-frames behavior.
-  if (s_state.keep_gpu_device_on_shutdown && g_gpu_device)
-  {
-    g_gpu_device->SetGPUTimingEnabled(false);
-    UpdateDisplayVSync();
-  }
-  else
-  {
-    Host::ReleaseGPUDevice();
-  }
+  GPUThread::DestroyGPUBackend();
 
   s_state.taints = 0;
   s_state.bios_hash = {};
@@ -2018,9 +1892,12 @@ void System::DestroySystem()
   s_state.boot_mode = BootMode::None;
 
   s_state.state = State::Shutdown;
+  std::atomic_thread_fence(std::memory_order_release);
+
+  // NOTE: Must come after DestroyGPUBackend(), otherwise landing page will display.
+  FullscreenUI::OnSystemDestroyed();
 
   Host::OnSystemDestroyed();
-  Host::OnIdleStateChanged();
 }
 
 void System::ClearRunningGame()
@@ -2050,8 +1927,6 @@ void System::Execute()
       {
         s_state.system_executing = true;
 
-        // TODO: Purge reset/restore
-        g_gpu->RestoreDeviceContext();
         TimingEvents::CommitLeftoverTicks();
 
         if (s_state.gpu_dump_player) [[unlikely]]
@@ -2080,9 +1955,6 @@ void System::Execute()
 
 void System::FrameDone()
 {
-  // Vertex buffer is shared, need to flush what we have.
-  g_gpu->FlushRender();
-
   // Generate any pending samples from the SPU before sleeping, this way we reduce the chances of underruns.
   // TODO: when running ahead, we can skip this (and the flush above)
   if (!IsReplayingGPUDump()) [[likely]]
@@ -2104,8 +1976,6 @@ void System::FrameDone()
     s_state.socket_multiplexer->PollEventsWithTimeout(0);
 #endif
 
-  Host::FrameDone();
-
   if (s_state.frame_step_request)
   {
     s_state.frame_step_request = false;
@@ -2135,7 +2005,6 @@ void System::FrameDone()
       // counter-acts that.
       Host::PumpMessagesOnCPUThread();
       InputManager::PollSources();
-      g_gpu->RestoreDeviceContext();
       CheckForAndExitExecution();
     }
 
@@ -2148,29 +2017,6 @@ void System::FrameDone()
     SaveRunaheadState();
   }
 
-  // Kick off media capture early, might take a while.
-  if (s_state.media_capture && s_state.media_capture->IsCapturingVideo()) [[unlikely]]
-  {
-    if (s_state.media_capture->GetVideoFPS() != s_state.video_frame_rate) [[unlikely]]
-    {
-      const std::string next_capture_path = s_state.media_capture->GetNextCapturePath();
-      INFO_LOG("Video frame rate changed, switching to new capture file {}", Path::GetFileName(next_capture_path));
-
-      const bool was_capturing_audio = s_state.media_capture->IsCapturingAudio();
-      StopMediaCapture();
-      if (StartMediaCapture(std::move(next_capture_path), true, was_capturing_audio) &&
-          !g_gpu->SendDisplayToMediaCapture(s_state.media_capture.get())) [[unlikely]]
-      {
-        StopMediaCapture();
-      }
-    }
-    else
-    {
-      if (!g_gpu->SendDisplayToMediaCapture(s_state.media_capture.get())) [[unlikely]]
-        StopMediaCapture();
-    }
-  }
-
   Common::Timer::Value current_time = Common::Timer::GetCurrentValue();
 
   // pre-frame sleep accounting (input lag reduction)
@@ -2179,55 +2025,6 @@ void System::FrameDone()
   if (s_state.pre_frame_sleep)
     AccumulatePreFrameSleepTime(current_time);
 
-  // explicit present (frame pacing)
-  const bool is_unique_frame = (s_state.last_presented_internal_frame_number != s_state.internal_frame_number);
-  s_state.last_presented_internal_frame_number = s_state.internal_frame_number;
-
-  const bool skip_this_frame =
-    (((s_state.skip_presenting_duplicate_frames && !is_unique_frame &&
-       s_state.skipped_frame_count < MAX_SKIPPED_DUPLICATE_FRAME_COUNT) ||
-      (!s_state.optimal_frame_pacing && current_time > s_state.next_frame_time &&
-       s_state.skipped_frame_count < MAX_SKIPPED_TIMEOUT_FRAME_COUNT) ||
-      (g_gpu_device->HasMainSwapChain() && g_gpu_device->GetMainSwapChain()->ShouldSkipPresentingFrame())) &&
-     !s_state.syncing_to_host_with_vsync && !IsExecutionInterrupted());
-  if (!skip_this_frame)
-  {
-    s_state.skipped_frame_count = 0;
-
-    const bool scheduled_present =
-      (s_state.optimal_frame_pacing && s_state.throttler_enabled && !IsExecutionInterrupted());
-    const GPUDevice::Features features = g_gpu_device->GetFeatures();
-    if (scheduled_present && features.timed_present)
-    {
-      PresentDisplay(false, s_state.next_frame_time);
-      Throttle(current_time);
-    }
-    else if (scheduled_present && features.explicit_present)
-    {
-      const bool do_present = PresentDisplay(true, 0);
-      Throttle(current_time);
-      if (do_present)
-        g_gpu_device->SubmitPresent(g_gpu_device->GetMainSwapChain());
-    }
-    else
-    {
-      if (scheduled_present)
-        Throttle(current_time);
-
-      PresentDisplay(false, 0);
-
-      if (!scheduled_present && s_state.throttler_enabled && !IsExecutionInterrupted())
-        Throttle(current_time);
-    }
-  }
-  else
-  {
-    DEBUG_LOG("Skipping displaying frame");
-    s_state.skipped_frame_count++;
-    if (s_state.throttler_enabled)
-      Throttle(current_time);
-  }
-
   // pre-frame sleep (input lag reduction)
   current_time = Common::Timer::GetCurrentValue();
   if (s_state.pre_frame_sleep)
@@ -2236,10 +2033,15 @@ void System::FrameDone()
     if (pre_frame_sleep_until > current_time &&
         Common::Timer::ConvertValueToMilliseconds(pre_frame_sleep_until - current_time) >= 1)
     {
-      Common::Timer::SleepUntil(pre_frame_sleep_until, true);
+      Throttle(current_time, pre_frame_sleep_until);
       current_time = Common::Timer::GetCurrentValue();
     }
   }
+  else
+  {
+    if (s_state.throttler_enabled)
+      Throttle(current_time, s_state.next_frame_time);
+  }
 
   s_state.frame_start_time = current_time;
 
@@ -2250,13 +2052,40 @@ void System::FrameDone()
     InputManager::PollSources();
     CheckForAndExitExecution();
   }
+}
 
-  g_gpu->RestoreDeviceContext();
+void System::GetFramePresentationDetails(bool* is_frame, bool* present_frame, bool* allow_present_skip,
+                                         Common::Timer::Value* present_time)
+{
+  const Common::Timer::Value current_time = Common::Timer::GetCurrentValue();
 
-  // Update perf counters *after* throttling, we want to measure from start-of-frame
-  // to start-of-frame, not end-of-frame to end-of-frame (will be noisy due to different
-  // amounts of computation happening in each frame).
-  PerformanceCounters::Update(s_state.frame_number, s_state.internal_frame_number);
+  // explicit present (frame pacing)
+  const bool is_unique_frame = (s_state.last_presented_internal_frame_number != s_state.internal_frame_number);
+  s_state.last_presented_internal_frame_number = s_state.internal_frame_number;
+
+  const bool is_duplicate_frame = (s_state.skip_presenting_duplicate_frames && !is_unique_frame &&
+                                   s_state.skipped_frame_count < MAX_SKIPPED_DUPLICATE_FRAME_COUNT);
+  const bool skip_this_frame =
+    ((is_duplicate_frame || (!s_state.optimal_frame_pacing && current_time > s_state.next_frame_time &&
+                             s_state.skipped_frame_count < MAX_SKIPPED_TIMEOUT_FRAME_COUNT)) &&
+     !s_state.syncing_to_host_with_vsync && !IsExecutionInterrupted());
+  const bool should_allow_present_skip = !s_state.syncing_to_host_with_vsync && !s_state.optimal_frame_pacing;
+  *is_frame = !is_duplicate_frame;
+  *present_frame = !skip_this_frame;
+  *allow_present_skip = should_allow_present_skip;
+  *present_time = (s_state.optimal_frame_pacing && s_state.throttler_enabled && !IsExecutionInterrupted()) ?
+                    s_state.next_frame_time :
+                    0;
+
+  if (!skip_this_frame)
+  {
+    s_state.skipped_frame_count = 0;
+  }
+  else
+  {
+    DEBUG_LOG("Skipping displaying frame");
+    s_state.skipped_frame_count++;
+  }
 }
 
 float System::GetVideoFrameRate()
@@ -2296,12 +2125,12 @@ void System::ResetThrottler()
   s_state.pre_frame_sleep_time = 0;
 }
 
-void System::Throttle(Common::Timer::Value current_time)
+void System::Throttle(Common::Timer::Value current_time, Common::Timer::Value sleep_until)
 {
   // If we're running too slow, advance the next frame time based on the time we lost. Effectively skips
   // running those frames at the intended time, because otherwise if we pause in the debugger, we'll run
   // hundreds of frames when we resume.
-  if (current_time > s_state.next_frame_time)
+  if (current_time > sleep_until)
   {
     const Common::Timer::Value diff = static_cast<s64>(current_time) - static_cast<s64>(s_state.next_frame_time);
     s_state.next_frame_time += (diff / s_state.frame_period) * s_state.frame_period + s_state.frame_period;
@@ -2316,11 +2145,10 @@ void System::Throttle(Common::Timer::Value current_time)
     Common::Timer::Value poll_start_time = current_time;
     for (;;)
     {
-      const u32 sleep_ms =
-        static_cast<u32>(Common::Timer::ConvertValueToMilliseconds(s_state.next_frame_time - poll_start_time));
+      const u32 sleep_ms = static_cast<u32>(Common::Timer::ConvertValueToMilliseconds(sleep_until - poll_start_time));
       s_state.socket_multiplexer->PollEventsWithTimeout(sleep_ms);
       poll_start_time = Common::Timer::GetCurrentValue();
-      if (poll_start_time >= s_state.next_frame_time || (!g_settings.display_optimal_frame_pacing && sleep_ms == 0))
+      if (poll_start_time >= sleep_until || (!g_settings.display_optimal_frame_pacing && sleep_ms == 0))
         break;
     }
   }
@@ -2329,14 +2157,14 @@ void System::Throttle(Common::Timer::Value current_time)
     // Use a spinwait if we undersleep for all platforms except android.. don't want to burn battery.
     // Linux also seems to do a much better job of waking up at the requested time.
 #if !defined(__linux__)
-    Common::Timer::SleepUntil(s_state.next_frame_time, g_settings.display_optimal_frame_pacing);
+    Common::Timer::SleepUntil(sleep_until, g_settings.display_optimal_frame_pacing);
 #else
-    Common::Timer::SleepUntil(s_state.next_frame_time, false);
+    Common::Timer::SleepUntil(sleep_until, false);
 #endif
   }
 #else
   // No spinwait on Android, see above.
-  Common::Timer::SleepUntil(s_state.next_frame_time, false);
+  Common::Timer::SleepUntil(sleep_until, false);
 #endif
 
 #if 0
@@ -2380,65 +2208,6 @@ void System::IncrementInternalFrameNumber()
   s_state.internal_frame_number++;
 }
 
-bool System::CreateGPU(GPURenderer renderer, bool is_switching, bool fullscreen, Error* error)
-{
-  const RenderAPI api = Settings::GetRenderAPIForRenderer(renderer);
-
-  if (!g_gpu_device ||
-      (renderer != GPURenderer::Software && !GPUDevice::IsSameRenderAPI(g_gpu_device->GetRenderAPI(), api)))
-  {
-    if (g_gpu_device)
-    {
-      WARNING_LOG("Recreating GPU device, expecting {} got {}", GPUDevice::RenderAPIToString(api),
-                  GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()));
-      PostProcessing::Shutdown();
-    }
-
-    Host::ReleaseGPUDevice();
-    if (!Host::CreateGPUDevice(api, fullscreen, error))
-    {
-      Host::ReleaseRenderWindow();
-      return false;
-    }
-
-    if (is_switching)
-      PostProcessing::Initialize();
-  }
-
-  if (renderer == GPURenderer::Software)
-    g_gpu = GPU::CreateSoftwareRenderer(error);
-  else
-    g_gpu = GPU::CreateHardwareRenderer(error);
-
-  if (!g_gpu)
-  {
-    ERROR_LOG("Failed to initialize {} renderer, falling back to software renderer",
-              Settings::GetRendererName(renderer));
-    Host::AddOSDMessage(
-      fmt::format(TRANSLATE_FS("System", "Failed to initialize {} renderer, falling back to software renderer."),
-                  Settings::GetRendererName(renderer)),
-      Host::OSD_CRITICAL_ERROR_DURATION);
-    g_gpu.reset();
-    g_gpu = GPU::CreateSoftwareRenderer(error);
-    if (!g_gpu)
-    {
-      ERROR_LOG("Failed to create fallback software renderer.");
-      if (!s_state.keep_gpu_device_on_shutdown)
-      {
-        PostProcessing::Shutdown();
-        Host::ReleaseGPUDevice();
-        Host::ReleaseRenderWindow();
-      }
-      return false;
-    }
-  }
-
-  if (g_settings.display_show_gpu_usage)
-    g_gpu_device->SetGPUTimingEnabled(true);
-
-  return true;
-}
-
 bool System::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display, bool is_memory_state)
 {
   if (!sw.DoMarker("System"))
@@ -2504,8 +2273,7 @@ bool System::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
   if (!sw.DoMarker("InterruptController") || !InterruptController::DoState(sw))
     return false;
 
-  g_gpu->RestoreDeviceContext();
-  if (!sw.DoMarker("GPU") || !g_gpu->DoState(sw, host_texture, update_display))
+  if (!sw.DoMarker("GPU") || !g_gpu->DoState(sw, update_display))
     return false;
 
   if (!sw.DoMarker("CDROM") || !CDROM::DoState(sw))
@@ -2811,7 +2579,7 @@ bool System::LoadStateFromBuffer(const SaveStateBuffer& buffer, Error* error, bo
   ResetThrottler();
 
   if (update_display_if_paused && IsPaused())
-    InvalidateDisplay();
+    GPUThread::PresentCurrentFrame();
 
   return true;
 }
@@ -3065,23 +2833,14 @@ bool System::SaveStateToBuffer(SaveStateBuffer* buffer, Error* error, u32 screen
   // save screenshot
   if (screenshot_size > 0)
   {
-    // assume this size is the width
-    GSVector4i screenshot_display_rect, screenshot_draw_rect;
-    g_gpu->CalculateDrawRect(screenshot_size, screenshot_size, true, true, &screenshot_display_rect,
-                             &screenshot_draw_rect);
-
-    const u32 screenshot_width = static_cast<u32>(screenshot_display_rect.width());
-    const u32 screenshot_height = static_cast<u32>(screenshot_display_rect.height());
-    screenshot_draw_rect = screenshot_draw_rect.sub32(screenshot_display_rect.xyxy());
-    screenshot_display_rect = screenshot_display_rect.sub32(screenshot_display_rect.xyxy());
-    VERBOSE_LOG("Saving {}x{} screenshot for state", screenshot_width, screenshot_height);
-
+    u32 screenshot_width;
+    u32 screenshot_height;
     std::vector<u32> screenshot_buffer;
     u32 screenshot_stride;
     GPUTexture::Format screenshot_format;
-    if (g_gpu->RenderScreenshotToBuffer(screenshot_width, screenshot_height, screenshot_display_rect,
-                                        screenshot_draw_rect, false, &screenshot_buffer, &screenshot_stride,
-                                        &screenshot_format) &&
+    if (GPUBackend::RenderScreenshotToBuffer(screenshot_size, screenshot_size, false, &screenshot_width,
+                                             &screenshot_height, &screenshot_buffer, &screenshot_stride,
+                                             &screenshot_format) &&
         GPUTexture::ConvertTextureDataToRGBA8(screenshot_width, screenshot_height, screenshot_buffer, screenshot_stride,
                                               screenshot_format))
     {
@@ -3103,8 +2862,8 @@ bool System::SaveStateToBuffer(SaveStateBuffer* buffer, Error* error, u32 screen
     }
     else
     {
-      WARNING_LOG("Failed to save {}x{} screenshot for save state due to render/conversion failure", screenshot_width,
-                  screenshot_height);
+      WARNING_LOG("Failed to save {}x{} screenshot for save state due to render/conversion failure", screenshot_size,
+                  screenshot_size);
     }
   }
 
@@ -3112,7 +2871,6 @@ bool System::SaveStateToBuffer(SaveStateBuffer* buffer, Error* error, u32 screen
   if (buffer->state_data.empty())
     buffer->state_data.resize(GetMaxSaveStateSize());
 
-  g_gpu->RestoreDeviceContext();
   StateWrapper sw(buffer->state_data.span(), StateWrapper::Mode::Write, SAVE_STATE_VERSION);
   if (!DoState(sw, nullptr, false, false))
   {
@@ -3350,10 +3108,9 @@ void System::UpdateSpeedLimiterState()
   s_state.syncing_to_host = false;
   s_state.syncing_to_host_with_vsync = false;
 
-  if (g_settings.sync_to_host_refresh_rate && g_gpu_device->HasMainSwapChain())
+  if (g_settings.sync_to_host_refresh_rate)
   {
-    const float host_refresh_rate = g_gpu_device->GetMainSwapChain()->GetWindowInfo().surface_refresh_rate;
-    if (host_refresh_rate > 0.0f)
+    if (const float host_refresh_rate = GPUThread::GetRenderWindowInfo().surface_refresh_rate; host_refresh_rate > 0.0f)
     {
       const float ratio = host_refresh_rate / s_state.video_frame_rate;
       s_state.can_sync_to_host = (ratio >= 0.95f && ratio <= 1.05f);
@@ -3405,32 +3162,15 @@ void System::UpdateSpeedLimiterState()
 
 void System::UpdateDisplayVSync()
 {
-  static constexpr std::array<const char*, static_cast<size_t>(GPUVSyncMode::Count)> vsync_modes = {{
-    "Disabled",
-    "FIFO",
-    "Mailbox",
-  }};
-
   // Avoid flipping vsync on and off by manually throttling when vsync is on.
   const GPUVSyncMode vsync_mode = GetEffectiveVSyncMode();
   const bool allow_present_throttle = ShouldAllowPresentThrottle();
-  if (!g_gpu_device->HasMainSwapChain() ||
-      (g_gpu_device->GetMainSwapChain()->GetVSyncMode() == vsync_mode &&
-       g_gpu_device->GetMainSwapChain()->IsPresentThrottleAllowed() == allow_present_throttle))
-  {
-    return;
-  }
 
-  VERBOSE_LOG("VSync: {}{}{}", vsync_modes[static_cast<size_t>(vsync_mode)],
+  VERBOSE_LOG("VSync: {}{}{}", GPUDevice::VSyncModeToString(vsync_mode),
               s_state.syncing_to_host_with_vsync ? " (for throttling)" : "",
               allow_present_throttle ? " (present throttle allowed)" : "");
 
-  Error error;
-  if (!g_gpu_device->GetMainSwapChain()->SetVSyncMode(vsync_mode, allow_present_throttle, &error))
-  {
-    ERROR_LOG("Failed to update vsync mode to {}: {}", vsync_modes[static_cast<size_t>(vsync_mode)],
-              error.GetDescription());
-  }
+  GPUThread::SetVSync(vsync_mode, allow_present_throttle);
 }
 
 GPUVSyncMode System::GetEffectiveVSyncMode()
@@ -3881,7 +3621,6 @@ bool System::DumpVRAM(const char* filename)
   if (!IsValid())
     return false;
 
-  g_gpu->RestoreDeviceContext();
   return g_gpu->DumpVRAMToFile(filename);
 }
 
@@ -4057,10 +3796,11 @@ void System::UpdateRunningGame(const std::string_view path, CDImage* image, bool
   if (s_state.running_game_serial != prev_serial)
     UpdateSessionTime(prev_serial);
 
+  // TODO GPU-THREAD: Racey...
   if (SaveStateSelectorUI::IsOpen())
-    SaveStateSelectorUI::RefreshList(s_state.running_game_serial);
-  else
-    SaveStateSelectorUI::ClearList();
+  {
+    GPUThread::RunOnThread([serial = s_state.running_game_serial]() { SaveStateSelectorUI::RefreshList(serial); });
+  }
 
   UpdateRichPresence(booting);
 
@@ -4316,7 +4056,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
     if (g_settings.gpu_resolution_scale != old_settings.gpu_resolution_scale ||
         g_settings.gpu_multisamples != old_settings.gpu_multisamples ||
         g_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading ||
-        g_settings.gpu_use_thread != old_settings.gpu_use_thread ||
+        g_settings.gpu_max_queued_frames != old_settings.gpu_max_queued_frames ||
         g_settings.gpu_use_software_renderer_for_readbacks != old_settings.gpu_use_software_renderer_for_readbacks ||
         g_settings.gpu_fifo_size != old_settings.gpu_fifo_size ||
         g_settings.gpu_max_run_ahead != old_settings.gpu_max_run_ahead ||
@@ -4355,9 +4095,14 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
         g_settings.texture_replacements.dump_textures != old_settings.texture_replacements.dump_textures ||
         g_settings.texture_replacements.config != old_settings.texture_replacements.config)
     {
-      g_gpu->UpdateSettings(old_settings);
+      GPUThread::UpdateSettings(true);
       if (IsPaused())
-        InvalidateDisplay();
+        GPUThread::PresentCurrentFrame();
+    }
+    else
+    {
+      // still need to update debug windows
+      GPUThread::UpdateSettings(false);
     }
 
     if (g_settings.gpu_widescreen_hack != old_settings.gpu_widescreen_hack ||
@@ -4366,8 +4111,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
          (g_settings.display_aspect_ratio_custom_numerator != old_settings.display_aspect_ratio_custom_numerator ||
           g_settings.display_aspect_ratio_custom_denominator != old_settings.display_aspect_ratio_custom_denominator)))
     {
-      if (GPUSwapChain* swap_chain = g_gpu_device->GetMainSwapChain())
-        GTE::UpdateAspectRatio(swap_chain->GetWidth(), swap_chain->GetHeight());
+      SetGTEAspectRatioFromRenderWindow();
     }
 
     if (g_settings.gpu_pgxp_enable != old_settings.gpu_pgxp_enable ||
@@ -4386,9 +4130,6 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
       InterruptExecution();
     }
 
-    if (g_settings.display_show_gpu_stats != old_settings.display_show_gpu_stats)
-      g_gpu->ResetStatistics();
-
     if (g_settings.cdrom_readahead_sectors != old_settings.cdrom_readahead_sectors)
       CDROM::SetReadaheadSectors(g_settings.cdrom_readahead_sectors);
 
@@ -4449,9 +4190,6 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
 
     PostProcessing::UpdateSettings();
 
-    if (ImGuiManager::UpdateDebugWindowConfig())
-      InvalidateDisplay();
-
 #ifdef ENABLE_GDB_SERVER
     if (g_settings.debugging.enable_gdb_server != old_settings.debugging.enable_gdb_server ||
         g_settings.debugging.gdb_server_port != old_settings.debugging.gdb_server_port)
@@ -4508,6 +4246,9 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
       Panic("Failed to reallocate memory map. The log may contain more information.");
     }
   }
+
+  if (g_settings.gpu_use_thread != old_settings.gpu_use_thread) [[unlikely]]
+    GPUThread::Internal::SetThreadEnabled(g_settings.gpu_use_thread);
 }
 
 void System::SetTaintsFromSettings()
@@ -4742,6 +4483,9 @@ void System::CalculateRewindMemoryUsage(u32 num_saves, u32 resolution_scale, u64
 
 void System::ClearMemorySaveStates()
 {
+  if (!s_state.rewind_states.empty() || !s_state.runahead_states.empty())
+    Panic("FIXME TEXTURE CLEAR");
+
   s_state.rewind_states.clear();
   s_state.runahead_states.clear();
 }
@@ -4924,11 +4668,12 @@ void System::DoRewind()
     s_state.rewind_load_counter--;
   }
 
-  InvalidateDisplay();
+  // TODO FIXME InvalidateDisplay();
+
   Host::PumpMessagesOnCPUThread();
   IdlePollUpdate();
 
-  Throttle(Common::Timer::GetCurrentValue());
+  Throttle(Common::Timer::GetCurrentValue(), s_state.next_frame_time);
 }
 
 void System::SaveRunaheadState()
@@ -5047,6 +4792,7 @@ void System::ShutdownSystem(bool save_resume_state)
   }
 
   s_state.state = State::Stopping;
+  std::atomic_thread_fence(std::memory_order_release);
   if (!s_state.system_executing)
     DestroySystem();
 }
@@ -5163,17 +4909,17 @@ std::string System::GetScreenshotPath(const char* extension)
   return path;
 }
 
-bool System::SaveScreenshot(const char* path, DisplayScreenshotMode mode, DisplayScreenshotFormat format, u8 quality,
+void System::SaveScreenshot(const char* path, DisplayScreenshotMode mode, DisplayScreenshotFormat format, u8 quality,
                             bool compress_on_thread)
 {
   if (!IsValid())
-    return false;
+    return;
 
   std::string auto_path;
   if (!path)
     path = (auto_path = GetScreenshotPath(Settings::GetDisplayScreenshotFormatExtension(format))).c_str();
 
-  return g_gpu->RenderScreenshotToFile(path, mode, quality, compress_on_thread, true);
+  GPUBackend::RenderScreenshotToFile(path, mode, quality, compress_on_thread, true);
 }
 
 bool System::StartRecordingGPUDump(const char* path /*= nullptr*/, u32 num_frames /*= 0*/)
@@ -5245,17 +4991,20 @@ bool System::StartMediaCapture(std::string path, bool capture_video, bool captur
     Host::GetUIntSettingValue("MediaCapture", "VideoWidth", Settings::DEFAULT_MEDIA_CAPTURE_VIDEO_WIDTH);
   u32 capture_height =
     Host::GetUIntSettingValue("MediaCapture", "VideoHeight", Settings::DEFAULT_MEDIA_CAPTURE_VIDEO_HEIGHT);
+  const WindowInfo& main_window_info = GPUThread::GetRenderWindowInfo();
   const GPUTexture::Format capture_format =
-    g_gpu_device->HasMainSwapChain() ? g_gpu_device->GetMainSwapChain()->GetFormat() : GPUTexture::Format::RGBA8;
+    main_window_info.IsSurfaceless() ? GPUTexture::Format::RGBA8 : main_window_info.surface_format;
   if (capture_video)
   {
-    // TODO: This will be a mess with GPU thread.
+#if 0
+    // TODO:FIXME: This will be a mess with GPU thread. The start will have to be moved there.
     if (Host::GetBoolSettingValue("MediaCapture", "VideoAutoSize", false))
     {
       GSVector4i unused_display_rect, unused_draw_rect;
       g_gpu->CalculateScreenshotSize(DisplayScreenshotMode::InternalResolution, &capture_width, &capture_height,
                                      &unused_display_rect, &unused_draw_rect);
     }
+#endif
 
     MediaCapture::AdjustVideoSize(&capture_width, &capture_height);
   }
@@ -5303,7 +5052,7 @@ bool System::StartMediaCapture(std::string path, bool capture_video, bool captur
     return false;
   }
 
-  Host::AddIconOSDMessage("MediaCapture", ICON_FA_CAMERA,
+  Host::AddIconOSDMessage(fmt::format("MediaCapture_{}", s_state.media_capture->GetPath()), ICON_FA_CAMERA,
                           fmt::format(TRANSLATE_FS("System", "Starting {0} to '{1}'."),
                                       GetCaptureTypeForMessage(s_state.media_capture->IsCapturingVideo(),
                                                                s_state.media_capture->IsCapturingAudio()),
@@ -5319,30 +5068,45 @@ void System::StopMediaCapture()
   if (!s_state.media_capture)
     return;
 
-  const bool was_capturing_audio = s_state.media_capture->IsCapturingAudio();
-  const bool was_capturing_video = s_state.media_capture->IsCapturingVideo();
+  if (s_state.media_capture->IsCapturingVideo())
+  {
+    // If we're capturing video, we need to finish the capture on the GPU thread.
+    // This is because it owns texture objects, and OpenGL is not thread-safe.
+    GPUThread::RunOnThread(
+      [cap = s_state.media_capture.release()]() mutable { StopMediaCapture(std::unique_ptr<MediaCapture>(cap)); });
+  }
+  else
+  {
+    // Otherwise, we can do it on the CPU thread.
+    StopMediaCapture(std::move(s_state.media_capture));
+  }
+
+  Host::OnMediaCaptureStopped();
+}
+
+void System::StopMediaCapture(std::unique_ptr<MediaCapture> cap)
+{
+  const bool was_capturing_audio = cap->IsCapturingAudio();
+  const bool was_capturing_video = cap->IsCapturingVideo();
 
   Error error;
-  if (s_state.media_capture->EndCapture(&error))
+  std::string osd_key = fmt::format("MediaCapture_{}", cap->GetPath());
+  if (cap->EndCapture(&error))
   {
-    Host::AddIconOSDMessage("MediaCapture", ICON_FA_CAMERA,
+    Host::AddIconOSDMessage(std::move(osd_key), ICON_FA_CAMERA,
                             fmt::format(TRANSLATE_FS("System", "Stopped {0} to '{1}'."),
                                         GetCaptureTypeForMessage(was_capturing_video, was_capturing_audio),
-                                        Path::GetFileName(s_state.media_capture->GetPath())),
+                                        Path::GetFileName(cap->GetPath())),
                             Host::OSD_INFO_DURATION);
   }
   else
   {
-    Host::AddIconOSDWarning("MediaCapture", ICON_FA_EXCLAMATION_TRIANGLE,
+    Host::AddIconOSDWarning(std::move(osd_key), ICON_FA_EXCLAMATION_TRIANGLE,
                             fmt::format(TRANSLATE_FS("System", "Stopped {0}: {1}."),
-                                        GetCaptureTypeForMessage(s_state.media_capture->IsCapturingVideo(),
-                                                                 s_state.media_capture->IsCapturingAudio()),
+                                        GetCaptureTypeForMessage(was_capturing_video, was_capturing_audio),
                                         error.GetDescription()),
                             Host::OSD_INFO_DURATION);
   }
-  s_state.media_capture.reset();
-
-  Host::OnMediaCaptureStopped();
 }
 
 std::string System::GetGameSaveStateFileName(std::string_view serial, s32 slot)
@@ -5615,8 +5379,7 @@ void System::ToggleWidescreen()
                   Settings::GetDisplayAspectRatioDisplayName(g_settings.display_aspect_ratio), 5.0f));
   }
 
-  if (GPUSwapChain* swap_chain = g_gpu_device->GetMainSwapChain())
-    GTE::UpdateAspectRatio(swap_chain->GetWidth(), swap_chain->GetHeight());
+  SetGTEAspectRatioFromRenderWindow();
 }
 
 void System::ToggleSoftwareRendering()
@@ -5624,18 +5387,15 @@ void System::ToggleSoftwareRendering()
   if (IsShutdown() || g_settings.gpu_renderer == GPURenderer::Software)
     return;
 
-  const GPURenderer new_renderer = g_gpu->IsHardwareRenderer() ? GPURenderer::Software : g_settings.gpu_renderer;
+  const GPURenderer new_renderer =
+    GPUBackend::IsUsingHardwareBackend() ? GPURenderer::Software : g_settings.gpu_renderer;
 
   Host::AddIconOSDMessage("SoftwareRendering", ICON_FA_PAINT_ROLLER,
                           fmt::format(TRANSLATE_FS("OSDMessage", "Switching to {} renderer..."),
                                       Settings::GetRendererDisplayName(new_renderer)),
                           Host::OSD_QUICK_DURATION);
-  RecreateGPU(new_renderer);
 
-  // TODO: GPU-THREAD: Drop this
-  PerformanceCounters::Reset();
-
-  g_gpu->UpdateResolutionScale();
+  RecreateGPU(new_renderer, false, IsPaused());
 }
 
 void System::RequestDisplaySize(float scale /*= 0.0f*/)
@@ -5644,7 +5404,7 @@ void System::RequestDisplaySize(float scale /*= 0.0f*/)
     return;
 
   if (scale == 0.0f)
-    scale = g_gpu->IsHardwareRenderer() ? static_cast<float>(g_settings.gpu_resolution_scale) : 1.0f;
+    scale = GPUBackend::IsUsingHardwareBackend() ? static_cast<float>(g_settings.gpu_resolution_scale) : 1.0f;
 
   const float y_scale =
     (static_cast<float>(g_gpu->GetCRTCDisplayWidth()) / static_cast<float>(g_gpu->GetCRTCDisplayHeight())) /
@@ -5668,75 +5428,13 @@ void System::DisplayWindowResized(u32 width, u32 height)
     return;
 
   if (g_settings.gpu_widescreen_hack && g_settings.display_aspect_ratio == DisplayAspectRatio::MatchWindow)
-    GTE::UpdateAspectRatio(width, height);
-
-  g_gpu->RestoreDeviceContext();
-  g_gpu->UpdateResolutionScale();
-
-  // If we're paused, re-present the current frame at the new window size.
-  if (IsPaused())
-  {
-    // Hackity hack, on some systems, presenting a single frame isn't enough to actually get it
-    // displayed. Two seems to be good enough. Maybe something to do with direct scanout.
-    InvalidateDisplay();
-    InvalidateDisplay();
-  }
+    SetGTEAspectRatioFromRenderWindow();
 }
 
-bool System::PresentDisplay(bool explicit_present, u64 present_time)
+void System::SetGTEAspectRatioFromRenderWindow()
 {
-  // acquire for IO.MousePos.
-  std::atomic_thread_fence(std::memory_order_acquire);
-
-  FullscreenUI::Render();
-  ImGuiManager::RenderTextOverlays();
-  ImGuiManager::RenderOSDMessages();
-
-  if (s_state.state == State::Running)
-    ImGuiManager::RenderSoftwareCursors();
-
-  // Debug windows are always rendered, otherwise mouse input breaks on skip.
-  ImGuiManager::RenderOverlayWindows();
-
-  if (IsValid())
-    ImGuiManager::RenderDebugWindows();
-
-  const GPUDevice::PresentResult pres =
-    g_gpu_device->HasMainSwapChain() ?
-      (g_gpu ? g_gpu->PresentDisplay() : g_gpu_device->BeginPresent(g_gpu_device->GetMainSwapChain())) :
-      GPUDevice::PresentResult::SkipPresent;
-  if (pres == GPUDevice::PresentResult::OK)
-  {
-    g_gpu_device->RenderImGui(g_gpu_device->GetMainSwapChain());
-    g_gpu_device->EndPresent(g_gpu_device->GetMainSwapChain(), explicit_present, present_time);
-
-    if (g_gpu_device->IsGPUTimingEnabled())
-      PerformanceCounters::AccumulateGPUTime();
-  }
-  else
-  {
-    if (pres == GPUDevice::PresentResult::DeviceLost) [[unlikely]]
-      HandleHostGPUDeviceLost();
-    else if (pres == GPUDevice::PresentResult::ExclusiveFullscreenLost)
-      HandleExclusiveFullscreenLost();
-    else
-      g_gpu_device->FlushCommands();
-
-    // Still need to kick ImGui or it gets cranky.
-    ImGui::EndFrame();
-  }
-
-  ImGuiManager::NewFrame();
-
-  return (pres == GPUDevice::PresentResult::OK);
-}
-
-void System::InvalidateDisplay()
-{
-  PresentDisplay(false, 0);
-
-  if (g_gpu)
-    g_gpu->RestoreDeviceContext();
+  if (const WindowInfo& main_window_info = GPUThread::GetRenderWindowInfo(); !main_window_info.IsSurfaceless())
+    GTE::UpdateAspectRatio(main_window_info.surface_width, main_window_info.surface_height);
 }
 
 bool System::OpenGPUDump(std::string path, Error* error)
diff --git a/src/core/system.h b/src/core/system.h
index 7f8fadc5a..ff8fcaf49 100644
--- a/src/core/system.h
+++ b/src/core/system.h
@@ -159,7 +159,6 @@ std::string GetGameSettingsPath(std::string_view game_serial);
 std::string GetInputProfilePath(std::string_view name);
 
 State GetState();
-void SetState(State new_state);
 bool IsRunning();
 bool IsPaused();
 bool IsShutdown();
@@ -272,6 +271,8 @@ bool IsRunningAtNonStandardSpeed();
 float GetVideoFrameRate();
 void SetVideoFrameRate(float frequency);
 
+void GetFramePresentationDetails(bool* is_frame, bool* present_frame, bool* allow_present_skip, u64* present_time);
+
 // Access controllers for simulating input.
 Controller* GetController(u32 slot);
 void UpdateMemoryCardTypes();
@@ -375,7 +376,7 @@ s32 GetAudioOutputVolume();
 void UpdateVolume();
 
 /// Saves a screenshot to the specified file. If no file name is provided, one will be generated automatically.
-bool SaveScreenshot(const char* path = nullptr, DisplayScreenshotMode mode = g_settings.display_screenshot_mode,
+void SaveScreenshot(const char* path = nullptr, DisplayScreenshotMode mode = g_settings.display_screenshot_mode,
                     DisplayScreenshotFormat format = g_settings.display_screenshot_format,
                     u8 quality = g_settings.display_screenshot_quality, bool compress_on_thread = true);
 
@@ -404,10 +405,6 @@ void ToggleSoftwareRendering();
 /// If the scale is set to 0, the internal resolution will be used, otherwise it is treated as a multiplier to 1x.
 void RequestDisplaySize(float scale = 0.0f);
 
-/// Renders the display.
-bool PresentDisplay(bool explicit_present, u64 present_time);
-void InvalidateDisplay();
-
 //////////////////////////////////////////////////////////////////////////
 // Memory Save States (Rewind and Runahead)
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/system_private.h b/src/core/system_private.h
index e47baf4c0..5e41623b4 100644
--- a/src/core/system_private.h
+++ b/src/core/system_private.h
@@ -86,11 +86,8 @@ void OnSystemPaused();
 /// Called when the VM is resumed after being paused.
 void OnSystemResumed();
 
-/// Called when the pause state changes, or fullscreen UI opens.
-void OnIdleStateChanged();
-
 /// Called when performance metrics are updated, approximately once a second.
-void OnPerformanceCountersUpdated();
+void OnPerformanceCountersUpdated(const GPUBackend* gpu_backend);
 
 /// Provided by the host; called when the running executable changes.
 void OnGameChanged(const std::string& disc_path, const std::string& game_serial, const std::string& game_name);
diff --git a/src/duckstation-qt/graphicssettingswidget.cpp b/src/duckstation-qt/graphicssettingswidget.cpp
index c0b52fd92..4c0865a5a 100644
--- a/src/duckstation-qt/graphicssettingswidget.cpp
+++ b/src/duckstation-qt/graphicssettingswidget.cpp
@@ -604,8 +604,8 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsWindow* dialog, QWidget*
                              tr("Draws a wireframe outline of the triangles rendered by the console's GPU, either as a "
                                 "replacement or an overlay."));
   dialog->registerWidgetHelp(m_ui.gpuThread, tr("Threaded Rendering"), tr("Checked"),
-                             tr("Uses a second thread for drawing graphics. Currently only available for the software "
-                                "renderer, but can provide a significant speed improvement, and is safe to use."));
+                             tr("Uses a second thread for drawing graphics. Provides a significant speed improvement "
+                                "particularly with the software renderer, and is safe to use."));
 
   dialog->registerWidgetHelp(
     m_ui.useDebugDevice, tr("Use Debug Device"), tr("Unchecked"),
@@ -807,8 +807,6 @@ void GraphicsSettingsWidget::updateRendererDependentOptions()
   m_ui.blitSwapChain->setEnabled(render_api == RenderAPI::D3D11);
 #endif
 
-  m_ui.gpuThread->setEnabled(!is_hardware);
-
   m_ui.exclusiveFullscreenLabel->setEnabled(render_api == RenderAPI::D3D11 || render_api == RenderAPI::D3D12 ||
                                             render_api == RenderAPI::Vulkan);
   m_ui.exclusiveFullscreenControl->setEnabled(render_api == RenderAPI::Vulkan);
diff --git a/src/duckstation-qt/mainwindow.cpp b/src/duckstation-qt/mainwindow.cpp
index 8f04c8e8f..43b8f55fa 100644
--- a/src/duckstation-qt/mainwindow.cpp
+++ b/src/duckstation-qt/mainwindow.cpp
@@ -83,6 +83,7 @@ static bool s_use_central_widget = false;
 // UI thread VM validity.
 static bool s_system_valid = false;
 static bool s_system_paused = false;
+static bool s_fullscreen_ui_started = false;
 static std::atomic_uint32_t s_system_locked{false};
 static QString s_current_game_title;
 static QString s_current_game_serial;
@@ -762,7 +763,7 @@ void MainWindow::recreate()
   {
     g_emu_thread->setSurfaceless(false);
     g_main_window->updateEmulationActions(false, System::IsValid(), Achievements::IsHardcoreModeActive());
-    g_main_window->onFullscreenUIStateChange(g_emu_thread->isRunningFullscreenUI());
+    g_main_window->onFullscreenUIStartedOrStopped(s_fullscreen_ui_started);
   }
 
   if (controller_settings_window_pos.has_value())
@@ -1252,8 +1253,9 @@ void MainWindow::onStartFullscreenUITriggered()
     g_emu_thread->startFullscreenUI();
 }
 
-void MainWindow::onFullscreenUIStateChange(bool running)
+void MainWindow::onFullscreenUIStartedOrStopped(bool running)
 {
+  s_fullscreen_ui_started = running;
   m_ui.actionStartFullscreenUI->setText(running ? tr("Stop Big Picture Mode") : tr("Start Big Picture Mode"));
   m_ui.actionStartFullscreenUI2->setText(running ? tr("Exit Big Picture") : tr("Big Picture"));
 }
@@ -1999,7 +2001,7 @@ void MainWindow::connectSignals()
   connect(g_emu_thread, &EmuThread::mediaCaptureStarted, this, &MainWindow::onMediaCaptureStarted);
   connect(g_emu_thread, &EmuThread::mediaCaptureStopped, this, &MainWindow::onMediaCaptureStopped);
   connect(g_emu_thread, &EmuThread::mouseModeRequested, this, &MainWindow::onMouseModeRequested);
-  connect(g_emu_thread, &EmuThread::fullscreenUIStateChange, this, &MainWindow::onFullscreenUIStateChange);
+  connect(g_emu_thread, &EmuThread::fullscreenUIStartedOrStopped, this, &MainWindow::onFullscreenUIStartedOrStopped);
   connect(g_emu_thread, &EmuThread::achievementsLoginRequested, this, &MainWindow::onAchievementsLoginRequested);
   connect(g_emu_thread, &EmuThread::achievementsChallengeModeChanged, this,
           &MainWindow::onAchievementsChallengeModeChanged);
@@ -2452,7 +2454,7 @@ bool MainWindow::requestShutdown(bool allow_confirm /* = true */, bool allow_sav
   // reshow the main window during display updates, because otherwise fullscreen transitions and renderer switches
   // would briefly show and then hide the main window. So instead, we do it on shutdown, here. Except if we're in
   // batch mode, when we're going to exit anyway.
-  if (!isRenderingToMain() && isHidden() && !QtHost::InBatchMode() && !g_emu_thread->isRunningFullscreenUI())
+  if (!isRenderingToMain() && isHidden() && !QtHost::InBatchMode() && !s_fullscreen_ui_started)
     updateWindowState(true);
 
   // Now we can actually shut down the VM.
diff --git a/src/duckstation-qt/mainwindow.h b/src/duckstation-qt/mainwindow.h
index 32551795f..00b605b62 100644
--- a/src/duckstation-qt/mainwindow.h
+++ b/src/duckstation-qt/mainwindow.h
@@ -167,7 +167,7 @@ private Q_SLOTS:
   void onCheatsActionTriggered();
   void onCheatsMenuAboutToShow();
   void onStartFullscreenUITriggered();
-  void onFullscreenUIStateChange(bool running);
+  void onFullscreenUIStartedOrStopped(bool running);
   void onRemoveDiscActionTriggered();
   void onViewToolbarActionToggled(bool checked);
   void onViewLockToolbarActionToggled(bool checked);
diff --git a/src/duckstation-qt/qthost.cpp b/src/duckstation-qt/qthost.cpp
index e8a55a015..db1f661f2 100644
--- a/src/duckstation-qt/qthost.cpp
+++ b/src/duckstation-qt/qthost.cpp
@@ -19,7 +19,9 @@
 #include "core/game_list.h"
 #include "core/gdb_server.h"
 #include "core/gpu.h"
+#include "core/gpu_backend.h"
 #include "core/gpu_hw_texture_cache.h"
+#include "core/gpu_thread.h"
 #include "core/host.h"
 #include "core/imgui_overlays.h"
 #include "core/memory_card.h"
@@ -87,6 +89,20 @@ static constexpr u32 GDB_SERVER_POLLING_INTERVAL = 1;
 // Local function declarations
 //////////////////////////////////////////////////////////////////////////
 namespace QtHost {
+
+namespace {
+
+class GPUThread : public QThread
+{
+public:
+  GPUThread(QObject* parent = nullptr);
+  ~GPUThread() override;
+
+  void run() override;
+};
+
+} // namespace
+
 static bool PerformEarlyHardwareChecks();
 static bool EarlyProcessStartup();
 static void RegisterTypes();
@@ -565,13 +581,8 @@ void Host::LoadSettings(const SettingsInterface& si, std::unique_lock<std::mutex
 void EmuThread::checkForSettingsChanges(const Settings& old_settings)
 {
   if (g_main_window)
-  {
     QMetaObject::invokeMethod(g_main_window, &MainWindow::checkForSettingChanges, Qt::QueuedConnection);
 
-    if (System::IsValid())
-      updatePerformanceCounters();
-  }
-
   // don't mess with fullscreen while locked
   if (!QtHost::IsSystemLocked())
   {
@@ -580,7 +591,7 @@ void EmuThread::checkForSettingsChanges(const Settings& old_settings)
     {
       m_is_rendering_to_main = render_to_main;
       if (g_gpu_device)
-        Host::UpdateDisplayWindow(m_is_fullscreen);
+        GPUThread::UpdateDisplayWindow(m_is_fullscreen);
     }
   }
 }
@@ -708,33 +719,24 @@ void EmuThread::startFullscreenUI()
     return;
   }
 
-  if (System::IsValid())
+  if (System::IsValid() || m_is_fullscreen_ui_started)
     return;
 
   // we want settings loaded so we choose the correct renderer
   // this also sorts out input sources.
   System::LoadSettings(false);
   m_is_rendering_to_main = shouldRenderToMain();
-  m_run_fullscreen_ui = true;
 
   // borrow the game start fullscreen flag
   const bool start_fullscreen =
     (s_start_fullscreen_ui_fullscreen || Host::GetBaseBoolSettingValue("Main", "StartFullscreen", false));
 
   Error error;
-  if (!Host::CreateGPUDevice(Settings::GetRenderAPIForRenderer(g_settings.gpu_renderer), start_fullscreen, &error) ||
-      !FullscreenUI::Initialize())
+  if (!GPUThread::StartFullscreenUI(start_fullscreen, &error))
   {
     Host::ReportErrorAsync("Error", error.GetDescription());
-    m_run_fullscreen_ui = false;
     return;
   }
-
-  emit fullscreenUIStateChange(true);
-
-  // poll more frequently so we don't lose events
-  stopBackgroundControllerPollTimer();
-  startBackgroundControllerPollTimer();
 }
 
 void EmuThread::stopFullscreenUI()
@@ -749,18 +751,8 @@ void EmuThread::stopFullscreenUI()
     return;
   }
 
-  setFullscreen(false, true);
-
-  if (m_run_fullscreen_ui)
-  {
-    m_run_fullscreen_ui = false;
-    emit fullscreenUIStateChange(false);
-  }
-
-  if (!g_gpu_device)
-    return;
-
-  Host::ReleaseGPUDevice();
+  if (m_is_fullscreen_ui_started)
+    GPUThread::StopFullscreenUI();
 }
 
 void EmuThread::bootSystem(std::shared_ptr<SystemBootParameters> params)
@@ -867,7 +859,7 @@ void EmuThread::onDisplayWindowMouseWheelEvent(const QPoint& delta_angle)
 
 void EmuThread::onDisplayWindowResized(int width, int height, float scale)
 {
-  Host::ResizeDisplayWindow(width, height, scale);
+  GPUThread::ResizeDisplayWindow(width, height, scale);
 }
 
 void EmuThread::redrawDisplayWindow()
@@ -878,10 +870,10 @@ void EmuThread::redrawDisplayWindow()
     return;
   }
 
-  if (!g_gpu_device || System::IsShutdown())
+  if (System::IsShutdown())
     return;
 
-  System::InvalidateDisplay();
+  GPUThread::PresentCurrentFrame();
 }
 
 void EmuThread::toggleFullscreen()
@@ -909,7 +901,7 @@ void EmuThread::setFullscreen(bool fullscreen, bool allow_render_to_main)
 
   m_is_fullscreen = fullscreen;
   m_is_rendering_to_main = allow_render_to_main && shouldRenderToMain();
-  Host::UpdateDisplayWindow(fullscreen);
+  GPUThread::UpdateDisplayWindow(fullscreen);
 }
 
 bool Host::IsFullscreen()
@@ -938,7 +930,7 @@ void EmuThread::setSurfaceless(bool surfaceless)
     return;
 
   m_is_surfaceless = surfaceless;
-  Host::UpdateDisplayWindow(false);
+  GPUThread::UpdateDisplayWindow(false);
 }
 
 void EmuThread::requestDisplaySize(float scale)
@@ -995,6 +987,7 @@ void Host::OnSystemStarting()
 void Host::OnSystemStarted()
 {
   g_emu_thread->stopBackgroundControllerPollTimer();
+  g_emu_thread->wakeThread();
 
   emit g_emu_thread->systemStarted();
 }
@@ -1012,6 +1005,7 @@ void Host::OnSystemResumed()
     g_emu_thread->setSurfaceless(false);
 
   emit g_emu_thread->systemResumed();
+  g_emu_thread->wakeThread();
 
   g_emu_thread->stopBackgroundControllerPollTimer();
 }
@@ -1023,9 +1017,14 @@ void Host::OnSystemDestroyed()
   emit g_emu_thread->systemDestroyed();
 }
 
-void Host::OnIdleStateChanged()
+void Host::OnFullscreenUIStartedOrStopped(bool started)
 {
-  g_emu_thread->wakeThread();
+  g_emu_thread->setFullscreenUIStarted(started);
+}
+
+void Host::OnFullscreenUIActiveChanged(bool is_active)
+{
+  g_emu_thread->setFullscreenUIActive(is_active);
 }
 
 void EmuThread::reloadInputSources()
@@ -1679,7 +1678,8 @@ void Host::DestroyAuxiliaryRenderWindow(AuxiliaryRenderWindowHandle handle, s32*
     *height = size.height();
 
   // eat all pending events, to make sure we're not going to write input events back to a dead pointer
-  g_emu_thread->getEventLoop()->processEvents(QEventLoop::AllEvents);
+  if (g_emu_thread->isCurrentThread())
+    g_emu_thread->getEventLoop()->processEvents(QEventLoop::AllEvents);
 }
 
 void EmuThread::queueAuxiliaryRenderWindowInputEvent(Host::AuxiliaryRenderWindowUserData userdata,
@@ -1699,10 +1699,12 @@ void EmuThread::processAuxiliaryRenderWindowInputEvent(void* userdata, quint32 e
                                                        quint32 param3)
 {
   DebugAssert(isCurrentThread());
-  ImGuiManager::ProcessAuxiliaryRenderWindowInputEvent(userdata, static_cast<Host::AuxiliaryRenderWindowEvent>(event),
-                                                       Host::AuxiliaryRenderWindowEventParam{.uint_param = param1},
-                                                       Host::AuxiliaryRenderWindowEventParam{.uint_param = param2},
-                                                       Host::AuxiliaryRenderWindowEventParam{.uint_param = param3});
+  GPUThread::RunOnThread([userdata, event, param1, param2, param3]() {
+    ImGuiManager::ProcessAuxiliaryRenderWindowInputEvent(userdata, static_cast<Host::AuxiliaryRenderWindowEvent>(event),
+                                                         Host::AuxiliaryRenderWindowEventParam{.uint_param = param1},
+                                                         Host::AuxiliaryRenderWindowEventParam{.uint_param = param2},
+                                                         Host::AuxiliaryRenderWindowEventParam{.uint_param = param3});
+  });
 }
 
 void EmuThread::doBackgroundControllerPoll()
@@ -1731,7 +1733,7 @@ void EmuThread::startBackgroundControllerPollTimer()
     return;
 
   u32 poll_interval = BACKGROUND_CONTROLLER_POLLING_INTERVAL;
-  if (FullscreenUI::IsInitialized())
+  if (m_is_fullscreen_ui_active)
     poll_interval = FULLSCREEN_UI_CONTROLLER_POLLING_INTERVAL;
   if (GDBServer::HasAnyClients())
     poll_interval = GDB_SERVER_POLLING_INTERVAL;
@@ -1747,6 +1749,27 @@ void EmuThread::stopBackgroundControllerPollTimer()
   m_background_controller_polling_timer->stop();
 }
 
+void EmuThread::setFullscreenUIActive(bool active)
+{
+  m_is_fullscreen_ui_active = active;
+
+  // adjust the timer speed to pick up controller input faster
+  if (!m_background_controller_polling_timer->isActive())
+    return;
+
+  g_emu_thread->stopBackgroundControllerPollTimer();
+  g_emu_thread->startBackgroundControllerPollTimer();
+}
+
+void EmuThread::setFullscreenUIStarted(bool started)
+{
+  if (m_is_fullscreen_ui_started == started)
+    return;
+
+  m_is_fullscreen_ui_started = started;
+  emit fullscreenUIStartedOrStopped(started);
+}
+
 void EmuThread::start()
 {
   AssertMsg(!g_emu_thread, "Emu thread does not exist");
@@ -1790,49 +1813,52 @@ void EmuThread::run()
     }
   }
 
-  // bind buttons/axises
-  createBackgroundControllerPollTimer();
-  startBackgroundControllerPollTimer();
-
-  // main loop
-  while (!m_shutdown_flag)
   {
-    if (System::IsRunning())
+    // kick off GPU thread
+    QtHost::GPUThread gpu_thread;
+    gpu_thread.start();
+
+    // bind buttons/axises
+    createBackgroundControllerPollTimer();
+    startBackgroundControllerPollTimer();
+
+    // main loop
+    while (!m_shutdown_flag)
     {
-      System::Execute();
-    }
-    else
-    {
-      // we want to keep rendering the UI when paused and fullscreen UI is enabled
-      if (!FullscreenUI::HasActiveWindow() && !System::IsRunning())
-      {
-        // wait until we have a system before running
+      if (System::IsRunning())
+        System::Execute();
+      else
         m_event_loop->exec();
-        continue;
-      }
-
-      m_event_loop->processEvents(QEventLoop::AllEvents);
-      System::IdlePollUpdate();
-      if (g_gpu_device && g_gpu_device->HasMainSwapChain())
-      {
-        System::PresentDisplay(false, 0);
-        if (!g_gpu_device->GetMainSwapChain()->IsVSyncModeBlocking())
-          g_gpu_device->GetMainSwapChain()->ThrottlePresentation();
-      }
     }
+
+    if (System::IsValid())
+      System::ShutdownSystem(false);
+
+    destroyBackgroundControllerPollTimer();
+
+    // tell GPU thread to exit
+    GPUThread::Internal::RequestShutdown();
+
+    // and tidy up everything left
+    System::CPUThreadShutdown();
   }
 
-  if (System::IsValid())
-    System::ShutdownSystem(false);
-
-  destroyBackgroundControllerPollTimer();
-  System::CPUThreadShutdown();
-
   // move back to UI thread
   moveToThread(m_ui_thread);
 }
 
-void Host::FrameDone()
+QtHost::GPUThread::GPUThread(QObject* parent) : QThread(parent)
+{
+}
+
+QtHost::GPUThread::~GPUThread() = default;
+
+void QtHost::GPUThread::run()
+{
+  ::GPUThread::Internal::GPUThreadEntryPoint();
+}
+
+void Host::FrameDoneOnGPUThread(GPUBackend* gpu_backend, u32 frame_number)
 {
 }
 
@@ -1921,7 +1947,7 @@ void Host::OnInputDeviceConnected(std::string_view identifier, std::string_view
 {
   emit g_emu_thread->onInputDeviceConnected(std::string(identifier), std::string(device_name));
 
-  if (System::IsValid() || g_emu_thread->isRunningFullscreenUI())
+  if (System::IsValid() || g_emu_thread->isFullscreenUIActive())
   {
     Host::AddIconOSDMessage(fmt::format("ControllerConnected{}", identifier), ICON_FA_GAMEPAD,
                             fmt::format(TRANSLATE_FS("QtHost", "Controller {} connected."), identifier),
@@ -1947,7 +1973,7 @@ void Host::OnInputDeviceDisconnected(InputBindingKey key, std::string_view ident
     Host::AddIconOSDMessage(fmt::format("ControllerConnected{}", identifier), ICON_FA_GAMEPAD, std::move(message),
                             Host::OSD_WARNING_DURATION);
   }
-  else if (System::IsValid() || g_emu_thread->isRunningFullscreenUI())
+  else if (System::IsValid() || g_emu_thread->isFullscreenUIActive())
   {
     Host::AddIconOSDMessage(fmt::format("ControllerConnected{}", identifier), ICON_FA_GAMEPAD,
                             fmt::format(TRANSLATE_FS("QtHost", "Controller {} disconnected."), identifier),
@@ -2012,17 +2038,17 @@ void Host::ReleaseRenderWindow()
   g_emu_thread->releaseRenderWindow();
 }
 
-void EmuThread::updatePerformanceCounters()
+void EmuThread::updatePerformanceCounters(const GPUBackend* gpu_backend)
 {
-  const RenderAPI render_api = g_gpu_device ? g_gpu_device->GetRenderAPI() : RenderAPI::None;
-  const bool hardware_renderer = g_gpu && g_gpu->IsHardwareRenderer();
+  const RenderAPI render_api = g_gpu_device->GetRenderAPI();
+  const bool hardware_renderer = gpu_backend->IsHardwareRenderer();
   u32 render_width = 0;
   u32 render_height = 0;
 
-  if (g_gpu)
+  if (gpu_backend)
   {
-    const u32 render_scale = g_gpu->GetResolutionScale();
-    std::tie(render_width, render_height) = g_gpu->GetFullDisplayResolution();
+    const u32 render_scale = gpu_backend->GetResolutionScale();
+    std::tie(render_width, render_height) = gpu_backend->GetFullDisplayResolution();
     render_width *= render_scale;
     render_height *= render_scale;
   }
@@ -2085,9 +2111,9 @@ void EmuThread::resetPerformanceCounters()
                             Q_ARG(const QString&, blank));
 }
 
-void Host::OnPerformanceCountersUpdated()
+void Host::OnPerformanceCountersUpdated(const GPUBackend* gpu_backend)
 {
-  g_emu_thread->updatePerformanceCounters();
+  g_emu_thread->updatePerformanceCounters(gpu_backend);
 }
 
 void Host::OnGameChanged(const std::string& disc_path, const std::string& game_serial, const std::string& game_name)
diff --git a/src/duckstation-qt/qthost.h b/src/duckstation-qt/qthost.h
index aa4e71468..7eafe1835 100644
--- a/src/duckstation-qt/qthost.h
+++ b/src/duckstation-qt/qthost.h
@@ -44,6 +44,8 @@ class INISettingsInterface;
 enum class RenderAPI : u8;
 class GPUDevice;
 
+class GPUBackend;
+
 class MainWindow;
 class DisplayWidget;
 
@@ -91,9 +93,9 @@ public:
   ALWAYS_INLINE QEventLoop* getEventLoop() const { return m_event_loop; }
 
   ALWAYS_INLINE bool isFullscreen() const { return m_is_fullscreen; }
+  ALWAYS_INLINE bool isFullscreenUIActive() const { return m_is_fullscreen_ui_active; }
   ALWAYS_INLINE bool isRenderingToMain() const { return m_is_rendering_to_main; }
   ALWAYS_INLINE bool isSurfaceless() const { return m_is_surfaceless; }
-  ALWAYS_INLINE bool isRunningFullscreenUI() const { return m_run_fullscreen_ui; }
 
   std::optional<WindowInfo> acquireRenderWindow(RenderAPI render_api, bool fullscreen, bool exclusive_fullscreen,
                                                 Error* error);
@@ -102,6 +104,8 @@ public:
 
   void startBackgroundControllerPollTimer();
   void stopBackgroundControllerPollTimer();
+  void setFullscreenUIActive(bool active);
+  void setFullscreenUIStarted(bool started);
   void wakeThread();
 
   bool shouldRenderToMain() const;
@@ -109,7 +113,7 @@ public:
 
   void bootOrLoadState(std::string path);
 
-  void updatePerformanceCounters();
+  void updatePerformanceCounters(const GPUBackend* gpu_backend);
   void resetPerformanceCounters();
 
   /// Locks the system by pausing it, while a popup dialog is displayed.
@@ -147,7 +151,7 @@ Q_SIGNALS:
   void runningGameChanged(const QString& filename, const QString& game_serial, const QString& game_title);
   void inputProfileLoaded();
   void mouseModeRequested(bool relative, bool hide_cursor);
-  void fullscreenUIStateChange(bool running);
+  void fullscreenUIStartedOrStopped(bool running);
   void achievementsLoginRequested(Achievements::LoginRequestReason reason);
   void achievementsRefreshed(quint32 id, const QString& game_info_string);
   void achievementsChallengeModeChanged(bool enabled);
@@ -242,9 +246,10 @@ private:
   QTimer* m_background_controller_polling_timer = nullptr;
 
   bool m_shutdown_flag = false;
-  bool m_run_fullscreen_ui = false;
   bool m_is_rendering_to_main = false;
   bool m_is_fullscreen = false;
+  bool m_is_fullscreen_ui_started = false;
+  bool m_is_fullscreen_ui_active = false;
   bool m_is_surfaceless = false;
   bool m_save_state_on_shutdown = false;
 
diff --git a/src/duckstation-regtest/regtest_host.cpp b/src/duckstation-regtest/regtest_host.cpp
index 5f60d1a7e..50a0ace25 100644
--- a/src/duckstation-regtest/regtest_host.cpp
+++ b/src/duckstation-regtest/regtest_host.cpp
@@ -5,7 +5,7 @@
 #include "core/controller.h"
 #include "core/fullscreen_ui.h"
 #include "core/game_list.h"
-#include "core/gpu.h"
+#include "core/gpu_backend.h"
 #include "core/host.h"
 #include "core/system.h"
 #include "core/system_private.h"
@@ -276,7 +276,7 @@ void Host::OnIdleStateChanged()
   //
 }
 
-void Host::OnPerformanceCountersUpdated()
+void Host::OnPerformanceCountersUpdated(const GPUBackend* gpu_backend)
 {
   //
 }
@@ -365,14 +365,10 @@ void Host::DestroyAuxiliaryRenderWindow(AuxiliaryRenderWindowHandle handle, s32*
 {
 }
 
-void Host::FrameDone()
+void Host::FrameDoneOnGPUThread(GPUBackend* gpu_backend, u32 frame_number)
 {
-  const u32 frame = System::GetFrameNumber();
-  if (s_frame_dump_interval > 0 && (s_frame_dump_interval == 1 || (frame % s_frame_dump_interval) == 0))
-  {
-    std::string dump_filename(RegTestHost::GetFrameDumpFilename(frame));
-    g_gpu->WriteDisplayTextureToFile(std::move(dump_filename));
-  }
+  if (s_frame_dump_interval > 0 && (s_frame_dump_interval == 1 || (frame_number % s_frame_dump_interval) == 0))
+    gpu_backend->WriteDisplayTextureToFile(RegTestHost::GetFrameDumpFilename(frame_number));
 }
 
 void Host::OpenURL(std::string_view url)
diff --git a/src/util/gpu_device.cpp b/src/util/gpu_device.cpp
index b3b90af66..5bc052c8d 100644
--- a/src/util/gpu_device.cpp
+++ b/src/util/gpu_device.cpp
@@ -343,6 +343,17 @@ const char* GPUDevice::ShaderLanguageToString(GPUShaderLanguage language)
   }
 }
 
+const char* GPUDevice::VSyncModeToString(GPUVSyncMode mode)
+{
+  static constexpr std::array<const char*, static_cast<size_t>(GPUVSyncMode::Count)> vsync_modes = {{
+    "Disabled",
+    "FIFO",
+    "Mailbox",
+  }};
+
+  return vsync_modes[static_cast<size_t>(mode)];
+}
+
 bool GPUDevice::IsSameRenderAPI(RenderAPI lhs, RenderAPI rhs)
 {
   return (lhs == rhs || ((lhs == RenderAPI::OpenGL || lhs == RenderAPI::OpenGLES) &&
diff --git a/src/util/gpu_device.h b/src/util/gpu_device.h
index 309b4db39..0859ea6c4 100644
--- a/src/util/gpu_device.h
+++ b/src/util/gpu_device.h
@@ -601,6 +601,9 @@ public:
   /// Returns a string representing the specified language.
   static const char* ShaderLanguageToString(GPUShaderLanguage language);
 
+  /// Returns a string representing the specified vsync mode.
+  static const char* VSyncModeToString(GPUVSyncMode mode);
+
   /// Returns a new device for the specified API.
   static std::unique_ptr<GPUDevice> CreateDeviceForAPI(RenderAPI api);
 
diff --git a/src/util/state_wrapper.h b/src/util/state_wrapper.h
index 9ac2b86d8..bdf581eef 100644
--- a/src/util/state_wrapper.h
+++ b/src/util/state_wrapper.h
@@ -34,6 +34,8 @@ public:
   ALWAYS_INLINE bool IsReading() const { return (m_mode == Mode::Read); }
   ALWAYS_INLINE bool IsWriting() const { return (m_mode == Mode::Write); }
   ALWAYS_INLINE u32 GetVersion() const { return m_version; }
+  ALWAYS_INLINE const u8* GetData() const { return m_data; }
+  ALWAYS_INLINE size_t GetDataSize() const { return m_size; }
   ALWAYS_INLINE size_t GetPosition() const { return m_pos; }
 
   /// Overload for integral or floating-point types. Writes bytes as-is.