Merge pull request #9956 from Pokechu22/non-power-of-2-wrap-2

VideoCommon: Manually handle texture wrapping and sampling
2021-11-18 13:08:23 -05:00 · 2021-11-18 13:08:23 -05:00 · 6f4bbac528
parent 8b57aad8ed 95b9941044
commit 6f4bbac528
46 changed files with 752 additions and 329 deletions
--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
@ -198,6 +198,8 @@ public enum BooleanSetting implements AbstractBooleanSetting
  GFX_HACK_EFB_EMULATE_FORMAT_CHANGES(Settings.FILE_GFX, Settings.SECTION_GFX_HACKS,
          "EFBEmulateFormatChanges", false),
  GFX_HACK_VERTEX_ROUDING(Settings.FILE_GFX, Settings.SECTION_GFX_HACKS, "VertexRounding", false),
+  GFX_HACK_FAST_TEXTURE_SAMPLING(Settings.FILE_GFX, Settings.SECTION_GFX_HACKS,
+          "FastTextureSampling", true),

  LOGGER_WRITE_TO_FILE(Settings.FILE_LOGGER, Settings.SECTION_LOGGER_OPTIONS, "WriteToFile", false),

--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
@ -744,6 +744,8 @@ public final class SettingsFragmentPresenter
            R.string.backend_multithreading, R.string.backend_multithreading_description));
    sl.add(new CheckBoxSetting(mContext, BooleanSetting.GFX_HACK_EFB_DEFER_INVALIDATION,
            R.string.defer_efb_invalidation, R.string.defer_efb_invalidation_description));
+    sl.add(new InvertedCheckBoxSetting(mContext, BooleanSetting.GFX_HACK_FAST_TEXTURE_SAMPLING,
+            R.string.manual_texture_sampling, R.string.manual_texture_sampling_description));
    sl.add(new CheckBoxSetting(mContext, BooleanSetting.GFX_INTERNAL_RESOLUTION_FRAME_DUMPS,
            R.string.internal_resolution_dumps, R.string.internal_resolution_dumps_description));

--- a/Source/Android/app/src/main/res/values/strings.xml
+++ b/Source/Android/app/src/main/res/values/strings.xml
@ -303,6 +303,8 @@
    <string name="backend_multithreading_description">Enables graphics backend multithreading (Vulkan only). May affect performance. If unsure, leave this unchecked.</string>
    <string name="defer_efb_invalidation">Defer EFB Cache Invalidation</string>
    <string name="defer_efb_invalidation_description">Defers invalidation of the EFB access cache until a GPU synchronization command is executed. May improve performance in some games at the cost of stability. If unsure, leave this unchecked.</string>
+    <string name="manual_texture_sampling">Manual Texture Sampling</string>
+    <string name="manual_texture_sampling_description">Use a manual implementation of texture sampling instead of the graphics backend\'s built-in functionality.</string>
    <string name="internal_resolution_dumps">Dump Frames at Internal Resolution</string>
    <string name="internal_resolution_dumps_description">Creates frame dumps and screenshots at the internal resolution of the renderer, rather than the size of the window it is displayed within. If the aspect ratio is widescreen, the output image will be scaled horizontally to preserve the vertical resolution.</string>
    <string name="debugging">Debugging</string>
--- a/Source/Core/Common/BitField.h
+++ b/Source/Core/Common/BitField.h
@ -149,6 +149,7 @@ public:

  constexpr T Value() const { return Value(std::is_signed<T>()); }
  constexpr operator T() const { return Value(); }
+  static constexpr bool IsSigned() { return std::is_signed<T>(); }
  static constexpr std::size_t StartBit() { return position; }
  static constexpr std::size_t NumBits() { return bits; }

@ -244,6 +245,7 @@ public:
  BitFieldArray& operator=(const BitFieldArray&) = delete;

 public:
+  constexpr bool IsSigned() const { return std::is_signed<T>(); }
  constexpr std::size_t StartBit() const { return position; }
  constexpr std::size_t NumBits() const { return bits; }
  constexpr std::size_t Size() const { return size; }
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@ -150,6 +150,8 @@ const Info<bool> GFX_HACK_EFB_EMULATE_FORMAT_CHANGES{
 const Info<bool> GFX_HACK_VERTEX_ROUDING{{System::GFX, "Hacks", "VertexRounding"}, false};
 const Info<u32> GFX_HACK_MISSING_COLOR_VALUE{{System::GFX, "Hacks", "MissingColorValue"},
                                             0xFFFFFFFF};
+const Info<bool> GFX_HACK_FAST_TEXTURE_SAMPLING{{System::GFX, "Hacks", "FastTextureSampling"},
+                                                true};

 // Graphics.GameSpecific

--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@ -123,6 +123,7 @@ extern const Info<bool> GFX_HACK_COPY_EFB_SCALED;
 extern const Info<bool> GFX_HACK_EFB_EMULATE_FORMAT_CHANGES;
 extern const Info<bool> GFX_HACK_VERTEX_ROUDING;
 extern const Info<u32> GFX_HACK_MISSING_COLOR_VALUE;
+extern const Info<bool> GFX_HACK_FAST_TEXTURE_SAMPLING;

 // Graphics.GameSpecific

--- a/Source/Core/DolphinLib.props
+++ b/Source/Core/DolphinLib.props
@ -644,7 +644,6 @@
    <ClInclude Include="VideoCommon\PostProcessing.h" />
    <ClInclude Include="VideoCommon\RenderBase.h" />
    <ClInclude Include="VideoCommon\RenderState.h" />
-    <ClInclude Include="VideoCommon\SamplerCommon.h" />
    <ClInclude Include="VideoCommon\ShaderCache.h" />
    <ClInclude Include="VideoCommon\ShaderGenCommon.h" />
    <ClInclude Include="VideoCommon\Statistics.h" />
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
@ -138,8 +138,11 @@ void AdvancedWidget::CreateWidgets()

  m_defer_efb_access_invalidation =
      new GraphicsBool(tr("Defer EFB Cache Invalidation"), Config::GFX_HACK_EFB_DEFER_INVALIDATION);
+  m_manual_texture_sampling =
+      new GraphicsBool(tr("Manual Texture Sampling"), Config::GFX_HACK_FAST_TEXTURE_SAMPLING, true);

  experimental_layout->addWidget(m_defer_efb_access_invalidation, 0, 0);
+  experimental_layout->addWidget(m_manual_texture_sampling, 0, 1);

  main_layout->addWidget(debugging_box);
  main_layout->addWidget(utility_box);
@ -266,6 +269,17 @@ void AdvancedWidget::AddDescriptions()
      "<br><br>May improve performance in some games which rely on CPU EFB Access at the cost "
      "of stability.<br><br><dolphin_emphasis>If unsure, leave this "
      "unchecked.</dolphin_emphasis>");
+  static const char TR_MANUAL_TEXTURE_SAMPLING_DESCRIPTION[] = QT_TR_NOOP(
+      "Use a manual implementation of texture sampling instead of the graphics backend's built-in "
+      "functionality.<br><br>"
+      "This setting can fix graphical issues in some games on certain GPUs, most commonly vertical "
+      "lines on FMVs. In addition to this, enabling Manual Texture Sampling will allow for correct "
+      "emulation of texture wrapping special cases (at 1x IR or when scaled EFB is disabled, and "
+      "with custom textures disabled) and better emulates Level of Detail calculation.<br><br>"
+      "This comes at the cost of potentially worse performance, especially at higher internal "
+      "resolutions; additionally, Anisotropic Filtering is currently incompatible with Manual "
+      "Texture Sampling.<br><br>"
+      "<dolphin_emphasis>If unsure, leave this unchecked.</dolphin_emphasis>");

 #ifdef _WIN32
  static const char TR_BORDERLESS_FULLSCREEN_DESCRIPTION[] = QT_TR_NOOP(
@ -299,4 +313,5 @@ void AdvancedWidget::AddDescriptions()
  m_borderless_fullscreen->SetDescription(tr(TR_BORDERLESS_FULLSCREEN_DESCRIPTION));
 #endif
  m_defer_efb_access_invalidation->SetDescription(tr(TR_DEFER_EFB_ACCESS_INVALIDATION_DESCRIPTION));
+  m_manual_texture_sampling->SetDescription(tr(TR_MANUAL_TEXTURE_SAMPLING_DESCRIPTION));
 }
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
@ -61,4 +61,5 @@ private:

  // Experimental
  GraphicsBool* m_defer_efb_access_invalidation;
+  GraphicsBool* m_manual_texture_sampling;
 };
--- a/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h
+++ b/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h
@ -26,6 +26,7 @@ private:
  GraphicsBool* m_skip_efb_cpu;
  GraphicsBool* m_ignore_format_changes;
  GraphicsBool* m_store_efb_copies;
+  GraphicsBool* m_defer_efb_copies;

  // Texture Cache
  QLabel* m_accuracy_label;
@ -42,7 +43,6 @@ private:
  GraphicsBool* m_disable_bounding_box;
  GraphicsBool* m_vertex_rounding;
  GraphicsBool* m_save_texture_cache_state;
-  GraphicsBool* m_defer_efb_copies;

  void CreateWidgets();
  void ConnectWidgets();
--- a/Source/Core/VideoBackends/D3D/D3DMain.cpp
+++ b/Source/Core/VideoBackends/D3D/D3DMain.cpp
@ -106,6 +106,8 @@ void VideoBackend::FillBackendInfo()
  g_Config.backend_info.bSupportsSSAA = true;
  g_Config.backend_info.bSupportsShaderBinaries = true;
  g_Config.backend_info.bSupportsPipelineCacheData = false;
+  g_Config.backend_info.bSupportsCoarseDerivatives = true;
+  g_Config.backend_info.bSupportsTextureQueryLevels = true;
  g_Config.backend_info.bSupportsLogicOp = D3D::SupportsLogicOp(g_Config.iAdapter);

  g_Config.backend_info.Adapters = D3DCommon::GetAdapterNames();
--- a/Source/Core/VideoBackends/D3D/D3DState.cpp
+++ b/Source/Core/VideoBackends/D3D/D3DState.cpp
@ -303,43 +303,43 @@ StateCache::~StateCache() = default;
 ID3D11SamplerState* StateCache::Get(SamplerState state)
 {
  std::lock_guard<std::mutex> guard(m_lock);
-  auto it = m_sampler.find(state.hex);
+  auto it = m_sampler.find(state);
  if (it != m_sampler.end())
    return it->second.Get();

  D3D11_SAMPLER_DESC sampdc = CD3D11_SAMPLER_DESC(CD3D11_DEFAULT());
-  if (state.mipmap_filter == SamplerState::Filter::Linear)
+  if (state.tm0.mipmap_filter == FilterMode::Linear)
  {
-    if (state.min_filter == SamplerState::Filter::Linear)
-      sampdc.Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+    if (state.tm0.min_filter == FilterMode::Linear)
+      sampdc.Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                          D3D11_FILTER_MIN_MAG_MIP_LINEAR :
                          D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR;
    else
-      sampdc.Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      sampdc.Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                          D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR :
                          D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR;
  }
  else
  {
-    if (state.min_filter == SamplerState::Filter::Linear)
-      sampdc.Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+    if (state.tm0.min_filter == FilterMode::Linear)
+      sampdc.Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                          D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT :
                          D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT;
    else
-      sampdc.Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      sampdc.Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                          D3D11_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT :
                          D3D11_FILTER_MIN_MAG_MIP_POINT;
  }

  static constexpr std::array<D3D11_TEXTURE_ADDRESS_MODE, 3> address_modes = {
      {D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_TEXTURE_ADDRESS_WRAP, D3D11_TEXTURE_ADDRESS_MIRROR}};
-  sampdc.AddressU = address_modes[static_cast<u32>(state.wrap_u.Value())];
-  sampdc.AddressV = address_modes[static_cast<u32>(state.wrap_v.Value())];
-  sampdc.MaxLOD = state.max_lod / 16.f;
-  sampdc.MinLOD = state.min_lod / 16.f;
-  sampdc.MipLODBias = (s32)state.lod_bias / 256.f;
+  sampdc.AddressU = address_modes[static_cast<u32>(state.tm0.wrap_u.Value())];
+  sampdc.AddressV = address_modes[static_cast<u32>(state.tm0.wrap_v.Value())];
+  sampdc.MaxLOD = state.tm1.max_lod / 16.f;
+  sampdc.MinLOD = state.tm1.min_lod / 16.f;
+  sampdc.MipLODBias = state.tm0.lod_bias / 256.f;

-  if (state.anisotropic_filtering)
+  if (state.tm0.anisotropic_filtering)
  {
    sampdc.Filter = D3D11_FILTER_ANISOTROPIC;
    sampdc.MaxAnisotropy = 1u << g_ActiveConfig.iMaxAnisotropy;
@ -348,7 +348,7 @@ ID3D11SamplerState* StateCache::Get(SamplerState state)
  ComPtr<ID3D11SamplerState> res;
  HRESULT hr = D3D::device->CreateSamplerState(&sampdc, res.GetAddressOf());
  CHECK(SUCCEEDED(hr), "Creating D3D sampler state failed");
-  return m_sampler.emplace(state.hex, std::move(res)).first->second.Get();
+  return m_sampler.emplace(state, std::move(res)).first->second.Get();
 }

 ID3D11BlendState* StateCache::Get(BlendingState state)
--- a/Source/Core/VideoBackends/D3D/D3DState.h
+++ b/Source/Core/VideoBackends/D3D/D3DState.h
@ -37,7 +37,7 @@ private:
  std::unordered_map<u32, ComPtr<ID3D11DepthStencilState>> m_depth;
  std::unordered_map<u32, ComPtr<ID3D11RasterizerState>> m_raster;
  std::unordered_map<u32, ComPtr<ID3D11BlendState>> m_blend;
-  std::unordered_map<SamplerState::StorageType, ComPtr<ID3D11SamplerState>> m_sampler;
+  std::unordered_map<SamplerState, ComPtr<ID3D11SamplerState>> m_sampler;
  std::mutex m_lock;
 };

--- a/Source/Core/VideoBackends/D3D12/DescriptorHeapManager.cpp
+++ b/Source/Core/VideoBackends/D3D12/DescriptorHeapManager.cpp
@ -85,32 +85,32 @@ SamplerHeapManager::~SamplerHeapManager() = default;

 static void GetD3DSamplerDesc(D3D12_SAMPLER_DESC* desc, const SamplerState& state)
 {
-  if (state.mipmap_filter == SamplerState::Filter::Linear)
+  if (state.tm0.mipmap_filter == FilterMode::Linear)
  {
-    if (state.min_filter == SamplerState::Filter::Linear)
+    if (state.tm0.min_filter == FilterMode::Linear)
    {
-      desc->Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      desc->Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                         D3D12_FILTER_MIN_MAG_MIP_LINEAR :
                         D3D12_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR;
    }
    else
    {
-      desc->Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      desc->Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                         D3D12_FILTER_MIN_POINT_MAG_MIP_LINEAR :
                         D3D12_FILTER_MIN_MAG_POINT_MIP_LINEAR;
    }
  }
  else
  {
-    if (state.min_filter == SamplerState::Filter::Linear)
+    if (state.tm0.min_filter == FilterMode::Linear)
    {
-      desc->Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      desc->Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                         D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT :
                         D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT;
    }
    else
    {
-      desc->Filter = (state.mag_filter == SamplerState::Filter::Linear) ?
+      desc->Filter = (state.tm0.mag_filter == FilterMode::Linear) ?
                         D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT :
                         D3D12_FILTER_MIN_MAG_MIP_POINT;
    }
@ -119,15 +119,15 @@ static void GetD3DSamplerDesc(D3D12_SAMPLER_DESC* desc, const SamplerState& stat
  static constexpr std::array<D3D12_TEXTURE_ADDRESS_MODE, 3> address_modes = {
      {D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_WRAP,
       D3D12_TEXTURE_ADDRESS_MODE_MIRROR}};
-  desc->AddressU = address_modes[static_cast<u32>(state.wrap_u.Value())];
-  desc->AddressV = address_modes[static_cast<u32>(state.wrap_v.Value())];
+  desc->AddressU = address_modes[static_cast<u32>(state.tm0.wrap_u.Value())];
+  desc->AddressV = address_modes[static_cast<u32>(state.tm0.wrap_v.Value())];
  desc->AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
-  desc->MaxLOD = state.max_lod / 16.f;
-  desc->MinLOD = state.min_lod / 16.f;
-  desc->MipLODBias = static_cast<s32>(state.lod_bias) / 256.f;
+  desc->MaxLOD = state.tm1.max_lod / 16.f;
+  desc->MinLOD = state.tm1.min_lod / 16.f;
+  desc->MipLODBias = static_cast<s32>(state.tm0.lod_bias) / 256.f;
  desc->ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER;

-  if (state.anisotropic_filtering)
+  if (state.tm0.anisotropic_filtering)
  {
    desc->Filter = D3D12_FILTER_ANISOTROPIC;
    desc->MaxAnisotropy = 1u << g_ActiveConfig.iMaxAnisotropy;
@ -136,7 +136,7 @@ static void GetD3DSamplerDesc(D3D12_SAMPLER_DESC* desc, const SamplerState& stat

 bool SamplerHeapManager::Lookup(const SamplerState& ss, D3D12_CPU_DESCRIPTOR_HANDLE* handle)
 {
-  const auto it = m_sampler_map.find(ss.hex);
+  const auto it = m_sampler_map.find(ss);
  if (it != m_sampler_map.end())
  {
    *handle = it->second;
@ -158,7 +158,7 @@ bool SamplerHeapManager::Lookup(const SamplerState& ss, D3D12_CPU_DESCRIPTOR_HAN
                                                  m_current_offset * m_descriptor_increment_size};
  g_dx_context->GetDevice()->CreateSampler(&desc, new_handle);

-  m_sampler_map.emplace(ss.hex, new_handle);
+  m_sampler_map.emplace(ss, new_handle);
  m_current_offset++;
  *handle = new_handle;
  return true;
--- a/Source/Core/VideoBackends/D3D12/DescriptorHeapManager.h
+++ b/Source/Core/VideoBackends/D3D12/DescriptorHeapManager.h
@ -68,6 +68,6 @@ private:

  D3D12_CPU_DESCRIPTOR_HANDLE m_heap_base_cpu{};

-  std::unordered_map<SamplerState::StorageType, D3D12_CPU_DESCRIPTOR_HANDLE> m_sampler_map;
+  std::unordered_map<SamplerState, D3D12_CPU_DESCRIPTOR_HANDLE> m_sampler_map;
 };
 }  // namespace DX12
--- a/Source/Core/VideoBackends/D3D12/VideoBackend.cpp
+++ b/Source/Core/VideoBackends/D3D12/VideoBackend.cpp
@ -82,6 +82,8 @@ void VideoBackend::FillBackendInfo()
  g_Config.backend_info.AAModes = DXContext::GetAAModes(g_Config.iAdapter);
  g_Config.backend_info.bSupportsShaderBinaries = true;
  g_Config.backend_info.bSupportsPipelineCacheData = true;
+  g_Config.backend_info.bSupportsCoarseDerivatives = true;
+  g_Config.backend_info.bSupportsTextureQueryLevels = true;

  // We can only check texture support once we have a device.
  if (g_dx_context)
--- a/Source/Core/VideoBackends/Null/NullBackend.cpp
+++ b/Source/Core/VideoBackends/Null/NullBackend.cpp
@ -55,6 +55,8 @@ void VideoBackend::InitBackendInfo()
  g_Config.backend_info.bSupportsPartialDepthCopies = false;
  g_Config.backend_info.bSupportsShaderBinaries = false;
  g_Config.backend_info.bSupportsPipelineCacheData = false;
+  g_Config.backend_info.bSupportsCoarseDerivatives = false;
+  g_Config.backend_info.bSupportsTextureQueryLevels = false;

  // aamodes: We only support 1 sample, so no MSAA
  g_Config.backend_info.Adapters.clear();
--- a/Source/Core/VideoBackends/OGL/OGLMain.cpp
+++ b/Source/Core/VideoBackends/OGL/OGLMain.cpp
@ -99,7 +99,7 @@ void VideoBackend::InitBackendInfo()
  g_Config.backend_info.bSupportsGPUTextureDecoding = true;
  g_Config.backend_info.bSupportsBBox = true;

-  // Overwritten in Render.cpp later
+  // Overwritten in OGLRender.cpp later
  g_Config.backend_info.bSupportsDualSourceBlend = true;
  g_Config.backend_info.bSupportsPrimitiveRestart = true;
  g_Config.backend_info.bSupportsPaletteConversion = true;
@ -107,6 +107,8 @@ void VideoBackend::InitBackendInfo()
  g_Config.backend_info.bSupportsDepthClamp = true;
  g_Config.backend_info.bSupportsST3CTextures = false;
  g_Config.backend_info.bSupportsBPTCTextures = false;
+  g_Config.backend_info.bSupportsCoarseDerivatives = false;
+  g_Config.backend_info.bSupportsTextureQueryLevels = false;

  g_Config.backend_info.Adapters.clear();

--- a/Source/Core/VideoBackends/OGL/OGLRender.cpp
+++ b/Source/Core/VideoBackends/OGL/OGLRender.cpp
@ -483,6 +483,10 @@ Renderer::Renderer(std::unique_ptr<GLContext> main_gl_context, float backbuffer_
      GLExtensions::Supports("GL_EXT_texture_compression_s3tc");
  g_Config.backend_info.bSupportsBPTCTextures =
      GLExtensions::Supports("GL_ARB_texture_compression_bptc");
+  g_Config.backend_info.bSupportsCoarseDerivatives =
+      GLExtensions::Supports("GL_ARB_derivative_control") || GLExtensions::Version() >= 450;
+  g_Config.backend_info.bSupportsTextureQueryLevels =
+      GLExtensions::Supports("GL_ARB_texture_query_levels") || GLExtensions::Version() >= 430;

  if (m_main_gl_context->IsGLES())
  {
--- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
+++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
@ -747,6 +747,8 @@ void ProgramShaderCache::CreateHeader()
      "%s\n"  // shader image load store
      "%s\n"  // shader framebuffer fetch
      "%s\n"  // shader thread shuffle
+      "%s\n"  // derivative control
+      "%s\n"  // query levels

      // Precision defines for GLSL ES
      "%s\n"
@ -826,6 +828,12 @@ void ProgramShaderCache::CreateHeader()
          "#extension GL_ARB_shader_image_load_store : enable" :
          "",
      framebuffer_fetch_string.c_str(), shader_shuffle_string.c_str(),
+      g_ActiveConfig.backend_info.bSupportsCoarseDerivatives ?
+          "#extension GL_ARB_derivative_control : enable" :
+          "",
+      g_ActiveConfig.backend_info.bSupportsTextureQueryLevels ?
+          "#extension GL_ARB_texture_query_levels : enable" :
+          "",
      is_glsles ? "precision highp float;" : "", is_glsles ? "precision highp int;" : "",
      is_glsles ? "precision highp sampler2DArray;" : "",
      (is_glsles && g_ActiveConfig.backend_info.bSupportsPaletteConversion) ?
--- a/Source/Core/VideoBackends/OGL/SamplerCache.cpp
+++ b/Source/Core/VideoBackends/OGL/SamplerCache.cpp
@ -7,7 +7,6 @@
 #include <memory>

 #include "Common/CommonTypes.h"
-#include "VideoCommon/SamplerCommon.h"
 #include "VideoCommon/VideoConfig.h"

 namespace OGL
@ -72,16 +71,16 @@ void SamplerCache::InvalidateBinding(u32 stage)
 void SamplerCache::SetParameters(GLuint sampler_id, const SamplerState& params)
 {
  GLenum min_filter;
-  GLenum mag_filter = (params.mag_filter == SamplerState::Filter::Point) ? GL_NEAREST : GL_LINEAR;
-  if (params.mipmap_filter == SamplerState::Filter::Linear)
+  GLenum mag_filter = (params.tm0.mag_filter == FilterMode::Near) ? GL_NEAREST : GL_LINEAR;
+  if (params.tm0.mipmap_filter == FilterMode::Linear)
  {
-    min_filter = (params.min_filter == SamplerState::Filter::Point) ? GL_NEAREST_MIPMAP_LINEAR :
-                                                                      GL_LINEAR_MIPMAP_LINEAR;
+    min_filter = (params.tm0.min_filter == FilterMode::Near) ? GL_NEAREST_MIPMAP_LINEAR :
+                                                               GL_LINEAR_MIPMAP_LINEAR;
  }
  else
  {
-    min_filter = (params.min_filter == SamplerState::Filter::Point) ? GL_NEAREST_MIPMAP_NEAREST :
-                                                                      GL_LINEAR_MIPMAP_NEAREST;
+    min_filter = (params.tm0.min_filter == FilterMode::Near) ? GL_NEAREST_MIPMAP_NEAREST :
+                                                               GL_LINEAR_MIPMAP_NEAREST;
  }

  glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER, min_filter);
@ -91,17 +90,17 @@ void SamplerCache::SetParameters(GLuint sampler_id, const SamplerState& params)
      {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT}};

  glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S,
-                      address_modes[static_cast<u32>(params.wrap_u.Value())]);
+                      address_modes[static_cast<u32>(params.tm0.wrap_u.Value())]);
  glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T,
-                      address_modes[static_cast<u32>(params.wrap_v.Value())]);
+                      address_modes[static_cast<u32>(params.tm0.wrap_v.Value())]);

-  glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, params.min_lod / 16.f);
-  glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, params.max_lod / 16.f);
+  glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, params.tm1.min_lod / 16.f);
+  glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, params.tm1.max_lod / 16.f);

  if (!static_cast<Renderer*>(g_renderer.get())->IsGLES())
-    glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, params.lod_bias / 256.f);
+    glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, params.tm0.lod_bias / 256.f);

-  if (params.anisotropic_filtering && g_ogl_config.bSupportsAniso)
+  if (params.tm0.anisotropic_filtering && g_ogl_config.bSupportsAniso)
  {
    glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT,
                        static_cast<float>(1 << g_ActiveConfig.iMaxAnisotropy));
--- a/Source/Core/VideoBackends/Software/Rasterizer.cpp
+++ b/Source/Core/VideoBackends/Software/Rasterizer.cpp
@ -171,22 +171,25 @@ static inline void CalculateLOD(s32* lodp, bool* linear, u32 texmap, u32 texcoor
  const TexMode1& tm1 = texUnit.texMode1;

  float sDelta, tDelta;
+
+  float* uv00 = rasterBlock.Pixel[0][0].Uv[texcoord];
+  float* uv10 = rasterBlock.Pixel[1][0].Uv[texcoord];
+  float* uv01 = rasterBlock.Pixel[0][1].Uv[texcoord];
+
+  float dudx = fabsf(uv00[0] - uv10[0]);
+  float dvdx = fabsf(uv00[1] - uv10[1]);
+  float dudy = fabsf(uv00[0] - uv01[0]);
+  float dvdy = fabsf(uv00[1] - uv01[1]);
+
  if (tm0.diag_lod == LODType::Diagonal)
  {
-    float* uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
-    float* uv1 = rasterBlock.Pixel[1][1].Uv[texcoord];
-
-    sDelta = fabsf(uv0[0] - uv1[0]);
-    tDelta = fabsf(uv0[1] - uv1[1]);
+    sDelta = dudx + dudy;
+    tDelta = dvdx + dvdy;
  }
  else
  {
-    float* uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
-    float* uv1 = rasterBlock.Pixel[1][0].Uv[texcoord];
-    float* uv2 = rasterBlock.Pixel[0][1].Uv[texcoord];
-
-    sDelta = std::max(fabsf(uv0[0] - uv1[0]), fabsf(uv0[0] - uv2[0]));
-    tDelta = std::max(fabsf(uv0[1] - uv1[1]), fabsf(uv0[1] - uv2[1]));
+    sDelta = std::max(dudx, dudy);
+    tDelta = std::max(dvdx, dvdy);
  }

  // get LOD in s28.4
--- a/Source/Core/VideoBackends/Software/SWmain.cpp
+++ b/Source/Core/VideoBackends/Software/SWmain.cpp
@ -84,6 +84,8 @@ void VideoSoftware::InitBackendInfo()
  g_Config.backend_info.bSupportsShaderBinaries = false;
  g_Config.backend_info.bSupportsPipelineCacheData = false;
  g_Config.backend_info.bSupportsBBox = true;
+  g_Config.backend_info.bSupportsCoarseDerivatives = false;
+  g_Config.backend_info.bSupportsTextureQueryLevels = false;

  // aamodes
  g_Config.backend_info.AAModes = {1};
--- a/Source/Core/VideoBackends/Software/TextureSampler.cpp
+++ b/Source/Core/VideoBackends/Software/TextureSampler.cpp
@ -11,7 +11,6 @@
 #include "Core/HW/Memmap.h"

 #include "VideoCommon/BPMemory.h"
-#include "VideoCommon/SamplerCommon.h"
 #include "VideoCommon/TextureDecoder.h"

 #define ALLOW_MIPMAP 1
@ -79,7 +78,7 @@ void Sample(s32 s, s32 t, s32 lod, bool linear, u8 texmap, u8* sample)

  const s32 lodFract = lod & 0xf;

-  if (lod > 0 && SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0))
+  if (lod > 0 && tm0.mipmap_filter != MipMode::None)
  {
    // use mipmap
    baseMip = lod >> 4;
--- a/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp
+++ b/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp
@ -315,28 +315,28 @@ VkSampler ObjectCache::GetSampler(const SamplerState& info)
       VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT}};

  VkSamplerCreateInfo create_info = {
-      VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,               // VkStructureType         sType
-      nullptr,                                             // const void*             pNext
-      0,                                                   // VkSamplerCreateFlags    flags
-      filters[static_cast<u32>(info.mag_filter.Value())],  // VkFilter                magFilter
-      filters[static_cast<u32>(info.min_filter.Value())],  // VkFilter                minFilter
-      mipmap_modes[static_cast<u32>(info.mipmap_filter.Value())],  // VkSamplerMipmapMode mipmapMode
-      address_modes[static_cast<u32>(info.wrap_u.Value())],  // VkSamplerAddressMode    addressModeU
-      address_modes[static_cast<u32>(info.wrap_v.Value())],  // VkSamplerAddressMode    addressModeV
-      VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,                 // VkSamplerAddressMode    addressModeW
-      info.lod_bias / 256.0f,                                // float                   mipLodBias
+      VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,              // VkStructureType         sType
+      nullptr,                                            // const void*             pNext
+      0,                                                  // VkSamplerCreateFlags    flags
+      filters[u32(info.tm0.mag_filter.Value())],          // VkFilter                magFilter
+      filters[u32(info.tm0.min_filter.Value())],          // VkFilter                minFilter
+      mipmap_modes[u32(info.tm0.mipmap_filter.Value())],  // VkSamplerMipmapMode mipmapMode
+      address_modes[u32(info.tm0.wrap_u.Value())],        // VkSamplerAddressMode    addressModeU
+      address_modes[u32(info.tm0.wrap_v.Value())],        // VkSamplerAddressMode    addressModeV
+      VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,              // VkSamplerAddressMode    addressModeW
+      info.tm0.lod_bias / 256.0f,                         // float                   mipLodBias
      VK_FALSE,                                 // VkBool32                anisotropyEnable
      0.0f,                                     // float                   maxAnisotropy
      VK_FALSE,                                 // VkBool32                compareEnable
      VK_COMPARE_OP_ALWAYS,                     // VkCompareOp             compareOp
-      info.min_lod / 16.0f,                     // float                   minLod
-      info.max_lod / 16.0f,                     // float                   maxLod
+      info.tm1.min_lod / 16.0f,                 // float                   minLod
+      info.tm1.max_lod / 16.0f,                 // float                   maxLod
      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,  // VkBorderColor           borderColor
      VK_FALSE                                  // VkBool32                unnormalizedCoordinates
  };

  // Can we use anisotropic filtering with this sampler?
-  if (info.anisotropic_filtering && g_vulkan_context->SupportsAnisotropicFiltering())
+  if (info.tm0.anisotropic_filtering && g_vulkan_context->SupportsAnisotropicFiltering())
  {
    // Cap anisotropy to device limits.
    create_info.anisotropyEnable = VK_TRUE;
--- a/Source/Core/VideoBackends/Vulkan/VKRenderer.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VKRenderer.cpp
@ -49,7 +49,7 @@ Renderer::Renderer(std::unique_ptr<SwapChain> swap_chain, float backbuffer_scale
 {
  UpdateActiveConfig();
  for (SamplerState& m_sampler_state : m_sampler_states)
-    m_sampler_state.hex = RenderState::GetPointSamplerState().hex;
+    m_sampler_state = RenderState::GetPointSamplerState();
 }

 Renderer::~Renderer() = default;
@ -545,7 +545,7 @@ void Renderer::SetTexture(u32 index, const AbstractTexture* texture)
 void Renderer::SetSamplerState(u32 index, const SamplerState& state)
 {
  // Skip lookup if the state hasn't changed.
-  if (m_sampler_states[index].hex == state.hex)
+  if (m_sampler_states[index] == state)
    return;

  // Look up new state and replace in state tracker.
@ -557,7 +557,7 @@ void Renderer::SetSamplerState(u32 index, const SamplerState& state)
  }

  StateTracker::GetInstance()->SetSampler(index, sampler);
-  m_sampler_states[index].hex = state.hex;
+  m_sampler_states[index] = state;
 }

 void Renderer::SetComputeImageTexture(AbstractTexture* texture, bool read, bool write)
@ -588,7 +588,7 @@ void Renderer::ResetSamplerStates()
  // Invalidate all sampler states, next draw will re-initialize them.
  for (u32 i = 0; i < m_sampler_states.size(); i++)
  {
-    m_sampler_states[i].hex = RenderState::GetPointSamplerState().hex;
+    m_sampler_states[i] = RenderState::GetPointSamplerState();
    StateTracker::GetInstance()->SetSampler(i, g_object_cache->GetPointSampler());
  }

--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
@ -286,6 +286,8 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config)
  config->backend_info.bSupportsLogicOp = false;                   // Dependent on features.
  config->backend_info.bSupportsLargePoints = false;               // Dependent on features.
  config->backend_info.bSupportsFramebufferFetch = false;          // No support.
+  config->backend_info.bSupportsCoarseDerivatives = true;          // Assumed support.
+  config->backend_info.bSupportsTextureQueryLevels = true;         // Assumed support.
 }

 void VulkanContext::PopulateBackendInfoAdapters(VideoConfig* config, const GPUList& gpu_list)
--- a/Source/Core/VideoCommon/CMakeLists.txt
+++ b/Source/Core/VideoCommon/CMakeLists.txt
@ -70,7 +70,6 @@ add_library(videocommon
  RenderBase.h
  RenderState.cpp
  RenderState.h
-  SamplerCommon.h
  ShaderCache.cpp
  ShaderCache.h
  ShaderGenCommon.cpp
--- a/Source/Core/VideoCommon/ConstantManager.h
+++ b/Source/Core/VideoCommon/ConstantManager.h
@ -21,7 +21,7 @@ struct PixelShaderConstants
  std::array<int4, 4> colors;
  std::array<int4, 4> kcolors;
  int4 alpha;
-  std::array<float4, 8> texdims;
+  std::array<uint4, 8> texdims;
  std::array<int4, 2> zbias;
  std::array<int4, 2> indtexscale;
  std::array<int4, 6> indtexmtx;
@ -32,7 +32,7 @@ struct PixelShaderConstants
  float4 zslope;
  std::array<float, 2> efbscale;  // .xy

-  // Constants from here onwards are only used in ubershaders.
+  // Constants from here onwards are only used in ubershaders, other than pack2.
  u32 genmode;                  // .z
  u32 alphaTest;                // .w
  u32 fogParam3;                // .x
@ -44,7 +44,7 @@ struct PixelShaderConstants
  u32 dither;                   // .z (bool)
  u32 bounding_box;             // .w (bool)
  std::array<uint4, 16> pack1;  // .xy - combiners, .z - tevind, .w - iref
-  std::array<uint4, 8> pack2;   // .x - tevorder, .y - tevksel
+  std::array<uint4, 8> pack2;   // .x - tevorder, .y - tevksel, .z/.w - SamplerState tm0/tm1
  std::array<int4, 32> konst;   // .rgba
  // The following are used in ubershaders when using shader_framebuffer_fetch blending
  u32 blend_enable;
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@ -381,7 +381,7 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
    // Declare samplers
    out.Write("SamplerState samp[8] : register(s0);\n"
              "\n"
-              "Texture2DArray Tex[8] : register(t0);\n");
+              "Texture2DArray tex[8] : register(t0);\n");
  }
  out.Write("\n");

@ -393,7 +393,7 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
  out.Write("\tint4 " I_COLORS "[4];\n"
            "\tint4 " I_KCOLORS "[4];\n"
            "\tint4 " I_ALPHA ";\n"
-            "\tfloat4 " I_TEXDIMS "[8];\n"
+            "\tint4 " I_TEXDIMS "[8];\n"
            "\tint4 " I_ZBIAS "[2];\n"
            "\tint4 " I_INDTEXSCALE "[2];\n"
            "\tint4 " I_INDTEXMTX "[6];\n"
@ -414,7 +414,7 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
            "\tbool  bpmem_dither;\n"
            "\tbool  bpmem_bounding_box;\n"
            "\tuint4 bpmem_pack1[16];\n"  // .xy - combiners, .z - tevind
-            "\tuint4 bpmem_pack2[8];\n"   // .x - tevorder, .y - tevksel
+            "\tuint4 bpmem_pack2[8];\n"   // .x - tevorder, .y - tevksel, .zw - SamplerState tm0/tm1
            "\tint4  konstLookup[32];\n"
            "\tbool  blend_enable;\n"
            "\tuint  blend_src_factor;\n"
@ -428,7 +428,9 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
            "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n"
            "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n"
            "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n"
-            "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n");
+            "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n"
+            "#define samp_texmode0(i) (bpmem_pack2[(i)].z)\n"
+            "#define samp_texmode1(i) (bpmem_pack2[(i)].w)\n\n");

  if (host_config.per_pixel_lighting)
  {
@ -534,14 +536,304 @@ void UpdateBoundingBox(float2 rawpos) {{
 )",
              fmt::arg("efb_height", EFB_HEIGHT), fmt::arg("efb_scale", I_EFBSCALE));
  }
+
+  if (host_config.manual_texture_sampling)
+  {
+    if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+    {
+      out.Write(R"(
+int4 readTexture(in sampler2DArray tex, uint u, uint v, int layer, int lod) {{
+  return iround(texelFetch(tex, int3(u, v, layer), lod) * 255.0);
+}}
+
+int4 readTextureLinear(in sampler2DArray tex, uint2 uv1, uint2 uv2, int layer, int lod, int2 frac_uv) {{)");
+    }
+    else if (api_type == APIType::D3D)
+    {
+      out.Write(R"(
+int4 readTexture(in Texture2DArray tex, uint u, uint v, int layer, int lod) {{
+  return iround(tex.Load(int4(u, v, layer, lod)) * 255.0);
+}}
+
+int4 readTextureLinear(in Texture2DArray tex, uint2 uv1, uint2 uv2, int layer, int lod, int2 frac_uv) {{)");
+    }
+
+    out.Write(R"(
+  int4 result =
+    readTexture(tex, uv1.x, uv1.y, layer, lod) * (128 - frac_uv.x) * (128 - frac_uv.y) +
+    readTexture(tex, uv2.x, uv1.y, layer, lod) * (      frac_uv.x) * (128 - frac_uv.y) +
+    readTexture(tex, uv1.x, uv2.y, layer, lod) * (128 - frac_uv.x) * (      frac_uv.y) +
+    readTexture(tex, uv2.x, uv2.y, layer, lod) * (      frac_uv.x) * (      frac_uv.y);
+  return result >> 14;
+}}
+)");
+
+    if (host_config.manual_texture_sampling_custom_texture_sizes)
+    {
+      // This is slower, and doesn't result in the same odd behavior that happens on console when
+      // wrapping with non-power-of-2 sizes, but it's fine for custom textures to have non-console
+      // behavior.
+      out.Write(R"(
+// Both GLSL and HLSL produce undefined values when the modulo operator (%) is used with a negative
+// dividend and a positive divisor.  We want a positive value such that SafeModulo(-1, 3) is 2.
+int SafeModulo(int dividend, int divisor) {{
+  if (dividend >= 0) {{
+    return dividend % divisor;
+  }} else {{
+    // This works because ~x is the same as -x - 1.
+    // `~x % 5` over -5 to -1 gives 4, 3, 2, 1, 0.  `4 - (~x % 5)` gives 0, 1, 2, 3, 4.
+    return (divisor - 1) - (~dividend % divisor);
+  }}
+}}
+
+uint WrapCoord(int coord, uint wrap, int size) {{
+  switch (wrap) {{
+    case {:s}:
+    default: // confirmed that clamp is used for invalid (3) via hardware test
+      return uint(clamp(coord, 0, size - 1));
+    case {:s}:
+      return uint(SafeModulo(coord, size));  // coord % size
+    case {:s}:
+      if (SafeModulo(coord, 2 * size) >= size) {{  // coord % (2 * size)
+        coord = ~coord;
+      }}
+      return uint(SafeModulo(coord, size));  // coord % size
+  }}
+}}
+)",
+                WrapMode::Clamp, WrapMode::Repeat, WrapMode::Mirror);
+    }
+    else
+    {
+      out.Write(R"(
+uint WrapCoord(int coord, uint wrap, int size) {{
+  switch (wrap) {{
+    case {:s}:
+    default: // confirmed that clamp is used for invalid (3) via hardware test
+      return uint(clamp(coord, 0, size - 1));
+    case {:s}:
+      return uint(coord & (size - 1));
+    case {:s}:
+      if ((coord & size) != 0) {{
+        coord = ~coord;
+      }}
+      return uint(coord & (size - 1));
+  }}
+}}
+)",
+                WrapMode::Clamp, WrapMode::Repeat, WrapMode::Mirror);
+    }
+  }
+
+  if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+  {
+    out.Write("\nint4 sampleTexture(uint texmap, in sampler2DArray tex, int2 uv, int layer) {{\n");
+  }
+  else if (api_type == APIType::D3D)
+  {
+    out.Write("\nint4 sampleTexture(uint texmap, in Texture2DArray tex, in SamplerState tex_samp, "
+              "int2 uv, int layer) {{\n");
+  }
+
+  if (!host_config.manual_texture_sampling)
+  {
+    out.Write("  float size_s = float(" I_TEXDIMS "[texmap].x * 128);\n"
+              "  float size_t = float(" I_TEXDIMS "[texmap].y * 128);\n"
+              "  float3 coords = float3(float(uv.x) / size_s, float(uv.y) / size_t, layer);\n");
+    if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+    {
+      out.Write("  return iround(255.0 * texture(tex, coords));\n}}\n");
+    }
+    else if (api_type == APIType::D3D)
+    {
+      out.Write("  return iround(255.0 * tex.Sample(tex_samp, coords));\n}}\n");
+    }
+  }
+  else
+  {
+    out.Write(R"(
+  uint texmode0 = samp_texmode0(texmap);
+  uint texmode1 = samp_texmode1(texmap);
+
+  uint wrap_s = {};
+  uint wrap_t = {};
+  bool mag_linear = {} != 0u;
+  bool mipmap_linear = {} != 0u;
+  bool min_linear = {} != 0u;
+  bool diag_lod = {} != 0u;
+  int lod_bias = {};
+  // uint max_aniso = TODO;
+  bool lod_clamp = {} != 0u;
+  int min_lod = int({});
+  int max_lod = int({});
+)",
+              BitfieldExtract<&SamplerState::TM0::wrap_u>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::wrap_v>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::mag_filter>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::mipmap_filter>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::min_filter>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::diag_lod>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::lod_bias>("texmode0"),
+              // BitfieldExtract<&SamplerState::TM0::max_aniso>("texmode0"),
+              BitfieldExtract<&SamplerState::TM0::lod_clamp>("texmode0"),
+              BitfieldExtract<&SamplerState::TM1::min_lod>("texmode1"),
+              BitfieldExtract<&SamplerState::TM1::max_lod>("texmode1"));
+
+    if (host_config.manual_texture_sampling_custom_texture_sizes)
+    {
+      out.Write(R"(
+  int native_size_s = )" I_TEXDIMS R"([texmap].x;
+  int native_size_t = )" I_TEXDIMS R"([texmap].y;
+)");
+
+      if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+      {
+        out.Write(R"(
+  int3 size = textureSize(tex, 0);
+  int size_s = size.x;
+  int size_t = size.y;
+)");
+        if (g_ActiveConfig.backend_info.bSupportsTextureQueryLevels)
+        {
+          out.Write("  int number_of_levels = textureQueryLevels(tex);\n");
+        }
+        else
+        {
+          out.Write("  int number_of_levels = 256;  // textureQueryLevels is not supported\n");
+          ERROR_LOG_FMT(VIDEO, "textureQueryLevels is not supported!  Odd graphical results may "
+                               "occur if custom textures are in use!");
+        }
+      }
+      else if (api_type == APIType::D3D)
+      {
+        ASSERT(g_ActiveConfig.backend_info.bSupportsTextureQueryLevels);
+        out.Write(R"(
+  int size_s, size_t, layers, number_of_levels;
+  tex.GetDimensions(0, size_s, size_t, layers, number_of_levels);
+)");
+      }
+
+      out.Write(R"(
+  // Prevent out-of-bounds LOD values when using custom textures
+  max_lod = min(max_lod, (number_of_levels - 1) << 4);
+  // Rescale uv to account for the new texture size
+  uv.x = (uv.x * size_s) / native_size_s;
+  uv.y = (uv.y * size_t) / native_size_t;
+)");
+    }
+    else
+    {
+      out.Write(R"(
+  int size_s = )" I_TEXDIMS R"([texmap].x;
+  int size_t = )" I_TEXDIMS R"([texmap].y;
+)");
+    }
+
+    if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+    {
+      if (g_ActiveConfig.backend_info.bSupportsCoarseDerivatives)
+      {
+        // The software renderer uses the equivalent of coarse derivatives, so use them here for
+        // consistency.  This hasn't been hardware tested.
+        // Note that bSupportsCoarseDerivatives being false only means dFdxCoarse and dFdxFine don't
+        // exist.  The GPU may still implement dFdx using coarse derivatives; we just don't have the
+        // ability to specifically require it.
+        out.Write(R"(
+  float2 uv_delta_x = abs(dFdxCoarse(float2(uv)));
+  float2 uv_delta_y = abs(dFdyCoarse(float2(uv)));
+)");
+      }
+      else
+      {
+        out.Write(R"(
+  float2 uv_delta_x = abs(dFdx(float2(uv)));
+  float2 uv_delta_y = abs(dFdy(float2(uv)));
+)");
+      }
+    }
+    else if (api_type == APIType::D3D)
+    {
+      ASSERT(g_ActiveConfig.backend_info.bSupportsCoarseDerivatives);
+      out.Write(R"(
+  float2 uv_delta_x = abs(ddx_coarse(float2(uv)));
+  float2 uv_delta_y = abs(ddy_coarse(float2(uv)));
+)");
+    }
+
+    // TODO: LOD bias is normally S2.5 (Dolphin uses S7.8 for arbitrary mipmap detection and higher
+    // IRs), but (at least per the software renderer) actual LOD is S28.4.  How does this work?
+    // Also, note that we can make some assumptions due to use of a SamplerState version of the BP
+    // configuration, which tidies things compared to whatever nonsense games can put in.
+    out.Write(R"(
+  float2 uv_delta = diag_lod ? uv_delta_x + uv_delta_y : max(uv_delta_x, uv_delta_y);
+  float max_delta = max(uv_delta.x / 128.0, uv_delta.y / 128.0);
+  // log2(x) is undefined if x <= 0, but in practice it seems log2(0) is -infinity, which becomes INT_MIN.
+  // If lod_bias is negative, adding it to INT_MIN causes an underflow, resulting in a large positive value.
+  // Hardware testing indicates that min_lod should be used when the derivative is 0.
+  int lod = max_delta == 0.0 ? min_lod : int(floor(log2(max_delta) * 16.0)) + (lod_bias >> 4);
+
+  bool is_linear = (lod > 0) ? min_linear : mag_linear;
+  lod = clamp(lod, min_lod, max_lod);
+  int base_lod = lod >> 4;
+  int frac_lod = lod & 15;
+  if (!mipmap_linear && frac_lod >= 8) {{
+    // Round to nearest LOD in point mode
+    base_lod++;
+  }}
+
+  if (is_linear) {{
+    uint2 texuv1 = uint2(
+        WrapCoord(((uv.x >> base_lod) - 64) >> 7, wrap_s, size_s >> base_lod),
+        WrapCoord(((uv.y >> base_lod) - 64) >> 7, wrap_t, size_t >> base_lod));
+    uint2 texuv2 = uint2(
+        WrapCoord(((uv.x >> base_lod) + 64) >> 7, wrap_s, size_s >> base_lod),
+        WrapCoord(((uv.y >> base_lod) + 64) >> 7, wrap_t, size_t >> base_lod));
+    int2 frac_uv = int2(((uv.x >> base_lod) - 64) & 0x7f, ((uv.y >> base_lod) - 64) & 0x7f);
+
+    int4 result = readTextureLinear(tex, texuv1, texuv2, layer, base_lod, frac_uv);
+
+    if (frac_lod != 0 && mipmap_linear) {{
+      texuv1 = uint2(
+          WrapCoord(((uv.x >> (base_lod + 1)) - 64) >> 7, wrap_s, size_s >> (base_lod + 1)),
+          WrapCoord(((uv.y >> (base_lod + 1)) - 64) >> 7, wrap_t, size_t >> (base_lod + 1)));
+      texuv2 = uint2(
+          WrapCoord(((uv.x >> (base_lod + 1)) + 64) >> 7, wrap_s, size_s >> (base_lod + 1)),
+          WrapCoord(((uv.y >> (base_lod + 1)) + 64) >> 7, wrap_t, size_t >> (base_lod + 1)));
+      frac_uv = int2(((uv.x >> (base_lod + 1)) - 64) & 0x7f, ((uv.y >> (base_lod + 1)) - 64) & 0x7f);
+
+      result *= 16 - frac_lod;
+      result += readTextureLinear(tex, texuv1, texuv2, layer, base_lod + 1, frac_uv) * frac_lod;
+      result >>= 4;
+    }}
+
+    return result;
+  }} else {{
+    uint2 texuv = uint2(
+        WrapCoord(uv.x >> (7 + base_lod), wrap_s, size_s >> base_lod),
+        WrapCoord(uv.y >> (7 + base_lod), wrap_t, size_t >> base_lod));
+
+    int4 result = readTexture(tex, texuv.x, texuv.y, layer, base_lod);
+
+    if (frac_lod != 0 && mipmap_linear) {{
+      texuv = uint2(
+          WrapCoord(uv.x >> (7 + base_lod + 1), wrap_s, size_s >> (base_lod + 1)),
+          WrapCoord(uv.y >> (7 + base_lod + 1), wrap_t, size_t >> (base_lod + 1)));
+
+      result *= 16 - frac_lod;
+      result += readTexture(tex, texuv.x, texuv.y, layer, base_lod + 1) * frac_lod;
+      result >>= 4;
+    }}
+    return result;
+  }}
+}}
+)");
+  }
 }

 static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
                       APIType api_type, bool stereo);
 static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBias bias, TevOp op,
                            bool clamp, TevScale scale, bool alpha);
-static void SampleTexture(ShaderCode& out, std::string_view texcoords, std::string_view texswap,
-                          int texmap, bool stereo, APIType api_type);
 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type,
                           bool per_pixel_depth, bool use_dual_source);
 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
@ -565,8 +857,20 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
            uid_data->genMode_numtexgens, uid_data->genMode_numindstages);

  // Stuff that is shared between ubershaders and pixelgen.
+  WriteBitfieldExtractHeader(out, api_type, host_config);
  WritePixelShaderCommonHeader(out, api_type, host_config, uid_data->bounding_box);

+  if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
+  {
+    out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) "
+              "sampleTexture(texmap, samp[texmap], uv, layer)\n");
+  }
+  else if (api_type == APIType::D3D)
+  {
+    out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) "
+              "sampleTexture(texmap, tex[texmap], samp[texmap], uv, layer)\n");
+  }
+
  if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
  {
    // Zcomploc (aka early_ztest) is a way to control whether depth test is done before
@ -754,6 +1058,8 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
      out.Write(",\n  in uint layer : SV_RenderTargetArrayIndex\n");
    out.Write("        ) {{\n");
  }
+  if (!stereo)
+    out.Write("\tint layer = 0;\n");

  out.Write("\tint4 c0 = " I_COLORS "[1], c1 = " I_COLORS "[2], c2 = " I_COLORS
            "[3], prev = " I_COLORS "[0];\n"
@ -811,7 +1117,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
    {
      out.Write("\tint2 fixpoint_uv{} = int2(", i);
      out.Write("(tex{}.z == 0.0 ? tex{}.xy : tex{}.xy / tex{}.z)", i, i, i, i);
-      out.Write(" * " I_TEXDIMS "[{}].zw);\n", i);
+      out.Write(" * float2(" I_TEXDIMS "[{}].zw * 128));\n", i);
      // TODO: S24 overflows here?
    }
  }
@ -834,8 +1140,8 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
      out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
                (i & 1) ? "zw" : "xy");

-      out.Write("\tint3 iindtex{} = ", i);
-      SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, api_type);
+      out.Write("\tint3 iindtex{0} = sampleTextureWrapper({1}u, tempcoord, layer).abg;\n", i,
+                texmap);
    }
  }

@ -1243,8 +1549,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
        '\0',
    };

-    out.Write("\ttextemp = ");
-    SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, api_type);
+    out.Write("\ttextemp = sampleTextureWrapper({0}u, tevcoord.xy, layer).{1};\n",
+              stage.tevorders_texmap, texswap);
  }
  else if (uid_data->genMode_numtexgens == 0)
  {
@ -1428,24 +1734,6 @@ static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBia
  out.Write("){}", tev_scale_table_right[u32(scale)]);
 }

-static void SampleTexture(ShaderCode& out, std::string_view texcoords, std::string_view texswap,
-                          int texmap, bool stereo, APIType api_type)
-{
-  out.SetConstantsUsed(C_TEXDIMS + texmap, C_TEXDIMS + texmap);
-
-  if (api_type == APIType::D3D)
-  {
-    out.Write("iround(255.0 * Tex[{}].Sample(samp[{}], float3({}.xy * " I_TEXDIMS
-              "[{}].xy, {}))).{};\n",
-              texmap, texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
-  }
-  else
-  {
-    out.Write("iround(255.0 * texture(samp[{}], float3({}.xy * " I_TEXDIMS "[{}].xy, {}))).{};\n",
-              texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
-  }
-}
-
 constexpr std::array<const char*, 8> tev_alpha_funcs_table{
    "(false)",         // CompareMode::Never
    "(prev.a <  {})",  // CompareMode::Less
--- a/Source/Core/VideoCommon/PixelShaderManager.cpp
+++ b/Source/Core/VideoCommon/PixelShaderManager.cpp
@ -273,16 +273,22 @@ void PixelShaderManager::SetDestAlphaChanged()

 void PixelShaderManager::SetTexDims(int texmapid, u32 width, u32 height)
 {
-  float rwidth = 1.0f / (width * 128.0f);
-  float rheight = 1.0f / (height * 128.0f);
-
  // TODO: move this check out to callee. There we could just call this function on texture changes
  // or better, use textureSize() in glsl
-  if (constants.texdims[texmapid][0] != rwidth || constants.texdims[texmapid][1] != rheight)
+  if (constants.texdims[texmapid][0] != width || constants.texdims[texmapid][1] != height)
    dirty = true;

-  constants.texdims[texmapid][0] = rwidth;
-  constants.texdims[texmapid][1] = rheight;
+  constants.texdims[texmapid][0] = width;
+  constants.texdims[texmapid][1] = height;
+}
+
+void PixelShaderManager::SetSamplerState(int texmapid, u32 tm0, u32 tm1)
+{
+  if (constants.pack2[texmapid][2] != tm0 || constants.pack2[texmapid][3] != tm1)
+    dirty = true;
+
+  constants.pack2[texmapid][2] = tm0;
+  constants.pack2[texmapid][3] = tm1;
 }

 void PixelShaderManager::SetZTextureBias()
@ -382,8 +388,8 @@ void PixelShaderManager::SetZTextureOpChanged()
 void PixelShaderManager::SetTexCoordChanged(u8 texmapid)
 {
  TCoordInfo& tc = bpmem.texcoords[texmapid];
-  constants.texdims[texmapid][2] = (float)(tc.s.scale_minus_1 + 1) * 128.0f;
-  constants.texdims[texmapid][3] = (float)(tc.t.scale_minus_1 + 1) * 128.0f;
+  constants.texdims[texmapid][2] = tc.s.scale_minus_1 + 1;
+  constants.texdims[texmapid][3] = tc.t.scale_minus_1 + 1;
  dirty = true;
 }

--- a/Source/Core/VideoCommon/PixelShaderManager.h
+++ b/Source/Core/VideoCommon/PixelShaderManager.h
@ -30,6 +30,7 @@ public:
  static void SetAlphaTestChanged();
  static void SetDestAlphaChanged();
  static void SetTexDims(int texmapid, u32 width, u32 height);
+  static void SetSamplerState(int texmapid, u32 tm0, u32 tm1);
  static void SetZTextureBias();
  static void SetViewportChanged();
  static void SetEfbScaleChanged(float scalex, float scaley);
--- a/Source/Core/VideoCommon/RenderState.cpp
+++ b/Source/Core/VideoCommon/RenderState.cpp
@ -2,9 +2,10 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include "VideoCommon/RenderState.h"
+
 #include <algorithm>
 #include <array>
-#include "VideoCommon/SamplerCommon.h"
+
 #include "VideoCommon/TextureConfig.h"

 void RasterizationState::Generate(const BPMemory& bp, PrimitiveType primitive_type)
@ -17,18 +18,6 @@ void RasterizationState::Generate(const BPMemory& bp, PrimitiveType primitive_ty
    cullmode = CullMode::None;
 }

-RasterizationState& RasterizationState::operator=(const RasterizationState& rhs)
-{
-  hex = rhs.hex;
-  return *this;
-}
-
-FramebufferState& FramebufferState::operator=(const FramebufferState& rhs)
-{
-  hex = rhs.hex;
-  return *this;
-}
-
 void DepthState::Generate(const BPMemory& bp)
 {
  testenable = bp.zmode.testenable.Value();
@ -36,12 +25,6 @@ void DepthState::Generate(const BPMemory& bp)
  func = bp.zmode.func.Value();
 }

-DepthState& DepthState::operator=(const DepthState& rhs)
-{
-  hex = rhs.hex;
-  return *this;
-}
-
 // If the framebuffer format has no alpha channel, it is assumed to
 // ONE on blending. As the backends may emulate this framebuffer
 // configuration with an alpha channel, we just drop all references
@ -216,42 +199,45 @@ void BlendingState::ApproximateLogicOpWithBlending()
  dstfactor = approximations[u32(logicmode.Value())].dstfactor;
 }

-BlendingState& BlendingState::operator=(const BlendingState& rhs)
-{
-  hex = rhs.hex;
-  return *this;
-}
-
 void SamplerState::Generate(const BPMemory& bp, u32 index)
 {
  auto tex = bp.tex.GetUnit(index);
-  const TexMode0& tm0 = tex.texMode0;
-  const TexMode1& tm1 = tex.texMode1;
+  const TexMode0& bp_tm0 = tex.texMode0;
+  const TexMode1& bp_tm1 = tex.texMode1;

  // GX can configure the mip filter to none. However, D3D and Vulkan can't express this in their
  // sampler states. Therefore, we set the min/max LOD to zero if this option is used.
-  min_filter = tm0.min_filter == FilterMode::Linear ? Filter::Linear : Filter::Point;
-  mipmap_filter = tm0.mipmap_filter == MipMode::Linear ? Filter::Linear : Filter::Point;
-  mag_filter = tm0.mag_filter == FilterMode::Linear ? Filter::Linear : Filter::Point;
+  tm0.min_filter = bp_tm0.min_filter;
+  tm0.mipmap_filter =
+      bp_tm0.mipmap_filter == MipMode::Linear ? FilterMode::Linear : FilterMode::Near;
+  tm0.mag_filter = bp_tm0.mag_filter;

  // If mipmaps are disabled, clamp min/max lod
-  max_lod = SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0) ? tm1.max_lod.Value() : 0;
-  min_lod = std::min(max_lod.Value(), static_cast<u64>(tm1.min_lod));
-  lod_bias = SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0) ? tm0.lod_bias * (256 / 32) : 0;
+  if (bp_tm0.mipmap_filter == MipMode::None)
+  {
+    tm1.max_lod = 0;
+    tm1.min_lod = 0;
+    tm0.lod_bias = 0;
+  }
+  else
+  {
+    // NOTE: When comparing, max is checked first, then min; if max is less than min, max wins
+    tm1.max_lod = bp_tm1.max_lod.Value();
+    tm1.min_lod = std::min(tm1.max_lod.Value(), bp_tm1.min_lod.Value());
+    tm0.lod_bias = bp_tm0.lod_bias * (256 / 32);
+  }

-  // Address modes
+  // Wrap modes
  // Hardware testing indicates that wrap_mode set to 3 behaves the same as clamp.
-  static constexpr std::array<AddressMode, 4> address_modes = {
-      {AddressMode::Clamp, AddressMode::Repeat, AddressMode::MirroredRepeat, AddressMode::Clamp}};
-  wrap_u = address_modes[u32(tm0.wrap_s.Value())];
-  wrap_v = address_modes[u32(tm0.wrap_t.Value())];
-  anisotropic_filtering = 0;
-}
+  auto filter_invalid_wrap = [](WrapMode mode) {
+    return (mode <= WrapMode::Mirror) ? mode : WrapMode::Clamp;
+  };
+  tm0.wrap_u = filter_invalid_wrap(bp_tm0.wrap_s);
+  tm0.wrap_v = filter_invalid_wrap(bp_tm0.wrap_t);

-SamplerState& SamplerState::operator=(const SamplerState& rhs)
-{
-  hex = rhs.hex;
-  return *this;
+  tm0.diag_lod = bp_tm0.diag_lod;
+  tm0.anisotropic_filtering = false;  // TODO: Respect BP anisotropic filtering mode
+  tm0.lod_clamp = bp_tm0.lod_clamp;   // TODO: What does this do?
 }

 namespace RenderState
@ -344,37 +330,42 @@ BlendingState GetNoColorWriteBlendState()
 SamplerState GetInvalidSamplerState()
 {
  SamplerState state;
-  state.hex = UINT64_C(0xFFFFFFFFFFFFFFFF);
+  state.tm0.hex = 0xFFFFFFFF;
+  state.tm1.hex = 0xFFFFFFFF;
  return state;
 }

 SamplerState GetPointSamplerState()
 {
  SamplerState state = {};
-  state.min_filter = SamplerState::Filter::Point;
-  state.mag_filter = SamplerState::Filter::Point;
-  state.mipmap_filter = SamplerState::Filter::Point;
-  state.wrap_u = SamplerState::AddressMode::Clamp;
-  state.wrap_v = SamplerState::AddressMode::Clamp;
-  state.min_lod = 0;
-  state.max_lod = 255;
-  state.lod_bias = 0;
-  state.anisotropic_filtering = false;
+  state.tm0.min_filter = FilterMode::Near;
+  state.tm0.mag_filter = FilterMode::Near;
+  state.tm0.mipmap_filter = FilterMode::Near;
+  state.tm0.wrap_u = WrapMode::Clamp;
+  state.tm0.wrap_v = WrapMode::Clamp;
+  state.tm1.min_lod = 0;
+  state.tm1.max_lod = 255;
+  state.tm0.lod_bias = 0;
+  state.tm0.anisotropic_filtering = false;
+  state.tm0.diag_lod = LODType::Edge;
+  state.tm0.lod_clamp = false;
  return state;
 }

 SamplerState GetLinearSamplerState()
 {
  SamplerState state = {};
-  state.min_filter = SamplerState::Filter::Linear;
-  state.mag_filter = SamplerState::Filter::Linear;
-  state.mipmap_filter = SamplerState::Filter::Linear;
-  state.wrap_u = SamplerState::AddressMode::Clamp;
-  state.wrap_v = SamplerState::AddressMode::Clamp;
-  state.min_lod = 0;
-  state.max_lod = 255;
-  state.lod_bias = 0;
-  state.anisotropic_filtering = false;
+  state.tm0.min_filter = FilterMode::Linear;
+  state.tm0.mag_filter = FilterMode::Linear;
+  state.tm0.mipmap_filter = FilterMode::Linear;
+  state.tm0.wrap_u = WrapMode::Clamp;
+  state.tm0.wrap_v = WrapMode::Clamp;
+  state.tm1.min_lod = 0;
+  state.tm1.max_lod = 255;
+  state.tm0.lod_bias = 0;
+  state.tm0.anisotropic_filtering = false;
+  state.tm0.diag_lod = LODType::Edge;
+  state.tm0.lod_clamp = false;
  return state;
 }

--- a/Source/Core/VideoCommon/RenderState.h
+++ b/Source/Core/VideoCommon/RenderState.h
@ -22,11 +22,24 @@ union RasterizationState
 {
  void Generate(const BPMemory& bp, PrimitiveType primitive_type);

-  RasterizationState& operator=(const RasterizationState& rhs);
+  RasterizationState() = default;
+  RasterizationState(const RasterizationState&) = default;
+  RasterizationState& operator=(const RasterizationState& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }
+  RasterizationState(RasterizationState&&) = default;
+  RasterizationState& operator=(RasterizationState&& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }

  bool operator==(const RasterizationState& rhs) const { return hex == rhs.hex; }
-  bool operator!=(const RasterizationState& rhs) const { return hex != rhs.hex; }
+  bool operator!=(const RasterizationState& rhs) const { return !operator==(rhs); }
  bool operator<(const RasterizationState& rhs) const { return hex < rhs.hex; }
+
  BitField<0, 2, CullMode> cullmode;
  BitField<3, 2, PrimitiveType> primitive;

@ -35,15 +48,28 @@ union RasterizationState

 union FramebufferState
 {
+  FramebufferState() = default;
+  FramebufferState(const FramebufferState&) = default;
+  FramebufferState& operator=(const FramebufferState& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }
+  FramebufferState(FramebufferState&&) = default;
+  FramebufferState& operator=(FramebufferState&& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }
+
+  bool operator==(const FramebufferState& rhs) const { return hex == rhs.hex; }
+  bool operator!=(const FramebufferState& rhs) const { return !operator==(rhs); }
+
  BitField<0, 8, AbstractTextureFormat> color_texture_format;
  BitField<8, 8, AbstractTextureFormat> depth_texture_format;
  BitField<16, 8, u32> samples;
  BitField<24, 1, u32> per_sample_shading;

-  bool operator==(const FramebufferState& rhs) const { return hex == rhs.hex; }
-  bool operator!=(const FramebufferState& rhs) const { return hex != rhs.hex; }
-  FramebufferState& operator=(const FramebufferState& rhs);
-
  u32 hex;
 };

@ -51,11 +77,24 @@ union DepthState
 {
  void Generate(const BPMemory& bp);

-  DepthState& operator=(const DepthState& rhs);
+  DepthState() = default;
+  DepthState(const DepthState&) = default;
+  DepthState& operator=(const DepthState& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }
+  DepthState(DepthState&&) = default;
+  DepthState& operator=(DepthState&& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }

  bool operator==(const DepthState& rhs) const { return hex == rhs.hex; }
-  bool operator!=(const DepthState& rhs) const { return hex != rhs.hex; }
+  bool operator!=(const DepthState& rhs) const { return !operator==(rhs); }
  bool operator<(const DepthState& rhs) const { return hex < rhs.hex; }
+
  BitField<0, 1, u32> testenable;
  BitField<1, 1, u32> updateenable;
  BitField<2, 3, CompareMode> func;
@ -71,11 +110,24 @@ union BlendingState
  // Will not be bit-correct, and in some cases not even remotely in the same ballpark.
  void ApproximateLogicOpWithBlending();

-  BlendingState& operator=(const BlendingState& rhs);
+  BlendingState() = default;
+  BlendingState(const BlendingState&) = default;
+  BlendingState& operator=(const BlendingState& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }
+  BlendingState(BlendingState&&) = default;
+  BlendingState& operator=(BlendingState&& rhs)
+  {
+    hex = rhs.hex;
+    return *this;
+  }

  bool operator==(const BlendingState& rhs) const { return hex == rhs.hex; }
-  bool operator!=(const BlendingState& rhs) const { return hex != rhs.hex; }
+  bool operator!=(const BlendingState& rhs) const { return !operator==(rhs); }
  bool operator<(const BlendingState& rhs) const { return hex < rhs.hex; }
+
  BitField<0, 1, u32> blendenable;
  BitField<1, 1, u32> logicopenable;
  BitField<2, 1, u32> dstalpha;
@ -93,43 +145,74 @@ union BlendingState
  u32 hex;
 };

-union SamplerState
+struct SamplerState
 {
-  using StorageType = u64;
-
-  enum class Filter : StorageType
-  {
-    Point,
-    Linear
-  };
-
-  enum class AddressMode : StorageType
-  {
-    Clamp,
-    Repeat,
-    MirroredRepeat
-  };
-
  void Generate(const BPMemory& bp, u32 index);

-  SamplerState& operator=(const SamplerState& rhs);
+  SamplerState() = default;
+  SamplerState(const SamplerState&) = default;
+  SamplerState& operator=(const SamplerState& rhs)
+  {
+    tm0.hex = rhs.tm0.hex;
+    tm1.hex = rhs.tm1.hex;
+    return *this;
+  }
+  SamplerState(SamplerState&&) = default;
+  SamplerState& operator=(SamplerState&& rhs)
+  {
+    tm0.hex = rhs.tm0.hex;
+    tm1.hex = rhs.tm1.hex;
+    return *this;
+  }

-  bool operator==(const SamplerState& rhs) const { return hex == rhs.hex; }
-  bool operator!=(const SamplerState& rhs) const { return hex != rhs.hex; }
-  bool operator<(const SamplerState& rhs) const { return hex < rhs.hex; }
-  BitField<0, 1, Filter> min_filter;
-  BitField<1, 1, Filter> mag_filter;
-  BitField<2, 1, Filter> mipmap_filter;
-  BitField<3, 2, AddressMode> wrap_u;
-  BitField<5, 2, AddressMode> wrap_v;
-  BitField<7, 16, s64> lod_bias;  // multiplied by 256
-  BitField<23, 8, u64> min_lod;   // multiplied by 16
-  BitField<31, 8, u64> max_lod;   // multiplied by 16
-  BitField<39, 1, u64> anisotropic_filtering;
+  bool operator==(const SamplerState& rhs) const { return Hex() == rhs.Hex(); }
+  bool operator!=(const SamplerState& rhs) const { return !operator==(rhs); }
+  bool operator<(const SamplerState& rhs) const { return Hex() < rhs.Hex(); }

-  StorageType hex;
+  constexpr u64 Hex() const { return tm0.hex | (static_cast<u64>(tm1.hex) << 32); }
+
+  // Based on BPMemory TexMode0/TexMode1, but with slightly higher precision and some
+  // simplifications
+  union TM0
+  {
+    // BP's mipmap_filter can be None, but that is represented here by setting min_lod and max_lod
+    // to 0
+    BitField<0, 1, FilterMode> min_filter;
+    BitField<1, 1, FilterMode> mag_filter;
+    BitField<2, 1, FilterMode> mipmap_filter;
+    // Guaranteed to be valid values (i.e. not 3)
+    BitField<3, 2, WrapMode> wrap_u;
+    BitField<5, 2, WrapMode> wrap_v;
+    BitField<7, 1, LODType> diag_lod;
+    BitField<8, 16, s32> lod_bias;         // multiplied by 256, higher precision than normal
+    BitField<24, 1, bool, u32> lod_clamp;  // TODO: This isn't currently implemented
+    BitField<25, 1, bool, u32> anisotropic_filtering;  // TODO: This doesn't use the BP one yet
+    u32 hex;
+  };
+  union TM1
+  {
+    // Min is guaranteed to be less than or equal to max
+    BitField<0, 8, u32> min_lod;  // multiplied by 16
+    BitField<8, 8, u32> max_lod;  // multiplied by 16
+    u32 hex;
+  };
+
+  TM0 tm0;
+  TM1 tm1;
 };

+namespace std
+{
+template <>
+struct hash<SamplerState>
+{
+  std::size_t operator()(SamplerState const& state) const noexcept
+  {
+    return std::hash<u64>{}(state.Hex());
+  }
+};
+}  // namespace std
+
 namespace RenderState
 {
 RasterizationState GetInvalidRasterizationState();
--- a/Source/Core/VideoCommon/SamplerCommon.h
+++ b/Source/Core/VideoCommon/SamplerCommon.h
@ -1,27 +0,0 @@
-// Copyright 2016 Dolphin Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-namespace SamplerCommon
-{
-// Helper for checking if a BPMemory TexMode0 register is set to Point
-// Filtering modes. This is used to decide whether Anisotropic enhancements
-// are (mostly) safe in the VideoBackends.
-// If both the minification and magnification filters are set to POINT modes
-// then applying anisotropic filtering is equivalent to forced filtering. Point
-// mode textures are usually some sort of 2D UI billboard which will end up
-// misaligned from the correct pixels when filtered anisotropically.
-template <class T>
-constexpr bool IsBpTexMode0PointFiltering(const T& tm0)
-{
-  return tm0.min_filter == FilterMode::Near && tm0.mag_filter == FilterMode::Near;
-}
-
-// Check if the minification filter has mipmap based filtering modes enabled.
-template <class T>
-constexpr bool AreBpTexMode0MipmapsEnabled(const T& tm0)
-{
-  return tm0.mipmap_filter != MipMode::None;
-}
-}  // namespace SamplerCommon
--- a/Source/Core/VideoCommon/ShaderGenCommon.cpp
+++ b/Source/Core/VideoCommon/ShaderGenCommon.cpp
@ -39,6 +39,9 @@ ShaderHostConfig ShaderHostConfig::GetCurrent()
  bits.backend_logic_op = g_ActiveConfig.backend_info.bSupportsLogicOp;
  bits.backend_palette_conversion = g_ActiveConfig.backend_info.bSupportsPaletteConversion;
  bits.enable_validation_layer = g_ActiveConfig.bEnableValidationLayer;
+  bits.manual_texture_sampling = !g_ActiveConfig.bFastTextureSampling;
+  bits.manual_texture_sampling_custom_texture_sizes =
+      g_ActiveConfig.ManualTextureSamplingWithHiResTextures();
  return bits;
 }

@ -105,6 +108,30 @@ void WriteIsNanHeader(ShaderCode& out, APIType api_type)
  }
 }

+void WriteBitfieldExtractHeader(ShaderCode& out, APIType api_type,
+                                const ShaderHostConfig& host_config)
+{
+  // ==============================================
+  //  BitfieldExtract for APIs which don't have it
+  // ==============================================
+  if (!host_config.backend_bitfield)
+  {
+    out.Write("uint bitfieldExtract(uint val, int off, int size) {{\n"
+              "  // This built-in function is only supported in OpenGL 4.0+ and ES 3.1+\n"
+              "  // Microsoft's HLSL compiler automatically optimises this to a bitfield extract "
+              "instruction.\n"
+              "  uint mask = uint((1 << size) - 1);\n"
+              "  return uint(val >> off) & mask;\n"
+              "}}\n\n");
+    out.Write("int bitfieldExtract(int val, int off, int size) {{\n"
+              "  // This built-in function is only supported in OpenGL 4.0+ and ES 3.1+\n"
+              "  // Microsoft's HLSL compiler automatically optimises this to a bitfield extract "
+              "instruction.\n"
+              "  return ((val << (32 - size - off)) >> (32 - size));\n"
+              "}}\n\n");
+  }
+}
+
 static void DefineOutputMember(ShaderCode& object, APIType api_type, std::string_view qualifier,
                               std::string_view type, std::string_view name, int var_index,
                               std::string_view semantic = {}, int semantic_index = -1)
--- a/Source/Core/VideoCommon/ShaderGenCommon.h
+++ b/Source/Core/VideoCommon/ShaderGenCommon.h
@ -14,6 +14,7 @@
 #include "Common/BitField.h"
 #include "Common/CommonTypes.h"
 #include "Common/StringUtil.h"
+#include "Common/TypeUtils.h"

 enum class APIType;

@ -168,6 +169,8 @@ union ShaderHostConfig
  BitField<21, 1, bool, u32> backend_logic_op;
  BitField<22, 1, bool, u32> backend_palette_conversion;
  BitField<23, 1, bool, u32> enable_validation_layer;
+  BitField<24, 1, bool, u32> manual_texture_sampling;
+  BitField<25, 1, bool, u32> manual_texture_sampling_custom_texture_sizes;

  static ShaderHostConfig GetCurrent();
 };
@ -177,6 +180,8 @@ std::string GetDiskShaderCacheFileName(APIType api_type, const char* type, bool
                                       bool include_host_config, bool include_api = true);

 void WriteIsNanHeader(ShaderCode& out, APIType api_type);
+void WriteBitfieldExtractHeader(ShaderCode& out, APIType api_type,
+                                const ShaderHostConfig& host_config);

 void GenerateVSOutputMembers(ShaderCode& object, APIType api_type, u32 texgens,
                             const ShaderHostConfig& host_config, std::string_view qualifier);
@ -195,6 +200,16 @@ void AssignVSOutputMembers(ShaderCode& object, std::string_view a, std::string_v
 const char* GetInterpolationQualifier(bool msaa, bool ssaa, bool in_glsl_interface_block = false,
                                      bool in = false);

+// bitfieldExtract generator for BitField types
+template <auto ptr_to_bitfield_member>
+std::string BitfieldExtract(std::string_view source)
+{
+  using BitFieldT = Common::MemberType<ptr_to_bitfield_member>;
+  return fmt::format("bitfieldExtract({}({}), {}, {})", BitFieldT::IsSigned() ? "int" : "uint",
+                     source, static_cast<u32>(BitFieldT::StartBit()),
+                     static_cast<u32>(BitFieldT::NumBits()));
+}
+
 // Constant variable names
 #define I_COLORS "color"
 #define I_KCOLORS "k"
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@ -40,7 +40,6 @@
 #include "VideoCommon/OpcodeDecoding.h"
 #include "VideoCommon/PixelShaderManager.h"
 #include "VideoCommon/RenderBase.h"
-#include "VideoCommon/SamplerCommon.h"
 #include "VideoCommon/ShaderCache.h"
 #include "VideoCommon/Statistics.h"
 #include "VideoCommon/TMEM.h"
@ -966,6 +965,18 @@ void TextureCacheBase::DumpTexture(TCacheEntry* entry, std::string basename, uns
  entry->texture->Save(filename, level);
 }

+// Helper for checking if a BPMemory TexMode0 register is set to Point
+// Filtering modes. This is used to decide whether Anisotropic enhancements
+// are (mostly) safe in the VideoBackends.
+// If both the minification and magnification filters are set to POINT modes
+// then applying anisotropic filtering is equivalent to forced filtering. Point
+// mode textures are usually some sort of 2D UI billboard which will end up
+// misaligned from the correct pixels when filtered anisotropically.
+static bool IsAnisostropicEnhancementSafe(const TexMode0& tm0)
+{
+  return !(tm0.min_filter == FilterMode::Near && tm0.mag_filter == FilterMode::Near);
+}
+
 static void SetSamplerState(u32 index, float custom_tex_scale, bool custom_tex,
                            bool has_arbitrary_mips)
 {
@ -977,19 +988,18 @@ static void SetSamplerState(u32 index, float custom_tex_scale, bool custom_tex,
  // Force texture filtering config option.
  if (g_ActiveConfig.bForceFiltering)
  {
-    state.min_filter = SamplerState::Filter::Linear;
-    state.mag_filter = SamplerState::Filter::Linear;
-    state.mipmap_filter = SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0) ?
-                              SamplerState::Filter::Linear :
-                              SamplerState::Filter::Point;
+    state.tm0.min_filter = FilterMode::Linear;
+    state.tm0.mag_filter = FilterMode::Linear;
+    state.tm0.mipmap_filter =
+        tm0.mipmap_filter != MipMode::None ? FilterMode::Linear : FilterMode::Near;
  }

  // Custom textures may have a greater number of mips
  if (custom_tex)
-    state.max_lod = 255;
+    state.tm1.max_lod = 255;

  // Anisotropic filtering option.
-  if (g_ActiveConfig.iMaxAnisotropy != 0 && !SamplerCommon::IsBpTexMode0PointFiltering(tm0))
+  if (g_ActiveConfig.iMaxAnisotropy != 0 && IsAnisostropicEnhancementSafe(tm0))
  {
    // https://www.opengl.org/registry/specs/EXT/texture_filter_anisotropic.txt
    // For predictable results on all hardware/drivers, only use one of:
@ -998,31 +1008,32 @@ static void SetSamplerState(u32 index, float custom_tex_scale, bool custom_tex,
    // Letting the game set other combinations will have varying arbitrary results;
    // possibly being interpreted as equal to bilinear/trilinear, implicitly
    // disabling anisotropy, or changing the anisotropic algorithm employed.
-    state.min_filter = SamplerState::Filter::Linear;
-    state.mag_filter = SamplerState::Filter::Linear;
-    if (SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0))
-      state.mipmap_filter = SamplerState::Filter::Linear;
-    state.anisotropic_filtering = 1;
+    state.tm0.min_filter = FilterMode::Linear;
+    state.tm0.mag_filter = FilterMode::Linear;
+    if (tm0.mipmap_filter != MipMode::None)
+      state.tm0.mipmap_filter = FilterMode::Linear;
+    state.tm0.anisotropic_filtering = true;
  }
  else
  {
-    state.anisotropic_filtering = 0;
+    state.tm0.anisotropic_filtering = false;
  }

-  if (has_arbitrary_mips && SamplerCommon::AreBpTexMode0MipmapsEnabled(tm0))
+  if (has_arbitrary_mips && tm0.mipmap_filter != MipMode::None)
  {
    // Apply a secondary bias calculated from the IR scale to pull inwards mipmaps
    // that have arbitrary contents, eg. are used for fog effects where the
    // distance they kick in at is important to preserve at any resolution.
    // Correct this with the upscaling factor of custom textures.
-    s64 lod_offset = std::log2(g_renderer->GetEFBScale() / custom_tex_scale) * 256.f;
-    state.lod_bias = std::clamp<s64>(state.lod_bias + lod_offset, -32768, 32767);
+    s32 lod_offset = std::log2(g_renderer->GetEFBScale() / custom_tex_scale) * 256.f;
+    state.tm0.lod_bias = std::clamp<s32>(state.tm0.lod_bias + lod_offset, -32768, 32767);

    // Anisotropic also pushes mips farther away so it cannot be used either
-    state.anisotropic_filtering = 0;
+    state.tm0.anisotropic_filtering = false;
  }

  g_renderer->SetSamplerState(index, state);
+  PixelShaderManager::SetSamplerState(index, state.tm0.hex, state.tm1.hex);
 }

 void TextureCacheBase::BindTextures(BitSet32 used_textures)
--- a/Source/Core/VideoCommon/TextureInfo.cpp
+++ b/Source/Core/VideoCommon/TextureInfo.cpp
@ -9,7 +9,6 @@
 #include "Common/Align.h"
 #include "Core/HW/Memmap.h"
 #include "VideoCommon/BPMemory.h"
-#include "VideoCommon/SamplerCommon.h"
 #include "VideoCommon/TextureDecoder.h"

 TextureInfo TextureInfo::FromStage(u32 stage)
@ -28,7 +27,7 @@ TextureInfo TextureInfo::FromStage(u32 stage)
  const u8* tlut_ptr = &texMem[tlutaddr];

  std::optional<u32> mip_count;
-  const bool has_mipmaps = SamplerCommon::AreBpTexMode0MipmapsEnabled(tex.texMode0);
+  const bool has_mipmaps = tex.texMode0.mipmap_filter != MipMode::None;
  if (has_mipmaps)
  {
    mip_count = (tex.texMode1.max_lod + 0xf) / 0x10;
--- a/Source/Core/VideoCommon/UberShaderCommon.cpp
+++ b/Source/Core/VideoCommon/UberShaderCommon.cpp
@ -9,24 +9,6 @@

 namespace UberShader
 {
-void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type,
-                                 const ShaderHostConfig& host_config)
-{
-  // ==============================================
-  //  BitfieldExtract for APIs which don't have it
-  // ==============================================
-  if (!host_config.backend_bitfield)
-  {
-    out.Write("uint bitfieldExtract(uint val, int off, int size) {{\n"
-              "	// This built-in function is only support in OpenGL 4.0+ and ES 3.1+\n"
-              "	// Microsoft's HLSL compiler automatically optimises this to a bitfield extract "
-              "instruction.\n"
-              "	uint mask = uint((1 << size) - 1);\n"
-              "	return uint(val >> off) & mask;\n"
-              "}}\n\n");
-  }
-}
-
 void WriteLightingFunction(ShaderCode& out)
 {
  // ==============================================
--- a/Source/Core/VideoCommon/UberShaderCommon.h
+++ b/Source/Core/VideoCommon/UberShaderCommon.h
@ -3,37 +3,18 @@

 #pragma once

-#include <string>
 #include <string_view>

-#include <fmt/format.h>
-
-#include "Common/CommonTypes.h"
-#include "Common/TypeUtils.h"
-
 class ShaderCode;
 enum class APIType;
 union ShaderHostConfig;

 namespace UberShader
 {
-// Common functions across all ubershaders
-void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type,
-                                 const ShaderHostConfig& host_config);
-
 // Vertex lighting
 void WriteLightingFunction(ShaderCode& out);
 void WriteVertexLighting(ShaderCode& out, APIType api_type, std::string_view world_pos_var,
                         std::string_view normal_var, std::string_view in_color_0_var,
                         std::string_view in_color_1_var, std::string_view out_color_0_var,
                         std::string_view out_color_1_var);
-
-// bitfieldExtract generator for BitField types
-template <auto ptr_to_bitfield_member>
-std::string BitfieldExtract(std::string_view source)
-{
-  using BitFieldT = Common::MemberType<ptr_to_bitfield_member>;
-  return fmt::format("bitfieldExtract({}, {}, {})", source, static_cast<u32>(BitFieldT::StartBit()),
-                     static_cast<u32>(BitFieldT::NumBits()));
-}
 }  // namespace UberShader
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@ -63,8 +63,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,

  out.Write("// Pixel UberShader for {} texgens{}{}\n", numTexgen,
            early_depth ? ", early-depth" : "", per_pixel_depth ? ", per-pixel depth" : "");
+  WriteBitfieldExtractHeader(out, api_type, host_config);
  WritePixelShaderCommonHeader(out, api_type, host_config, bounding_box);
-  WriteUberShaderCommonHeader(out, api_type, host_config);
  if (per_pixel_lighting)
    WriteLightingFunction(out);

@ -226,17 +226,17 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
  {
    // Doesn't look like DirectX supports this. Oh well the code path is here just in case it
    // supports this in the future.
-    out.Write("int4 sampleTexture(uint sampler_num, float3 uv) {{\n");
+    out.Write("int4 sampleTextureWrapper(uint texmap, int2 uv, int layer) {{\n");
    if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
-      out.Write("  return iround(texture(samp[sampler_num], uv) * 255.0);\n");
+      out.Write("  return sampleTexture(texmap, samp[texmap], uv, layer);\n");
    else if (api_type == APIType::D3D)
-      out.Write("  return iround(Tex[sampler_num].Sample(samp[sampler_num], uv) * 255.0);\n");
+      out.Write("  return sampleTexture(texmap, tex[texmap], samp[texmap], uv, layer);\n");
    out.Write("}}\n\n");
  }
  else
  {
-    out.Write("int4 sampleTexture(uint sampler_num, float3 uv) {{\n"
-              "  // This is messy, but DirectX, OpenGL 3.3 and OpenGL ES 3.0 doesn't support "
+    out.Write("int4 sampleTextureWrapper(uint sampler_num, int2 uv, int layer) {{\n"
+              "  // This is messy, but DirectX, OpenGL 3.3, and OpenGL ES 3.0 don't support "
              "dynamic indexing of the sampler array\n"
              "  // With any luck the shader compiler will optimise this if the hardware supports "
              "dynamic indexing.\n"
@ -244,9 +244,14 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
    for (int i = 0; i < 8; i++)
    {
      if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
-        out.Write("  case {}u: return iround(texture(samp[{}], uv) * 255.0);\n", i, i);
+      {
+        out.Write("  case {0}u: return sampleTexture({0}u, samp[{0}u], uv, layer);\n", i);
+      }
      else if (api_type == APIType::D3D)
-        out.Write("  case {}u: return iround(Tex[{}].Sample(samp[{}], uv) * 255.0);\n", i, i, i);
+      {
+        out.Write("  case {0}u: return sampleTexture({0}u, tex[{0}u], samp[{0}u], uv, layer);\n",
+                  i);
+      }
    }
    out.Write("  }}\n"
              "}}\n\n");
@ -284,8 +289,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
  // ======================
  //    Indirect Lookup
  // ======================
-  const auto LookupIndirectTexture = [&out, stereo](std::string_view out_var_name,
-                                                    std::string_view in_index_name) {
+  const auto LookupIndirectTexture = [&out](std::string_view out_var_name,
+                                            std::string_view in_index_name) {
    // in_index_name is the indirect stage, not the tev stage
    // bpmem_iref is packed differently from RAS1_IREF
    // This function assumes bpmem_iref is nonzero (i.e. matrix is not off, and the
@ -301,11 +306,9 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
              "  else\n"
              "    fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[{} >> 1].zw;\n"
              "\n"
-              "  {} = sampleTexture(texmap, float3(float2(fixedPoint_uv) * " I_TEXDIMS
-              "[texmap].xy, {})).abg;\n"
-              "}}",
-              in_index_name, in_index_name, in_index_name, in_index_name, out_var_name,
-              stereo ? "float(layer)" : "0.0");
+              "  {} = sampleTextureWrapper(texmap, fixedPoint_uv, layer).abg;\n"
+              "}}\n",
+              in_index_name, in_index_name, in_index_name, in_index_name, out_var_name);
  };

  // ======================
@ -729,6 +732,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
      out.Write(",\n  in uint layer : SV_RenderTargetArrayIndex\n");
    out.Write("\n        ) {{\n");
  }
+  if (!stereo)
+    out.Write("  int layer = 0;\n");

  out.Write("  int3 tevcoord = int3(0, 0, 0);\n"
            "  State s;\n"
@ -786,7 +791,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
    {
      out.Write("    int2 fixpoint_uv{} = int2(", i);
      out.Write("(tex{}.z == 0.0 ? tex{}.xy : tex{}.xy / tex{}.z)", i, i, i, i);
-      out.Write(" * " I_TEXDIMS "[{}].zw);\n", i);
+      out.Write(" * float2(" I_TEXDIMS "[{}].zw * 128));\n", i);
      // TODO: S24 overflows here?
    }

@ -820,7 +825,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
    // For the undefined case, we just skip applying the indirect operation, which is close enough.
    // Viewtiful Joe hits the undefined case (bug 12525).
    // Wrapping and add to previous still apply in this case (and when the stage is disabled).
-    out.Write("      if (bpmem_iref(bt) != 0u) {{");
+    out.Write("      if (bpmem_iref(bt) != 0u) {{\n");
    out.Write("        int3 indcoord;\n");
    LookupIndirectTexture("indcoord", "bt");
    out.Write("        if (bs != 0u)\n"
@ -910,10 +915,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
              "      uint sampler_num = {};\n",
              BitfieldExtract<&TwoTevStageOrders::texmap0>("ss.order"));
    out.Write("\n"
-              "      float2 uv = (float2(tevcoord.xy)) * " I_TEXDIMS "[sampler_num].xy;\n");
-    out.Write("      int4 color = sampleTexture(sampler_num, float3(uv, {}));\n",
-              stereo ? "float(layer)" : "0.0");
-    out.Write("      uint swap = {};\n",
+              "      int4 color = sampleTextureWrapper(sampler_num, tevcoord.xy, layer);\n"
+              "      uint swap = {};\n",
              BitfieldExtract<&TevStageCombiner::AlphaCombiner::tswap>("ss.ac"));
    out.Write("      s.TexColor = Swizzle(swap, color);\n");
    out.Write("    }} else {{\n"
--- a/Source/Core/VideoCommon/UberShaderVertex.cpp
+++ b/Source/Core/VideoCommon/UberShaderVertex.cpp
@ -49,8 +49,8 @@ ShaderCode GenVertexShader(APIType api_type, const ShaderHostConfig& host_config
  GenerateVSOutputMembers(out, api_type, num_texgen, host_config, "");
  out.Write("}};\n\n");

-  WriteUberShaderCommonHeader(out, api_type, host_config);
  WriteIsNanHeader(out, api_type);
+  WriteBitfieldExtractHeader(out, api_type, host_config);
  WriteLightingFunction(out);

  if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@ -27,7 +27,6 @@
 #include "VideoCommon/PerfQueryBase.h"
 #include "VideoCommon/PixelShaderManager.h"
 #include "VideoCommon/RenderBase.h"
-#include "VideoCommon/SamplerCommon.h"
 #include "VideoCommon/Statistics.h"
 #include "VideoCommon/TextureCacheBase.h"
 #include "VideoCommon/VertexLoaderManager.h"
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -135,6 +135,7 @@ void VideoConfig::Refresh()
  bVertexRounding = Config::Get(Config::GFX_HACK_VERTEX_ROUDING);
  iEFBAccessTileSize = Config::Get(Config::GFX_HACK_EFB_ACCESS_TILE_SIZE);
  iMissingColorValue = Config::Get(Config::GFX_HACK_MISSING_COLOR_VALUE);
+  bFastTextureSampling = Config::Get(Config::GFX_HACK_FAST_TEXTURE_SAMPLING);

  bPerfQueriesEnable = Config::Get(Config::GFX_PERF_QUERIES_ENABLE);

--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -135,6 +135,7 @@ struct VideoConfig final
  int iLog = 0;           // CONF_ bits
  int iSaveTargetId = 0;  // TODO: Should be dropped
  u32 iMissingColorValue = 0;
+  bool bFastTextureSampling = false;

  // Stereoscopy
  StereoMode stereo_mode{};
@ -230,6 +231,8 @@ struct VideoConfig final
    bool bSupportsDepthReadback = false;
    bool bSupportsShaderBinaries = false;
    bool bSupportsPipelineCacheData = false;
+    bool bSupportsCoarseDerivatives = false;
+    bool bSupportsTextureQueryLevels = false;
  } backend_info;

  // Utility
@ -243,6 +246,16 @@ struct VideoConfig final
    return backend_info.bSupportsGPUTextureDecoding && bEnableGPUTextureDecoding;
  }
  bool UseVertexRounding() const { return bVertexRounding && iEFBScale != 1; }
+  bool ManualTextureSamplingWithHiResTextures() const
+  {
+    // Hi-res textures (including hi-res EFB copies, but not native-resolution EFB copies at higher
+    // internal resolutions) breaks the wrapping logic used by manual texture sampling.
+    if (bFastTextureSampling)
+      return false;
+    if (iEFBScale != 1 && bCopyEFBScaled)
+      return true;
+    return bHiresTextures;
+  }
  bool UsingUberShaders() const;
  u32 GetShaderCompilerThreads() const;
  u32 GetShaderPrecompilerThreads() const;