Merge pull request #11028 from tellowkrinkle/MetalFixes

Various Metal renderer improvements
2022-10-24 15:22:37 -04:00 · 2022-10-24 15:22:37 -04:00 · b66793194e
parent 4787b25a7f fd2680d8b4
commit b66793194e
16 changed files with 284 additions and 29 deletions
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@ -87,6 +87,11 @@ const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE{
 const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{
    {System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false};

+const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS{
+    {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto};
+const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE{{System::GFX, "Settings", "MTLUsePresentDrawable"},
+                                              false};
+
 const Info<bool> GFX_SW_DUMP_OBJECTS{{System::GFX, "Settings", "SWDumpObjects"}, false};
 const Info<bool> GFX_SW_DUMP_TEV_STAGES{{System::GFX, "Settings", "SWDumpTevStages"}, false};
 const Info<bool> GFX_SW_DUMP_TEV_TEX_FETCHES{{System::GFX, "Settings", "SWDumpTevTexFetches"},
--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@ -11,6 +11,7 @@ enum class AspectMode : int;
 enum class ShaderCompilationMode : int;
 enum class StereoMode : int;
 enum class FreelookControlType : int;
+enum class TriState : int;

 namespace Config
 {
@ -75,6 +76,9 @@ extern const Info<int> GFX_SHADER_PRECOMPILER_THREADS;
 extern const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE;
 extern const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION;

+extern const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS;
+extern const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE;
+
 extern const Info<bool> GFX_SW_DUMP_OBJECTS;
 extern const Info<bool> GFX_SW_DUMP_TEV_STAGES;
 extern const Info<bool> GFX_SW_DUMP_TEV_TEX_FETCHES;
--- a/Source/Core/VideoBackends/Metal/CMakeLists.txt
+++ b/Source/Core/VideoBackends/Metal/CMakeLists.txt
@ -39,3 +39,5 @@ PRIVATE
  ${METAL_LIBRARY}
  ${QUARTZCORE_LIBRARY}
 )
+
+target_compile_options(videometal PRIVATE -fno-objc-arc)
--- a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm
+++ b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm
@ -36,6 +36,7 @@ std::vector<BBoxType> Metal::BoundingBox::Read(u32 index, u32 length)
  {
    g_state_tracker->EndRenderPass();
    g_state_tracker->FlushEncoders();
+    g_state_tracker->NotifyOfCPUGPUSync();
    g_state_tracker->WaitForFlushedEncoders();
    return std::vector<BBoxType>(m_cpu_buffer_ptr + index, m_cpu_buffer_ptr + index + length);
  }
--- a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm
+++ b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm
@ -56,6 +56,7 @@ void Metal::PerfQuery::FlushResults()

  // There's a possibility that some active performance queries are unflushed
  g_state_tracker->FlushEncoders();
+  g_state_tracker->NotifyOfCPUGPUSync();

  std::unique_lock<std::mutex> lock(m_results_mtx);
  while (!IsFlushed())
--- a/Source/Core/VideoBackends/Metal/MTLRenderer.mm
+++ b/Source/Core/VideoBackends/Metal/MTLRenderer.mm
@ -20,6 +20,7 @@ Metal::Renderer::Renderer(MRCOwned<CAMetalLayer*> layer, int width, int height,
      m_layer(std::move(layer))
 {
  UpdateActiveConfig();
+  [m_layer setDisplaySyncEnabled:g_ActiveConfig.bVSyncActive];
 }

 Metal::Renderer::~Renderer() = default;
@ -454,8 +455,15 @@ void Metal::Renderer::PresentBackbuffer()
    g_state_tracker->EndRenderPass();
    if (m_drawable)
    {
-      [g_state_tracker->GetRenderCmdBuf()
-          addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }];
+      // PresentDrawable refuses to allow Dolphin to present faster than the display's refresh rate
+      // when windowed (or fullscreen with vsync enabled, but that's more understandable).
+      // On the other hand, it helps Xcode's GPU captures start and stop on frame boundaries
+      // which is convenient.  Put it here as a default-off config, which we can override in Xcode.
+      if (g_ActiveConfig.bUsePresentDrawable)
+        [g_state_tracker->GetRenderCmdBuf() presentDrawable:m_drawable];
+      else
+        [g_state_tracker->GetRenderCmdBuf()
+            addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }];
      m_bb_texture->SetMTLTexture(nullptr);
      m_drawable = nullptr;
    }
--- a/Source/Core/VideoBackends/Metal/MTLStateTracker.h
+++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.h
@ -34,7 +34,6 @@ public:
    Uniform,
    Vertex,
    Index,
-    TextureData,
    Texels,
    Last = Texels
  };
@ -75,6 +74,14 @@ public:
    return m_current_draw != 1 + m_last_finished_draw.load(std::memory_order_acquire);
  }
  void ReloadSamplers();
+  void NotifyOfCPUGPUSync()
+  {
+    if (!g_features.manual_buffer_upload || !m_manual_buffer_upload)
+      return;
+    if (m_upload_cmdbuf || m_current_render_cmdbuf)
+      return;
+    SetManualBufferUpload(false);
+  }

  void SetPipeline(const Pipeline* pipe);
  void SetPipeline(const ComputePipeline* pipe);
@ -106,6 +113,7 @@ public:
  {
    return (amt + static_cast<size_t>(align)) & ~static_cast<size_t>(align);
  }
+  Map AllocateForTextureUpload(size_t amt);
  Map Allocate(UploadBuffer buffer_idx, size_t amt, AlignMask align)
  {
    Preallocate(buffer_idx, amt);
@ -119,6 +127,7 @@ public:
                  static_cast<size_t>(align)) == 0);
    return CommitPreallocation(buffer_idx, Align(amt, align));
  }
+  id<MTLBlitCommandEncoder> GetUploadEncoder();
  id<MTLBlitCommandEncoder> GetTextureUploadEncoder();
  id<MTLCommandBuffer> GetRenderCmdBuf();

@ -142,18 +151,28 @@ private:
    void Reset(size_t new_size);
  };

-  struct Buffer
+  struct CPUBuffer
  {
    UsageTracker usage;
    MRCOwned<id<MTLBuffer>> mtlbuffer;
    void* buffer = nullptr;
  };

+  struct BufferPair
+  {
+    UsageTracker usage;
+    MRCOwned<id<MTLBuffer>> cpubuffer;
+    MRCOwned<id<MTLBuffer>> gpubuffer;
+    void* buffer = nullptr;
+    size_t last_upload = 0;
+  };
+
  struct Backref;
  struct PerfQueryTracker;

  std::shared_ptr<Backref> m_backref;
  std::vector<std::shared_ptr<PerfQueryTracker>> m_perf_query_tracker_cache;
+  MRCOwned<id<MTLFence>> m_fence;
  MRCOwned<id<MTLCommandBuffer>> m_upload_cmdbuf;
  MRCOwned<id<MTLBlitCommandEncoder>> m_upload_encoder;
  MRCOwned<id<MTLCommandBuffer>> m_texture_upload_cmdbuf;
@ -165,7 +184,8 @@ private:
  MRCOwned<MTLRenderPassDescriptor*> m_render_pass_desc[3];
  MRCOwned<MTLRenderPassDescriptor*> m_resolve_pass_desc;
  Framebuffer* m_current_framebuffer;
-  Buffer m_upload_buffers[static_cast<int>(UploadBuffer::Last) + 1];
+  CPUBuffer m_texture_upload_buffer;
+  BufferPair m_upload_buffers[static_cast<int>(UploadBuffer::Last) + 1];
  u64 m_current_draw = 1;
  std::atomic<u64> m_last_finished_draw{0};

@ -250,9 +270,12 @@ private:
  } m_state;

  u32 m_perf_query_tracker_counter = 0;
+  bool m_manual_buffer_upload = false;

+  void SetManualBufferUpload(bool enable);
  std::shared_ptr<PerfQueryTracker> NewPerfQueryTracker();
  void SetSamplerForce(u32 idx, const SamplerState& sampler);
+  void Sync(BufferPair& buffer);
  Map CommitPreallocation(UploadBuffer buffer_idx, size_t actual_amt);
  void CheckViewport();
  void CheckScissor();
--- a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm
+++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm
@ -45,12 +45,11 @@ static NSString* GetName(Metal::StateTracker::UploadBuffer buffer)
  // clang-format off
  switch (buffer)
  {
-    case Metal::StateTracker::UploadBuffer::TextureData: return @"Texture Data";
-    case Metal::StateTracker::UploadBuffer::Texels:      return @"Texels";
-    case Metal::StateTracker::UploadBuffer::Vertex:      return @"Vertices";
-    case Metal::StateTracker::UploadBuffer::Index:       return @"Indices";
-    case Metal::StateTracker::UploadBuffer::Uniform:     return @"Uniforms";
-    case Metal::StateTracker::UploadBuffer::Other:       return @"Generic Upload";
+    case Metal::StateTracker::UploadBuffer::Texels:  return @"Texels";
+    case Metal::StateTracker::UploadBuffer::Vertex:  return @"Vertices";
+    case Metal::StateTracker::UploadBuffer::Index:   return @"Indices";
+    case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms";
+    case Metal::StateTracker::UploadBuffer::Other:   return @"Generic Upload";
  }
  // clang-format on
 }
@ -105,6 +104,7 @@ void Metal::StateTracker::UsageTracker::Reset(size_t new_size)
 Metal::StateTracker::StateTracker() : m_backref(std::make_shared<Backref>(this))
 {
  m_flags.should_apply_label = true;
+  m_fence = MRCTransfer([g_device newFence]);
  for (MRCOwned<MTLRenderPassDescriptor*>& rpdesc : m_render_pass_desc)
  {
    rpdesc = MRCTransfer([MTLRenderPassDescriptor new]);
@ -141,9 +141,10 @@ Metal::StateTracker::~StateTracker()

 // MARK: BufferPair Ops

-std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt)
+Metal::StateTracker::Map Metal::StateTracker::AllocateForTextureUpload(size_t amt)
 {
-  Buffer& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  amt = (amt + 15) & ~15ull;
+  CPUBuffer& buffer = m_texture_upload_buffer;
  u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire);
  bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt);
  if (__builtin_expect(needs_new, false))
@ -155,11 +156,61 @@ std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_id
    MTLResourceOptions options =
        MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
    buffer.mtlbuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
-    [buffer.mtlbuffer setLabel:GetName(buffer_idx)];
+    [buffer.mtlbuffer setLabel:@"Texture Upload Buffer"];
    ASSERT_MSG(VIDEO, buffer.mtlbuffer, "Failed to allocate MTLBuffer (out of memory?)");
    buffer.buffer = [buffer.mtlbuffer contents];
    buffer.usage.Reset(newsize);
  }
+
+  size_t pos = buffer.usage.Allocate(m_current_draw, amt);
+
+  Map ret = {buffer.mtlbuffer, pos, reinterpret_cast<char*>(buffer.buffer) + pos};
+  DEBUG_ASSERT(pos <= buffer.usage.Size() &&
+               "Previous code should have guaranteed there was enough space");
+  return ret;
+}
+
+std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt)
+{
+  BufferPair& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire);
+  size_t base_pos = buffer.usage.Pos();
+  bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt);
+  bool needs_upload = needs_new || buffer.usage.Pos() == 0;
+  if (m_manual_buffer_upload && needs_upload)
+  {
+    if (base_pos != buffer.last_upload)
+    {
+      id<MTLBlitCommandEncoder> encoder = GetUploadEncoder();
+      [encoder copyFromBuffer:buffer.cpubuffer
+                 sourceOffset:buffer.last_upload
+                     toBuffer:buffer.gpubuffer
+            destinationOffset:buffer.last_upload
+                         size:base_pos - buffer.last_upload];
+    }
+    buffer.last_upload = 0;
+  }
+  if (__builtin_expect(needs_new, false))
+  {
+    // Orphan buffer
+    size_t newsize = std::max<size_t>(buffer.usage.Size() * 2, 4096);
+    while (newsize < amt)
+      newsize *= 2;
+    MTLResourceOptions options =
+        MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
+    buffer.cpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
+    [buffer.cpubuffer setLabel:GetName(buffer_idx)];
+    ASSERT_MSG(VIDEO, buffer.cpubuffer, "Failed to allocate MTLBuffer (out of memory?)");
+    buffer.buffer = [buffer.cpubuffer contents];
+    buffer.usage.Reset(newsize);
+    if (g_features.manual_buffer_upload)
+    {
+      options = MTLResourceStorageModePrivate | MTLResourceHazardTrackingModeUntracked;
+      buffer.gpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
+      [buffer.gpubuffer setLabel:GetName(buffer_idx)];
+      ASSERT_MSG(VIDEO, buffer.gpubuffer, "Failed to allocate MTLBuffer (out of memory?)");
+    }
+  }
  size_t pos = buffer.usage.Pos();
  return std::make_pair(reinterpret_cast<char*>(buffer.buffer) + pos, pos);
 }
@ -167,17 +218,46 @@ std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_id
 Metal::StateTracker::Map Metal::StateTracker::CommitPreallocation(UploadBuffer buffer_idx,
                                                                  size_t amt)
 {
-  Buffer& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  BufferPair& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
  size_t pos = buffer.usage.Allocate(m_current_draw, amt);
  Map ret = {nil, pos, reinterpret_cast<char*>(buffer.buffer) + pos};
-  ret.gpu_buffer = buffer.mtlbuffer;
+  ret.gpu_buffer = m_manual_buffer_upload ? buffer.gpubuffer : buffer.cpubuffer;
  DEBUG_ASSERT(pos <= buffer.usage.Size() &&
               "Previous code should have guaranteed there was enough space");
  return ret;
 }

+void Metal::StateTracker::Sync(BufferPair& buffer)
+{
+  if (!m_manual_buffer_upload || buffer.usage.Pos() == buffer.last_upload)
+    return;
+
+  id<MTLBlitCommandEncoder> encoder = GetUploadEncoder();
+  [encoder copyFromBuffer:buffer.cpubuffer
+             sourceOffset:buffer.last_upload
+                 toBuffer:buffer.gpubuffer
+        destinationOffset:buffer.last_upload
+                     size:buffer.usage.Pos() - buffer.last_upload];
+  buffer.last_upload = buffer.usage.Pos();
+}
+
 // MARK: Render Pass / Encoder Management

+id<MTLBlitCommandEncoder> Metal::StateTracker::GetUploadEncoder()
+{
+  if (!m_upload_cmdbuf)
+  {
+    @autoreleasepool
+    {
+      m_upload_cmdbuf = MRCRetain([g_queue commandBuffer]);
+      [m_upload_cmdbuf setLabel:@"Vertex Upload"];
+      m_upload_encoder = MRCRetain([m_upload_cmdbuf blitCommandEncoder]);
+      [m_upload_encoder setLabel:@"Vertex Upload"];
+    }
+  }
+  return m_upload_encoder;
+}
+
 id<MTLBlitCommandEncoder> Metal::StateTracker::GetTextureUploadEncoder()
 {
  if (!m_texture_upload_cmdbuf)
@ -270,6 +350,8 @@ void Metal::StateTracker::BeginRenderPass(MTLRenderPassDescriptor* descriptor)
      MRCRetain([GetRenderCmdBuf() renderCommandEncoderWithDescriptor:descriptor]);
  if (m_current_perf_query)
    [descriptor setVisibilityResultBuffer:nil];
+  if (m_manual_buffer_upload)
+    [m_current_render_encoder waitForFence:m_fence beforeStages:MTLRenderStageVertex];
  AbstractTexture* attachment = m_current_framebuffer->GetColorAttachment();
  if (!attachment)
    attachment = m_current_framebuffer->GetDepthAttachment();
@ -299,6 +381,8 @@ void Metal::StateTracker::BeginComputePass()
  EndRenderPass();
  m_current_compute_encoder = MRCRetain([GetRenderCmdBuf() computeCommandEncoder]);
  [m_current_compute_encoder setLabel:@"Compute"];
+  if (m_manual_buffer_upload)
+    [m_current_compute_encoder waitForFence:m_fence];
  m_flags.NewEncoder();
  m_dirty_samplers = 0xff;
  m_dirty_textures = 0xff;
@ -326,6 +410,20 @@ void Metal::StateTracker::FlushEncoders()
  if (!m_current_render_cmdbuf)
    return;
  EndRenderPass();
+  for (int i = 0; i <= static_cast<int>(UploadBuffer::Last); ++i)
+    Sync(m_upload_buffers[i]);
+  if (!m_manual_buffer_upload)
+  {
+    ASSERT(!m_upload_cmdbuf && "Should never be used!");
+  }
+  else if (m_upload_cmdbuf)
+  {
+    [m_upload_encoder updateFence:m_fence];
+    [m_upload_encoder endEncoding];
+    [m_upload_cmdbuf commit];
+    m_upload_encoder = nullptr;
+    m_upload_cmdbuf = nullptr;
+  }
  if (m_texture_upload_cmdbuf)
  {
    [m_texture_upload_encoder endEncoding];
@ -355,6 +453,8 @@ void Metal::StateTracker::FlushEncoders()
  m_last_render_cmdbuf = std::move(m_current_render_cmdbuf);
  m_current_render_cmdbuf = nullptr;
  m_current_draw++;
+  if (g_features.manual_buffer_upload && !m_manual_buffer_upload)
+    SetManualBufferUpload(true);
 }

 void Metal::StateTracker::WaitForFlushedEncoders()
@ -368,6 +468,23 @@ void Metal::StateTracker::ReloadSamplers()
    m_state.samplers[i] = g_object_cache->GetSampler(m_state.sampler_states[i]);
 }

+void Metal::StateTracker::SetManualBufferUpload(bool enabled)
+{
+  // When a game does something that needs CPU-GPU sync (e.g. bbox, texture download, etc),
+  // the next command buffer will be done with manual buffer upload disabled,
+  // since overlapping the upload with the previous draw won't be possible (due to sync).
+  // This greatly improves performance in heavy bbox games like Super Paper Mario.
+  m_manual_buffer_upload = enabled;
+  if (enabled)
+  {
+    for (BufferPair& buffer : m_upload_buffers)
+    {
+      // Update sync positions, since Sync doesn't do it when manual buffer upload is off
+      buffer.last_upload = buffer.usage.Pos();
+    }
+  }
+}
+
 // MARK: State Setters

 void Metal::StateTracker::SetPipeline(const Pipeline* pipe)
--- a/Source/Core/VideoBackends/Metal/MTLTexture.mm
+++ b/Source/Core/VideoBackends/Metal/MTLTexture.mm
@ -6,6 +6,7 @@
 #include "Common/Align.h"
 #include "Common/Assert.h"

+#include "VideoBackends/Metal/MTLRenderer.h"
 #include "VideoBackends/Metal/MTLStateTracker.h"

 Metal::Texture::Texture(MRCOwned<id<MTLTexture>> tex, const TextureConfig& config)
@ -50,6 +51,10 @@ void Metal::Texture::ResolveFromTexture(const AbstractTexture* src,
  g_state_tracker->ResolveTexture(src_tex, m_tex, layer, level);
 }

+// Use a temporary texture for large texture loads
+// (Since the main upload buffer doesn't shrink after it grows)
+static constexpr u32 STAGING_TEXTURE_UPLOAD_THRESHOLD = 1024 * 1024 * 4;
+
 void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length,  //
                          const u8* buffer, size_t buffer_size)
 {
@ -59,8 +64,23 @@ void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length,  //
    const u32 num_rows = Common::AlignUp(height, block_size) / block_size;
    const u32 source_pitch = CalculateStrideForFormat(m_config.format, row_length);
    const u32 upload_size = source_pitch * num_rows;
-    StateTracker::Map map = g_state_tracker->Allocate(StateTracker::UploadBuffer::TextureData,
-                                                      upload_size, StateTracker::AlignMask::Other);
+    MRCOwned<id<MTLBuffer>> tmp_buffer;
+    StateTracker::Map map;
+    if (upload_size > STAGING_TEXTURE_UPLOAD_THRESHOLD)
+    {
+      tmp_buffer = MRCTransfer([g_device
+          newBufferWithLength:upload_size
+                      options:MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined]);
+      [tmp_buffer setLabel:@"Temp Texture Upload"];
+      map.gpu_buffer = tmp_buffer;
+      map.gpu_offset = 0;
+      map.cpu_buffer = [tmp_buffer contents];
+    }
+    else
+    {
+      map = g_state_tracker->AllocateForTextureUpload(upload_size);
+    }
+
    memcpy(map.cpu_buffer, buffer, upload_size);
    id<MTLBlitCommandEncoder> encoder = g_state_tracker->GetTextureUploadEncoder();
    [encoder copyFromBuffer:map.gpu_buffer
@ -163,6 +183,7 @@ void Metal::StagingTexture::Flush()
  {
    // Flush while we wait, since who knows how long we'll be sitting here
    g_state_tracker->FlushEncoders();
+    g_state_tracker->NotifyOfCPUGPUSync();
    [m_wait_buffer waitUntilCompleted];
  }
  m_wait_buffer = nullptr;
--- a/Source/Core/VideoBackends/Metal/MTLUtil.h
+++ b/Source/Core/VideoBackends/Metal/MTLUtil.h
@ -16,6 +16,10 @@ namespace Metal
 {
 struct DeviceFeatures
 {
+  /// Manually copy buffer data to the GPU (instead of letting the GPU read from system memory)
+  /// On discrete GPUs, this tends to be faster if the copy is able to operate in parallel with a
+  /// previous render.  This is the case unless a game uses features like bbox or texture downloads.
+  bool manual_buffer_upload;
  bool subgroup_ops;
 };

--- a/Source/Core/VideoBackends/Metal/MTLUtil.mm
+++ b/Source/Core/VideoBackends/Metal/MTLUtil.mm
@ -217,6 +217,27 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id<MTLDevice>
      config->backend_info.AAModes.push_back(i);
  }

+  switch (config->iManuallyUploadBuffers)
+  {
+  case TriState::Off:
+    g_features.manual_buffer_upload = false;
+    break;
+  case TriState::On:
+    g_features.manual_buffer_upload = true;
+    break;
+  case TriState::Auto:
+#if TARGET_OS_OSX
+    g_features.manual_buffer_upload = false;
+    if (@available(macOS 10.15, *))
+      if (![device hasUnifiedMemory])
+        g_features.manual_buffer_upload = true;
+#else
+    // All iOS devices have unified memory
+    g_features.manual_buffer_upload = false;
+#endif
+    break;
+  }
+
  g_features.subgroup_ops = false;
  if (@available(macOS 10.15, iOS 13, *))
  {
@ -225,7 +246,7 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id<MTLDevice>
        [device supportsFamily:MTLGPUFamilyMac2] || [device supportsFamily:MTLGPUFamilyApple6];
    config->backend_info.bSupportsFramebufferFetch = [device supportsFamily:MTLGPUFamilyApple1];
  }
-  if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID))
+  if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS))
    g_features.subgroup_ops = false;
 #if TARGET_OS_OSX
  if (@available(macOS 11, *))
@ -378,6 +399,12 @@ static const std::string_view MSL_HEADER =
    // These are usually when the compiler doesn't think a switch is exhaustive
    "#pragma clang diagnostic ignored \"-Wreturn-type\"\n";

+static constexpr std::pair<std::string_view, std::string_view> MSL_FIXUPS[] = {
+    // Force-unroll the lighting loop in ubershaders, which greatly reduces register pressure on AMD
+    {"for (uint chan = 0u; chan < 2u; chan++)",
+     "_Pragma(\"unroll\") for (uint chan = 0u; chan < 2u; chan++)"},
+};
+
 static constexpr spirv_cross::MSLResourceBinding
 MakeResourceBinding(spv::ExecutionModel stage, u32 set, u32 binding,  //
                    u32 msl_buffer, u32 msl_texture, u32 msl_sampler)
@ -474,7 +501,27 @@ std::optional<std::string> Metal::Util::TranslateShaderToMSL(ShaderStage stage,
  for (auto& binding : resource_bindings)
    compiler.add_msl_resource_binding(binding);

-  std::string msl(MSL_HEADER);
-  msl += compiler.compile();
-  return msl;
+  std::string output(MSL_HEADER);
+  std::string compiled = compiler.compile();
+  std::string_view remaining = compiled;
+  while (!remaining.empty())
+  {
+    // Apply fixups
+    std::string_view piece = remaining;
+    std::string_view fixup_piece = {};
+    size_t next = piece.size();
+    for (const auto& fixup : MSL_FIXUPS)
+    {
+      size_t found = piece.find(fixup.first);
+      if (found == std::string_view::npos)
+        continue;
+      piece = piece.substr(0, found);
+      fixup_piece = fixup.second;
+      next = found + fixup.first.size();
+    }
+    output += piece;
+    output += fixup_piece;
+    remaining = remaining.substr(next);
+  }
+  return output;
 }
--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
@ -918,7 +918,7 @@ void VulkanContext::PopulateShaderSubgroupSupport()
  m_supports_shader_subgroup_operations =
      (subgroup_properties.supportedOperations & required_operations) == required_operations &&
      subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT &&
-      !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID);
+      !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS);
 }

 bool VulkanContext::SupportsExclusiveFullscreen(const WindowSystemInfo& wsi, VkSurfaceKHR surface)
--- a/Source/Core/VideoCommon/DriverDetails.cpp
+++ b/Source/Core/VideoCommon/DriverDetails.cpp
@ -132,10 +132,14 @@ constexpr BugInfo m_known_bugs[] = {
     -1.0, -1.0, true},
    {API_VULKAN, OS_ALL, VENDOR_ARM, DRIVER_ARM, Family::UNKNOWN, BUG_BROKEN_VECTOR_BITWISE_AND,
     -1.0, -1.0, true},
-    {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN,
-     BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
-    {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN,
-     BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
+    {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
+     -1.0, -1.0, true},
+    {API_VULKAN, OS_OSX, VENDOR_INTEL, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
+     -1.0, -1.0, true},
+    {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
+     -1.0, true},
+    {API_METAL, OS_OSX, VENDOR_INTEL, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
+     -1.0, true},
    {API_OPENGL, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
     BUG_BROKEN_MULTITHREADED_SHADER_PRECOMPILATION, -1.0, -1.0, true},
    {API_VULKAN, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
--- a/Source/Core/VideoCommon/DriverDetails.h
+++ b/Source/Core/VideoCommon/DriverDetails.h
@ -306,10 +306,15 @@ enum Bug
  BUG_BROKEN_VECTOR_BITWISE_AND,

  // BUG: Accessing gl_SubgroupInvocationID causes the Metal shader compiler to crash.
-  // Affected devices: AMD (macOS)
+  //      Affected devices: AMD (older macOS)
+  // BUG: gl_HelperInvocation always returns true, even for non-helper invocations
+  //      Affected devices: AMD (newer macOS)
+  // BUG: Using subgroupMax in a shader that can discard results in garbage data
+  //      (For some reason, this only happens at 4x+ IR on Metal, but 2x+ IR on MoltenVK)
+  //      Affected devices: Intel (macOS)
  // Started version: -1
  // Ended version: -1
-  BUG_BROKEN_SUBGROUP_INVOCATION_ID,
+  BUG_BROKEN_SUBGROUP_OPS,

  // BUG: Multi-threaded shader pre-compilation sometimes crashes
  // Used primarily in Videoconfig.cpp's GetNumAutoShaderPreCompilerThreads()
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -55,6 +55,8 @@ void VideoConfig::Refresh()

  bVSync = Config::Get(Config::GFX_VSYNC);
  iAdapter = Config::Get(Config::GFX_ADAPTER);
+  iManuallyUploadBuffers = Config::Get(Config::GFX_MTL_MANUALLY_UPLOAD_BUFFERS);
+  bUsePresentDrawable = Config::Get(Config::GFX_MTL_USE_PRESENT_DRAWABLE);

  bWidescreenHack = Config::Get(Config::GFX_WIDESCREEN_HACK);
  aspect_mode = Config::Get(Config::GFX_ASPECT_RATIO);
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -45,6 +45,13 @@ enum class ShaderCompilationMode : int
  AsynchronousSkipRendering
 };

+enum class TriState : int
+{
+  Off,
+  On,
+  Auto
+};
+
 // NEVER inherit from this class.
 struct VideoConfig final
 {
@ -149,6 +156,10 @@ struct VideoConfig final
  // D3D only config, mostly to be merged into the above
  int iAdapter = 0;

+  // Metal only config
+  TriState iManuallyUploadBuffers = TriState::Auto;
+  bool bUsePresentDrawable = false;
+
  // Enable API validation layers, currently only supported with Vulkan.
  bool bEnableValidationLayer = false;