Metal Renderer

2023-08-06 01:54:41 +10:00 · 2023-08-06 01:54:41 +10:00 · 7d97c539f3
parent 61fbf7a533
commit 7d97c539f3
15 changed files with 2725 additions and 10 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -171,6 +171,18 @@ if(WIN32)
  target_link_libraries(core PRIVATE winmm.lib)
 endif()

+if(APPLE)
+  target_sources(core PRIVATE
+    gpu/metal_device.h
+    gpu/metal_device.mm
+    gpu/metal_stream_buffer.h
+    gpu/metal_stream_buffer.mm
+  )
+  find_library(METAL_LIBRARY Metal)
+  find_library(QUARTZCORE_LIBRARY QuartzCore)
+  target_link_libraries(core PRIVATE ${METAL_LIBRARY} ${QUARTZCORE_LIBRARY})
+endif()
+
 if(USE_X11)
  target_sources(common PRIVATE
      gl/x11_window.cpp
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -167,6 +167,10 @@ public:
  static std::unique_ptr<GPU> CreateHardwareD3D12Renderer();
 #endif

+#ifdef __APPLE__
+  static std::unique_ptr<GPU> CreateHardwareMetalRenderer();
+#endif
+
 #ifdef WITH_OPENGL
  // gpu_hw_opengl.cpp
  static std::unique_ptr<GPU> CreateHardwareOpenGLRenderer();
--- a/src/core/gpu/gpu_device.cpp
+++ b/src/core/gpu/gpu_device.cpp
@ -34,6 +34,10 @@ Log_SetChannel(GPUDevice);
 #include "d3d12_gpu_device.h"
 #endif

+#ifdef __APPLE__
+extern std::unique_ptr<GPUDevice> WrapNewMetalDevice();
+#endif
+
 #ifdef WITH_OPENGL
 #include "opengl_device.h"
 #endif
@ -202,7 +206,7 @@ RenderAPI GPUDevice::GetPreferredAPI()
 #ifdef _WIN32___ // TODO remove me
  return RenderAPI::D3D11;
 #else
-  return RenderAPI::OpenGL;
+  return RenderAPI::Metal;
 #endif
 }

@ -1548,13 +1552,18 @@ std::unique_ptr<GPUDevice> Host::CreateDisplayForAPI(RenderAPI api)
      return std::make_unique<D3D11Device>();
 #endif

+#ifdef __APPLE__
+    case RenderAPI::Metal:
+      return WrapNewMetalDevice();
+#endif
+
    default:
 #if defined(_WIN32) && defined(_M_ARM64)
      return std::make_unique<D3D12GPUDevice>();
 #elif defined(_WIN32)
      return std::make_unique<D3D11Device>();
 #elif defined(__APPLE__)
-			return WrapNewMetalDevice();
+      return WrapNewMetalDevice();
 #elif defined(WITH_OPENGL)
      return std::make_unique<OpenGLDevice>();
 #elif defined(WITH_VULKAN)
--- a/src/core/gpu/gpu_device.h
+++ b/src/core/gpu/gpu_device.h
@ -26,7 +26,8 @@ enum class RenderAPI : u32
  D3D12,
  Vulkan,
  OpenGL,
-  OpenGLES
+  OpenGLES,
+  Metal
 };

 class GPUFramebuffer
--- a/src/core/gpu/metal_device.h
+++ b/src/core/gpu/metal_device.h
@ -0,0 +1,373 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "gpu_device.h"
+#include "metal_stream_buffer.h"
+#include "postprocessing_chain.h"
+
+#include "common/rectangle.h"
+#include "common/timer.h"
+#include "common/window_info.h"
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include <Metal/Metal.h>
+#include <QuartzCore/QuartzCore.h>
+
+#ifndef __OBJC__
+#error This file needs to be compiled with Objective C++.
+#endif
+
+#if __has_feature(objc_arc)
+#error ARC should not be enabled.
+#endif
+
+class MetalDevice;
+class MetalFramebuffer;
+class MetalPipeline;
+class MetalTexture;
+
+class MetalSampler final : public GPUSampler
+{
+  friend MetalDevice;
+
+public:
+  ~MetalSampler() override;
+
+	ALWAYS_INLINE id<MTLSamplerState> GetSamplerState() const { return m_ss; }
+
+  void SetDebugName(const std::string_view& name) override;
+
+private:
+  MetalSampler(id<MTLSamplerState> ss);
+
+	id<MTLSamplerState> m_ss;
+};
+
+class MetalShader final : public GPUShader
+{
+  friend MetalDevice;
+
+public:
+  ~MetalShader() override;
+
+	ALWAYS_INLINE id<MTLLibrary> GetLibrary() const { return m_library; }
+	ALWAYS_INLINE id<MTLFunction> GetFunction() const { return m_function; }
+
+  void SetDebugName(const std::string_view& name) override;
+
+private:
+  MetalShader(GPUShaderStage stage, id<MTLLibrary> library, id<MTLFunction> function);
+
+	id<MTLLibrary> m_library;
+	id<MTLFunction> m_function;
+};
+
+class MetalPipeline final : public GPUPipeline
+{
+  friend MetalDevice;
+
+public:
+  ~MetalPipeline() override;
+	
+	ALWAYS_INLINE id<MTLRenderPipelineState> GetPipelineState() const { return m_pipeline; }
+	ALWAYS_INLINE id<MTLDepthStencilState> GetDepthState() const { return m_depth; }
+	ALWAYS_INLINE MTLCullMode GetCullMode() const { return m_cull_mode; }
+	ALWAYS_INLINE MTLPrimitiveType GetPrimitive() const { return m_primitive; }
+
+  void SetDebugName(const std::string_view& name) override;
+
+private:
+  MetalPipeline(id<MTLRenderPipelineState> pipeline, id<MTLDepthStencilState> depth, MTLCullMode cull_mode, MTLPrimitiveType primitive);
+	
+	id<MTLRenderPipelineState> m_pipeline;
+	id<MTLDepthStencilState> m_depth;
+	MTLCullMode m_cull_mode;
+	MTLPrimitiveType m_primitive;
+};
+
+class MetalTexture final : public GPUTexture
+{
+  friend MetalDevice;
+
+public:
+  ~MetalTexture();
+
+	ALWAYS_INLINE id<MTLTexture> GetMTLTexture() const { return m_texture; }
+
+  bool Create(id<MTLDevice> device, u32 width, u32 height, u32 layers, u32 levels, u32 samples, Type type,
+              Format format, const void* initial_data = nullptr, u32 initial_data_stride = 0);
+  void Destroy();
+
+  bool IsValid() const override;
+
+  bool Update(u32 x, u32 y, u32 width, u32 height, const void* data, u32 pitch, u32 layer = 0, u32 level = 0) override;
+  bool Map(void** map, u32* map_stride, u32 x, u32 y, u32 width, u32 height, u32 layer = 0, u32 level = 0) override;
+  void Unmap() override;
+
+  void SetDebugName(const std::string_view& name) override;
+
+private:
+	MetalTexture(id<MTLTexture> texture, u16 width, u16 height, u8 layers, u8 levels, u8 samples, Type type,
+							 Format format);
+	
+	id<MTLTexture> m_texture;
+
+	u16 m_map_x = 0;
+	u16 m_map_y = 0;
+  u16 m_map_width = 0;
+	u16 m_map_height = 0;
+	u8 m_map_layer = 0;
+	u8 m_map_level = 0;
+};
+
+#if 0
+class MetalTextureBuffer final : public GPUTextureBuffer
+{
+public:
+  MetalTextureBuffer(Format format, u32 size_in_elements);
+  ~MetalTextureBuffer() override;
+
+  ALWAYS_INLINE IMetalBuffer* GetBuffer() const { return m_buffer.GetD3DBuffer(); }
+  ALWAYS_INLINE IMetalShaderResourceView* GetSRV() const { return m_srv.Get(); }
+  ALWAYS_INLINE IMetalShaderResourceView* const* GetSRVArray() const { return m_srv.GetAddressOf(); }
+
+  bool CreateBuffer(IMetalDevice* device);
+
+  // Inherited via GPUTextureBuffer
+  virtual void* Map(u32 required_elements) override;
+  virtual void Unmap(u32 used_elements) override;
+
+private:
+  MetalStreamBuffer m_buffer;
+  Microsoft::WRL::ComPtr<IMetalShaderResourceView> m_srv;
+};
+#endif
+
+class MetalFramebuffer final : public GPUFramebuffer
+{
+	friend MetalDevice;
+
+public:
+	~MetalFramebuffer() override;
+	
+	MTLRenderPassDescriptor* GetDescriptor() const;
+
+	void SetDebugName(const std::string_view& name) override;
+
+private:
+	MetalFramebuffer(GPUTexture* rt, GPUTexture* ds, u32 width, u32 height, id<MTLTexture> rt_tex, id<MTLTexture> ds_tex,
+									 MTLRenderPassDescriptor* descriptor);
+
+	id<MTLTexture> m_rt_tex;
+	id<MTLTexture> m_ds_tex;
+	MTLRenderPassDescriptor* m_descriptor;
+};
+
+class MetalDevice final : public GPUDevice
+{
+public:
+  ALWAYS_INLINE static MetalDevice& GetInstance() { return *static_cast<MetalDevice*>(g_host_display.get()); }
+  ALWAYS_INLINE static id<MTLDevice> GetMTLDevice() { return GetInstance().m_device; }
+	ALWAYS_INLINE static u64 GetCurrentFenceCounter() { return GetInstance().m_current_fence_counter; }
+	ALWAYS_INLINE static u64 GetCompletedFenceCounter() { return GetInstance().m_completed_fence_counter; }
+
+  MetalDevice();
+  ~MetalDevice();
+
+  RenderAPI GetRenderAPI() const override;
+
+  bool HasSurface() const override;
+
+  bool CreateDevice(const WindowInfo& wi, bool vsync) override;
+  bool SetupDevice() override;
+
+  bool MakeCurrent() override;
+  bool DoneCurrent() override;
+
+  bool ChangeWindow(const WindowInfo& new_wi) override;
+  void ResizeWindow(s32 new_window_width, s32 new_window_height) override;
+  bool SupportsFullscreen() const override;
+  bool IsFullscreen() override;
+  bool SetFullscreen(bool fullscreen, u32 width, u32 height, float refresh_rate) override;
+  AdapterAndModeList GetAdapterAndModeList() override;
+  void DestroySurface() override;
+
+  std::string GetShaderCacheBaseName(const std::string_view& type, bool debug) const override;
+
+  std::unique_ptr<GPUTexture> CreateTexture(u32 width, u32 height, u32 layers, u32 levels, u32 samples,
+                                            GPUTexture::Type type, GPUTexture::Format format,
+                                            const void* data = nullptr, u32 data_stride = 0,
+                                            bool dynamic = false) override;
+  std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
+  std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
+
+  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
+                       u32 out_data_stride) override;
+  bool SupportsTextureFormat(GPUTexture::Format format) const override;
+  void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
+                         u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
+  void ResolveTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
+                            u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
+
+  std::unique_ptr<GPUFramebuffer> CreateFramebuffer(GPUTexture* rt = nullptr, u32 rt_layer = 0, u32 rt_level = 0,
+                                                    GPUTexture* ds = nullptr, u32 ds_layer = 0,
+                                                    u32 ds_level = 0) override;
+
+  std::unique_ptr<GPUShader> CreateShaderFromBinary(GPUShaderStage stage, gsl::span<const u8> data) override;
+  std::unique_ptr<GPUShader> CreateShaderFromSource(GPUShaderStage stage, const std::string_view& source,
+                                                    std::vector<u8>* out_binary = nullptr) override;
+  std::unique_ptr<GPUPipeline> CreatePipeline(const GPUPipeline::GraphicsConfig& config) override;
+
+  void PushDebugGroup(const char* fmt, ...) override;
+  void PopDebugGroup() override;
+  void InsertDebugMessage(const char* fmt, ...) override;
+
+  void MapVertexBuffer(u32 vertex_size, u32 vertex_count, void** map_ptr, u32* map_space,
+                       u32* map_base_vertex) override;
+  void UnmapVertexBuffer(u32 vertex_size, u32 vertex_count) override;
+  void MapIndexBuffer(u32 index_count, DrawIndex** map_ptr, u32* map_space, u32* map_base_index) override;
+  void UnmapIndexBuffer(u32 used_index_count) override;
+  void PushUniformBuffer(const void* data, u32 data_size) override;
+  void* MapUniformBuffer(u32 size) override;
+  void UnmapUniformBuffer(u32 size) override;
+  void SetFramebuffer(GPUFramebuffer* fb) override;
+  void SetPipeline(GPUPipeline* pipeline) override;
+  void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override;
+  void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override;
+  void SetViewport(s32 x, s32 y, s32 width, s32 height) override;
+  void SetScissor(s32 x, s32 y, s32 width, s32 height) override;
+  void Draw(u32 vertex_count, u32 base_vertex) override;
+  void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override;
+
+  bool GetHostRefreshRate(float* refresh_rate) override;
+
+  bool SetGPUTimingEnabled(bool enabled) override;
+  float GetAndResetAccumulatedGPUTime() override;
+
+  void SetVSync(bool enabled) override;
+
+  bool BeginPresent(bool skip_present) override;
+  void EndPresent() override;
+
+	void WaitForFenceCounter(u64 counter);
+	
+	ALWAYS_INLINE MetalStreamBuffer& GetTextureStreamBuffer() { return m_texture_upload_buffer; }
+	id<MTLBlitCommandEncoder> GetTextureUploadEncoder(bool is_inline);
+	
+	void SubmitCommandBuffer();
+	void SubmitCommandBufferAndRestartRenderPass(const char* reason);
+	
+  void UnbindFramebuffer(MetalFramebuffer* fb);
+  void UnbindPipeline(MetalPipeline* pl);
+  void UnbindTexture(MetalTexture* tex);
+
+  static AdapterAndModeList StaticGetAdapterAndModeList();
+
+private:
+  static constexpr u32 VERTEX_BUFFER_SIZE = 8 * 1024 * 1024;
+  static constexpr u32 INDEX_BUFFER_SIZE = 4 * 1024 * 1024;
+  static constexpr u32 UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024;
+  static constexpr u32 UNIFORM_BUFFER_ALIGNMENT = 256;
+	static constexpr u32 TEXTURE_STREAM_BUFFER_SIZE = 32/*16*/ * 1024 * 1024; // TODO reduce after separate allocations
+  static constexpr u8 NUM_TIMESTAMP_QUERIES = 3;
+	
+	using DepthStateMap = std::unordered_map<u8, id<MTLDepthStencilState>>;
+
+	ALWAYS_INLINE NSView* GetWindowView() const { return (__bridge NSView*)m_window_info.window_handle; }
+	
+  void SetFeatures();
+	
+	std::unique_ptr<GPUShader> CreateShaderFromMSL(GPUShaderStage stage, const std::string_view& source, const std::string_view& entry_point);
+	
+	id<MTLDepthStencilState> GetDepthState(const GPUPipeline::DepthState& ds);
+	
+	void CreateCommandBuffer();
+	void CommandBufferCompleted(u64 fence_counter);
+	
+	ALWAYS_INLINE bool InRenderPass() const { return (m_render_encoder != nil); }
+	ALWAYS_INLINE bool IsInlineUploading() const { return (m_inline_upload_encoder != nil); }
+	void BeginRenderPass();
+	void EndRenderPass();
+	void EndInlineUploading();
+	void EndAnyEncoding();
+
+  void PreDrawCheck();
+	void SetInitialEncoderState();
+	void SetUniformBufferInRenderEncoder();
+	void SetViewportInRenderEncoder();
+	void SetScissorInRenderEncoder();
+
+  //bool CheckStagingBufferSize(u32 width, u32 height, DXGI_FORMAT format);
+  //void DestroyStagingBuffer();
+
+	bool CreateLayer();
+	void DestroyLayer();
+
+  bool CreateBuffers();
+  void DestroyBuffers();
+
+  bool CreateTimestampQueries();
+  void DestroyTimestampQueries();
+  void PopTimestampQuery();
+  void KickTimestampQuery();
+
+	id<MTLDevice> m_device;
+	id<MTLCommandQueue> m_queue;
+	
+	CAMetalLayer* m_layer = nil;
+	id<MTLDrawable> m_layer_drawable = nil;
+	MTLRenderPassDescriptor* m_layer_pass_desc = nil;
+	
+	std::mutex m_fence_mutex;
+	u64 m_current_fence_counter = 0;
+	std::atomic<u64> m_completed_fence_counter{0};
+	
+	DepthStateMap m_depth_states;
+
+//  ComPtr<IMetalTexture2D> m_readback_staging_texture;
+//  DXGI_FORMAT m_readback_staging_texture_format = DXGI_FORMAT_UNKNOWN;
+//  u32 m_readback_staging_texture_width = 0;
+//  u32 m_readback_staging_texture_height = 0;
+
+  MetalStreamBuffer m_vertex_buffer;
+  MetalStreamBuffer m_index_buffer;
+  MetalStreamBuffer m_uniform_buffer;
+	MetalStreamBuffer m_texture_upload_buffer;
+	
+	id<MTLCommandBuffer> m_upload_cmdbuf = nil;
+	id<MTLBlitCommandEncoder> m_upload_encoder = nil;
+	id<MTLBlitCommandEncoder> m_inline_upload_encoder = nil;
+	
+	id<MTLCommandBuffer> m_render_cmdbuf = nil;
+	id<MTLRenderCommandEncoder> m_render_encoder = nil;
+
+  MetalFramebuffer* m_current_framebuffer = nullptr;
+
+	MetalPipeline* m_current_pipeline = nullptr;
+	id<MTLDepthStencilState> m_current_depth_state = nil;
+	MTLCullMode m_current_cull_mode = MTLCullModeNone;
+	u32 m_current_uniform_buffer_position = 0;
+
+	std::array<id<MTLTexture>, MAX_TEXTURE_SAMPLERS> m_current_textures = {};
+	std::array<id<MTLSamplerState>, MAX_TEXTURE_SAMPLERS> m_current_samplers = {};
+	Common::Rectangle<s32> m_current_viewport = {};
+	Common::Rectangle<s32> m_current_scissor = {};
+	
+	bool m_vsync_enabled = false;
+
+//  std::array<std::array<ComPtr<IMetalQuery>, 3>, NUM_TIMESTAMP_QUERIES> m_timestamp_queries = {};
+//  u8 m_read_timestamp_query = 0;
+//  u8 m_write_timestamp_query = 0;
+//  u8 m_waiting_timestamp_queries = 0;
+//  bool m_timestamp_query_started = false;
+//  float m_accumulated_gpu_time = 0.0f;
+};
--- a/src/core/gpu/metal_device.mm
+++ b/src/core/gpu/metal_device.mm
--- a/src/core/gpu/metal_stream_buffer.h
+++ b/src/core/gpu/metal_stream_buffer.h
@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "common/types.h"
+
+#include <Metal/Metal.h>
+#include <QuartzCore/QuartzCore.h>
+
+#ifndef __OBJC__
+#error This file needs to be compiled with Objective C++.
+#endif
+
+#if __has_feature(objc_arc)
+#error ARC should not be enabled.
+#endif
+
+#include <deque>
+#include <memory>
+
+class MetalStreamBuffer
+{
+public:
+	MetalStreamBuffer();
+	MetalStreamBuffer(MetalStreamBuffer&& move) = delete;
+	MetalStreamBuffer(const MetalStreamBuffer&) = delete;
+	~MetalStreamBuffer();
+
+	MetalStreamBuffer& operator=(MetalStreamBuffer&& move) = delete;
+	MetalStreamBuffer& operator=(const MetalStreamBuffer&) = delete;
+
+	ALWAYS_INLINE bool IsValid() const { return (m_buffer != nil); }
+	ALWAYS_INLINE id<MTLBuffer> GetBuffer() const { return m_buffer; }
+	ALWAYS_INLINE u8* GetHostPointer() const { return m_host_pointer; }
+	ALWAYS_INLINE u8* GetCurrentHostPointer() const { return m_host_pointer + m_current_offset; }
+	ALWAYS_INLINE u32 GetCurrentSize() const { return m_size; }
+	ALWAYS_INLINE u32 GetCurrentSpace() const { return m_current_space; }
+	ALWAYS_INLINE u32 GetCurrentOffset() const { return m_current_offset; }
+
+	bool Create(id<MTLDevice> device, u32 size);
+	void Destroy();
+
+	bool ReserveMemory(u32 num_bytes, u32 alignment);
+	void CommitMemory(u32 final_num_bytes);
+
+private:
+	bool AllocateBuffer(u32 size);
+	void UpdateCurrentFencePosition();
+	void UpdateGPUPosition();
+
+	// Waits for as many fences as needed to allocate num_bytes bytes from the buffer.
+	bool WaitForClearSpace(u32 num_bytes);
+
+	u32 m_size = 0;
+	u32 m_current_offset = 0;
+	u32 m_current_space = 0;
+	u32 m_current_gpu_position = 0;
+
+	id<MTLBuffer> m_buffer = nil;
+	u8* m_host_pointer = nullptr;
+
+	// List of fences and the corresponding positions in the buffer
+	std::deque<std::pair<u64, u32>> m_tracked_fences;
+};
--- a/src/core/gpu/metal_stream_buffer.mm
+++ b/src/core/gpu/metal_stream_buffer.mm
@ -0,0 +1,253 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "metal_stream_buffer.h"
+#include "metal_device.h"
+
+#include "common/align.h"
+#include "common/assert.h"
+#include "common/log.h"
+
+Log_SetChannel(MetalDevice);
+
+MetalStreamBuffer::MetalStreamBuffer() = default;
+
+MetalStreamBuffer::~MetalStreamBuffer()
+{
+	if (IsValid())
+		Destroy();
+}
+
+bool MetalStreamBuffer::Create(id<MTLDevice> device, u32 size)
+{ @autoreleasepool {
+	const MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
+	
+	id<MTLBuffer> new_buffer = [device newBufferWithLength:size options:options];
+	if (new_buffer == nil)
+	{
+		Log_ErrorPrintf("Failed to create buffer.");
+		return false;
+	}
+	
+	if (IsValid())
+		Destroy();
+	
+	// Replace with the new buffer
+	m_size = size;
+	m_current_offset = 0;
+	m_current_gpu_position = 0;
+	m_tracked_fences.clear();
+	m_buffer = [new_buffer retain];
+	m_host_pointer = static_cast<u8*>([new_buffer contents]);
+	return true;
+} }
+
+void MetalStreamBuffer::Destroy()
+{
+	m_size = 0;
+	m_current_offset = 0;
+	m_current_gpu_position = 0;
+	m_tracked_fences.clear();
+	[m_buffer release];
+	m_buffer = nil;
+	m_host_pointer = nullptr;
+}
+
+bool MetalStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
+{
+	const u32 required_bytes = num_bytes + alignment;
+
+	// Check for sane allocations
+	if (required_bytes > m_size)
+	{
+		Log_ErrorPrintf("Attempting to allocate %u bytes from a %u byte stream buffer", static_cast<u32>(num_bytes),
+			static_cast<u32>(m_size));
+		Panic("Stream buffer overflow");
+		return false;
+	}
+
+	UpdateGPUPosition();
+
+	// Is the GPU behind or up to date with our current offset?
+	if (m_current_offset >= m_current_gpu_position)
+	{
+		const u32 remaining_bytes = m_size - m_current_offset;
+		if (required_bytes <= remaining_bytes)
+		{
+			// Place at the current position, after the GPU position.
+			m_current_offset = Common::AlignUp(m_current_offset, alignment);
+			m_current_space = m_size - m_current_offset;
+			return true;
+		}
+
+		// Check for space at the start of the buffer
+		// We use < here because we don't want to have the case of m_current_offset ==
+		// m_current_gpu_position. That would mean the code above would assume the
+		// GPU has caught up to us, which it hasn't.
+		if (required_bytes < m_current_gpu_position)
+		{
+			// Reset offset to zero, since we're allocating behind the gpu now
+			m_current_offset = 0;
+			m_current_space = m_current_gpu_position - 1;
+			return true;
+		}
+	}
+
+	// Is the GPU ahead of our current offset?
+	if (m_current_offset < m_current_gpu_position)
+	{
+		// We have from m_current_offset..m_current_gpu_position space to use.
+		const u32 remaining_bytes = m_current_gpu_position - m_current_offset;
+		if (required_bytes < remaining_bytes)
+		{
+			// Place at the current position, since this is still behind the GPU.
+			m_current_offset = Common::AlignUp(m_current_offset, alignment);
+			m_current_space = m_current_gpu_position - m_current_offset - 1;
+			return true;
+		}
+	}
+
+	// Can we find a fence to wait on that will give us enough memory?
+	if (WaitForClearSpace(required_bytes))
+	{
+		const u32 align_diff = Common::AlignUp(m_current_offset, alignment) - m_current_offset;
+		m_current_offset += align_diff;
+		m_current_space -= align_diff;
+		return true;
+	}
+
+	// We tried everything we could, and still couldn't get anything. This means that too much space
+	// in the buffer is being used by the command buffer currently being recorded. Therefore, the
+	// only option is to execute it, and wait until it's done.
+	return false;
+}
+
+void MetalStreamBuffer::CommitMemory(u32 final_num_bytes)
+{
+	DebugAssert((m_current_offset + final_num_bytes) <= m_size);
+	DebugAssert(final_num_bytes <= m_current_space);
+
+	m_current_offset += final_num_bytes;
+	m_current_space -= final_num_bytes;
+	UpdateCurrentFencePosition();
+}
+
+void MetalStreamBuffer::UpdateCurrentFencePosition()
+{
+	// Has the offset changed since the last fence?
+	const u64 counter = MetalDevice::GetCurrentFenceCounter();
+	if (!m_tracked_fences.empty() && m_tracked_fences.back().first == counter)
+	{
+		// Still haven't executed a command buffer, so just update the offset.
+		m_tracked_fences.back().second = m_current_offset;
+		return;
+	}
+
+	// New buffer, so update the GPU position while we're at it.
+	m_tracked_fences.emplace_back(counter, m_current_offset);
+}
+
+void MetalStreamBuffer::UpdateGPUPosition()
+{
+	auto start = m_tracked_fences.begin();
+	auto end = start;
+
+	const u64 completed_counter = MetalDevice::GetCompletedFenceCounter();
+	while (end != m_tracked_fences.end() && completed_counter >= end->first)
+	{
+		m_current_gpu_position = end->second;
+		++end;
+	}
+
+	if (start != end)
+	{
+		m_tracked_fences.erase(start, end);
+		if (m_current_offset == m_current_gpu_position)
+		{
+			// GPU is all caught up now.
+			m_current_offset = 0;
+			m_current_gpu_position = 0;
+			m_current_space = m_size;
+		}
+	}
+}
+
+bool MetalStreamBuffer::WaitForClearSpace(u32 num_bytes)
+{
+	u32 new_offset = 0;
+	u32 new_space = 0;
+	u32 new_gpu_position = 0;
+
+	auto iter = m_tracked_fences.begin();
+	for (; iter != m_tracked_fences.end(); ++iter)
+	{
+		// Would this fence bring us in line with the GPU?
+		// This is the "last resort" case, where a command buffer execution has been forced
+		// after no additional data has been written to it, so we can assume that after the
+		// fence has been signaled the entire buffer is now consumed.
+		u32 gpu_position = iter->second;
+		if (m_current_offset == gpu_position)
+		{
+			new_offset = 0;
+			new_space = m_size;
+			new_gpu_position = 0;
+			break;
+		}
+
+		// Assuming that we wait for this fence, are we allocating in front of the GPU?
+		if (m_current_offset > gpu_position)
+		{
+			// This would suggest the GPU has now followed us and wrapped around, so we have from
+			// m_current_position..m_size free, as well as and 0..gpu_position.
+			const u32 remaining_space_after_offset = m_size - m_current_offset;
+			if (remaining_space_after_offset >= num_bytes)
+			{
+				// Switch to allocating in front of the GPU, using the remainder of the buffer.
+				new_offset = m_current_offset;
+				new_space = m_size - m_current_offset;
+				new_gpu_position = gpu_position;
+				break;
+			}
+
+			// We can wrap around to the start, behind the GPU, if there is enough space.
+			// We use > here because otherwise we'd end up lining up with the GPU, and then the
+			// allocator would assume that the GPU has consumed what we just wrote.
+			if (gpu_position > num_bytes)
+			{
+				new_offset = 0;
+				new_space = gpu_position - 1;
+				new_gpu_position = gpu_position;
+				break;
+			}
+		}
+		else
+		{
+			// We're currently allocating behind the GPU. This would give us between the current
+			// offset and the GPU position worth of space to work with. Again, > because we can't
+			// align the GPU position with the buffer offset.
+			u32 available_space_inbetween = gpu_position - m_current_offset;
+			if (available_space_inbetween > num_bytes)
+			{
+				// Leave the offset as-is, but update the GPU position.
+				new_offset = m_current_offset;
+				new_space = available_space_inbetween - 1;
+				new_gpu_position = gpu_position;
+				break;
+			}
+		}
+	}
+
+	// Did any fences satisfy this condition?
+	// Has the command buffer been executed yet? If not, the caller should execute it.
+	if (iter == m_tracked_fences.end() || iter->first == MetalDevice::GetCurrentFenceCounter())
+		return false;
+
+	// Wait until this fence is signaled. This will fire the callback, updating the GPU position.
+	MetalDevice::GetInstance().WaitForFenceCounter(iter->first);
+	m_tracked_fences.erase(
+		m_tracked_fences.begin(), m_current_offset == iter->second ? m_tracked_fences.end() : ++iter);
+	m_current_offset = new_offset;
+	m_current_space = new_space;
+	m_current_gpu_position = new_gpu_position;
+	return true;
+}
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -2654,6 +2654,24 @@ std::unique_ptr<GPU> GPU::CreateHardwareD3D11Renderer()

 #endif

+#ifdef __APPLE__
+
+std::unique_ptr<GPU> GPU::CreateHardwareMetalRenderer()
+{
+  if (!Host::AcquireHostDisplay(RenderAPI::Metal))
+  {
+    Log_ErrorPrintf("Host render API is incompatible");
+    return nullptr;
+  }
+
+  std::unique_ptr<GPU_HW> gpu(std::make_unique<GPU_HW>());
+  if (!gpu->Initialize())
+    return nullptr;
+
+  return gpu;
+}
+
+#endif

 std::unique_ptr<GPU> GPU::CreateHardwareOpenGLRenderer()
 {
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -1162,6 +1162,8 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_ssbo)
    ss << "layout(std430";
    if (IsVulkan())
      ss << ", set = 0, binding = 0";
+    else if (IsMetal())
+      ss << ", set = 0, binding = 1";
    else if (m_use_glsl_binding_layout)
      ss << ", binding = 0";

--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@ -883,6 +883,9 @@ static constexpr auto s_gpu_renderer_names = make_array(
 #ifdef _WIN32
  "D3D11", "D3D12",
 #endif
+#ifdef __APPLE__
+  "Metal",
+#endif
 #ifdef WITH_VULKAN
  "Vulkan",
 #endif
@ -894,6 +897,9 @@ static constexpr auto s_gpu_renderer_display_names = make_array(
 #ifdef _WIN32
  TRANSLATABLE("GPURenderer", "Hardware (D3D11)"), TRANSLATABLE("GPURenderer", "Hardware (D3D12)"),
 #endif
+#ifdef __APPLE__
+  TRANSLATABLE("GPURenderer", "Hardware (Metal)"),
+#endif
 #ifdef WITH_VULKAN
  TRANSLATABLE("GPURenderer", "Hardware (Vulkan)"),
 #endif
--- a/src/core/shadergen.cpp
+++ b/src/core/shadergen.cpp
@ -15,6 +15,7 @@ Log_SetChannel(ShaderGen);

 ShaderGen::ShaderGen(RenderAPI render_api, bool supports_dual_source_blend)
  : m_render_api(render_api), m_glsl(render_api != RenderAPI::D3D11 && render_api != RenderAPI::D3D12),
+    m_spirv(render_api == RenderAPI::Vulkan || render_api == RenderAPI::Metal),
    m_supports_dual_source_blend(supports_dual_source_blend), m_use_glsl_interface_blocks(false)
 {
 #if defined(WITH_OPENGL) || defined(WITH_VULKAN)
@ -24,8 +25,8 @@ ShaderGen::ShaderGen(RenderAPI render_api, bool supports_dual_source_blend)
    if (m_render_api == RenderAPI::OpenGL || m_render_api == RenderAPI::OpenGLES)
      SetGLSLVersionString();

-    m_use_glsl_interface_blocks = (IsVulkan() || GLAD_GL_ES_VERSION_3_2 || GLAD_GL_VERSION_3_2);
-    m_use_glsl_binding_layout = (IsVulkan() || UseGLSLBindingLayout());
+    m_use_glsl_interface_blocks = (IsVulkan() || IsMetal() || GLAD_GL_ES_VERSION_3_2 || GLAD_GL_VERSION_3_2);
+    m_use_glsl_binding_layout = (IsVulkan() || IsMetal() || UseGLSLBindingLayout());

    if (m_render_api == RenderAPI::OpenGL)
    {
@ -109,7 +110,7 @@ void ShaderGen::WriteHeader(std::stringstream& ss)
 {
  if (m_render_api == RenderAPI::OpenGL || m_render_api == RenderAPI::OpenGLES)
    ss << m_glsl_version_string << "\n\n";
-  else if (m_render_api == RenderAPI::Vulkan)
+  else if (m_spirv)
    ss << "#version 450 core\n\n";

 #ifdef WITH_OPENGL
@ -157,6 +158,7 @@ void ShaderGen::WriteHeader(std::stringstream& ss)
  DefineMacro(ss, "API_D3D11", m_render_api == RenderAPI::D3D11);
  DefineMacro(ss, "API_D3D12", m_render_api == RenderAPI::D3D12);
  DefineMacro(ss, "API_VULKAN", m_render_api == RenderAPI::Vulkan);
+  DefineMacro(ss, "API_METAL", m_render_api == RenderAPI::Metal);

 #ifdef WITH_OPENGL
  if (m_render_api == RenderAPI::OpenGLES)
@ -275,6 +277,10 @@ void ShaderGen::WriteUniformBufferDeclaration(std::stringstream& ss, bool push_c
    else
      ss << "layout(std140, set = 0, binding = 0) uniform UBOBlock\n";
  }
+  else if (IsMetal())
+  {
+    ss << "layout(std140, set = 0, binding = 0) uniform UBOBlock\n";
+  }
  else if (m_glsl)
  {
    if (m_use_glsl_binding_layout)
@ -343,7 +349,7 @@ const char* ShaderGen::GetInterpolationQualifier(bool interface_block, bool cent
 #else
  const bool shading_language_420pack = false;
 #endif
-  if (m_glsl && interface_block && (!IsVulkan() && !shading_language_420pack))
+  if (m_glsl && interface_block && (!m_spirv && !shading_language_420pack))
  {
    return (sample_interpolation ? (is_out ? "sample out " : "sample in ") :
                                   (centroid_interpolation ? (is_out ? "centroid out " : "centroid in ") : ""));
@ -381,7 +387,7 @@ void ShaderGen::DeclareVertexEntryPoint(
    {
      const char* qualifier = GetInterpolationQualifier(true, msaa, ssaa, true);

-      if (IsVulkan())
+      if (m_spirv)
        ss << "layout(location = 0) ";

      ss << "out VertexData" << output_block_suffix << " {\n";
@ -418,7 +424,7 @@ void ShaderGen::DeclareVertexEntryPoint(
    ss << "#define v_pos gl_Position\n\n";
    if (declare_vertex_id)
    {
-      if (IsVulkan())
+      if (m_spirv)
        ss << "#define v_id uint(gl_VertexIndex)\n";
      else
        ss << "#define v_id uint(gl_VertexID)\n";
@ -475,7 +481,7 @@ void ShaderGen::DeclareFragmentEntryPoint(
    {
      const char* qualifier = GetInterpolationQualifier(true, msaa, ssaa, false);

-      if (IsVulkan())
+      if (m_spirv)
        ss << "layout(location = 0) ";

      ss << "in VertexData {\n";
--- a/src/core/shadergen.h
+++ b/src/core/shadergen.h
@ -28,6 +28,7 @@ public:

 protected:
  ALWAYS_INLINE bool IsVulkan() const { return (m_render_api == RenderAPI::Vulkan); }
+  ALWAYS_INLINE bool IsMetal() const { return (m_render_api == RenderAPI::Metal); }

  const char* GetInterpolationQualifier(bool interface_block, bool centroid_interpolation, bool sample_interpolation,
                                        bool is_out) const;
@ -56,6 +57,7 @@ protected:

  RenderAPI m_render_api;
  bool m_glsl;
+  bool m_spirv;
  bool m_supports_dual_source_blend;
  bool m_use_glsl_interface_blocks;
  bool m_use_glsl_binding_layout;
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@ -1644,6 +1644,12 @@ bool System::CreateGPU(GPURenderer renderer)
      break;
 #endif

+#ifdef __APPLE__
+    case GPURenderer::HardwareMetal:
+      g_gpu = GPU::CreateHardwareMetalRenderer();
+      break;
+#endif
+
    case GPURenderer::Software:
    default:
      g_gpu = GPU::CreateSoftwareRenderer();
--- a/src/core/types.h
+++ b/src/core/types.h
@ -62,6 +62,9 @@ enum class GPURenderer : u8
  HardwareD3D11,
  HardwareD3D12,
 #endif
+#ifdef __APPLE__
+  HardwareMetal,
+#endif
 #ifdef WITH_VULKAN
  HardwareVulkan,
 #endif