From c08de82e90b8d120b44e68f92cd58712f6360a37 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Sat, 23 Jul 2022 17:27:24 -0500 Subject: [PATCH] VideoBackends:Metal: Bring back unified memory config Turns out it was helpful. (Most improvement in ubershaders.) This time with much better auto mode. --- Source/Core/Core/Config/GraphicsSettings.cpp | 3 + Source/Core/Core/Config/GraphicsSettings.h | 3 + .../VideoBackends/Metal/MTLBoundingBox.mm | 1 + .../Core/VideoBackends/Metal/MTLPerfQuery.mm | 1 + .../VideoBackends/Metal/MTLStateTracker.h | 29 +++- .../VideoBackends/Metal/MTLStateTracker.mm | 139 ++++++++++++++++-- Source/Core/VideoBackends/Metal/MTLTexture.mm | 5 +- Source/Core/VideoBackends/Metal/MTLUtil.h | 4 + Source/Core/VideoBackends/Metal/MTLUtil.mm | 21 +++ Source/Core/VideoCommon/VideoConfig.cpp | 1 + Source/Core/VideoCommon/VideoConfig.h | 10 ++ 11 files changed, 201 insertions(+), 16 deletions(-) diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp index e5056678bd..204a7fc82a 100644 --- a/Source/Core/Core/Config/GraphicsSettings.cpp +++ b/Source/Core/Core/Config/GraphicsSettings.cpp @@ -92,6 +92,9 @@ const Info GFX_SHADER_PRECOMPILER_THREADS{ const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE{ {System::GFX, "Settings", "SaveTextureCacheToState"}, true}; +const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS{ + {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto}; + const Info GFX_SW_DUMP_OBJECTS{{System::GFX, "Settings", "SWDumpObjects"}, false}; const Info GFX_SW_DUMP_TEV_STAGES{{System::GFX, "Settings", "SWDumpTevStages"}, false}; const Info GFX_SW_DUMP_TEV_TEX_FETCHES{{System::GFX, "Settings", "SWDumpTevTexFetches"}, diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h index 398a3dd5a0..b91f985289 100644 --- a/Source/Core/Core/Config/GraphicsSettings.h +++ b/Source/Core/Core/Config/GraphicsSettings.h @@ -11,6 +11,7 @@ enum class AspectMode : int; enum class ShaderCompilationMode : int; enum class StereoMode : int; enum class FreelookControlType : int; +enum class TriState : int; namespace Config { @@ -74,6 +75,8 @@ extern const Info GFX_SHADER_COMPILER_THREADS; extern const Info GFX_SHADER_PRECOMPILER_THREADS; extern const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE; +extern const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS; + extern const Info GFX_SW_DUMP_OBJECTS; extern const Info GFX_SW_DUMP_TEV_STAGES; extern const Info GFX_SW_DUMP_TEV_TEX_FETCHES; diff --git a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm index 4c1a6ebd48..3ff48a22c2 100644 --- a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm +++ b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm @@ -36,6 +36,7 @@ std::vector Metal::BoundingBox::Read(u32 index, u32 length) { g_state_tracker->EndRenderPass(); g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); g_state_tracker->WaitForFlushedEncoders(); return std::vector(m_cpu_buffer_ptr + index, m_cpu_buffer_ptr + index + length); } diff --git a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm index 42139e63bf..cd65b37b58 100644 --- a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm +++ b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm @@ -56,6 +56,7 @@ void Metal::PerfQuery::FlushResults() // There's a possibility that some active performance queries are unflushed g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); std::unique_lock lock(m_results_mtx); while (!IsFlushed()) diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.h b/Source/Core/VideoBackends/Metal/MTLStateTracker.h index 3e7bac832d..9e5257147e 100644 --- a/Source/Core/VideoBackends/Metal/MTLStateTracker.h +++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.h @@ -34,7 +34,6 @@ public: Uniform, Vertex, Index, - TextureData, Texels, Last = Texels }; @@ -75,6 +74,14 @@ public: return m_current_draw != 1 + m_last_finished_draw.load(std::memory_order_acquire); } void ReloadSamplers(); + void NotifyOfCPUGPUSync() + { + if (!g_features.manual_buffer_upload || !m_manual_buffer_upload) + return; + if (m_upload_cmdbuf || m_current_render_cmdbuf) + return; + SetManualBufferUpload(false); + } void SetPipeline(const Pipeline* pipe); void SetPipeline(const ComputePipeline* pipe); @@ -106,6 +113,7 @@ public: { return (amt + static_cast(align)) & ~static_cast(align); } + Map AllocateForTextureUpload(size_t amt); Map Allocate(UploadBuffer buffer_idx, size_t amt, AlignMask align) { Preallocate(buffer_idx, amt); @@ -119,6 +127,7 @@ public: static_cast(align)) == 0); return CommitPreallocation(buffer_idx, Align(amt, align)); } + id GetUploadEncoder(); id GetTextureUploadEncoder(); id GetRenderCmdBuf(); @@ -142,18 +151,28 @@ private: void Reset(size_t new_size); }; - struct Buffer + struct CPUBuffer { UsageTracker usage; MRCOwned> mtlbuffer; void* buffer = nullptr; }; + struct BufferPair + { + UsageTracker usage; + MRCOwned> cpubuffer; + MRCOwned> gpubuffer; + void* buffer = nullptr; + size_t last_upload = 0; + }; + struct Backref; struct PerfQueryTracker; std::shared_ptr m_backref; std::vector> m_perf_query_tracker_cache; + MRCOwned> m_fence; MRCOwned> m_upload_cmdbuf; MRCOwned> m_upload_encoder; MRCOwned> m_texture_upload_cmdbuf; @@ -165,7 +184,8 @@ private: MRCOwned m_render_pass_desc[3]; MRCOwned m_resolve_pass_desc; Framebuffer* m_current_framebuffer; - Buffer m_upload_buffers[static_cast(UploadBuffer::Last) + 1]; + CPUBuffer m_texture_upload_buffer; + BufferPair m_upload_buffers[static_cast(UploadBuffer::Last) + 1]; u64 m_current_draw = 1; std::atomic m_last_finished_draw{0}; @@ -249,9 +269,12 @@ private: } m_state; u32 m_perf_query_tracker_counter = 0; + bool m_manual_buffer_upload = false; + void SetManualBufferUpload(bool enable); std::shared_ptr NewPerfQueryTracker(); void SetSamplerForce(u32 idx, const SamplerState& sampler); + void Sync(BufferPair& buffer); Map CommitPreallocation(UploadBuffer buffer_idx, size_t actual_amt); void CheckViewport(); void CheckScissor(); diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm index 0004c81089..d1664d28a5 100644 --- a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm +++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm @@ -44,12 +44,11 @@ static NSString* GetName(Metal::StateTracker::UploadBuffer buffer) // clang-format off switch (buffer) { - case Metal::StateTracker::UploadBuffer::TextureData: return @"Texture Data"; - case Metal::StateTracker::UploadBuffer::Texels: return @"Texels"; - case Metal::StateTracker::UploadBuffer::Vertex: return @"Vertices"; - case Metal::StateTracker::UploadBuffer::Index: return @"Indices"; - case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms"; - case Metal::StateTracker::UploadBuffer::Other: return @"Generic Upload"; + case Metal::StateTracker::UploadBuffer::Texels: return @"Texels"; + case Metal::StateTracker::UploadBuffer::Vertex: return @"Vertices"; + case Metal::StateTracker::UploadBuffer::Index: return @"Indices"; + case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms"; + case Metal::StateTracker::UploadBuffer::Other: return @"Generic Upload"; } // clang-format on } @@ -104,6 +103,7 @@ void Metal::StateTracker::UsageTracker::Reset(size_t new_size) Metal::StateTracker::StateTracker() : m_backref(std::make_shared(this)) { m_flags.should_apply_label = true; + m_fence = MRCTransfer([g_device newFence]); for (MRCOwned& rpdesc : m_render_pass_desc) { rpdesc = MRCTransfer([MTLRenderPassDescriptor new]); @@ -140,9 +140,10 @@ Metal::StateTracker::~StateTracker() // MARK: BufferPair Ops -std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt) +Metal::StateTracker::Map Metal::StateTracker::AllocateForTextureUpload(size_t amt) { - Buffer& buffer = m_upload_buffers[static_cast(buffer_idx)]; + amt = (amt + 15) & ~15ull; + CPUBuffer& buffer = m_texture_upload_buffer; u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire); bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt); if (__builtin_expect(needs_new, false)) @@ -154,11 +155,61 @@ std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_id MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined; buffer.mtlbuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); - [buffer.mtlbuffer setLabel:GetName(buffer_idx)]; + [buffer.mtlbuffer setLabel:@"Texture Upload Buffer"]; ASSERT_MSG(VIDEO, buffer.mtlbuffer, "Failed to allocate MTLBuffer (out of memory?)"); buffer.buffer = [buffer.mtlbuffer contents]; buffer.usage.Reset(newsize); } + + size_t pos = buffer.usage.Allocate(m_current_draw, amt); + + Map ret = {buffer.mtlbuffer, pos, reinterpret_cast(buffer.buffer) + pos}; + DEBUG_ASSERT(pos <= buffer.usage.Size() && + "Previous code should have guaranteed there was enough space"); + return ret; +} + +std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt) +{ + BufferPair& buffer = m_upload_buffers[static_cast(buffer_idx)]; + u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire); + size_t base_pos = buffer.usage.Pos(); + bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt); + bool needs_upload = needs_new || buffer.usage.Pos() == 0; + if (m_manual_buffer_upload && needs_upload) + { + if (base_pos != buffer.last_upload) + { + id encoder = GetUploadEncoder(); + [encoder copyFromBuffer:buffer.cpubuffer + sourceOffset:buffer.last_upload + toBuffer:buffer.gpubuffer + destinationOffset:buffer.last_upload + size:base_pos - buffer.last_upload]; + } + buffer.last_upload = 0; + } + if (__builtin_expect(needs_new, false)) + { + // Orphan buffer + size_t newsize = std::max(buffer.usage.Size() * 2, 4096); + while (newsize < amt) + newsize *= 2; + MTLResourceOptions options = + MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined; + buffer.cpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); + [buffer.cpubuffer setLabel:GetName(buffer_idx)]; + ASSERT_MSG(VIDEO, buffer.cpubuffer, "Failed to allocate MTLBuffer (out of memory?)"); + buffer.buffer = [buffer.cpubuffer contents]; + buffer.usage.Reset(newsize); + if (g_features.manual_buffer_upload) + { + options = MTLResourceStorageModePrivate | MTLResourceHazardTrackingModeUntracked; + buffer.gpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); + [buffer.gpubuffer setLabel:GetName(buffer_idx)]; + ASSERT_MSG(VIDEO, buffer.gpubuffer, "Failed to allocate MTLBuffer (out of memory?)"); + } + } size_t pos = buffer.usage.Pos(); return std::make_pair(reinterpret_cast(buffer.buffer) + pos, pos); } @@ -166,17 +217,46 @@ std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_id Metal::StateTracker::Map Metal::StateTracker::CommitPreallocation(UploadBuffer buffer_idx, size_t amt) { - Buffer& buffer = m_upload_buffers[static_cast(buffer_idx)]; + BufferPair& buffer = m_upload_buffers[static_cast(buffer_idx)]; size_t pos = buffer.usage.Allocate(m_current_draw, amt); Map ret = {nil, pos, reinterpret_cast(buffer.buffer) + pos}; - ret.gpu_buffer = buffer.mtlbuffer; + ret.gpu_buffer = m_manual_buffer_upload ? buffer.gpubuffer : buffer.cpubuffer; DEBUG_ASSERT(pos <= buffer.usage.Size() && "Previous code should have guaranteed there was enough space"); return ret; } +void Metal::StateTracker::Sync(BufferPair& buffer) +{ + if (!m_manual_buffer_upload || buffer.usage.Pos() == buffer.last_upload) + return; + + id encoder = GetUploadEncoder(); + [encoder copyFromBuffer:buffer.cpubuffer + sourceOffset:buffer.last_upload + toBuffer:buffer.gpubuffer + destinationOffset:buffer.last_upload + size:buffer.usage.Pos() - buffer.last_upload]; + buffer.last_upload = buffer.usage.Pos(); +} + // MARK: Render Pass / Encoder Management +id Metal::StateTracker::GetUploadEncoder() +{ + if (!m_upload_cmdbuf) + { + @autoreleasepool + { + m_upload_cmdbuf = MRCRetain([g_queue commandBuffer]); + [m_upload_cmdbuf setLabel:@"Vertex Upload"]; + m_upload_encoder = MRCRetain([m_upload_cmdbuf blitCommandEncoder]); + [m_upload_encoder setLabel:@"Vertex Upload"]; + } + } + return m_upload_encoder; +} + id Metal::StateTracker::GetTextureUploadEncoder() { if (!m_texture_upload_cmdbuf) @@ -269,6 +349,8 @@ void Metal::StateTracker::BeginRenderPass(MTLRenderPassDescriptor* descriptor) MRCRetain([GetRenderCmdBuf() renderCommandEncoderWithDescriptor:descriptor]); if (m_current_perf_query) [descriptor setVisibilityResultBuffer:nil]; + if (m_manual_buffer_upload) + [m_current_render_encoder waitForFence:m_fence beforeStages:MTLRenderStageVertex]; AbstractTexture* attachment = m_current_framebuffer->GetColorAttachment(); if (!attachment) attachment = m_current_framebuffer->GetDepthAttachment(); @@ -298,6 +380,8 @@ void Metal::StateTracker::BeginComputePass() EndRenderPass(); m_current_compute_encoder = MRCRetain([GetRenderCmdBuf() computeCommandEncoder]); [m_current_compute_encoder setLabel:@"Compute"]; + if (m_manual_buffer_upload) + [m_current_compute_encoder waitForFence:m_fence]; m_flags.NewEncoder(); m_dirty_samplers = 0xff; m_dirty_textures = 0xff; @@ -325,6 +409,20 @@ void Metal::StateTracker::FlushEncoders() if (!m_current_render_cmdbuf) return; EndRenderPass(); + for (int i = 0; i <= static_cast(UploadBuffer::Last); ++i) + Sync(m_upload_buffers[i]); + if (!m_manual_buffer_upload) + { + ASSERT(!m_upload_cmdbuf && "Should never be used!"); + } + else if (m_upload_cmdbuf) + { + [m_upload_encoder updateFence:m_fence]; + [m_upload_encoder endEncoding]; + [m_upload_cmdbuf commit]; + m_upload_encoder = nullptr; + m_upload_cmdbuf = nullptr; + } if (m_texture_upload_cmdbuf) { [m_texture_upload_encoder endEncoding]; @@ -354,6 +452,8 @@ void Metal::StateTracker::FlushEncoders() m_last_render_cmdbuf = std::move(m_current_render_cmdbuf); m_current_render_cmdbuf = nullptr; m_current_draw++; + if (g_features.manual_buffer_upload && !m_manual_buffer_upload) + SetManualBufferUpload(true); } void Metal::StateTracker::WaitForFlushedEncoders() @@ -367,6 +467,23 @@ void Metal::StateTracker::ReloadSamplers() m_state.samplers[i] = g_object_cache->GetSampler(m_state.sampler_states[i]); } +void Metal::StateTracker::SetManualBufferUpload(bool enabled) +{ + // When a game does something that needs CPU-GPU sync (e.g. bbox, texture download, etc), + // the next command buffer will be done with manual buffer upload disabled, + // since overlapping the upload with the previous draw won't be possible (due to sync). + // This greatly improves performance in heavy bbox games like Super Paper Mario. + m_manual_buffer_upload = enabled; + if (enabled) + { + for (BufferPair& buffer : m_upload_buffers) + { + // Update sync positions, since Sync doesn't do it when manual buffer upload is off + buffer.last_upload = buffer.usage.Pos(); + } + } +} + // MARK: State Setters void Metal::StateTracker::SetPipeline(const Pipeline* pipe) diff --git a/Source/Core/VideoBackends/Metal/MTLTexture.mm b/Source/Core/VideoBackends/Metal/MTLTexture.mm index fd0358e10e..7e1ec73f78 100644 --- a/Source/Core/VideoBackends/Metal/MTLTexture.mm +++ b/Source/Core/VideoBackends/Metal/MTLTexture.mm @@ -6,6 +6,7 @@ #include "Common/Align.h" #include "Common/Assert.h" +#include "VideoBackends/Metal/MTLRenderer.h" #include "VideoBackends/Metal/MTLStateTracker.h" Metal::Texture::Texture(MRCOwned> tex, const TextureConfig& config) @@ -59,8 +60,7 @@ void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length, // const u32 num_rows = Common::AlignUp(height, block_size) / block_size; const u32 source_pitch = CalculateStrideForFormat(m_config.format, row_length); const u32 upload_size = source_pitch * num_rows; - StateTracker::Map map = g_state_tracker->Allocate(StateTracker::UploadBuffer::TextureData, - upload_size, StateTracker::AlignMask::Other); + StateTracker::Map map = g_state_tracker->AllocateForTextureUpload(upload_size); memcpy(map.cpu_buffer, buffer, upload_size); id encoder = g_state_tracker->GetTextureUploadEncoder(); [encoder copyFromBuffer:map.gpu_buffer @@ -163,6 +163,7 @@ void Metal::StagingTexture::Flush() { // Flush while we wait, since who knows how long we'll be sitting here g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); [m_wait_buffer waitUntilCompleted]; } m_wait_buffer = nullptr; diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.h b/Source/Core/VideoBackends/Metal/MTLUtil.h index dfedecd7c6..28fb57cb0b 100644 --- a/Source/Core/VideoBackends/Metal/MTLUtil.h +++ b/Source/Core/VideoBackends/Metal/MTLUtil.h @@ -16,6 +16,10 @@ namespace Metal { struct DeviceFeatures { + /// Manually copy buffer data to the GPU (instead of letting the GPU read from system memory) + /// On discrete GPUs, this tends to be faster if the copy is able to operate in parallel with a + /// previous render. This is the case unless a game uses features like bbox or texture downloads. + bool manual_buffer_upload; bool subgroup_ops; }; diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.mm b/Source/Core/VideoBackends/Metal/MTLUtil.mm index 09bdc79f77..1e357a4ae2 100644 --- a/Source/Core/VideoBackends/Metal/MTLUtil.mm +++ b/Source/Core/VideoBackends/Metal/MTLUtil.mm @@ -216,6 +216,27 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id config->backend_info.AAModes.push_back(i); } + switch (config->iManuallyUploadBuffers) + { + case TriState::Off: + g_features.manual_buffer_upload = false; + break; + case TriState::On: + g_features.manual_buffer_upload = true; + break; + case TriState::Auto: +#if TARGET_OS_OSX + g_features.manual_buffer_upload = false; + if (@available(macOS 10.15, *)) + if (![device hasUnifiedMemory]) + g_features.manual_buffer_upload = true; +#else + // All iOS devices have unified memory + g_features.manual_buffer_upload = false; +#endif + break; + } + g_features.subgroup_ops = false; if (@available(macOS 10.15, iOS 13, *)) { diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index 91e0550715..845a59f05c 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -55,6 +55,7 @@ void VideoConfig::Refresh() bVSync = Config::Get(Config::GFX_VSYNC); iAdapter = Config::Get(Config::GFX_ADAPTER); + iManuallyUploadBuffers = Config::Get(Config::GFX_MTL_MANUALLY_UPLOAD_BUFFERS); bWidescreenHack = Config::Get(Config::GFX_WIDESCREEN_HACK); aspect_mode = Config::Get(Config::GFX_ASPECT_RATIO); diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index ff52e96bde..f7477d6d3d 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -45,6 +45,13 @@ enum class ShaderCompilationMode : int AsynchronousSkipRendering }; +enum class TriState : int +{ + Off, + On, + Auto +}; + // NEVER inherit from this class. struct VideoConfig final { @@ -148,6 +155,9 @@ struct VideoConfig final // D3D only config, mostly to be merged into the above int iAdapter = 0; + // Metal only config + TriState iManuallyUploadBuffers = TriState::Auto; + // Enable API validation layers, currently only supported with Vulkan. bool bEnableValidationLayer = false;