From a2f19143da29d63e8b605370714e6200f4ae5c0f Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 10 Sep 2023 16:26:58 +1000 Subject: [PATCH] MetalDevice: Support multisampling --- CMakeModules/AddMetalSources.cmake | 47 +++++++++ src/core/gpu_hw.cpp | 9 +- src/util/CMakeLists.txt | 9 ++ src/util/metal_device.h | 8 ++ src/util/metal_device.mm | 164 +++++++++++++++++++++++++---- src/util/metal_shaders.metal | 42 ++++++++ 6 files changed, 254 insertions(+), 25 deletions(-) create mode 100644 CMakeModules/AddMetalSources.cmake create mode 100644 src/util/metal_shaders.metal diff --git a/CMakeModules/AddMetalSources.cmake b/CMakeModules/AddMetalSources.cmake new file mode 100644 index 000000000..570eab1a9 --- /dev/null +++ b/CMakeModules/AddMetalSources.cmake @@ -0,0 +1,47 @@ +# Borrowed from PCSX2. + +if(APPLE) + function(add_metal_sources target sources) + if(CMAKE_GENERATOR MATCHES "Xcode") + # If we're generating an xcode project, you can just add the shaders to the main pcsx2 target and xcode will deal with them properly + # This will make sure xcode supplies code completion, etc (if you use a custom command, it won't) + set_target_properties(${target} PROPERTIES + XCODE_ATTRIBUTE_MTL_ENABLE_DEBUG_INFO INCLUDE_SOURCE + ) + foreach(shader IN LISTS sources) + target_sources(${target} PRIVATE ${shader}) + set_source_files_properties(${shader} PROPERTIES LANGUAGE METAL) + endforeach() + else() + function(generateMetallib std triple outputName) + set(MetalShaderOut) + set(flags + -ffast-math + $<$>:-gline-tables-only> + $<$>:-MO> + ) + foreach(shader IN LISTS sources) + file(RELATIVE_PATH relativeShader "${CMAKE_SOURCE_DIR}" "${shader}") + set(shaderOut ${CMAKE_CURRENT_BINARY_DIR}/${outputName}/${relativeShader}.air) + list(APPEND MetalShaderOut ${shaderOut}) + get_filename_component(shaderDir ${shaderOut} DIRECTORY) + add_custom_command(OUTPUT ${shaderOut} + COMMAND ${CMAKE_COMMAND} -E make_directory ${shaderDir} + COMMAND xcrun metal ${flags} -std=${std} -target ${triple} -o ${shaderOut} -c ${shader} + DEPENDS ${shader} + ) + set(metallib ${CMAKE_CURRENT_BINARY_DIR}/${outputName}.metallib) + endforeach() + add_custom_command(OUTPUT ${metallib} + COMMAND xcrun metallib -o ${metallib} ${MetalShaderOut} + DEPENDS ${MetalShaderOut} + ) + target_sources(${target} PRIVATE ${metallib}) + set_source_files_properties(${metallib} PROPERTIES MACOSX_PACKAGE_LOCATION Resources) + endfunction() + generateMetallib(macos-metal2.0 air64-apple-macos10.13 default) + generateMetallib(macos-metal2.2 air64-apple-macos10.15 Metal22) + generateMetallib(macos-metal2.3 air64-apple-macos11.0 Metal23) + endif() + endfunction() +endif() \ No newline at end of file diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 5f5a3354f..286c4cbbe 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -565,12 +565,17 @@ bool GPU_HW::CreateBuffers() const u32 texture_height = VRAM_HEIGHT * m_resolution_scale; const u8 samples = static_cast(m_multisamples); + // Needed for Metal resolve. + const GPUTexture::Type read_texture_type = (g_gpu_device->GetRenderAPI() == RenderAPI::Metal && m_multisamples > 1) ? + GPUTexture::Type::RWTexture : + GPUTexture::Type::Texture; + if (!(m_vram_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, samples, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)) || !(m_vram_depth_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, samples, GPUTexture::Type::DepthStencil, VRAM_DS_FORMAT)) || - !(m_vram_read_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, 1, - GPUTexture::Type::Texture, VRAM_RT_FORMAT)) || + !(m_vram_read_texture = + g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, 1, read_texture_type, VRAM_RT_FORMAT)) || !(m_display_private_texture = g_gpu_device->CreateTexture( ((m_downsample_mode == GPUDownsampleMode::Adaptive) ? VRAM_WIDTH : GPU_MAX_DISPLAY_WIDTH) * m_resolution_scale, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index cf61315f7..e81632829 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -264,6 +264,8 @@ if(WIN32) target_link_libraries(util PRIVATE WinPixEventRuntime::WinPixEventRuntime) endif() elseif(APPLE) + include(AddMetalSources) + set(MAC_SOURCES cocoa_tools.h metal_device.h @@ -272,6 +274,10 @@ elseif(APPLE) metal_stream_buffer.mm platform_misc_mac.mm ) + set(METAL_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/metal_shaders.metal" + ) + set_property(GLOBAL PROPERTY UTIL_METAL_SOURCES ${METAL_SOURCES}) target_sources(util PRIVATE ${MAC_SOURCES}) find_library(IOK_LIBRARY IOKit REQUIRED) find_library(METAL_LIBRARY Metal) @@ -286,6 +292,9 @@ endif() function(add_util_resources target) if(APPLE) + get_property(UTIL_METAL_SOURCES GLOBAL PROPERTY UTIL_METAL_SOURCES) + add_metal_sources(${target} ${UTIL_METAL_SOURCES}) + # Copy MoltenVK into the bundle unset(MOLTENVK_PATH CACHE) find_file(MOLTENVK_PATH NAMES diff --git a/src/util/metal_device.h b/src/util/metal_device.h index 85a73ff42..174cc23fa 100644 --- a/src/util/metal_device.h +++ b/src/util/metal_device.h @@ -299,6 +299,10 @@ private: ALWAYS_INLINE NSView* GetWindowView() const { return (__bridge NSView*)m_window_info.window_handle; } void SetFeatures(); + bool LoadShaders(); + + id GetFunctionFromLibrary(id library, NSString* name); + id CreateComputePipeline(id function, NSString* name); std::unique_ptr CreateShaderFromMSL(GPUShaderStage stage, const std::string_view& source, const std::string_view& entry_point); @@ -354,6 +358,10 @@ private: MetalStreamBuffer m_uniform_buffer; MetalStreamBuffer m_texture_upload_buffer; + id m_shaders = nil; + std::vector, id>> + m_resolve_pipelines; + id m_upload_cmdbuf = nil; id m_upload_encoder = nil; id m_inline_upload_encoder = nil; diff --git a/src/util/metal_device.mm b/src/util/metal_device.mm index 9e47784ff..959d37da8 100644 --- a/src/util/metal_device.mm +++ b/src/util/metal_device.mm @@ -174,6 +174,12 @@ bool MetalDevice::CreateDevice(const std::string_view& adapter, bool threaded_pr CreateCommandBuffer(); RenderBlankFrame(); + if (!LoadShaders()) + { + Log_ErrorPrint("Failed to load shaders."); + return false; + } + if (!CreateBuffers()) { Log_ErrorPrintf("Failed to create buffers."); @@ -198,7 +204,7 @@ void MetalDevice::SetFeatures() } m_max_multisamples = 0; - for (u32 multisamples = 1; multisamples < 16; multisamples++) + for (u32 multisamples = 1; multisamples < 16; multisamples *= 2) { if (![m_device supportsTextureSampleCount:multisamples]) break; @@ -211,11 +217,71 @@ void MetalDevice::SetFeatures() m_features.supports_texture_buffers = true; m_features.texture_buffers_emulated_with_ssbo = true; m_features.geometry_shaders = false; - m_features.partial_msaa_resolve = true; + m_features.partial_msaa_resolve = false; m_features.shader_cache = true; m_features.pipeline_cache = false; } +bool MetalDevice::LoadShaders() +{ + @autoreleasepool + { + auto try_lib = [this](NSString* name) -> id { + NSBundle* bundle = [NSBundle mainBundle]; + NSString* path = [bundle pathForResource:name ofType:@"metallib"]; + if (path == nil) + { + // Xcode places it alongside the binary. + path = [NSString stringWithFormat:@"%@/%@.metallib", [bundle bundlePath], name]; + if (![[NSFileManager defaultManager] fileExistsAtPath:path]) + return nil; + } + + id lib = [m_device newLibraryWithFile:path error:nil]; + if (lib == nil) + return nil; + + return [lib retain]; + }; + + if (!(m_shaders = try_lib(@"Metal23")) && !(m_shaders = try_lib(@"Metal22")) && + !(m_shaders = try_lib(@"Metal21")) && !(m_shaders = try_lib(@"default"))) + { + return false; + } + + return true; + } +} + +id MetalDevice::GetFunctionFromLibrary(id library, NSString* name) +{ + id function = [library newFunctionWithName:name]; + return function; +} + +id MetalDevice::CreateComputePipeline(id function, NSString* name) +{ + MTLComputePipelineDescriptor* desc = [MTLComputePipelineDescriptor new]; + if (name != nil) + [desc setLabel:name]; + [desc setComputeFunction:function]; + + NSError* err = nil; + id pipeline = [m_device newComputePipelineStateWithDescriptor:desc + options:MTLPipelineOptionNone + reflection:nil + error:&err]; + [desc release]; + if (pipeline == nil) + { + LogNSError(err, "Create compute pipeline failed:"); + return nil; + } + + return pipeline; +} + void MetalDevice::DestroyDevice() { WaitForPreviousCommandBuffers(); @@ -243,6 +309,17 @@ void MetalDevice::DestroyDevice() [it.second release]; m_cleanup_objects.clear(); + for (auto& it : m_resolve_pipelines) + { + if (it.second != nil) + [it.second release]; + } + m_resolve_pipelines.clear(); + if (m_shaders != nil) + { + [m_shaders release]; + m_shaders = nil; + } if (m_queue != nil) { [m_queue release]; @@ -736,7 +813,7 @@ std::unique_ptr MetalDevice::CreatePipeline(const GPUPipeline::Grap // General const MTLPrimitiveType primitive = primitives[static_cast(config.primitive)]; - desc.rasterSampleCount = config.per_sample_shading ? config.samples : 1; + desc.rasterSampleCount = config.samples; // Metal-specific stuff desc.vertexBuffers[0].mutability = MTLMutabilityImmutable; @@ -959,6 +1036,15 @@ std::unique_ptr MetalDevice::CreateTexture(u32 width, u32 height, u3 desc.depth = levels; desc.pixelFormat = pixel_format; desc.mipmapLevelCount = levels; + if (samples > 1) + { + desc.textureType = (layers > 1) ? MTLTextureType2DMultisampleArray : MTLTextureType2DMultisample; + desc.sampleCount = samples; + } + else if (layers > 1) + { + desc.textureType = MTLTextureType2DArray; + } switch (type) { @@ -1339,30 +1425,62 @@ void MetalDevice::CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 d void MetalDevice::ResolveTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height) { -#if 0 - DebugAssert(src_level < src->GetLevels() && src_layer < src->GetLayers()); - DebugAssert((src_x + width) <= src->GetMipWidth(src_level)); - DebugAssert((src_y + height) <= src->GetMipHeight(src_level)); - DebugAssert(dst_level < dst->GetLevels() && dst_layer < dst->GetLayers()); - DebugAssert((dst_x + width) <= dst->GetMipWidth(dst_level)); - DebugAssert((dst_y + height) <= dst->GetMipHeight(dst_level)); - DebugAssert(!dst->IsMultisampled() && src->IsMultisampled()); + DebugAssert((src_x + width) <= src->GetWidth()); + DebugAssert((src_y + height) <= src->GetHeight()); + DebugAssert(dst_level < dst->GetLevels() && dst_layer < dst->GetLayers()); + DebugAssert((dst_x + width) <= dst->GetMipWidth(dst_level)); + DebugAssert((dst_y + height) <= dst->GetMipHeight(dst_level)); + DebugAssert(!dst->IsMultisampled() && src->IsMultisampled()); - // DX11 can't resolve partial rects. - Assert(src_x == dst_x && src_y == dst_y); + // Only does first level for now.. + DebugAssert(dst_level == 0 && dst_layer == 0); - MetalTexture* dst11 = static_cast(dst); - MetalTexture* src11 = static_cast(src); + const GPUTexture::Format src_format = dst->GetFormat(); + const GPUTexture::Format dst_format = dst->GetFormat(); + id resolve_pipeline = nil; + if (auto iter = std::find_if(m_resolve_pipelines.begin(), m_resolve_pipelines.end(), + [src_format, dst_format](const auto& it) { + return it.first.first == src_format && it.first.second == dst_format; + }); + iter != m_resolve_pipelines.end()) + { + resolve_pipeline = iter->second; + } + else + { + // Need to compile it. + @autoreleasepool + { + const bool is_depth = GPUTexture::IsDepthFormat(src_format); + id function = + [GetFunctionFromLibrary(m_shaders, is_depth ? @"depthResolveKernel" : @"colorResolveKernel") autorelease]; + if (function == nil) + Panic("Failed to get resolve kernel"); - src11->CommitClear(m_context.Get()); - dst11->CommitClear(m_context.Get()); + resolve_pipeline = [CreateComputePipeline(function, is_depth ? @"Depth Resolve" : @"Color Resolve") autorelease]; + if (resolve_pipeline != nil) + [resolve_pipeline retain]; + m_resolve_pipelines.emplace_back(std::make_pair(src_format, dst_format), resolve_pipeline); + } + } + if (resolve_pipeline == nil) + Panic("Failed to get resolve pipeline"); - m_context->ResolveSubresource(dst11->GetD3DTexture(), MetalCalcSubresource(dst_level, dst_layer, dst->GetLevels()), - src11->GetD3DTexture(), MetalCalcSubresource(src_level, src_layer, src->GetLevels()), - dst11->GetDXGIFormat()); -#else - Panic("Fixme"); -#endif + if (InRenderPass()) + EndRenderPass(); + + const u32 threadgroupHeight = resolve_pipeline.maxTotalThreadsPerThreadgroup / resolve_pipeline.threadExecutionWidth; + const MTLSize intrinsicThreadgroupSize = MTLSizeMake(resolve_pipeline.threadExecutionWidth, threadgroupHeight, 1); + const MTLSize threadgroupsInGrid = + MTLSizeMake((src->GetWidth() + intrinsicThreadgroupSize.width - 1) / intrinsicThreadgroupSize.width, + (src->GetHeight() + intrinsicThreadgroupSize.height - 1) / intrinsicThreadgroupSize.height, 1); + + id computeEncoder = [m_render_cmdbuf computeCommandEncoder]; + [computeEncoder setComputePipelineState:resolve_pipeline]; + [computeEncoder setTexture:static_cast(src)->GetMTLTexture() atIndex:0]; + [computeEncoder setTexture:static_cast(dst)->GetMTLTexture() atIndex:1]; + [computeEncoder dispatchThreadgroups:threadgroupsInGrid threadsPerThreadgroup:intrinsicThreadgroupSize]; + [computeEncoder endEncoding]; } void MetalDevice::ClearRenderTarget(GPUTexture* t, u32 c) diff --git a/src/util/metal_shaders.metal b/src/util/metal_shaders.metal new file mode 100644 index 000000000..917033883 --- /dev/null +++ b/src/util/metal_shaders.metal @@ -0,0 +1,42 @@ +/// A custom resolve kernel that averages color at all sample points. +#include +using namespace metal; + +// https://developer.apple.com/documentation/metal/metal_sample_code_library/improving_edge-rendering_quality_with_multisample_antialiasing_msaa?language=objc +kernel void +colorResolveKernel(texture2d_ms multisampledTexture [[texture(0)]], + texture2d resolvedTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) +{ + const uint count = multisampledTexture.get_num_samples(); + + float4 resolved_color = 0; + + for (uint i = 0; i < count; ++i) + { + resolved_color += multisampledTexture.read(gid, i); + } + + resolved_color /= count; + + resolvedTexture.write(resolved_color, gid); +} + +kernel void +depthResolveKernel(texture2d_ms multisampledTexture [[texture(0)]], + texture2d resolvedTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) +{ + const uint count = multisampledTexture.get_num_samples(); + + float resolved_depth = 0; + + for (uint i = 0; i < count; ++i) + { + resolved_depth += multisampledTexture.read(gid, i).r; + } + + resolved_depth /= count; + + resolvedTexture.write(float4(resolved_depth, 0, 0, 0), gid); +}