From 0c0bd5144ec7811f22f006329fc2261fd54c4ea5 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 28 Dec 2018 15:39:09 -0800 Subject: [PATCH] Cocoa Port: Do a small optimization when doing video output framebuffer fetches for Metal display views. --- .../cocoa/MetalRendererCommonShaders.h | 9 ++- .../cocoa/MetalRendererCommonShaders.metal | 20 ++++-- .../cocoa/userinterface/MacMetalDisplayView.h | 3 +- .../userinterface/MacMetalDisplayView.mm | 68 ++++++++++++------- .../MacMetalDisplayViewShaders.metal | 45 +++--------- 5 files changed, 78 insertions(+), 67 deletions(-) diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h index c1cd9ff57..bc50b81a9 100644 --- a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.h @@ -18,9 +18,12 @@ #ifndef _METAL_RENDERER_COMMON_H_ #define _METAL_RENDERER_COMMON_H_ -float4 unpack_unorm1555_to_unorm8888(const ushort color16); +float4 unpack_rgba5551_to_unorm8888(const ushort color16); -ushort pack_color_to_unorm5551(const float4 inColor); -float4 pack_color_to_unorm6665(const float4 inColor); +ushort pack_unorm8888_to_rgba5551(const float4 inColor); +uchar4 pack_unorm8888_to_rgba6665(const float4 inColor); +uchar4 pack_unorm8888_to_rgba8888(const float4 inColor); + +float4 convert_unorm666X_to_unorm8888(const float4 inColor); #endif // _METAL_RENDERER_COMMON_H_ diff --git a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal index 622639308..c6f8f83be 100644 --- a/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal +++ b/desmume/src/frontend/cocoa/MetalRendererCommonShaders.metal @@ -21,7 +21,7 @@ using namespace metal; #include "MetalRendererCommonShaders.h" -float4 unpack_unorm1555_to_unorm8888(const ushort color16) +float4 unpack_rgba5551_to_unorm8888(const ushort color16) { return float4((float)((color16 >> 0) & 0x1F) / 31.0f, (float)((color16 >> 5) & 0x1F) / 31.0f, @@ -29,9 +29,9 @@ float4 unpack_unorm1555_to_unorm8888(const ushort color16) (float)(color16 >> 15)); } -ushort pack_color_to_unorm5551(const float4 inColor) +ushort pack_unorm8888_to_rgba5551(const float4 inColor) { - ushort4 color16 = (ushort4)((inColor * 31.0f) + 0.35f); + ushort4 color16 = ushort4( (inColor * 31.0f) + 0.1f ); color16.g <<= 5; color16.b <<= 10; @@ -40,7 +40,17 @@ ushort pack_color_to_unorm5551(const float4 inColor) return (color16.r | color16.g | color16.b | color16.a); } -float4 pack_color_to_unorm6665(const float4 inColor) +uchar4 pack_unorm8888_to_rgba6665(const float4 inColor) { - return inColor * float4(63.0f/255.0f, 63.0f/255.0f, 63.0f/255.0f, 31.0f/255.0f); + return uchar4( (inColor * float4(63.0f, 63.0f, 63.0f, 31.0f)) + 0.1f ); +} + +uchar4 pack_unorm8888_to_rgba8888(const float4 inColor) +{ + return uchar4( (inColor * 255.0f) + 0.1f ); +} + +float4 convert_unorm666X_to_unorm8888(const float4 inColor) +{ + return float4( inColor.rgb * (255.0f/63.0f), 1.0f ); } diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h index 5599145b1..69de5ff8d 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.h @@ -129,8 +129,9 @@ typedef DisplayViewShaderProperties DisplayViewShaderProperties; id texHQ4xLUT; id texCurrentHQnxLUT; - MTLSize _fetchThreadsPerGroup; + MTLSize _fetchThreadsPerGroupNative; MTLSize _fetchThreadGroupsPerGridNative; + MTLSize _fetchThreadsPerGroupCustom; MTLSize _fetchThreadGroupsPerGridCustom; MTLSize deposterizeThreadsPerGroup; MTLSize deposterizeThreadGroupsPerGrid; diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm index 5ecdf777f..8d173348a 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayView.mm @@ -71,10 +71,10 @@ MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; [computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES]; - [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"]]; + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_rgb555_to_unorm8888"]]; _fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; - [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"]]; + [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_unorm666X_to_unorm8888"]]; _fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]]; @@ -99,22 +99,27 @@ [computePipelineDesc release]; - size_t tw = GetNearestPositivePOT((uint32_t)[_fetch555Pipeline threadExecutionWidth]); - while ( (tw > [_fetch555Pipeline threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) + NSUInteger tw = [_fetch555Pipeline threadExecutionWidth]; + while ( ((GPU_FRAMEBUFFER_NATIVE_WIDTH % tw) != 0) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) { tw >>= 1; } - size_t th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + while ( ((GPU_FRAMEBUFFER_NATIVE_HEIGHT % th) != 0) || (th > GPU_FRAMEBUFFER_NATIVE_HEIGHT) ) + { + th >>= 1; + } - _fetchThreadsPerGroup = MTLSizeMake(tw, th, 1); + _fetchThreadsPerGroupNative = MTLSizeMake(tw, th, 1); _fetchThreadGroupsPerGridNative = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, 1); + _fetchThreadsPerGroupCustom = _fetchThreadsPerGroupNative; _fetchThreadGroupsPerGridCustom = _fetchThreadGroupsPerGridNative; - deposterizeThreadsPerGroup = _fetchThreadsPerGroup; + deposterizeThreadsPerGroup = _fetchThreadsPerGroupNative; deposterizeThreadGroupsPerGrid = _fetchThreadGroupsPerGridNative; MTLRenderPipelineDescriptor *hudPipelineDesc = [[MTLRenderPipelineDescriptor alloc] init]; @@ -413,9 +418,22 @@ _fetchPixelBytes = dispInfo.pixelBytes; - const size_t tw = _fetchThreadsPerGroup.width; - const size_t th = _fetchThreadsPerGroup.height; - _fetchThreadGroupsPerGridCustom = MTLSizeMake((w + tw - 1) / tw, (h + th - 1) / th, 1); + NSUInteger tw = [_fetch555Pipeline threadExecutionWidth]; + while ( ((w % tw) != 0) || (tw > w) ) + { + tw >>= 1; + } + + NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; + while ( ((h % th) != 0) || (th > h) ) + { + th >>= 1; + } + + _fetchThreadsPerGroupCustom = MTLSizeMake(tw, th, 1); + _fetchThreadGroupsPerGridCustom = MTLSizeMake(w / tw, + h / th, + 1); id cb = [_fetchCommandQueue commandBufferWithUnretainedReferences]; MetalTexturePair newTexPair = [self setFetchTextureBindingsAtIndex:dispInfo.bufferIndex commandBuffer:cb]; @@ -498,7 +516,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; } @@ -507,7 +525,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; } @@ -528,7 +546,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; } @@ -537,7 +555,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; } @@ -572,7 +590,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; } @@ -581,7 +599,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; } @@ -594,7 +612,7 @@ [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupNative]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; } @@ -603,7 +621,7 @@ [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom - threadsPerThreadgroup:_fetchThreadsPerGroup]; + threadsPerThreadgroup:_fetchThreadsPerGroupCustom]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; } @@ -1023,17 +1041,21 @@ _texDisplayPixelScaler[NDSDisplayID_Main] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; _texDisplayPixelScaler[NDSDisplayID_Touch] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; - size_t tw = GetNearestPositivePOT((uint32_t)[[self pixelScalePipeline] threadExecutionWidth]); - while ( (tw > [[self pixelScalePipeline] threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) + NSUInteger tw = [[self pixelScalePipeline] threadExecutionWidth]; + while ( ((newScalerWidth % tw) != 0) || (tw > newScalerWidth) ) { tw >>= 1; } - const size_t th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw; + NSUInteger th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw; + while ( ((newScalerHeight % th) != 0) || (th > newScalerHeight) ) + { + th >>= 1; + } _pixelScalerThreadsPerGroup = MTLSizeMake(tw, th, 1); - _pixelScalerThreadGroupsPerGrid = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, - GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, + _pixelScalerThreadGroupsPerGrid = MTLSizeMake(newScalerWidth / tw, + newScalerHeight / th, 1); } else diff --git a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal index 783554cac..5cf9fda06 100644 --- a/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal +++ b/desmume/src/frontend/cocoa/userinterface/MacMetalDisplayViewShaders.metal @@ -432,12 +432,7 @@ kernel void nds_fetch555(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - - const float4 inColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r ); + const float4 inColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r ); float3 outColor = inColor.rgb; const uint line = uint( (float)position.y / ((float)h / 192.0f) ); @@ -454,11 +449,6 @@ kernel void nds_fetch666(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - const float4 inColor = inTexture.read(position); float3 outColor = inColor.rgb * float3(255.0f/63.0f); @@ -476,11 +466,6 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]], { const uint h = inTexture.get_height(); - if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) - { - return; - } - const float4 inColor = inTexture.read(position); float3 outColor = inColor.rgb; @@ -490,30 +475,20 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]], outTexture.write(float4(outColor, 1.0f), position); } -kernel void nds_fetch555ConvertOnly(const uint2 position [[thread_position_in_grid]], - const texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]]) +kernel void convert_texture_rgb555_to_unorm8888(const uint2 position [[thread_position_in_grid]], + const texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]]) { - if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) - { - return; - } - - const float4 outColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r ); + const float4 outColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r ); outTexture.write(float4(outColor.rgb, 1.0f), position); } -kernel void nds_fetch666ConvertOnly(const uint2 position [[thread_position_in_grid]], - const texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]]) +kernel void convert_texture_unorm666X_to_unorm8888(const uint2 position [[thread_position_in_grid]], + const texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]]) { - if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) - { - return; - } - - const float3 outColor = inTexture.read(position).rgb * float3(255.0f/63.0f); - outTexture.write(float4(outColor, 1.0f), position); + const float4 outColor = convert_unorm666X_to_unorm8888( inTexture.read(position) ); + outTexture.write(outColor, position); } float3 nds_apply_master_brightness(const float3 inColor, const uchar mode, const float intensity)