Cocoa Port: Do a small optimization when doing video output framebuffer fetches for Metal display views.

This commit is contained in:
rogerman 2018-12-28 15:39:09 -08:00
parent aeea0ea46a
commit 0c0bd5144e
5 changed files with 78 additions and 67 deletions

View File

@ -18,9 +18,12 @@
#ifndef _METAL_RENDERER_COMMON_H_ #ifndef _METAL_RENDERER_COMMON_H_
#define _METAL_RENDERER_COMMON_H_ #define _METAL_RENDERER_COMMON_H_
float4 unpack_unorm1555_to_unorm8888(const ushort color16); float4 unpack_rgba5551_to_unorm8888(const ushort color16);
ushort pack_color_to_unorm5551(const float4 inColor); ushort pack_unorm8888_to_rgba5551(const float4 inColor);
float4 pack_color_to_unorm6665(const float4 inColor); uchar4 pack_unorm8888_to_rgba6665(const float4 inColor);
uchar4 pack_unorm8888_to_rgba8888(const float4 inColor);
float4 convert_unorm666X_to_unorm8888(const float4 inColor);
#endif // _METAL_RENDERER_COMMON_H_ #endif // _METAL_RENDERER_COMMON_H_

View File

@ -21,7 +21,7 @@ using namespace metal;
#include "MetalRendererCommonShaders.h" #include "MetalRendererCommonShaders.h"
float4 unpack_unorm1555_to_unorm8888(const ushort color16) float4 unpack_rgba5551_to_unorm8888(const ushort color16)
{ {
return float4((float)((color16 >> 0) & 0x1F) / 31.0f, return float4((float)((color16 >> 0) & 0x1F) / 31.0f,
(float)((color16 >> 5) & 0x1F) / 31.0f, (float)((color16 >> 5) & 0x1F) / 31.0f,
@ -29,9 +29,9 @@ float4 unpack_unorm1555_to_unorm8888(const ushort color16)
(float)(color16 >> 15)); (float)(color16 >> 15));
} }
ushort pack_color_to_unorm5551(const float4 inColor) ushort pack_unorm8888_to_rgba5551(const float4 inColor)
{ {
ushort4 color16 = (ushort4)((inColor * 31.0f) + 0.35f); ushort4 color16 = ushort4( (inColor * 31.0f) + 0.1f );
color16.g <<= 5; color16.g <<= 5;
color16.b <<= 10; color16.b <<= 10;
@ -40,7 +40,17 @@ ushort pack_color_to_unorm5551(const float4 inColor)
return (color16.r | color16.g | color16.b | color16.a); return (color16.r | color16.g | color16.b | color16.a);
} }
float4 pack_color_to_unorm6665(const float4 inColor) uchar4 pack_unorm8888_to_rgba6665(const float4 inColor)
{ {
return inColor * float4(63.0f/255.0f, 63.0f/255.0f, 63.0f/255.0f, 31.0f/255.0f); return uchar4( (inColor * float4(63.0f, 63.0f, 63.0f, 31.0f)) + 0.1f );
}
uchar4 pack_unorm8888_to_rgba8888(const float4 inColor)
{
return uchar4( (inColor * 255.0f) + 0.1f );
}
float4 convert_unorm666X_to_unorm8888(const float4 inColor)
{
return float4( inColor.rgb * (255.0f/63.0f), 1.0f );
} }

View File

@ -129,8 +129,9 @@ typedef DisplayViewShaderProperties DisplayViewShaderProperties;
id<MTLTexture> texHQ4xLUT; id<MTLTexture> texHQ4xLUT;
id<MTLTexture> texCurrentHQnxLUT; id<MTLTexture> texCurrentHQnxLUT;
MTLSize _fetchThreadsPerGroup; MTLSize _fetchThreadsPerGroupNative;
MTLSize _fetchThreadGroupsPerGridNative; MTLSize _fetchThreadGroupsPerGridNative;
MTLSize _fetchThreadsPerGroupCustom;
MTLSize _fetchThreadGroupsPerGridCustom; MTLSize _fetchThreadGroupsPerGridCustom;
MTLSize deposterizeThreadsPerGroup; MTLSize deposterizeThreadsPerGroup;
MTLSize deposterizeThreadGroupsPerGrid; MTLSize deposterizeThreadGroupsPerGrid;

View File

@ -71,10 +71,10 @@
MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
[computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES]; [computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"]]; [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_rgb555_to_unorm8888"]];
_fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; _fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"]]; [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_unorm666X_to_unorm8888"]];
_fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain]; _fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]]; [computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]];
@ -99,22 +99,27 @@
[computePipelineDesc release]; [computePipelineDesc release];
size_t tw = GetNearestPositivePOT((uint32_t)[_fetch555Pipeline threadExecutionWidth]); NSUInteger tw = [_fetch555Pipeline threadExecutionWidth];
while ( (tw > [_fetch555Pipeline threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) while ( ((GPU_FRAMEBUFFER_NATIVE_WIDTH % tw) != 0) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) )
{ {
tw >>= 1; tw >>= 1;
} }
size_t th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw; NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw;
while ( ((GPU_FRAMEBUFFER_NATIVE_HEIGHT % th) != 0) || (th > GPU_FRAMEBUFFER_NATIVE_HEIGHT) )
{
th >>= 1;
}
_fetchThreadsPerGroup = MTLSizeMake(tw, th, 1); _fetchThreadsPerGroupNative = MTLSizeMake(tw, th, 1);
_fetchThreadGroupsPerGridNative = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, _fetchThreadGroupsPerGridNative = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw,
GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, GPU_FRAMEBUFFER_NATIVE_HEIGHT / th,
1); 1);
_fetchThreadsPerGroupCustom = _fetchThreadsPerGroupNative;
_fetchThreadGroupsPerGridCustom = _fetchThreadGroupsPerGridNative; _fetchThreadGroupsPerGridCustom = _fetchThreadGroupsPerGridNative;
deposterizeThreadsPerGroup = _fetchThreadsPerGroup; deposterizeThreadsPerGroup = _fetchThreadsPerGroupNative;
deposterizeThreadGroupsPerGrid = _fetchThreadGroupsPerGridNative; deposterizeThreadGroupsPerGrid = _fetchThreadGroupsPerGridNative;
MTLRenderPipelineDescriptor *hudPipelineDesc = [[MTLRenderPipelineDescriptor alloc] init]; MTLRenderPipelineDescriptor *hudPipelineDesc = [[MTLRenderPipelineDescriptor alloc] init];
@ -413,9 +418,22 @@
_fetchPixelBytes = dispInfo.pixelBytes; _fetchPixelBytes = dispInfo.pixelBytes;
const size_t tw = _fetchThreadsPerGroup.width; NSUInteger tw = [_fetch555Pipeline threadExecutionWidth];
const size_t th = _fetchThreadsPerGroup.height; while ( ((w % tw) != 0) || (tw > w) )
_fetchThreadGroupsPerGridCustom = MTLSizeMake((w + tw - 1) / tw, (h + th - 1) / th, 1); {
tw >>= 1;
}
NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw;
while ( ((h % th) != 0) || (th > h) )
{
th >>= 1;
}
_fetchThreadsPerGroupCustom = MTLSizeMake(tw, th, 1);
_fetchThreadGroupsPerGridCustom = MTLSizeMake(w / tw,
h / th,
1);
id<MTLCommandBuffer> cb = [_fetchCommandQueue commandBufferWithUnretainedReferences]; id<MTLCommandBuffer> cb = [_fetchCommandQueue commandBufferWithUnretainedReferences];
MetalTexturePair newTexPair = [self setFetchTextureBindingsAtIndex:dispInfo.bufferIndex commandBuffer:cb]; MetalTexturePair newTexPair = [self setFetchTextureBindingsAtIndex:dispInfo.bufferIndex commandBuffer:cb];
@ -498,7 +516,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index];
} }
@ -507,7 +525,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index];
} }
@ -528,7 +546,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index];
} }
@ -537,7 +555,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index];
} }
@ -572,7 +590,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index]; targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index];
} }
@ -581,7 +599,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0]; [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index]; targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index];
} }
@ -594,7 +612,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative [cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index]; targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index];
} }
@ -603,7 +621,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0]; [cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1]; [cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom [cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup]; threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index]; targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index];
} }
@ -1023,17 +1041,21 @@
_texDisplayPixelScaler[NDSDisplayID_Main] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; _texDisplayPixelScaler[NDSDisplayID_Main] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc];
_texDisplayPixelScaler[NDSDisplayID_Touch] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc]; _texDisplayPixelScaler[NDSDisplayID_Touch] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc];
size_t tw = GetNearestPositivePOT((uint32_t)[[self pixelScalePipeline] threadExecutionWidth]); NSUInteger tw = [[self pixelScalePipeline] threadExecutionWidth];
while ( (tw > [[self pixelScalePipeline] threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) ) while ( ((newScalerWidth % tw) != 0) || (tw > newScalerWidth) )
{ {
tw >>= 1; tw >>= 1;
} }
const size_t th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw; NSUInteger th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw;
while ( ((newScalerHeight % th) != 0) || (th > newScalerHeight) )
{
th >>= 1;
}
_pixelScalerThreadsPerGroup = MTLSizeMake(tw, th, 1); _pixelScalerThreadsPerGroup = MTLSizeMake(tw, th, 1);
_pixelScalerThreadGroupsPerGrid = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw, _pixelScalerThreadGroupsPerGrid = MTLSizeMake(newScalerWidth / tw,
GPU_FRAMEBUFFER_NATIVE_HEIGHT / th, newScalerHeight / th,
1); 1);
} }
else else

View File

@ -432,12 +432,7 @@ kernel void nds_fetch555(const uint2 position [[thread_position_in_grid]],
{ {
const uint h = inTexture.get_height(); const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) ) const float4 inColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r );
{
return;
}
const float4 inColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r );
float3 outColor = inColor.rgb; float3 outColor = inColor.rgb;
const uint line = uint( (float)position.y / ((float)h / 192.0f) ); const uint line = uint( (float)position.y / ((float)h / 192.0f) );
@ -454,11 +449,6 @@ kernel void nds_fetch666(const uint2 position [[thread_position_in_grid]],
{ {
const uint h = inTexture.get_height(); const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) )
{
return;
}
const float4 inColor = inTexture.read(position); const float4 inColor = inTexture.read(position);
float3 outColor = inColor.rgb * float3(255.0f/63.0f); float3 outColor = inColor.rgb * float3(255.0f/63.0f);
@ -476,11 +466,6 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]],
{ {
const uint h = inTexture.get_height(); const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) )
{
return;
}
const float4 inColor = inTexture.read(position); const float4 inColor = inTexture.read(position);
float3 outColor = inColor.rgb; float3 outColor = inColor.rgb;
@ -490,30 +475,20 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]],
outTexture.write(float4(outColor, 1.0f), position); outTexture.write(float4(outColor, 1.0f), position);
} }
kernel void nds_fetch555ConvertOnly(const uint2 position [[thread_position_in_grid]], kernel void convert_texture_rgb555_to_unorm8888(const uint2 position [[thread_position_in_grid]],
const texture2d<ushort, access::read> inTexture [[texture(0)]], const texture2d<ushort, access::read> inTexture [[texture(0)]],
texture2d<float, access::write> outTexture [[texture(1)]]) texture2d<float, access::write> outTexture [[texture(1)]])
{ {
if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) const float4 outColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r );
{
return;
}
const float4 outColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r );
outTexture.write(float4(outColor.rgb, 1.0f), position); outTexture.write(float4(outColor.rgb, 1.0f), position);
} }
kernel void nds_fetch666ConvertOnly(const uint2 position [[thread_position_in_grid]], kernel void convert_texture_unorm666X_to_unorm8888(const uint2 position [[thread_position_in_grid]],
const texture2d<float, access::read> inTexture [[texture(0)]], const texture2d<float, access::read> inTexture [[texture(0)]],
texture2d<float, access::write> outTexture [[texture(1)]]) texture2d<float, access::write> outTexture [[texture(1)]])
{ {
if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) ) const float4 outColor = convert_unorm666X_to_unorm8888( inTexture.read(position) );
{ outTexture.write(outColor, position);
return;
}
const float3 outColor = inTexture.read(position).rgb * float3(255.0f/63.0f);
outTexture.write(float4(outColor, 1.0f), position);
} }
float3 nds_apply_master_brightness(const float3 inColor, const uchar mode, const float intensity) float3 nds_apply_master_brightness(const float3 inColor, const uchar mode, const float intensity)