Cocoa Port: Do a small optimization when doing video output framebuffer fetches for Metal display views.

This commit is contained in:
rogerman 2018-12-28 15:39:09 -08:00
parent aeea0ea46a
commit 0c0bd5144e
5 changed files with 78 additions and 67 deletions

View File

@ -18,9 +18,12 @@
#ifndef _METAL_RENDERER_COMMON_H_
#define _METAL_RENDERER_COMMON_H_
float4 unpack_unorm1555_to_unorm8888(const ushort color16);
float4 unpack_rgba5551_to_unorm8888(const ushort color16);
ushort pack_color_to_unorm5551(const float4 inColor);
float4 pack_color_to_unorm6665(const float4 inColor);
ushort pack_unorm8888_to_rgba5551(const float4 inColor);
uchar4 pack_unorm8888_to_rgba6665(const float4 inColor);
uchar4 pack_unorm8888_to_rgba8888(const float4 inColor);
float4 convert_unorm666X_to_unorm8888(const float4 inColor);
#endif // _METAL_RENDERER_COMMON_H_

View File

@ -21,7 +21,7 @@ using namespace metal;
#include "MetalRendererCommonShaders.h"
float4 unpack_unorm1555_to_unorm8888(const ushort color16)
float4 unpack_rgba5551_to_unorm8888(const ushort color16)
{
return float4((float)((color16 >> 0) & 0x1F) / 31.0f,
(float)((color16 >> 5) & 0x1F) / 31.0f,
@ -29,9 +29,9 @@ float4 unpack_unorm1555_to_unorm8888(const ushort color16)
(float)(color16 >> 15));
}
ushort pack_color_to_unorm5551(const float4 inColor)
ushort pack_unorm8888_to_rgba5551(const float4 inColor)
{
ushort4 color16 = (ushort4)((inColor * 31.0f) + 0.35f);
ushort4 color16 = ushort4( (inColor * 31.0f) + 0.1f );
color16.g <<= 5;
color16.b <<= 10;
@ -40,7 +40,17 @@ ushort pack_color_to_unorm5551(const float4 inColor)
return (color16.r | color16.g | color16.b | color16.a);
}
float4 pack_color_to_unorm6665(const float4 inColor)
uchar4 pack_unorm8888_to_rgba6665(const float4 inColor)
{
return inColor * float4(63.0f/255.0f, 63.0f/255.0f, 63.0f/255.0f, 31.0f/255.0f);
return uchar4( (inColor * float4(63.0f, 63.0f, 63.0f, 31.0f)) + 0.1f );
}
uchar4 pack_unorm8888_to_rgba8888(const float4 inColor)
{
return uchar4( (inColor * 255.0f) + 0.1f );
}
float4 convert_unorm666X_to_unorm8888(const float4 inColor)
{
return float4( inColor.rgb * (255.0f/63.0f), 1.0f );
}

View File

@ -129,8 +129,9 @@ typedef DisplayViewShaderProperties DisplayViewShaderProperties;
id<MTLTexture> texHQ4xLUT;
id<MTLTexture> texCurrentHQnxLUT;
MTLSize _fetchThreadsPerGroup;
MTLSize _fetchThreadsPerGroupNative;
MTLSize _fetchThreadGroupsPerGridNative;
MTLSize _fetchThreadsPerGroupCustom;
MTLSize _fetchThreadGroupsPerGridCustom;
MTLSize deposterizeThreadsPerGroup;
MTLSize deposterizeThreadGroupsPerGrid;

View File

@ -71,10 +71,10 @@
MTLComputePipelineDescriptor *computePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
[computePipelineDesc setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch555ConvertOnly"]];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_rgb555_to_unorm8888"]];
_fetch555ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"nds_fetch666ConvertOnly"]];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"convert_texture_unorm666X_to_unorm8888"]];
_fetch666ConvertOnlyPipeline = [[device newComputePipelineStateWithDescriptor:computePipelineDesc options:MTLPipelineOptionNone reflection:nil error:nil] retain];
[computePipelineDesc setComputeFunction:[defaultLibrary newFunctionWithName:@"src_filter_deposterize"]];
@ -99,22 +99,27 @@
[computePipelineDesc release];
size_t tw = GetNearestPositivePOT((uint32_t)[_fetch555Pipeline threadExecutionWidth]);
while ( (tw > [_fetch555Pipeline threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) )
NSUInteger tw = [_fetch555Pipeline threadExecutionWidth];
while ( ((GPU_FRAMEBUFFER_NATIVE_WIDTH % tw) != 0) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) )
{
tw >>= 1;
}
size_t th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw;
NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw;
while ( ((GPU_FRAMEBUFFER_NATIVE_HEIGHT % th) != 0) || (th > GPU_FRAMEBUFFER_NATIVE_HEIGHT) )
{
th >>= 1;
}
_fetchThreadsPerGroup = MTLSizeMake(tw, th, 1);
_fetchThreadsPerGroupNative = MTLSizeMake(tw, th, 1);
_fetchThreadGroupsPerGridNative = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw,
GPU_FRAMEBUFFER_NATIVE_HEIGHT / th,
1);
_fetchThreadsPerGroupCustom = _fetchThreadsPerGroupNative;
_fetchThreadGroupsPerGridCustom = _fetchThreadGroupsPerGridNative;
deposterizeThreadsPerGroup = _fetchThreadsPerGroup;
deposterizeThreadsPerGroup = _fetchThreadsPerGroupNative;
deposterizeThreadGroupsPerGrid = _fetchThreadGroupsPerGridNative;
MTLRenderPipelineDescriptor *hudPipelineDesc = [[MTLRenderPipelineDescriptor alloc] init];
@ -413,9 +418,22 @@
_fetchPixelBytes = dispInfo.pixelBytes;
const size_t tw = _fetchThreadsPerGroup.width;
const size_t th = _fetchThreadsPerGroup.height;
_fetchThreadGroupsPerGridCustom = MTLSizeMake((w + tw - 1) / tw, (h + th - 1) / th, 1);
NSUInteger tw = [_fetch555Pipeline threadExecutionWidth];
while ( ((w % tw) != 0) || (tw > w) )
{
tw >>= 1;
}
NSUInteger th = [_fetch555Pipeline maxTotalThreadsPerThreadgroup] / tw;
while ( ((h % th) != 0) || (th > h) )
{
th >>= 1;
}
_fetchThreadsPerGroupCustom = MTLSizeMake(tw, th, 1);
_fetchThreadGroupsPerGridCustom = MTLSizeMake(w / tw,
h / th,
1);
id<MTLCommandBuffer> cb = [_fetchCommandQueue commandBufferWithUnretainedReferences];
MetalTexturePair newTexPair = [self setFetchTextureBindingsAtIndex:dispInfo.bufferIndex commandBuffer:cb];
@ -498,7 +516,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index];
}
@ -507,7 +525,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index];
}
@ -528,7 +546,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index];
}
@ -537,7 +555,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index];
}
@ -572,7 +590,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.main = _texDisplayPostprocessNative[NDSDisplayID_Main][index];
}
@ -581,7 +599,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Main][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Main][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.main = _texDisplayPostprocessCustom[NDSDisplayID_Main][index];
}
@ -594,7 +612,7 @@
[cce setTexture:_texDisplayFetchNative[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessNative[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridNative
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupNative];
targetTexPair.touch = _texDisplayPostprocessNative[NDSDisplayID_Touch][index];
}
@ -603,7 +621,7 @@
[cce setTexture:_texDisplayFetchCustom[NDSDisplayID_Touch][index] atIndex:0];
[cce setTexture:_texDisplayPostprocessCustom[NDSDisplayID_Touch][index] atIndex:1];
[cce dispatchThreadgroups:_fetchThreadGroupsPerGridCustom
threadsPerThreadgroup:_fetchThreadsPerGroup];
threadsPerThreadgroup:_fetchThreadsPerGroupCustom];
targetTexPair.touch = _texDisplayPostprocessCustom[NDSDisplayID_Touch][index];
}
@ -1023,17 +1041,21 @@
_texDisplayPixelScaler[NDSDisplayID_Main] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc];
_texDisplayPixelScaler[NDSDisplayID_Touch] = [[sharedData device] newTextureWithDescriptor:texDisplayPixelScaleDesc];
size_t tw = GetNearestPositivePOT((uint32_t)[[self pixelScalePipeline] threadExecutionWidth]);
while ( (tw > [[self pixelScalePipeline] threadExecutionWidth]) || (tw > GPU_FRAMEBUFFER_NATIVE_WIDTH) )
NSUInteger tw = [[self pixelScalePipeline] threadExecutionWidth];
while ( ((newScalerWidth % tw) != 0) || (tw > newScalerWidth) )
{
tw >>= 1;
}
const size_t th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw;
NSUInteger th = [[self pixelScalePipeline] maxTotalThreadsPerThreadgroup] / tw;
while ( ((newScalerHeight % th) != 0) || (th > newScalerHeight) )
{
th >>= 1;
}
_pixelScalerThreadsPerGroup = MTLSizeMake(tw, th, 1);
_pixelScalerThreadGroupsPerGrid = MTLSizeMake(GPU_FRAMEBUFFER_NATIVE_WIDTH / tw,
GPU_FRAMEBUFFER_NATIVE_HEIGHT / th,
_pixelScalerThreadGroupsPerGrid = MTLSizeMake(newScalerWidth / tw,
newScalerHeight / th,
1);
}
else

View File

@ -432,12 +432,7 @@ kernel void nds_fetch555(const uint2 position [[thread_position_in_grid]],
{
const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) )
{
return;
}
const float4 inColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r );
const float4 inColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r );
float3 outColor = inColor.rgb;
const uint line = uint( (float)position.y / ((float)h / 192.0f) );
@ -454,11 +449,6 @@ kernel void nds_fetch666(const uint2 position [[thread_position_in_grid]],
{
const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) )
{
return;
}
const float4 inColor = inTexture.read(position);
float3 outColor = inColor.rgb * float3(255.0f/63.0f);
@ -476,11 +466,6 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]],
{
const uint h = inTexture.get_height();
if ( (position.x > inTexture.get_width() - 1) || (position.y > h - 1) )
{
return;
}
const float4 inColor = inTexture.read(position);
float3 outColor = inColor.rgb;
@ -490,30 +475,20 @@ kernel void nds_fetch888(const uint2 position [[thread_position_in_grid]],
outTexture.write(float4(outColor, 1.0f), position);
}
kernel void nds_fetch555ConvertOnly(const uint2 position [[thread_position_in_grid]],
kernel void convert_texture_rgb555_to_unorm8888(const uint2 position [[thread_position_in_grid]],
const texture2d<ushort, access::read> inTexture [[texture(0)]],
texture2d<float, access::write> outTexture [[texture(1)]])
{
if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) )
{
return;
}
const float4 outColor = unpack_unorm1555_to_unorm8888( (ushort)inTexture.read(position).r );
const float4 outColor = unpack_rgba5551_to_unorm8888( (ushort)inTexture.read(position).r );
outTexture.write(float4(outColor.rgb, 1.0f), position);
}
kernel void nds_fetch666ConvertOnly(const uint2 position [[thread_position_in_grid]],
kernel void convert_texture_unorm666X_to_unorm8888(const uint2 position [[thread_position_in_grid]],
const texture2d<float, access::read> inTexture [[texture(0)]],
texture2d<float, access::write> outTexture [[texture(1)]])
{
if ( (position.x > inTexture.get_width() - 1) || (position.y > inTexture.get_height() - 1) )
{
return;
}
const float3 outColor = inTexture.read(position).rgb * float3(255.0f/63.0f);
outTexture.write(float4(outColor, 1.0f), position);
const float4 outColor = convert_unorm666X_to_unorm8888( inTexture.read(position) );
outTexture.write(outColor, position);
}
float3 nds_apply_master_brightness(const float3 inColor, const uchar mode, const float intensity)