GPU: Add NEON-accelerated functions for 2D layer compositing. (For 64-bit ARM CPUs only.)

- This improves GPU performance by up to 20% on the Raspberry Pi 5, and up to 50% on Apple Silicon CPUs.
This commit is contained in:
rogerman 2025-03-16 16:23:21 -07:00
parent e2379a66d6
commit c73a7ffe53
6 changed files with 3041 additions and 2 deletions

View File

@ -2,7 +2,7 @@
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 Theo Berkau
Copyright (C) 2007 shash
Copyright (C) 2008-2024 DeSmuME team
Copyright (C) 2008-2025 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -56,6 +56,9 @@
#elif defined(ENABLE_SSE2)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#elif defined(ENABLE_NEON_A64)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#endif
#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2021-2023 DeSmuME team
Copyright (C) 2021-2025 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -758,6 +758,8 @@ static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead,
#include "GPU_Operations_AVX2.cpp"
#elif defined(ENABLE_SSE2)
#include "GPU_Operations_SSE2.cpp"
#elif defined(ENABLE_NEON_A64)
#include "GPU_Operations_NEON.cpp"
#else
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,122 @@
/*
Copyright (C) 2025 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GPU_OPERATIONS_NEON_H
#define GPU_OPERATIONS_NEON_H
#include "GPU_Operations.h"
#ifndef ENABLE_NEON_A64
#warning This header requires ARM64 NEON support.
#else
class ColorOperation_NEON
{
public:
ColorOperation_NEON() {};
FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const;
FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const;
FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const;
};
class PixelOperation_NEON
{
protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
public:
PixelOperation_NEON() {};
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
};
#endif // ENABLE_NEON_A64
#endif // GPU_OPERATIONS_NEON_H

View File

@ -4416,6 +4416,8 @@
ABD2CE4426E05CB000FB15F7 /* DeSmuME (x86_64h).app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "DeSmuME (x86_64h).app"; sourceTree = BUILT_PRODUCTS_DIR; };
ABD42045172319D1006A9B46 /* FileMigrationDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FileMigrationDelegate.h; sourceTree = "<group>"; };
ABD42046172319D1006A9B46 /* FileMigrationDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = FileMigrationDelegate.mm; sourceTree = "<group>"; };
ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GPU_Operations_NEON.h; sourceTree = "<group>"; };
ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = GPU_Operations_NEON.cpp; sourceTree = "<group>"; };
ABDD89EF2C30BE97003482B7 /* OGLRender_ES3.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = OGLRender_ES3.h; sourceTree = "<group>"; };
ABDD89F02C30BE97003482B7 /* OGLRender_ES3.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = OGLRender_ES3.cpp; sourceTree = "<group>"; };
ABDDF7C41898F024007583C1 /* Icon_DisplayToggle_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_DisplayToggle_420x420.png; path = images/Icon_DisplayToggle_420x420.png; sourceTree = "<group>"; };
@ -5685,6 +5687,7 @@
AB1D4BB126E6F8D700A9AE42 /* GPU_Operations.cpp */,
AB1D4BB426E6F8D700A9AE42 /* GPU_Operations_SSE2.cpp */,
AB1D4BAF26E6F8D700A9AE42 /* GPU_Operations_AVX2.cpp */,
ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */,
ABD1FEB81345AC8400AF11D1 /* lua-engine.cpp */,
ABD1FEB91345AC8400AF11D1 /* matrix.cpp */,
ABD1FEBA1345AC8400AF11D1 /* mc.cpp */,
@ -5729,6 +5732,7 @@
AB1D4BB326E6F8D700A9AE42 /* GPU_Operations.h */,
AB1D4BB226E6F8D700A9AE42 /* GPU_Operations_SSE2.h */,
AB1D4BB026E6F8D700A9AE42 /* GPU_Operations_AVX2.h */,
ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */,
AB796CA215CDCB6B00C59155 /* instruction_attributes.h */,
AB796CA315CDCB6B00C59155 /* instructions.h */,
ABD1FE841345AC8400AF11D1 /* lua-engine.h */,

View File

@ -347,6 +347,12 @@
<Unit filename="../../../GPU_Operations_AVX2.h">
<Option target="&lt;{~None~}&gt;" />
</Unit>
<Unit filename="../../../GPU_Operations_NEON.cpp">
<Option target="&lt;{~None~}&gt;" />
</Unit>
<Unit filename="../../../GPU_Operations_NEON.h">
<Option target="&lt;{~None~}&gt;" />
</Unit>
<Unit filename="../../../GPU_Operations_SSE2.cpp">
<Option target="&lt;{~None~}&gt;" />
</Unit>