GPU: Add NEON-accelerated functions for 2D layer compositing. (For 64-bit ARM CPUs only.)
- This improves GPU performance by up to 20% on the Raspberry Pi 5, and up to 50% on Apple Silicon CPUs.
This commit is contained in:
parent
e2379a66d6
commit
c73a7ffe53
|
@ -2,7 +2,7 @@
|
|||
Copyright (C) 2006 yopyop
|
||||
Copyright (C) 2006-2007 Theo Berkau
|
||||
Copyright (C) 2007 shash
|
||||
Copyright (C) 2008-2024 DeSmuME team
|
||||
Copyright (C) 2008-2025 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -56,6 +56,9 @@
|
|||
#elif defined(ENABLE_SSE2)
|
||||
#define USEVECTORSIZE_128
|
||||
#define VECTORSIZE 16
|
||||
#elif defined(ENABLE_NEON_A64)
|
||||
#define USEVECTORSIZE_128
|
||||
#define VECTORSIZE 16
|
||||
#endif
|
||||
|
||||
#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2021-2023 DeSmuME team
|
||||
Copyright (C) 2021-2025 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -758,6 +758,8 @@ static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead,
|
|||
#include "GPU_Operations_AVX2.cpp"
|
||||
#elif defined(ENABLE_SSE2)
|
||||
#include "GPU_Operations_SSE2.cpp"
|
||||
#elif defined(ENABLE_NEON_A64)
|
||||
#include "GPU_Operations_NEON.cpp"
|
||||
#else
|
||||
|
||||
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
Copyright (C) 2025 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with the this software. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef GPU_OPERATIONS_NEON_H
|
||||
#define GPU_OPERATIONS_NEON_H
|
||||
|
||||
#include "GPU_Operations.h"
|
||||
|
||||
#ifndef ENABLE_NEON_A64
|
||||
#warning This header requires ARM64 NEON support.
|
||||
#else
|
||||
|
||||
class ColorOperation_NEON
|
||||
{
|
||||
public:
|
||||
ColorOperation_NEON() {};
|
||||
|
||||
FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
|
||||
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
|
||||
|
||||
FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const;
|
||||
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const;
|
||||
|
||||
FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const;
|
||||
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const;
|
||||
|
||||
FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const;
|
||||
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const;
|
||||
};
|
||||
|
||||
class PixelOperation_NEON
|
||||
{
|
||||
protected:
|
||||
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
|
||||
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
|
||||
FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo,
|
||||
const v128u8 &passMask8,
|
||||
const v128u16 &evy16,
|
||||
const v128u8 &srcLayerID,
|
||||
const v128u16 &src1, const v128u16 &src0,
|
||||
const v128u8 &srcEffectEnableMask,
|
||||
const v128u8 &dstBlendEnableMaskLUT,
|
||||
const v128u8 &enableColorEffectMask,
|
||||
const v128u8 &spriteAlpha,
|
||||
const v128u8 &spriteMode) const;
|
||||
|
||||
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
|
||||
FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo,
|
||||
const v128u8 &passMask8,
|
||||
const v128u16 &evy16,
|
||||
const v128u8 &srcLayerID,
|
||||
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
|
||||
const v128u8 &srcEffectEnableMask,
|
||||
const v128u8 &dstBlendEnableMaskLUT,
|
||||
const v128u8 &enableColorEffectMask,
|
||||
const v128u8 &spriteAlpha,
|
||||
const v128u8 &spriteMode) const;
|
||||
|
||||
public:
|
||||
PixelOperation_NEON() {};
|
||||
|
||||
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
|
||||
FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo,
|
||||
const bool didAllPixelsPass,
|
||||
const v128u8 &passMask8,
|
||||
const v128u16 &evy16,
|
||||
const v128u8 &srcLayerID,
|
||||
const v128u16 &src1, const v128u16 &src0,
|
||||
const v128u8 &srcEffectEnableMask,
|
||||
const v128u8 &dstBlendEnableMaskLUT,
|
||||
const u8 *__restrict enableColorEffectPtr,
|
||||
const u8 *__restrict sprAlphaPtr,
|
||||
const u8 *__restrict sprModePtr) const;
|
||||
|
||||
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
|
||||
FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo,
|
||||
const bool didAllPixelsPass,
|
||||
const v128u8 &passMask8,
|
||||
const v128u16 &evy16,
|
||||
const v128u8 &srcLayerID,
|
||||
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
|
||||
const v128u8 &srcEffectEnableMask,
|
||||
const v128u8 &dstBlendEnableMaskLUT,
|
||||
const u8 *__restrict enableColorEffectPtr,
|
||||
const u8 *__restrict sprAlphaPtr,
|
||||
const u8 *__restrict sprModePtr) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_NEON_A64
|
||||
|
||||
#endif // GPU_OPERATIONS_NEON_H
|
|
@ -4416,6 +4416,8 @@
|
|||
ABD2CE4426E05CB000FB15F7 /* DeSmuME (x86_64h).app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "DeSmuME (x86_64h).app"; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
ABD42045172319D1006A9B46 /* FileMigrationDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FileMigrationDelegate.h; sourceTree = "<group>"; };
|
||||
ABD42046172319D1006A9B46 /* FileMigrationDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = FileMigrationDelegate.mm; sourceTree = "<group>"; };
|
||||
ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GPU_Operations_NEON.h; sourceTree = "<group>"; };
|
||||
ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = GPU_Operations_NEON.cpp; sourceTree = "<group>"; };
|
||||
ABDD89EF2C30BE97003482B7 /* OGLRender_ES3.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = OGLRender_ES3.h; sourceTree = "<group>"; };
|
||||
ABDD89F02C30BE97003482B7 /* OGLRender_ES3.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = OGLRender_ES3.cpp; sourceTree = "<group>"; };
|
||||
ABDDF7C41898F024007583C1 /* Icon_DisplayToggle_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_DisplayToggle_420x420.png; path = images/Icon_DisplayToggle_420x420.png; sourceTree = "<group>"; };
|
||||
|
@ -5685,6 +5687,7 @@
|
|||
AB1D4BB126E6F8D700A9AE42 /* GPU_Operations.cpp */,
|
||||
AB1D4BB426E6F8D700A9AE42 /* GPU_Operations_SSE2.cpp */,
|
||||
AB1D4BAF26E6F8D700A9AE42 /* GPU_Operations_AVX2.cpp */,
|
||||
ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */,
|
||||
ABD1FEB81345AC8400AF11D1 /* lua-engine.cpp */,
|
||||
ABD1FEB91345AC8400AF11D1 /* matrix.cpp */,
|
||||
ABD1FEBA1345AC8400AF11D1 /* mc.cpp */,
|
||||
|
@ -5729,6 +5732,7 @@
|
|||
AB1D4BB326E6F8D700A9AE42 /* GPU_Operations.h */,
|
||||
AB1D4BB226E6F8D700A9AE42 /* GPU_Operations_SSE2.h */,
|
||||
AB1D4BB026E6F8D700A9AE42 /* GPU_Operations_AVX2.h */,
|
||||
ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */,
|
||||
AB796CA215CDCB6B00C59155 /* instruction_attributes.h */,
|
||||
AB796CA315CDCB6B00C59155 /* instructions.h */,
|
||||
ABD1FE841345AC8400AF11D1 /* lua-engine.h */,
|
||||
|
|
|
@ -347,6 +347,12 @@
|
|||
<Unit filename="../../../GPU_Operations_AVX2.h">
|
||||
<Option target="<{~None~}>" />
|
||||
</Unit>
|
||||
<Unit filename="../../../GPU_Operations_NEON.cpp">
|
||||
<Option target="<{~None~}>" />
|
||||
</Unit>
|
||||
<Unit filename="../../../GPU_Operations_NEON.h">
|
||||
<Option target="<{~None~}>" />
|
||||
</Unit>
|
||||
<Unit filename="../../../GPU_Operations_SSE2.cpp">
|
||||
<Option target="<{~None~}>" />
|
||||
</Unit>
|
||||
|
|
Loading…
Reference in New Issue