diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp index cde1350ee..03d11ef5f 100755 --- a/desmume/src/FIFO.cpp +++ b/desmume/src/FIFO.cpp @@ -38,6 +38,9 @@ #elif defined(ENABLE_SSE2) #define USEVECTORSIZE_128 #define VECTORSIZE 16 +#elif defined(ENABLE_NEON_A64) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 #elif defined(ENABLE_ALTIVEC) #define USEVECTORSIZE_128 #define VECTORSIZE 16 diff --git a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index 0be6acb2a..5394f2ac6 100755 --- a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -3636,6 +3636,8 @@ AB96EE861F990E4700B7AA67 /* lzio.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = lzio.c; sourceTree = ""; }; AB96EE871F990E4700B7AA67 /* lzio.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lzio.h; sourceTree = ""; }; AB9971CE134EDA0800531BA7 /* cocoa_globals.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_globals.h; sourceTree = ""; }; + ABA48DF527F95C2E00D961FB /* colorspacehandler_NEON.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_NEON.h; sourceTree = ""; }; + ABA48DF627F95C2E00D961FB /* colorspacehandler_NEON.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_NEON.cpp; sourceTree = ""; }; ABA6574914511EC90077E5E9 /* cocoa_cheat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_cheat.h; sourceTree = ""; }; ABA6574A14511EC90077E5E9 /* cocoa_cheat.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = cocoa_cheat.mm; sourceTree = ""; }; ABA731251BB5104200B26147 /* SIL Open Font License.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "SIL Open Font License.txt"; sourceTree = ""; }; @@ -4968,11 +4970,13 @@ ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */, ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */, ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */, + ABA48DF627F95C2E00D961FB /* colorspacehandler_NEON.cpp */, ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */, ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */, ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */, ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */, ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */, + ABA48DF527F95C2E00D961FB /* colorspacehandler_NEON.h */, ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */, ); path = colorspacehandler; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index f725f376c..b8c87e0e3 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2021 DeSmuME team + Copyright (C) 2016-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +30,10 @@ #include "colorspacehandler_SSE2.cpp" #endif +#if defined(ENABLE_NEON_A64) + #include "colorspacehandler_NEON.cpp" +#endif + #if defined(ENABLE_ALTIVEC) #include "colorspacehandler_AltiVec.cpp" #endif @@ -40,7 +44,7 @@ #elif defined(ENABLE_AVX2) #define USEVECTORSIZE_256 #define VECTORSIZE 32 -#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) +#elif defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC) #define USEVECTORSIZE_128 #define VECTORSIZE 16 #endif @@ -60,6 +64,8 @@ static const ColorspaceHandler_AVX2 csh; #elif defined(ENABLE_SSE2) static const ColorspaceHandler_SSE2 csh; + #elif defined(ENABLE_NEON_A64) + static const ColorspaceHandler_NEON csh; #elif defined(ENABLE_ALTIVEC) static const ColorspaceHandler_AltiVec csh; #else diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.cpp new file mode 100644 index 000000000..52c75ae16 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.cpp @@ -0,0 +1,1025 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_NEON.h" + +#ifndef ENABLE_NEON_A64 + #error This code requires ARM64 NEON support. +#else + +#include + +#define COLOR16_SWAPRB_NEON(src) vorrq_u16( vshlq_n_u16(vandq_u16(src,vdupq_n_u16(0x001F)),10), vorrq_u16( vandq_u16(src,vdupq_n_u16(0x03E0)), vorrq_u16(vshrq_n_u16(vandq_u16(src,vdupq_n_u16(0x7C00)),10), vandq_u16(src,vdupq_n_u16(0x8000))) ) ) + +#define COLOR32_SWAPRB_NEON(src) vcopyq_laneq_u32( vcopyq_laneq_u32(src, 2, src, 0), 0, src, 2 ) + +template +FORCEINLINE void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = vorrq_u16( vshlq_n_u16(srcColor,11), vandq_u16(vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8)) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0707))); + + v128u16 ga = vandq_u16(vshrq_n_u16(srcColor, 2), vdupq_n_u16(0x00F8) ); + ga = vorrq_u16(ga, vshrq_n_u16(ga, 5)); + ga = vorrq_u16(ga, srcAlphaBits); + + dstLo = vzip1q_u8(rb, ga); + dstHi = vzip2q_u8(rb, ga); + } + else + { + const v128u16 r = vandq_u16( vshlq_n_u16(srcColor, 3), vdupq_n_u16(0x00F8) ); + v128u16 rg = vorrq_u16( r, vandq_u16(vshlq_n_u16(srcColor, 6), vdupq_n_u16(0xF800)) ); + rg = vorrq_u16( rg, vandq_u16(vshrq_n_u16(rg, 5), vdupq_n_u16(0x0707)) ); + + v128u16 ba = vandq_u16( vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8) ); + ba = vorrq_u16(ba, vshrq_n_u16(ba, 5)); + ba = vorrq_u16(ba, srcAlphaBits); + + dstLo = vzip1q_u16(rg, ba); + dstHi = vzip2q_u16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = vorrq_u16( vshlq_n_u16(srcColor,11), vandq_u16(vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8)) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0707))); + + v128u16 g = vandq_u16(vshrq_n_u16(srcColor, 2), vdupq_n_u16(0x00F8) ); + g = vorrq_u16(g, vshrq_n_u16(g, 5)); + + dstLo = vzip1q_u8(rb, g); + dstHi = vzip2q_u8(rb, g); + } + else + { + const v128u16 r = vandq_u16( vshlq_n_u16(srcColor, 3), vdupq_n_u16(0x00F8) ); + v128u16 rg = vorrq_u16( r, vandq_u16(vshlq_n_u16(srcColor, 6), vdupq_n_u16(0xF800)) ); + rg = vorrq_u16( rg, vandq_u16(vshrq_n_u16(rg, 5), vdupq_n_u16(0x0707)) ); + + v128u16 b = vandq_u16( vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8) ); + b = vorrq_u16(b, vshrq_n_u16(b, 5)); + + dstLo = vzip1q_u16(rg, b); + dstHi = vzip2q_u16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = vandq_u16( vorrq_u16( vshlq_n_u16(srcColor,9), vshrq_n_u16(srcColor, 9)), vdupq_n_u16(0x3E3E) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0101))); + + v128u16 ga = vandq_u16(vshrq_n_u16(srcColor, 4), vdupq_n_u16(0x003E) ); + ga = vorrq_u16(ga, vshrq_n_u16(ga, 5)); + ga = vorrq_u16(ga, srcAlphaBits); + + dstLo = vzip1q_u8(rb, ga); + dstHi = vzip2q_u8(rb, ga); + } + else + { + const v128u16 r = vandq_u16( vshlq_n_u16(srcColor, 1), vdupq_n_u16(0x003E) ); + const v128u16 b = vandq_u16( vshrq_n_u16(srcColor, 9), vdupq_n_u16(0x003E) ); + + v128u16 rg = vorrq_u16( r, vandq_u16(vshlq_n_u16(srcColor, 4), vdupq_n_u16(0x3E00)) ); + rg = vorrq_u16( rg, vandq_u16(vshrq_n_u16(rg, 5), vdupq_n_u16(0x0101)) ); + + v128u16 ba = vorrq_u16(b, vshrq_n_u16(b, 5)); + ba = vorrq_u16(ba, srcAlphaBits); + + dstLo = vzip1q_u16(rg, ba); + dstHi = vzip2q_u16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = vandq_u16( vorrq_u16( vshlq_n_u16(srcColor,9), vshrq_n_u16(srcColor, 9)), vdupq_n_u16(0x3E3E) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0101))); + + v128u16 g = vandq_u16(vshrq_n_u16(srcColor, 4), vdupq_n_u16(0x003E) ); + g = vorrq_u16(g, vshrq_n_u16(g, 5)); + + dstLo = vzip1q_u8(rb, g); + dstHi = vzip2q_u8(rb, g); + } + else + { + const v128u16 r = vandq_u16( vshlq_n_u16(srcColor, 1), vdupq_n_u16(0x003E) ); + v128u16 rg = vorrq_u16( r, vandq_u16(vshlq_n_u16(srcColor, 4), vdupq_n_u16(0x3E00)) ); + rg = vorrq_u16( rg, vandq_u16(vshrq_n_u16(rg, 5), vdupq_n_u16(0x0101)) ); + + v128u16 b = vandq_u16( vshrq_n_u16(srcColor, 9), vdupq_n_u16(0x003E) ); + b = vorrq_u16(b, vshrq_n_u16(b, 5)); + + dstLo = vzip1q_u16(rg, b); + dstHi = vzip2q_u16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = vdupq_n_u16(0xFF00); + ColorspaceConvert555To8888_NEON(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = vdupq_n_u16(0x1F00); + ColorspaceConvert555To6665_NEON(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u32 rgb = vandq_u32( vshrq_n_u32(src, 2), vdupq_n_u32(0x003F3F3F) ); + const v128u32 a = vandq_u32( vshrq_n_u32(src, 3), vdupq_n_u32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = COLOR32_SWAPRB_NEON(rgb); + } + + return vorrq_u32(rgb, a); +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u32 rgb = vorrq_u32( vandq_u32(vshlq_n_u32(src, 2), vdupq_n_u32(0x00FCFCFC)), vandq_u32(vshrq_n_u32(src, 4), vdupq_n_u32(0x00030303)) ); + const v128u32 a = vorrq_u32( vandq_u32(vshlq_n_u32(src, 3), vdupq_n_u32(0xF8000000)), vandq_u32(vshrq_n_u32(src, 2), vdupq_n_u32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = COLOR32_SWAPRB_NEON(rgb); + } + + return vorrq_u32(rgb, a); +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + v128u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 17), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 4), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshlq_n_u32(srcLo, 9), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 17), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 4), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshlq_n_u32(srcHi, 9), vdupq_n_u32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 1), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 4), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 7), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 1), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 4), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 7), vdupq_n_u32(0x00007C00)) ); + } + + // Convert alpha + alpha = vuzp1q_u16( vreinterpretq_u32_u16(vandq_u32(vshrq_n_u32(srcLo, 24), vdupq_n_u32(0x0000001F))), vreinterpretq_u32_u16(vandq_u32(vshrq_n_u32(srcHi, 24), vdupq_n_u32(0x0000001F))) ); + alpha = vcgtq_u16(alpha, vdupq_n_u16(0)); + alpha = vandq_u16(alpha, vdupq_n_u16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 19), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 6), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshlq_n_u32(srcLo, 7), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 19), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 6), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshlq_n_u32(srcHi, 7), vdupq_n_u32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 3), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 6), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 9), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 3), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 6), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 9), vdupq_n_u32(0x00007C00)) ); + } + + // Convert alpha + alpha = vuzp1q_u16( vreinterpretq_u32_u16(vshrq_n_u32(srcLo, 24)), vreinterpretq_u32_u16(vshrq_n_u32(srcHi, 24)) ); + alpha = vcgtq_u16(alpha, vdupq_n_u16(0)); + alpha = vandq_u16(alpha, vdupq_n_u16(0x8000)); + } + + return vorrq_u16( vuzp1q_u16(vreinterpretq_u32_u16(rgbLo), vreinterpretq_u32_u16(rgbHi)), alpha ); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_NEON(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_NEON(srcLo, srcHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src) +{ + if (SWAP_RB) + { + return vorrq_u32( COLOR32_SWAPRB_NEON(src), vdupq_n_u32(0xFF000000) ); + } + + return vorrq_u32( src, vdupq_n_u32(0xFF000000) ); +} + +template +FORCEINLINE v128u16 ColorspaceCopy16_NEON(const v128u16 &src) +{ + if (SWAP_RB) + { + return COLOR16_SWAPRB_NEON(src); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_NEON(const v128u32 &src) +{ + if (SWAP_RB) + { + return COLOR32_SWAPRB_NEON(src); + } + + return src; +} + +template +FORCEINLINE v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity) +{ + v128u16 tempSrc = ColorspaceCopy16_NEON(src); + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return vandq_u16(tempSrc, vdupq_n_u16(0x8000)); + } + + v128u16 r = vandq_u16( tempSrc, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(tempSrc, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(tempSrc, 10), vdupq_n_u16(0x001F) ); + v128u16 a = vandq_u16( tempSrc, vdupq_n_u16(0x8000) ); + + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + r = vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(r), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(r), intensityVec)) ); + g = vshlq_n_u32( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(g), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(g), intensityVec)) ), 5 ); + b = vshlq_n_u32( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(b), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(b), intensityVec)) ), 10 ); + + return vorrq_u16( vorrq_u16( vorrq_u16(r, g), b), a); +} + +template +FORCEINLINE v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity) +{ + v128u32 tempSrc = ColorspaceCopy32_NEON(src); + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return vandq_u32(tempSrc, vdupq_n_u32(0xFF000000)); + } + + v128u32 rb = vandq_u32( tempSrc, vdupq_n_u32(0x00FF00FF) ); + v128u32 g = vandq_u32( vshrq_n_u32(tempSrc, 8), vdupq_n_u32(0x000000FF) ); + v128u32 a = vandq_u32( tempSrc, vdupq_n_u32(0xFF000000) ); + + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(rb)), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(rb)), intensityVec) ); + g = vshlq_n_u32( vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(g) ), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(g) ), intensityVec) ), 8 ); + + return vorrq_u32( vorrq_u32(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_NEON(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + v128u16 srcVec; + uint32x4x2_t dstVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u16(src+i); + ColorspaceConvert555To8888Opaque_NEON(srcVec, dstVec.val[0], dstVec.val[1]); + vst1q_u32_x2(dst+i, dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_NEON(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 srcVec; + uint32x4x2_t dstVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u16(src+i); + ColorspaceConvert555To6665Opaque_NEON(srcVec, dstVec.val[0], dstVec.val[1]); + vst1q_u32_x2(dst+i, dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + vst1q_u32( dst+i, ColorspaceConvert8888To6665_NEON(vld1q_u32(src+i)) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + vst1q_u32( dst+i, ColorspaceConvert6665To8888_NEON(vld1q_u32(src+i)) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_NEON(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint32x4x2_t srcVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u32_x2(src+i); + vst1q_u16( dst+i, ColorspaceConvert8888To5551_NEON(srcVec.val[0], srcVec.val[1]) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_NEON(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint32x4x2_t srcVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u32_x2(src+i); + vst1q_u16( dst+i, ColorspaceConvert6665To5551_NEON(srcVec.val[0], srcVec.val[1]) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + uint8x16x4_t srcVec_x4; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) + { + srcVec_x4 = vld4q_u8((u8 *)(src+i)); + + if (SWAP_RB) + { + srcVec_x4.val[3] = srcVec_x4.val[0]; // Use the alpha channel as temp storage since we're overwriting it anyways. + srcVec_x4.val[0] = srcVec_x4.val[2]; + srcVec_x4.val[2] = srcVec_x4.val[3]; + } + + srcVec_x4.val[3] = vdupq_n_u8(0xFF); + vst4q_u8((u8 *)(dst+i), *((uint8x16x4_t *)&srcVec_x4)); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_NEON(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint16x8x2_t srcVec; + uint8x16x3_t dstVec; + uint16x8_t tempRBLo; + uint16x8_t tempRBHi; + + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2)) + { + srcVec = vld1q_u16_x2(src+i); + tempRBLo = vorrq_u16( vshlq_n_u16(srcVec.val[0], 11), vshrq_n_u16(srcVec.val[0], 7) ); + tempRBHi = vorrq_u16( vshlq_n_u16(srcVec.val[1], 11), vshrq_n_u16(srcVec.val[1], 7) ); + + if (SWAP_RB) + { + dstVec.val[2] = vandq_u8( vuzp1q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + dstVec.val[0] = vandq_u8( vuzp2q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + } + else + { + dstVec.val[0] = vandq_u8( vuzp1q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + dstVec.val[2] = vandq_u8( vuzp2q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + } + + dstVec.val[1] = vandq_u8( vuzp1q_u8( vreinterpretq_u8_u16(vshrq_n_u16(srcVec.val[0], 2)), vreinterpretq_u8_u16(vshrq_n_u16(srcVec.val[1], 2)) ), vdupq_n_u8(0xF8) ); + + dstVec.val[0] = vorrq_u8(dstVec.val[0], vshrq_n_u32(dstVec.val[0], 5)); + dstVec.val[1] = vorrq_u8(dstVec.val[1], vshrq_n_u32(dstVec.val[1], 5)); + dstVec.val[2] = vorrq_u8(dstVec.val[2], vshrq_n_u32(dstVec.val[2], 5)); + + vst3q_u8(dst+(i*3), dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_NEON(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint8x16x4_t srcVec_x4; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) + { + srcVec_x4 = vld4q_u8((u8 *)(src+i)); + + if (SWAP_RB) + { + srcVec_x4.val[3] = srcVec_x4.val[0]; // Use the alpha channel as temp storage since we're dropping it anyways. + srcVec_x4.val[0] = srcVec_x4.val[2]; + srcVec_x4.val[2] = srcVec_x4.val[3]; + } + + vst3q_u8(dst+(i*3), *((uint8x16x3_t *)&srcVec_x4)); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_NEON(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 src_vec128 = vld1q_u16(src+i); + vst1q_u16(dst+i, ColorspaceCopy16_NEON(src_vec128)); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 src_vec128 = vld1q_u32(src+i); + vst1q_u32(dst+i, ColorspaceCopy32_NEON(src_vec128)); + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_NEON(u16 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + const v128u16 dstVec = vld1q_u16(dst+i); + const v128u16 tempDst = COLOR16_SWAPRB_NEON(dstVec); + vst1q_u16(dst+i, tempDst); + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + const uint16x8_t alphaMask = vdupq_n_u16(0x8000); + uint16x8x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16))*4)) + { + src = vld1q_u16_x4(dst+i); + src.val[0] = vandq_u16(src.val[0], alphaMask); + src.val[1] = vandq_u16(src.val[1], alphaMask); + src.val[2] = vandq_u16(src.val[2], alphaMask); + src.val[3] = vandq_u16(src.val[3], alphaMask); + + vst1q_u16_x4(dst+i, src); + } + } + else + { + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + const v128u16 dstVec = vld1q_u16(dst+i); + v128u16 tempDst = (SWAP_RB) ? COLOR16_SWAPRB_NEON(dstVec) : dstVec; + + v128u16 r = vandq_u16( tempDst, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(tempDst, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(tempDst, 10), vdupq_n_u16(0x001F) ); + v128u16 a = vandq_u16( tempDst, vdupq_n_u16(0x8000) ); + + r = vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(r), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(r), intensityVec)) ); + g = vshlq_n_u32( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(g), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(g), intensityVec)) ), 5 ); + b = vshlq_n_u32( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(b), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(b), intensityVec)) ), 10 ); + + tempDst = vorrq_u32( vorrq_u32( vorrq_u32(r, g), b), a); + + vst1q_u16(dst+i, tempDst); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_NEON(u32 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + uint32x4x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32))*4)) + { + src = vld1q_u32_x4(dst+i); + src.val[0] = COLOR32_SWAPRB_NEON(src.val[0]); + src.val[1] = COLOR32_SWAPRB_NEON(src.val[1]); + src.val[2] = COLOR32_SWAPRB_NEON(src.val[2]); + src.val[3] = COLOR32_SWAPRB_NEON(src.val[3]); + + vst1q_u32_x4(dst+i, src); + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + const uint32x4_t alphaMask = vdupq_n_u32(0xFF000000); + uint32x4x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32))*4)) + { + src = vld1q_u32_x4(dst+i); + src.val[0] = vandq_u32(src.val[0], alphaMask); + src.val[1] = vandq_u32(src.val[1], alphaMask); + src.val[2] = vandq_u32(src.val[2], alphaMask); + src.val[3] = vandq_u32(src.val[3], alphaMask); + + vst1q_u32_x4(dst+i, src); + } + } + else + { + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 dstVec = vld1q_u32(dst+i); + v128u32 tempDst = (SWAP_RB) ? COLOR32_SWAPRB_NEON(dstVec) : dstVec; + + v128u32 rb = vandq_u32( tempDst, vdupq_n_u32(0x00FF00FF) ); + v128u32 g = vandq_u32( vshrq_n_u32(tempDst, 8), vdupq_n_u32(0x000000FF) ); + v128u32 a = vandq_u32( tempDst, vdupq_n_u32(0xFF000000) ); + + rb = vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(rb)), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(rb)), intensityVec) ); + g = vshlq_n_u32( vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(g) ), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(g) ), intensityVec) ), 8 ); + + tempDst = vorrq_u32( vorrq_u32(rb, g), a); + vst1q_u32(dst+i, tempDst); + } + } + + return i; +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); + +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); + +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); + +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); + +#endif // ENABLE_NEON_A64 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.h b/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.h new file mode 100644 index 000000000..0669fb659 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_NEON.h @@ -0,0 +1,114 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_NEON_H +#define COLORSPACEHANDLER_NEON_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_NEON_A64 + #warning This header requires ARM64 NEON support. +#else + +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 C6olorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); + +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); + +class ColorspaceHandler_NEON : public ColorspaceHandler +{ +public: + ColorspaceHandler_NEON() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_NEON_A64 + +#endif // COLORSPACEHANDLER_NEON_H