diff --git a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index b40938d82..d53c0e26e 100644 --- a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -2685,6 +2685,8 @@ ABC570D0134431CE00E7B0B1 /* AudioUnit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AudioUnit.framework; path = System/Library/Frameworks/AudioUnit.framework; sourceTree = SDKROOT; }; ABC570D4134431DA00E7B0B1 /* OpenGL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenGL.framework; path = System/Library/Frameworks/OpenGL.framework; sourceTree = SDKROOT; }; ABC719E1138CB25E002827A9 /* DefaultKeyMappings.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = DefaultKeyMappings.plist; sourceTree = ""; }; + ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX512.cpp; sourceTree = ""; }; + ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX512.h; sourceTree = ""; }; ABCFA9F2178BDE920030C8BA /* encrypt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = encrypt.h; sourceTree = ""; }; ABCFA9F3178BDE920030C8BA /* encrypt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = encrypt.cpp; sourceTree = ""; }; ABD103FE1346652500AF11D1 /* cocoa_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_core.h; sourceTree = ""; }; @@ -3842,10 +3844,12 @@ children = ( ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */, ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */, + ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */, ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */, ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */, ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */, ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */, + ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */, ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */, ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */, ); @@ -7073,6 +7077,7 @@ GDB_STUB, ); MACOSX_DEPLOYMENT_TARGET = 10.7; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; PRODUCT_NAME = "DeSmuME (Debug, dev+)"; }; name = Debug; @@ -7087,6 +7092,7 @@ GDB_STUB, ); MACOSX_DEPLOYMENT_TARGET = 10.7; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; PRODUCT_NAME = "DeSmuME (dev+)"; }; name = Release; @@ -7257,6 +7263,9 @@ INFOPLIST_FILE = "Info (Debug).plist"; LD_NO_PIE = YES; MACOSX_DEPLOYMENT_TARGET = 10.5; + MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal11; + MTL_OPTIMIZATION_LEVEL = 3; ONLY_ACTIVE_ARCH = YES; OTHER_CFLAGS = "-I./../../"; PRODUCT_NAME = "DeSmuME (Debug)"; @@ -7309,6 +7318,9 @@ INFOPLIST_FILE = Info.plist; LD_NO_PIE = YES; MACOSX_DEPLOYMENT_TARGET = 10.5; + MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal11; + MTL_OPTIMIZATION_LEVEL = 3; OTHER_CFLAGS = "-I./../../"; PRODUCT_NAME = DeSmuME; SDKROOT = macosx; diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index d400e2481..0fc3ac4cb 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -37,10 +37,6 @@ #include #endif -#ifdef ENABLE_AVX -#include -#endif - enum MatrixMode { MATRIXMODE_PROJECTION = 0, @@ -159,7 +155,47 @@ FORCEINLINE s32 sfx32_shiftdown(const s64 a) // SIMD Functions //------------- -#if defined(ENABLE_AVX) +#if defined(ENABLE_AVX512_0) + +static void memset_u16(void *dst, const u16 val, const size_t elementCount) +{ + v512u16 *dst_vec512 = (v512u16 *)dst; + const size_t length_vec512 = elementCount / (sizeof(v512u16) / sizeof(u16)); + + const v512u16 val_vec512 = _mm512_set1_epi16(val); + for (size_t i = 0; i < length_vec512; i++) + _mm512_stream_si512(dst_vec512 + i, val_vec512); +} + +template +static void memset_u16_fast(void *dst, const u16 val) +{ + v512u16 *dst_vec512 = (v512u16 *)dst; + + const v512u16 val_vec512 = _mm512_set1_epi16(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v512u16) / sizeof(u16)), _mm512_store_si512(dst_vec512 + (X), val_vec512)); +} + +static void memset_u32(void *dst, const u32 val, const size_t elementCount) +{ + v512u32 *dst_vec512 = (v512u32 *)dst; + const size_t length_vec512 = elementCount / (sizeof(v512u32) / sizeof(u32)); + + const v512u32 val_vec512 = _mm512_set1_epi32(val); + for (size_t i = 0; i < length_vec512; i++) + _mm512_stream_si512(dst_vec512 + i, val_vec512); +} + +template +static void memset_u32_fast(void *dst, const u32 val) +{ + v512u32 *dst_vec512 = (v512u32 *)dst; + + const v512u32 val_vec512 = _mm512_set1_epi32(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v512u32) / sizeof(u32)), _mm512_store_si512(dst_vec512 + (X), val_vec512)); +} + +#elif defined(ENABLE_AVX) static void memset_u16(void *dst, const u16 val, const size_t elementCount) { diff --git a/desmume/src/types.h b/desmume/src/types.h index 689608166..d3bf54f42 100755 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -49,6 +49,10 @@ #endif #ifdef __GNUC__ + #ifdef __ALTIVEC__ + #define ENABLE_ALTIVEC + #endif + #ifdef __SSE__ #define ENABLE_SSE #endif @@ -81,8 +85,27 @@ #define ENABLE_AVX2 #endif - #ifdef __ALTIVEC__ - #define ENABLE_ALTIVEC + // AVX-512 is special because it has multiple tiers of support. + // + // For our case, Tier-0 will be the baseline AVX-512 tier that includes the basic Foundation and + // Conflict Detection extensions, which should be supported on all AVX-512 CPUs. Higher tiers + // include more extensions, where each higher tier also assumes support for all lower tiers. + // + // For typical use cases in DeSmuME, the most practical AVX-512 tier will be Tier-1. + #if defined(__AVX512F__) && defined(__AVX512CD__) + #define ENABLE_AVX512_0 + #endif + + #if defined(ENABLE_AVX512_0) && defined(__AVX512BW__) && defined(__AVX512DQ__) + #define ENABLE_AVX512_1 + #endif + + #if defined(ENABLE_AVX512_1) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) + #define ENABLE_AVX512_2 + #endif + + #if defined(ENABLE_AVX512_2) && defined(__AVX512VNNI__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) + #define ENABLE_AVX512_3 #endif #endif @@ -245,7 +268,8 @@ typedef __m128i v128u32; typedef __m128i v128s32; #endif -#ifdef ENABLE_AVX +#if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0) + #include typedef __m256i v256u8; typedef __m256i v256s8; @@ -253,8 +277,18 @@ typedef __m256i v256u16; typedef __m256i v256s16; typedef __m256i v256u32; typedef __m256i v256s32; + +#if defined(ENABLE_AVX512_0) +typedef __m512i v512u8; +typedef __m512i v512s8; +typedef __m512i v512u16; +typedef __m512i v512s16; +typedef __m512i v512u32; +typedef __m512i v512s32; #endif +#endif // defined(ENABLE_AVX) || defined(ENABLE_AVX512_0) + /*---------- GPU3D fixed-points types -----------*/ typedef s32 f32; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index 980bfdf7c..17755af12 100755 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2017 DeSmuME team + Copyright (C) 2016-2019 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,19 +18,31 @@ #include "colorspacehandler.h" #include -#if defined(ENABLE_AVX2) - #include "colorspacehandler_AVX2.cpp" - #include "colorspacehandler_SSE2.cpp" -#elif defined(ENABLE_SSE2) - #include "colorspacehandler_SSE2.cpp" -#elif defined(ENABLE_ALTIVEC) - #include "colorspacehandler_AltiVec.cpp" +#if defined(ENABLE_AVX512_1) + #include "colorspacehandler_AVX512.cpp" #endif #if defined(ENABLE_AVX2) + #include "colorspacehandler_AVX2.cpp" +#endif + +#if defined(ENABLE_SSE2) + #include "colorspacehandler_SSE2.cpp" +#endif + +#if defined(ENABLE_ALTIVEC) + #include "colorspacehandler_AltiVec.cpp" +#endif + +#if defined(ENABLE_AVX512_1) + #define USEVECTORSIZE_512 + #define VECTORSIZE 64 +#elif defined(ENABLE_AVX2) #define USEVECTORSIZE_256 + #define VECTORSIZE 32 #elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) #define USEVECTORSIZE_128 + #define VECTORSIZE 16 #endif // By default, the hand-coded vectorized code will be used instead of a compiler's built-in @@ -42,7 +54,9 @@ #endif #ifdef USEMANUALVECTORIZATION - #if defined(ENABLE_AVX2) + #if defined(ENABLE_AVX512_1) + static const ColorspaceHandler_AVX512 csh; + #elif defined(ENABLE_AVX2) static const ColorspaceHandler_AVX2 csh; #elif defined(ENABLE_SSE2) static const ColorspaceHandler_SSE2 csh; @@ -153,14 +167,7 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (SWAP_RB) { @@ -201,14 +208,7 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (SWAP_RB) { @@ -249,14 +249,7 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 8); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 4); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); if (SWAP_RB) { @@ -297,14 +290,7 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 8); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 4); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); if (SWAP_RB) { @@ -345,14 +331,7 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (SWAP_RB) { @@ -393,14 +372,7 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (SWAP_RB) { @@ -441,14 +413,7 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); if (SWAP_RB) { @@ -489,14 +454,7 @@ void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u16)) * 2)); if (SWAP_RB) { @@ -537,14 +495,7 @@ void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u32)) * 4)); if (SWAP_RB) { @@ -591,14 +542,7 @@ void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount) size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (IS_UNALIGNED) { @@ -631,14 +575,7 @@ void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount) size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 8); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 4); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); if (IS_UNALIGNED) { @@ -665,14 +602,7 @@ void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensi size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); if (SWAP_RB) { @@ -750,14 +680,7 @@ void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensi size_t i = 0; #ifdef USEMANUALVECTORIZATION - -#if defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); -#elif defined(USEVECTORSIZE_256) - const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); -#endif + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); if (SWAP_RB) { diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index b3e2880c0..50f99b363 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2018 DeSmuME team + Copyright (C) 2016-2019 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,19 +30,74 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 11), _mm256_srli_epi16(srcColor, 7)), _mm256_set1_epi16(0xF8F8) ); - v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits); + if (SWAP_RB) + { + v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707))); + + v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) ); + ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5)); + ga = _mm256_or_si256(ga, srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, ga); + dstHi = _mm256_unpackhi_epi8(rb, ga); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0707)) ); + + v256u16 ba = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) ); + ba = _mm256_or_si256(ba, _mm256_srli_epi16(ba, 5)); + ba = _mm256_or_si256(ba, srcAlphaBits); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + ba = _mm256_permute4x64_epi64(ba, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, ba); + dstHi = _mm256_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - rb = _mm256_permute4x64_epi64(rb, 0xD8); - ga = _mm256_permute4x64_epi64(ga, 0xD8); - - dstLo = _mm256_unpacklo_epi16(rb, ga); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); - dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); - - dstHi = _mm256_unpackhi_epi16(rb, ga); - dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) ); - dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); + if (SWAP_RB) + { + v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707))); + + v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) ); + g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5)); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, g); + dstHi = _mm256_unpackhi_epi8(rb, g); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi32(rg, 5), _mm256_set1_epi16(0x0707)) ); + + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) ); + b = _mm256_or_si256(b, _mm256_srli_epi32(b, 5)); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + b = _mm256_permute4x64_epi64( b, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, b); + dstHi = _mm256_unpackhi_epi16(rg, b); + } } template @@ -51,19 +106,75 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); - v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E)), srcAlphaBits); + if (SWAP_RB) + { + v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101))); + + v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) ); + ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5)); + ga = _mm256_or_si256(ga, srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, ga); + dstHi = _mm256_unpackhi_epi8(rb, ga); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) ); + const v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) ); + + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) ); + + v256u16 ba = _mm256_or_si256(b, _mm256_srli_epi16(b, 5)); + ba = _mm256_or_si256(ba, srcAlphaBits); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + ba = _mm256_permute4x64_epi64(ba, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, ba); + dstHi = _mm256_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - rb = _mm256_permute4x64_epi64(rb, 0xD8); - ga = _mm256_permute4x64_epi64(ga, 0xD8); - - dstLo = _mm256_unpacklo_epi16(rb, ga); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); - dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); - - dstHi = _mm256_unpackhi_epi16(rb, ga); - dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) ); - dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); + if (SWAP_RB) + { + v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101))); + + v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) ); + g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5)); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, g); + dstHi = _mm256_unpackhi_epi8(rb, g); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) ); + + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) ); + b = _mm256_or_si256(b, _mm256_srli_epi16(b, 5)); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + b = _mm256_permute4x64_epi64( b, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, b); + dstHi = _mm256_unpackhi_epi16(rg, b); + } } template @@ -86,18 +197,13 @@ FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src) // Conversion algorithm: // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) - v256u32 rgb; - const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) ); + v256u32 rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) ); if (SWAP_RB) { - rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); } - else - { - rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); - } return _mm256_or_si256(rgb, a); } @@ -288,18 +394,16 @@ FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float in return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000)); } - v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x000000FF) ); - v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) ); - v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 16), _mm256_set1_epi32(0x000000FF) ); - v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) ); + v256u16 rb = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x00FF00FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) ); const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - r = _mm256_mulhi_epu16(r, intensity_v256); - g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 ); - b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 ); + rb = _mm256_mulhi_epu16(rb, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 ); - return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); + return _mm256_or_si256( _mm256_or_si256(rb, g), a); } template @@ -307,7 +411,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict { size_t i = 0; - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); v256u32 dstConvertedLo, dstConvertedHi; @@ -315,13 +419,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict if (IS_UNALIGNED) { - _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); - _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); } else { - _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); - _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); } } @@ -333,7 +437,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3 { size_t i = 0; - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); v256u32 dstConvertedLo, dstConvertedHi; @@ -341,13 +445,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3 if (IS_UNALIGNED) { - _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); - _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); } else { - _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); - _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); } } @@ -359,7 +463,7 @@ size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t p { size_t i = 0; - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -379,7 +483,7 @@ size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t p { size_t i = 0; - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -399,15 +503,15 @@ size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__ { size_t i = 0; - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { if (IS_UNALIGNED) { - _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); } else { - _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); } } @@ -419,15 +523,15 @@ size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__ { size_t i = 0; - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { if (IS_UNALIGNED) { - _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); } else { - _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); } } @@ -439,7 +543,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si { size_t i = 0; - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -461,17 +565,17 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re v256u16 src_v256u16[2]; v256u32 src_v256u32[4]; - for (; i < pixCountVec256; i+=32) + for (; i < pixCountVec256; i+=((sizeof(v256u16)/sizeof(u16)) * 2)) { if (IS_UNALIGNED) { - src_v256u16[0] = _mm256_loadu_si256((v256u16 *)(src + i + 0)); - src_v256u16[1] = _mm256_loadu_si256((v256u16 *)(src + i + 16)); + src_v256u16[0] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) ); + src_v256u16[1] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) ); } else { - src_v256u16[0] = _mm256_load_si256((v256u16 *)(src + i + 0)); - src_v256u16[1] = _mm256_load_si256((v256u16 *)(src + i + 16)); + src_v256u16[0] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) ); + src_v256u16[1] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) ); } v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) ); @@ -516,15 +620,15 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re if (IS_UNALIGNED) { - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); } else { - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); } } @@ -537,21 +641,21 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re size_t i = 0; v256u32 src_v256u32[4]; - for (; i < pixCountVec256; i+=32) + for (; i < pixCountVec256; i+=((sizeof(v256u32)/sizeof(u32)) * 4)) { if (IS_UNALIGNED) { - src_v256u32[0] = _mm256_loadu_si256((v256u32 *)(src + i + 0)); - src_v256u32[1] = _mm256_loadu_si256((v256u32 *)(src + i + 8)); - src_v256u32[2] = _mm256_loadu_si256((v256u32 *)(src + i + 16)); - src_v256u32[3] = _mm256_loadu_si256((v256u32 *)(src + i + 24)); + src_v256u32[0] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) ); + src_v256u32[1] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) ); + src_v256u32[2] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) ); + src_v256u32[3] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) ); } else { - src_v256u32[0] = _mm256_load_si256((v256u32 *)(src + i + 0)); - src_v256u32[1] = _mm256_load_si256((v256u32 *)(src + i + 8)); - src_v256u32[2] = _mm256_load_si256((v256u32 *)(src + i + 16)); - src_v256u32[3] = _mm256_load_si256((v256u32 *)(src + i + 24)); + src_v256u32[0] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) ); + src_v256u32[1] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) ); + src_v256u32[2] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) ); + src_v256u32[3] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) ); } if (SWAP_RB) @@ -577,15 +681,15 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re if (IS_UNALIGNED) { - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); - _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); } else { - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); - _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); } } @@ -603,7 +707,7 @@ size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec2 size_t i = 0; - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); @@ -631,7 +735,7 @@ size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec2 size_t i = 0; - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i)); @@ -657,7 +761,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, { if (SWAP_RB) { - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ); @@ -679,7 +783,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, } else if (intensity < 0.001f) { - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { if (IS_UNALIGNED) { @@ -695,7 +799,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, { const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - for (; i < pixCountVec256; i+=16) + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) { v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256; @@ -734,7 +838,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, { if (SWAP_RB) { - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); @@ -756,7 +860,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, } else if (intensity < 0.001f) { - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -772,21 +876,19 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, { const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - for (; i < pixCountVec256; i+=8) + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) { v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256; - v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x000000FF) ); - v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) ); - v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempDst, 16), _mm256_set1_epi32(0x000000FF) ); - v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) ); + v256u16 rb = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x00FF00FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) ); - r = _mm256_mulhi_epu16(r, intensity_v256); - g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 ); - b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 ); + rb = _mm256_mulhi_epu16(rb, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 ); - tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); + tempDst = _mm256_or_si256( _mm256_or_si256(rb, g), a); if (IS_UNALIGNED) { @@ -1045,9 +1147,15 @@ size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 * template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h index 1dc6a1ed6..d32989e5d 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -25,7 +25,9 @@ #else template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.cpp new file mode 100644 index 000000000..ea31fcfc2 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.cpp @@ -0,0 +1,1145 @@ +/* + Copyright (C) 2016-2019 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_AVX512.h" + +#ifndef ENABLE_AVX512_1 + #error This code requires AVX-512 Tier-1 support. +#else + +#include +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 6), _mm512_set1_epi16(0xF800)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0707)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + ba = _mm512_or_si512(ba, srcAlphaBits); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 6), _mm512_set1_epi16(0xF800)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0707)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 4), _mm512_set1_epi16(0x3E00)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0101)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + ba = _mm512_or_si512(ba, srcAlphaBits); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 4), _mm512_set1_epi16(0x3E00)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0101)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + const v512u16 srcAlphaBits16 = _mm512_set1_epi16(0xFF00); + ColorspaceConvert555To8888_AVX512(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + const v512u16 srcAlphaBits16 = _mm512_set1_epi16(0x1F00); + ColorspaceConvert555To6665_AVX512(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v512u32 rgb = _mm512_and_si512( _mm512_srli_epi32(src, 2), _mm512_set1_epi32(0x003F3F3F) ); + const v512u32 a = _mm512_and_si512( _mm512_srli_epi32(src, 3), _mm512_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = _mm512_shuffle_epi8( rgb, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm512_or_si512(rgb, a); +} + +template +FORCEINLINE v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v512u32 rgb = _mm512_or_si512( _mm512_and_si512(_mm512_slli_epi32(src, 2), _mm512_set1_epi32(0x00FCFCFC)), _mm512_and_si512(_mm512_srli_epi32(src, 4), _mm512_set1_epi32(0x00030303)) ); + const v512u32 a = _mm512_or_si512( _mm512_and_si512(_mm512_slli_epi32(src, 3), _mm512_set1_epi32(0xF8000000)), _mm512_and_si512(_mm512_srli_epi32(src, 2), _mm512_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = _mm512_shuffle_epi8( rgb, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm512_or_si512(rgb, a); +} + +template +FORCEINLINE v512u16 _ConvertColorBaseTo5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v512u32 rgbLo; + v512u32 rgbHi; + v512u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 17), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 9), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 17), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 9), _mm512_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 1), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 7), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 1), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 7), _mm512_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm512_packus_epi32( _mm512_and_si512(_mm512_srli_epi32(srcLo, 24), _mm512_set1_epi32(0x0000001F)), _mm512_and_si512(_mm512_srli_epi32(srcHi, 24), _mm512_set1_epi32(0x0000001F)) ); + alpha = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), alpha); + alpha = _mm512_maskz_set1_epi16(_mm512_cmpgt_epi16_mask(alpha, _mm512_setzero_si512()), 0x8000); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 19), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_slli_epi32(srcLo, 7), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 19), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_slli_epi32(srcHi, 7), _mm512_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 3), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 9), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 3), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 9), _mm512_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm512_packus_epi32( _mm512_srli_epi32(srcLo, 24), _mm512_srli_epi32(srcHi, 24) ); + alpha = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), alpha); + alpha = _mm512_maskz_set1_epi16(_mm512_cmpgt_epi16_mask(alpha, _mm512_setzero_si512()), 0x8000); + } + + return _mm512_or_si512( _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), _mm512_packus_epi32(rgbLo, rgbHi)), alpha ); +} + +template +FORCEINLINE v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX512(srcLo, srcHi); +} + +template +FORCEINLINE v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX512(srcLo, srcHi); +} + +template +FORCEINLINE v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src) +{ + if (SWAP_RB) + { + return _mm512_or_si512( _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm512_set1_epi32(0xFF000000) ); + } + + return _mm512_or_si512(src, _mm512_set1_epi32(0xFF000000)); +} + +template +FORCEINLINE v512u16 ColorspaceCopy16_AVX512(const v512u16 &src) +{ + if (SWAP_RB) + { + return _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(src, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(src, _mm512_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v512u32 ColorspaceCopy32_AVX512(const v512u32 &src) +{ + if (SWAP_RB) + { + return _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + } + + return src; +} + +template +FORCEINLINE v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity) +{ + v512u16 tempSrc = (SWAP_RB) ? _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(src, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(src, _mm512_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm512_and_si512(tempSrc, _mm512_set1_epi16(0x8000)); + } + + v512u16 r = _mm512_and_si512( tempSrc, _mm512_set1_epi16(0x001F) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(tempSrc, 5), _mm512_set1_epi16(0x001F) ); + v512u16 b = _mm512_and_si512( _mm512_srli_epi16(tempSrc, 10), _mm512_set1_epi16(0x001F) ); + v512u16 a = _mm512_and_si512( tempSrc, _mm512_set1_epi16(0x8000) ); + + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm512_mulhi_epu16(r, intensity_v512); + g = _mm512_slli_epi16( _mm512_mulhi_epu16(g, intensity_v512), 5 ); + b = _mm512_slli_epi16( _mm512_mulhi_epu16(b, intensity_v512), 10 ); + + return _mm512_or_si512( _mm512_or_si512( _mm512_or_si512(r, g), b), a); +} + +template +FORCEINLINE v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity) +{ + v512u32 tempSrc = (SWAP_RB) ? _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm512_and_si512(tempSrc, _mm512_set1_epi32(0xFF000000)); + } + + v512u16 rb = _mm512_and_si512( tempSrc, _mm512_set1_epi32(0x00FF00FF) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi32(tempSrc, 8), _mm512_set1_epi32(0x000000FF) ); + v512u32 a = _mm512_and_si512( tempSrc, _mm512_set1_epi32(0xFF000000) ); + + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = _mm512_mulhi_epu16(rb, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16( g, intensity_v512), 8 ); + + return _mm512_or_si512( _mm512_or_si512(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AVX512(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + v512u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_AVX512(src_vec512, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AVX512(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + v512u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_AVX512(src_vec512, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert8888To6665_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert8888To6665_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert6665To8888_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert6665To8888_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AVX512(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), ColorspaceConvert8888To5551_AVX512(_mm512_loadu_si512((v512u32 *)(src+i)), _mm512_loadu_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), ColorspaceConvert8888To5551_AVX512(_mm512_load_si512((v512u32 *)(src+i)), _mm512_load_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AVX512(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), ColorspaceConvert6665To5551_AVX512(_mm512_loadu_si512((v512u32 *)(src+i)), _mm512_loadu_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), ColorspaceConvert6665To5551_AVX512(_mm512_load_si512((v512u32 *)(src+i)), _mm512_load_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_AVX512(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + v512u16 src_v512u16[2]; + v512u32 src_v512u32[4]; + + for (; i < pixCountVec512; i+=((sizeof(v512u16)/sizeof(u16)) * 2)) + { + if (IS_UNALIGNED) + { + src_v512u16[0] = _mm512_loadu_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 0)) ); + src_v512u16[1] = _mm512_loadu_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 1)) ); + } + else + { + src_v512u16[0] = _mm512_load_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 0)) ); + src_v512u16[1] = _mm512_load_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 1)) ); + } + + v512u16 rb = _mm512_and_si512( _mm512_or_si512(_mm512_slli_epi16(src_v512u16[0], 11), _mm512_srli_epi16(src_v512u16[0], 7)), _mm512_set1_epi16(0xF8F8) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(src_v512u16[0], 2), _mm512_set1_epi16(0x00F8) ); + rb = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), rb); + g = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), g); + src_v512u32[0] = _mm512_unpacklo_epi16(rb, g); + src_v512u32[1] = _mm512_unpackhi_epi16(rb, g); + + rb = _mm512_and_si512( _mm512_or_si512(_mm512_slli_epi16(src_v512u16[1], 11), _mm512_srli_epi16(src_v512u16[1], 7)), _mm512_set1_epi16(0xF8F8) ); + g = _mm512_and_si512( _mm512_srli_epi16(src_v512u16[1], 2), _mm512_set1_epi16(0x00F8) ); + rb = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), rb); + g = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), g); + src_v512u32[2] = _mm512_unpacklo_epi16(rb, g); + src_v512u32[3] = _mm512_unpackhi_epi16(rb, g); + + src_v512u32[0] = _mm512_or_si512( src_v512u32[0], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[0], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[1] = _mm512_or_si512( src_v512u32[1], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[1], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[2] = _mm512_or_si512( src_v512u32[2], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[2], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[3] = _mm512_or_si512( src_v512u32[3], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[3], 5), _mm512_set1_epi32(0x00070707)) ); + +#ifdef ENABLE_AVX512_2 // The vpermb instruction requires AVX512VBMI. + if (SWAP_RB) + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } + else + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } +#else + if (SWAP_RB) + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + } + else + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v512u32[0] = _mm512_permutexvar_epi32( _mm512_set_epi32(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi32( _mm512_set_epi32( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi32( _mm512_set_epi32( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi32( _mm512_set_epi32(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3), src_v512u32[3] ); +#endif + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + else + { + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AVX512(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + v512u32 src_v512u32[4]; + + for (; i < pixCountVec512; i+=((sizeof(v512u32)/sizeof(u32)) * 4)) + { + if (IS_UNALIGNED) + { + src_v512u32[0] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 0)) ); + src_v512u32[1] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 1)) ); + src_v512u32[2] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 2)) ); + src_v512u32[3] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 3)) ); + } + else + { + src_v512u32[0] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 0)) ); + src_v512u32[1] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 1)) ); + src_v512u32[2] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 2)) ); + src_v512u32[3] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 3)) ); + } + +#ifdef ENABLE_AVX512_2 // The vpermb instruction requires AVX512VBMI. + if (SWAP_RB) + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } + else + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } +#else + if (SWAP_RB) + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + } + else + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v512u32[0] = _mm512_permutexvar_epi32( _mm512_set_epi32(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi32( _mm512_set_epi32( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi32( _mm512_set_epi32( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi32( _mm512_set_epi32(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3), src_v512u32[3] ); +#endif + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + else + { + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_AVX512(const u16 *src, u16 *dst, size_t pixCountVec512) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec512 * sizeof(u16)); + return pixCountVec512; + } + + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u16 *)(dst+i), ColorspaceCopy16_AVX512(src_vec512)); + } + else + { + _mm512_store_si512((v512u16 *)(dst+i), ColorspaceCopy16_AVX512(src_vec512)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec512 * sizeof(u32)); + return pixCountVec512; + } + + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + v512u32 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(src+i)) : _mm512_load_si512((v512u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i), ColorspaceCopy32_AVX512(src_vec512)); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i), ColorspaceCopy32_AVX512(src_vec512)); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_AVX512(u16 *dst, size_t pixCountVec512, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + const v512u16 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(dst+i)) : _mm512_load_si512((v512u16 *)(dst+i)); + const v512u16 tempDst = _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(dst_v512, _mm512_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec512; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), _mm512_and_si512(_mm512_loadu_si512((v512u16 *)(dst+i)), _mm512_set1_epi16(0x8000)) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), _mm512_and_si512(_mm512_load_si512((v512u16 *)(dst+i)), _mm512_set1_epi16(0x8000)) ); + } + } + } + else + { + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(dst+i)) : _mm512_load_si512((v512u16 *)(dst+i)); + v512u16 tempDst = (SWAP_RB) ? _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(dst_v512, _mm512_set1_epi16(0x8000)) ) : dst_v512; + + v512u16 r = _mm512_and_si512( tempDst, _mm512_set1_epi16(0x001F) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(tempDst, 5), _mm512_set1_epi16(0x001F) ); + v512u16 b = _mm512_and_si512( _mm512_srli_epi16(tempDst, 10), _mm512_set1_epi16(0x001F) ); + v512u16 a = _mm512_and_si512( tempDst, _mm512_set1_epi16(0x8000) ); + + r = _mm512_mulhi_epu16(r, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16(g, intensity_v512), 5 ); + b = _mm512_slli_epi32( _mm512_mulhi_epu16(b, intensity_v512), 10 ); + + tempDst = _mm512_or_si512( _mm512_or_si512( _mm512_or_si512(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u16 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512((v512u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_AVX512(u32 *dst, size_t pixCountVec512, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + const v512u32 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(dst+i)) : _mm512_load_si512((v512u32 *)(dst+i)); + const v512u32 tempDst = _mm512_shuffle_epi8(dst_v512, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec512; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), _mm512_and_si512(_mm512_loadu_si512((v512u32 *)(dst+i)), _mm512_set1_epi32(0xFF000000)) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), _mm512_and_si512(_mm512_load_si512((v512u32 *)(dst+i)), _mm512_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + v512u32 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(dst+i)) : _mm512_load_si512((v512u32 *)(dst+i)); + v512u32 tempDst = (SWAP_RB) ? _mm512_shuffle_epi8(dst_v512, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v512; + + v512u16 rb = _mm512_and_si512( tempDst, _mm512_set1_epi32(0x00FF00FF) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi32(tempDst, 8), _mm512_set1_epi32(0x000000FF) ); + v512u32 a = _mm512_and_si512( tempDst, _mm512_set1_epi32(0xFF000000) ); + + rb = _mm512_mulhi_epu16(rb, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16( g, intensity_v512), 8 ); + + tempDst = _mm512_or_si512( _mm512_or_si512(rb, g), a); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); + +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); + +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); + +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); + +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); + +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); + +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); + +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); + +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); + +#endif // ENABLE_AVX512_1 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.h new file mode 100644 index 000000000..b46b6d684 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX512.h @@ -0,0 +1,114 @@ +/* + Copyright (C) 2016-2019 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_AVX512_H +#define COLORSPACEHANDLER_AVX512_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_AVX512_1 + #warning This header requires AVX-512 Tier-1 support. +#else + +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); + +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); + +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); + +class ColorspaceHandler_AVX512 : public ColorspaceHandler +{ +public: + ColorspaceHandler_AVX512() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_AVX512_1 + +#endif // COLORSPACEHANDLER_AVX512_H diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp index a736646a8..c5bb48fb1 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2017 DeSmuME team + Copyright (C) 2016-2019 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,6 +38,21 @@ FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, con dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); } +template +FORCEINLINE void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); +} + template FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) { @@ -53,6 +68,21 @@ FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, con dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); } +template +FORCEINLINE void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); + dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); + dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); +} + template FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) { @@ -513,9 +543,15 @@ size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h index 684338e7d..8fffc72e2 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -25,7 +25,9 @@ #else template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index dc1d65056..5e66311ed 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2017 DeSmuME team + Copyright (C) 2016-2019 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,28 +38,62 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) -#ifdef ENABLE_SSSE3 - v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 11), _mm_srli_epi16(srcColor, 7)), _mm_set1_epi16(0xF8F8) ); - v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8)), srcAlphaBits); + if (SWAP_RB) + { + v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707))); + + v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) ); + ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5)); + ga = _mm_or_si128(ga, srcAlphaBits); + + dstLo = _mm_unpacklo_epi8(rb, ga); + dstHi = _mm_unpackhi_epi8(rb, ga); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) ); + + v128u16 ba = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); + ba = _mm_or_si128(ba, _mm_srli_epi16(ba, 5)); + ba = _mm_or_si128(ba, srcAlphaBits); + + dstLo = _mm_unpacklo_epi16(rg, ba); + dstHi = _mm_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - dstLo = _mm_unpacklo_epi16(rb, ga); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); - dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); - - dstHi = _mm_unpackhi_epi16(rb, ga); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); - dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); -#else - v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); - v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800) ); - v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); - - dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); - - dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); -#endif + if (SWAP_RB) + { + v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707))); + + v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) ); + g = _mm_or_si128(g, _mm_srli_epi16(g, 5)); + + dstLo = _mm_unpacklo_epi8(rb, g); + dstHi = _mm_unpackhi_epi8(rb, g); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) ); + + v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); + b = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + + dstLo = _mm_unpacklo_epi16(rg, b); + dstHi = _mm_unpackhi_epi16(rg, b); + } } template @@ -68,28 +102,63 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) -#ifdef ENABLE_SSSE3 - v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); - v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E)), srcAlphaBits); + if (SWAP_RB) + { + v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101))); + + v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) ); + ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5)); + ga = _mm_or_si128(ga, srcAlphaBits); + + dstLo = _mm_unpacklo_epi8(rb, ga); + dstHi = _mm_unpackhi_epi8(rb, ga); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); + const v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); + + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) ); + + v128u16 ba = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + ba = _mm_or_si128(ba, srcAlphaBits); + + dstLo = _mm_unpacklo_epi16(rg, ba); + dstHi = _mm_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - dstLo = _mm_unpacklo_epi16(rb, ga); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); - dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); - - dstHi = _mm_unpackhi_epi16(rb, ga); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); - dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); -#else - v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); - v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00) ); - v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); - - dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); - - dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); -#endif + if (SWAP_RB) + { + v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101))); + + v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) ); + g = _mm_or_si128(g, _mm_srli_epi16(g, 5)); + + dstLo = _mm_unpacklo_epi8(rb, g); + dstHi = _mm_unpackhi_epi8(rb, g); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) ); + + v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); + b = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + + dstLo = _mm_unpacklo_epi16(rg, b); + dstHi = _mm_unpackhi_epi16(rg, b); + } } template @@ -332,18 +401,16 @@ FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float in return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000)); } - v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi32(0x000000FF) ); - v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) ); - v128u16 b = _mm_and_si128( _mm_srli_epi32(tempSrc, 16), _mm_set1_epi32(0x000000FF) ); - v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) ); + v128u16 rb = _mm_and_si128( tempSrc, _mm_set1_epi32(0x00FF00FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) ); const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - r = _mm_mulhi_epu16(r, intensity_v128); - g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 ); - b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 ); + rb = _mm_mulhi_epu16(rb, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 ); - return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); + return _mm_or_si128( _mm_or_si128(rb, g), a); } template @@ -351,7 +418,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict { size_t i = 0; - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); v128u32 dstConvertedLo, dstConvertedHi; @@ -359,13 +426,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict if (IS_UNALIGNED) { - _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); - _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); } else { - _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); - _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); } } @@ -377,7 +444,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3 { size_t i = 0; - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); v128u32 dstConvertedLo, dstConvertedHi; @@ -385,13 +452,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3 if (IS_UNALIGNED) { - _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); - _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); } else { - _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); - _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); } } @@ -403,7 +470,7 @@ size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t p { size_t i = 0; - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -423,7 +490,7 @@ size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t p { size_t i = 0; - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -443,7 +510,7 @@ size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__ { size_t i = 0; - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { if (IS_UNALIGNED) { @@ -463,7 +530,7 @@ size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__ { size_t i = 0; - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { if (IS_UNALIGNED) { @@ -483,7 +550,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si { size_t i = 0; - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -507,17 +574,17 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r v128u16 src_v128u16[2]; v128u32 src_v128u32[4]; - for (; i < pixCountVec128; i+=16) + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2)) { if (IS_UNALIGNED) { - src_v128u16[0] = _mm_loadu_si128((v128u16 *)(src + i + 0)); - src_v128u16[1] = _mm_loadu_si128((v128u16 *)(src + i + 8)); + src_v128u16[0] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) ); + src_v128u16[1] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) ); } else { - src_v128u16[0] = _mm_load_si128((v128u16 *)(src + i + 0)); - src_v128u16[1] = _mm_load_si128((v128u16 *)(src + i + 8)); + src_v128u16[0] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) ); + src_v128u16[1] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) ); } v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) ); @@ -553,28 +620,28 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r #ifdef ENABLE_SSE4_1 if (IS_UNALIGNED) { - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); } else { - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); } #else if (IS_UNALIGNED) { - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); } else { - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); } #endif } @@ -588,21 +655,21 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r size_t i = 0; v128u32 src_v128u32[4]; - for (; i < pixCountVec128; i+=16) + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) { if (IS_UNALIGNED) { - src_v128u32[0] = _mm_loadu_si128((v128u32 *)(src + i + 0)); - src_v128u32[1] = _mm_loadu_si128((v128u32 *)(src + i + 4)); - src_v128u32[2] = _mm_loadu_si128((v128u32 *)(src + i + 8)); - src_v128u32[3] = _mm_loadu_si128((v128u32 *)(src + i + 12)); + src_v128u32[0] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) ); + src_v128u32[1] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) ); + src_v128u32[2] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) ); + src_v128u32[3] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) ); } else { - src_v128u32[0] = _mm_load_si128((v128u32 *)(src + i + 0)); - src_v128u32[1] = _mm_load_si128((v128u32 *)(src + i + 4)); - src_v128u32[2] = _mm_load_si128((v128u32 *)(src + i + 8)); - src_v128u32[3] = _mm_load_si128((v128u32 *)(src + i + 12)); + src_v128u32[0] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) ); + src_v128u32[1] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) ); + src_v128u32[2] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) ); + src_v128u32[3] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) ); } if (SWAP_RB) @@ -623,28 +690,28 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r #ifdef ENABLE_SSE4_1 if (IS_UNALIGNED) { - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); } else { - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); } #else if (IS_UNALIGNED) { - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); } else { - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); - _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); } #endif } @@ -665,7 +732,7 @@ size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec1 size_t i = 0; - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); @@ -693,7 +760,7 @@ size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec1 size_t i = 0; - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i)); @@ -719,7 +786,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, { if (SWAP_RB) { - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ); @@ -741,7 +808,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, } else if (intensity < 0.001f) { - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { if (IS_UNALIGNED) { @@ -757,7 +824,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, { const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - for (; i < pixCountVec128; i+=8) + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) { v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128; @@ -796,7 +863,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, { if (SWAP_RB) { - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); #ifdef ENABLE_SSSE3 @@ -821,7 +888,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, } else if (intensity < 0.001f) { - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { if (IS_UNALIGNED) { @@ -837,7 +904,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, { const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); - for (; i < pixCountVec128; i+=4) + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) { v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); #ifdef ENABLE_SSSE3 @@ -846,16 +913,14 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128; #endif - v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi32(0x000000FF) ); - v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) ); - v128u16 b = _mm_and_si128( _mm_srli_epi32(tempDst, 16), _mm_set1_epi32(0x000000FF) ); - v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) ); + v128u16 rb = _mm_and_si128( tempDst, _mm_set1_epi32(0x00FF00FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) ); - r = _mm_mulhi_epu16(r, intensity_v128); - g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 ); - b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 ); + rb = _mm_mulhi_epu16(rb, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 ); - tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); + tempDst = _mm_or_si128( _mm_or_si128(rb, g), a); if (IS_UNALIGNED) { @@ -1118,9 +1183,15 @@ size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 * template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h index ca2aa14ac..007b9f800 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -25,7 +25,9 @@ #else template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); diff --git a/desmume/src/version.cpp b/desmume/src/version.cpp index 0afd68513..84f0cc138 100644 --- a/desmume/src/version.cpp +++ b/desmume/src/version.cpp @@ -60,40 +60,44 @@ #define DESMUME_PLATFORM_STRING "" #endif -#define DESMUME_CPUEXT_PRIMARY_STRING "" -#define DESMUME_CPUEXT_SECONDARY_STRING "" - #if defined(ENABLE_SSE4_2) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.2" #elif defined(ENABLE_SSE4_1) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.1" #elif defined(ENABLE_SSSE3) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSSE3" #elif defined(ENABLE_SSE3) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSE3" #elif defined(ENABLE_SSE2) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSE2" #elif defined(ENABLE_SSE) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " SSE" #elif defined(ENABLE_ALTIVEC) - #undef DESMUME_CPUEXT_PRIMARY_STRING #define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec" #endif -#if defined(ENABLE_AVX2) - #undef DESMUME_CPUEXT_SECONDARY_STRING +#if defined(ENABLE_AVX512_3) + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-3" +#elif defined(ENABLE_AVX512_2) + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-2" +#elif defined(ENABLE_AVX512_1) + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-1" +#elif defined(ENABLE_AVX512_0) + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-0" +#elif defined(ENABLE_AVX2) #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX2" #elif defined(ENABLE_AVX) - #undef DESMUME_CPUEXT_SECONDARY_STRING #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX" #endif +#ifndef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING "" +#endif + +#ifndef DESMUME_CPUEXT_SECONDARY_STRING + #define DESMUME_CPUEXT_SECONDARY_STRING "" +#endif + #define DESMUME_CPUEXT_STRING DESMUME_CPUEXT_PRIMARY_STRING DESMUME_CPUEXT_SECONDARY_STRING #ifdef DEVELOPER