diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index f8e41636e..bc71c3eb1 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -26,12 +26,10 @@ #include "colorspacehandler_AltiVec.cpp" #endif -#if defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) - #define USEVECTORSIZE_128 -#endif - #if defined(ENABLE_AVX2) #define USEVECTORSIZE_256 +#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) + #define USEVECTORSIZE_128 #endif // By default, the hand-coded vectorized code will be used instead of a compiler's built-in @@ -147,12 +145,12 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); #endif if (SWAP_RB) @@ -195,12 +193,12 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); #endif if (SWAP_RB) @@ -243,12 +241,12 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 4); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 8); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); #endif if (SWAP_RB) @@ -291,12 +289,12 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 4); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 8); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); #endif if (SWAP_RB) @@ -339,12 +337,12 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); #endif if (SWAP_RB) @@ -387,12 +385,12 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric #ifdef USEMANUALVECTORIZATION -#if defined(USEVECTORSIZE_128) - const size_t pixCountVector = pixCount - (pixCount % 8); +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); #elif defined(USEVECTORSIZE_256) const size_t pixCountVector = pixCount - (pixCount % 16); -#elif defined(USEVECTORSIZE_512) - const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); #endif if (SWAP_RB) diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index 6682bea12..3bbe895d9 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -30,14 +30,14 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) ); dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); - src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); @@ -52,14 +52,14 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) ); dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); - src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index fb4ada420..31b6ff156 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -27,6 +27,10 @@ #include #endif +#ifdef ENABLE_SSE4_1 +#include +#endif + template FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) { @@ -34,14 +38,25 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + +#ifdef ENABLE_SSE4_1 + src32 = _mm_cvtepu16_epi32(srcColor); +#else src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); +#endif + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); +#ifdef ENABLE_SSE4_1 + src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) ); +#else src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); +#endif + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); @@ -56,14 +71,25 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + +#ifdef ENABLE_SSE4_1 + src32 = _mm_cvtepu16_epi32(srcColor); +#else src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); +#endif + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); +#ifdef ENABLE_SSE4_1 + src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) ); +#else src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); +#endif + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );