From 858b05df79fe63422fd96b99b21a8db9550dee5e Mon Sep 17 00:00:00 2001 From: rogerman Date: Sun, 22 Oct 2017 22:14:05 -0700 Subject: [PATCH] Colorspace Handler: Add new functions for converting 16-bit RGBA5551 and 32-bit RGBA8888 color buffers to 24-bit RGB888. - Also improve the performance of RGB555-to-RGBA6665 and RGB555-to-RGBA8888 conversions. --- desmume/src/texcache.cpp | 19 +- .../colorspacehandler/colorspacehandler.cpp | 174 ++++++++++++ .../colorspacehandler/colorspacehandler.h | 39 +++ .../colorspacehandler_AVX2.cpp | 236 ++++++++++++--- .../colorspacehandler_AVX2.h | 14 +- .../colorspacehandler_AltiVec.cpp | 139 +++++++-- .../colorspacehandler_AltiVec.h | 10 +- .../colorspacehandler_SSE2.cpp | 268 ++++++++++++++---- .../colorspacehandler_SSE2.h | 16 +- 9 files changed, 780 insertions(+), 135 deletions(-) diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 14ca0e34c..b449a250c 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -1114,7 +1114,6 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - __m128i tmpAlpha[2]; __m128i convertedColor[4]; if (TEXCACHEFORMAT == TexFormat_15bpp) @@ -1123,13 +1122,8 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665_SSE2(palColor0, alphaLo, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665_SSE2(palColor1, alphaHi, convertedColor[2], convertedColor[3]); } else { @@ -1137,13 +1131,8 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888_SSE2(palColor0, alphaLo, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888_SSE2(palColor1, alphaHi, convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index c4720d5c1..58c4512a9 100755 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -483,6 +483,102 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi } } +template +void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555XTo888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555XTo888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555XTo888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } +} + +template +void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } +} + template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount) { @@ -992,6 +1088,74 @@ size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); } +size_t ColorspaceHandler::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer555XTo888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer555XTo888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo888_SwapRB(src, dst, pixCount); +} + size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const { size_t i = 0; @@ -1231,6 +1395,16 @@ template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *sr template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); + template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h index d57cd8ff2..1cf935496 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -238,6 +238,32 @@ FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor) return ColorspaceConvert888XTo8888Opaque(srcColorComponent); } +template +FORCEINLINE void ColorspaceConvert888XTo888(FragmentColor srcColor, u8 *dst) +{ + dst[0] = (SWAP_RB) ? srcColor.b : srcColor.r; + dst[1] = srcColor.g; + dst[2] = (SWAP_RB) ? srcColor.r : srcColor.b; +} + +template +FORCEINLINE void ColorspaceConvert888XTo888(u32 srcColor, u8 *dst) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + ColorspaceConvert888XTo888(srcColorComponent, dst); +} + +template +FORCEINLINE void ColorspaceConvert555XTo888(u16 srcColor, u8 *dst) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = ColorspaceConvert555To8888Opaque(srcColor); + + ColorspaceConvert888XTo888(srcColorComponent, dst); +} + template FORCEINLINE u16 ColorspaceCopy16(u16 srcColor) { @@ -332,6 +358,9 @@ template void ColorspaceConvertBuffer8888To5551 template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); + template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); @@ -378,6 +407,16 @@ public: size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index ccc342c67..308ce7da2 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -25,61 +25,59 @@ #include template -FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi) { - v256u32 src32; - // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) ); - dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); - dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) ); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); - dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); - src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); - dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); - dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) ); - dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); + v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(srcColor, 11), _mm256_srli_epi16(srcColor, 7)), _mm256_set1_epi16(0xF8F8) ); + v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rb, ga); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); + dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); + + dstHi = _mm256_unpackhi_epi16(rb, ga); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) ); - dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); + dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); } template -FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi) { - v256u32 src32; - // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) ); - dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); - dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) ); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); - dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); - dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); - src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); - dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); - dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) ); - dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); + v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(srcColor, 9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); + v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E)), srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rb, ga); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); + dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); + + dstHi = _mm256_unpackhi_epi16(rb, ga); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) ); - dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); + dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); } template FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) { - const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0xFF000000); - ColorspaceConvert555To8888_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v256u16 srcAlphaBits16 = _mm256_set1_epi16(0xFF00); + ColorspaceConvert555To8888_AVX2(srcColor, srcAlphaBits16, dstLo, dstHi); } template FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) { - const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0x1F000000); - ColorspaceConvert555To6665_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v256u16 srcAlphaBits16 = _mm256_set1_epi32(0x1F00); + ColorspaceConvert555To6665_AVX2(srcColor, srcAlphaBits16, dstLo, dstHi); } template @@ -456,6 +454,132 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si return i; } +template +size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + v256u16 src_v256u16[2]; + v256u32 src_v256u32[4]; + + for (; i < pixCountVec256; i+=32) + { + if (IS_UNALIGNED) + { + src_v256u16[0] = _mm256_loadu_si256((v256u16 *)(src + i + 0)); + src_v256u16[1] = _mm256_loadu_si256((v256u16 *)(src + i + 16)); + } + else + { + src_v256u16[0] = _mm256_load_si256((v256u16 *)(src + i + 0)); + src_v256u16[1] = _mm256_load_si256((v256u16 *)(src + i + 16)); + } + + v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) ); + v256u16 g = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(src_v256u16[0], 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits); + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + src_v256u32[0] = _mm256_unpacklo_epi16(rb, g); + src_v256u32[1] = _mm256_unpackhi_epi16(rb, g); + + rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(src_v256u16[1], 11), _mm256_srli_epi16(src_v256u16[1], 7)), _mm256_set1_epi16(0xF8F8) ); + g = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(src_v256u16[1], 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits); + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + src_v256u32[2] = _mm256_unpacklo_epi16(rb, g); + src_v256u32[3] = _mm256_unpackhi_epi16(rb, g); + + src_v256u32[0] = _mm256_or_si256( src_v256u32[0], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[0], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[1] = _mm256_or_si256( src_v256u32[1], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[1], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[2] = _mm256_or_si256( src_v256u32[2], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[2], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[3] = _mm256_or_si256( src_v256u32[3], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[3], 5), _mm256_set1_epi32(0x00070707)) ); + + if (SWAP_RB) + { + src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9) ); + src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22) ); + src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3) ); + } + else + { + src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8) ); + src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22) ); + src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3) ); + } + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256( src_v256u32[3], _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + } + else + { + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256( src_v256u32[3], _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + v256u32 src_v256u32[4]; + + for (; i < pixCountVec256; i+=32) + { + if (IS_UNALIGNED) + { + src_v256u32[0] = _mm256_loadu_si256((v256u32 *)(src + i + 0)); + src_v256u32[1] = _mm256_loadu_si256((v256u32 *)(src + i + 8)); + src_v256u32[2] = _mm256_loadu_si256((v256u32 *)(src + i + 16)); + src_v256u32[3] = _mm256_loadu_si256((v256u32 *)(src + i + 24)); + } + else + { + src_v256u32[0] = _mm256_load_si256((v256u32 *)(src + i + 0)); + src_v256u32[1] = _mm256_load_si256((v256u32 *)(src + i + 8)); + src_v256u32[2] = _mm256_load_si256((v256u32 *)(src + i + 16)); + src_v256u32[3] = _mm256_load_si256((v256u32 *)(src + i + 24)); + } + + if (SWAP_RB) + { + src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8)); + src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21)); + src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3)); + } + else + { + src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10)); + src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21)); + src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3)); + } + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + } + else + { + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + } + } + + return i; +} + template size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256) { @@ -806,6 +930,46 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned( return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); } +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const { return ColorspaceCopyBuffer16_AVX2(src, dst, pixCount); @@ -866,11 +1030,11 @@ size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 * return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); } -template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); -template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); -template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); -template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits16Lo, const v256u16 &srcAlphaBits16Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits16Lo, const v256u16 &srcAlphaBits16Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h index a83b27271..1dc6a1ed6 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -24,8 +24,8 @@ #warning This header requires AVX2 support. #else -template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); -template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); @@ -80,6 +80,16 @@ public: size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp index 98c6f99dd..a736646a8 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -24,57 +24,47 @@ #include template -FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) { // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + dstLo = vec_unpackl((vector pixel)srcColor); dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); - dstLo = vec_sel(dstLo, srcAlphaBits32Lo, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); + dstLo = vec_perm(dstLo, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); dstHi = vec_unpackh((vector pixel)srcColor); dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); - dstHi = vec_sel(dstHi, srcAlphaBits32Hi, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); - - if (SWAP_RB) - { - dstLo = vec_perm(dstLo, dstLo, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13})); - dstHi = vec_perm(dstHi, dstHi, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13})); - } + dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); } template -FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) { // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + dstLo = vec_unpackl((vector pixel)srcColor); dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); - dstLo = vec_sel(dstLo, srcAlphaBits32Lo, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); + dstLo = vec_perm(dstLo, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); dstHi = vec_unpackh((vector pixel)srcColor); dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); - dstHi = vec_sel(dstHi, srcAlphaBits32Hi, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); - - if (SWAP_RB) - { - dstLo = vec_perm(dstLo, dstLo, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13})); - dstHi = vec_perm(dstHi, dstHi, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13})); - } + dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); } template FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) { - const v128u32 srcAlphaBits32 = {0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}; - ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v128u16 srcAlphaBits16 = {0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00}; + ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); } template FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) { - const v128u32 srcAlphaBits32 = {0x1F000000, 0x1F000000, 0x1F000000, 0x1F000000}; - ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v128u16 srcAlphaBits16 = {0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00}; + ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); } template @@ -305,6 +295,83 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst, return i; } +template +size_t ColorspaceConvertBuffer555XTo888_AltiVec(const u16 *src, u8 *dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 src_v128u16[2]; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + src_v128u16[0] = vec_ld( 0, src+i); + src_v128u16[1] = vec_ld(16, src+i); + + src_v128u32[0] = vec_unpackl((vector pixel)src_v128u16[0]); + src_v128u32[1] = vec_unpackh((vector pixel)src_v128u16[0]); + src_v128u32[2] = vec_unpackl((vector pixel)src_v128u16[1]); + src_v128u32[3] = vec_unpackh((vector pixel)src_v128u16[1]); + + src_v128u32[0] = vec_or( vec_sl((v128u8)src_v128u32[0], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[0], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[1] = vec_or( vec_sl((v128u8)src_v128u32[1], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[1], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[2] = vec_or( vec_sl((v128u8)src_v128u32[2], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[2], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[3] = vec_or( vec_sl((v128u8)src_v128u32[3], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[3], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + + if (SWAP_RB) + { + src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) ); + src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) ); + src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) ); + } + else + { + src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) ); + src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) ); + src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) ); + } + + vec_st( src_v128u32[0], 0, dst + (i * 3) ); + vec_st( src_v128u32[1], 16, dst + (i * 3) ); + vec_st( src_v128u32[2], 32, dst + (i * 3) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AltiVec(const u32 *src, u8 *dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + src_v128u32[0] = vec_ld( 0, src+i); + src_v128u32[1] = vec_ld(16, src+i); + src_v128u32[2] = vec_ld(32, src+i); + src_v128u32[3] = vec_ld(48, src+i); + + if (SWAP_RB) + { + src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) ); + src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) ); + src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) ); + } + else + { + src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) ); + src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) ); + src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) ); + } + + vec_st( src_v128u32[0], 0, dst + (i * 3) ); + vec_st( src_v128u32[1], 16, dst + (i * 3) ); + vec_st( src_v128u32[2], 32, dst + (i * 3) ); + } + + return i; +} + template size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128) { @@ -413,6 +480,26 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); } +size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AltiVec(src, dst, pixCount); +} + size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const { return ColorspaceCopyBuffer16_AltiVec(src, dst, pixCount); @@ -423,11 +510,11 @@ size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, return ColorspaceCopyBuffer32_AltiVec(src, dst, pixCount); } -template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h index 0428c16c5..684338e7d 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -24,8 +24,8 @@ #warning This header requires PowerPC AltiVec support. #else -template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); @@ -65,6 +65,12 @@ public: size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index 9b34e0266..f95c988a6 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -33,83 +33,77 @@ #endif template -FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) { - v128u32 src32; - // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) -#ifdef ENABLE_SSE4_1 - src32 = _mm_cvtepu16_epi32(srcColor); -#else - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); -#endif +#ifdef ENABLE_SSSE3 + v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 11), _mm_srli_epi16(srcColor, 7)), _mm_set1_epi16(0xF8F8) ); + v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8)), srcAlphaBits); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstLo = _mm_unpacklo_epi16(rb, ga); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); -#ifdef ENABLE_SSE4_1 - src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) ); -#else - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); -#endif - - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstHi = _mm_unpackhi_epi16(rb, ga); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); + dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); +#else + v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); + v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800) ); + v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); + + dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); + + dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); +#endif } template -FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) { - v128u32 src32; - // Conversion algorithm: // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) -#ifdef ENABLE_SSE4_1 - src32 = _mm_cvtepu16_epi32(srcColor); -#else - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); -#endif +#ifdef ENABLE_SSSE3 + v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); + v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E)), srcAlphaBits); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstLo = _mm_unpacklo_epi16(rb, ga); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); -#ifdef ENABLE_SSE4_1 - src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) ); -#else - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); -#endif - - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstHi = _mm_unpackhi_epi16(rb, ga); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); + dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) ); +#else + v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); + v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00) ); + v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); + + dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); + + dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); +#endif } template FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) { - const v128u32 srcAlphaBits32 = _mm_set1_epi32(0xFF000000); - ColorspaceConvert555To8888_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v128u16 srcAlphaBits16 = _mm_set1_epi16(0xFF00); + ColorspaceConvert555To8888_SSE2(srcColor, srcAlphaBits16, dstLo, dstHi); } template FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) { - const v128u32 srcAlphaBits32 = _mm_set1_epi32(0x1F000000); - ColorspaceConvert555To6665_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); + const v128u16 srcAlphaBits16 = _mm_set1_epi16(0x1F00); + ColorspaceConvert555To6665_SSE2(srcColor, srcAlphaBits16, dstLo, dstHi); } template @@ -504,6 +498,132 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si return i; } +#ifdef ENABLE_SSSE3 + +template +size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 src_v128u16[2]; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + if (IS_UNALIGNED) + { + src_v128u16[0] = _mm_loadu_si128((v128u16 *)(src + i + 0)); + src_v128u16[1] = _mm_loadu_si128((v128u16 *)(src + i + 8)); + } + else + { + src_v128u16[0] = _mm_load_si128((v128u16 *)(src + i + 0)); + src_v128u16[1] = _mm_load_si128((v128u16 *)(src + i + 8)); + } + + v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(src_v128u16[0], 2), _mm_set1_epi16(0x00F8) ); + src_v128u32[0] = _mm_unpacklo_epi16(rb, g); + src_v128u32[1] = _mm_unpackhi_epi16(rb, g); + + rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[1], 11), _mm_srli_epi16(src_v128u16[1], 7)), _mm_set1_epi16(0xF8F8) ); + g = _mm_and_si128( _mm_srli_epi16(src_v128u16[1], 2), _mm_set1_epi16(0x00F8) ); + src_v128u32[2] = _mm_unpacklo_epi16(rb, g); + src_v128u32[3] = _mm_unpackhi_epi16(rb, g); + + src_v128u32[0] = _mm_or_si128( src_v128u32[0], _mm_and_si128(_mm_srli_epi32(src_v128u32[0], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[1] = _mm_or_si128( src_v128u32[1], _mm_and_si128(_mm_srli_epi32(src_v128u32[1], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[2] = _mm_or_si128( src_v128u32[2], _mm_and_si128(_mm_srli_epi32(src_v128u32[2], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[3] = _mm_or_si128( src_v128u32[3], _mm_and_si128(_mm_srli_epi32(src_v128u32[3], 5), _mm_set1_epi32(0x00070707)) ); + + if (SWAP_RB) + { + src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6) ); + src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9) ); + src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3) ); + } + else + { + src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6) ); + src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8) ); + src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3) ); + } + + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + if (IS_UNALIGNED) + { + src_v128u32[0] = _mm_loadu_si128((v128u32 *)(src + i + 0)); + src_v128u32[1] = _mm_loadu_si128((v128u32 *)(src + i + 4)); + src_v128u32[2] = _mm_loadu_si128((v128u32 *)(src + i + 8)); + src_v128u32[3] = _mm_loadu_si128((v128u32 *)(src + i + 12)); + } + else + { + src_v128u32[0] = _mm_load_si128((v128u32 *)(src + i + 0)); + src_v128u32[1] = _mm_load_si128((v128u32 *)(src + i + 4)); + src_v128u32[2] = _mm_load_si128((v128u32 *)(src + i + 8)); + src_v128u32[3] = _mm_load_si128((v128u32 *)(src + i + 12)); + } + + if (SWAP_RB) + { + src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5)); + src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8)); + src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3)); + } + else + { + src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5)); + src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10)); + src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3)); + } + + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + } + + return i; +} + +#endif + template size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128) { @@ -861,6 +981,50 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned( return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); } +#ifdef ENABLE_SSSE3 + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +#endif + size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const { return ColorspaceCopyBuffer16_SSE2(src, dst, pixCount); @@ -921,11 +1085,11 @@ size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 * return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); } -template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h index 50b8597d1..ca2aa14ac 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -24,8 +24,8 @@ #warning This header requires SSE2 support. #else -template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); -template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); @@ -80,6 +80,18 @@ public: size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; +#ifdef ENABLE_SSSE3 + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; +#endif + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;