Colorspace Handler: Add new functions for converting 16-bit RGBA5551 and 32-bit RGBA8888 color buffers to 24-bit RGB888.

- Also improve the performance of RGB555-to-RGBA6665 and RGB555-to-RGBA8888 conversions.
This commit is contained in:
rogerman 2017-10-22 22:14:05 -07:00
parent 2bc3be0711
commit 858b05df79
9 changed files with 780 additions and 135 deletions

View File

@ -1114,7 +1114,6 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co
const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0);
const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1);
__m128i tmpAlpha[2];
__m128i convertedColor[4]; __m128i convertedColor[4];
if (TEXCACHEFORMAT == TexFormat_15bpp) if (TEXCACHEFORMAT == TexFormat_15bpp)
@ -1123,13 +1122,8 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co
const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha);
const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha);
tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); ColorspaceConvert555To6665_SSE2<false>(palColor0, alphaLo, convertedColor[0], convertedColor[1]);
tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); ColorspaceConvert555To6665_SSE2<false>(palColor1, alphaHi, convertedColor[2], convertedColor[3]);
ColorspaceConvert555To6665_SSE2<false>(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]);
tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi);
tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi);
ColorspaceConvert555To6665_SSE2<false>(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]);
} }
else else
{ {
@ -1137,13 +1131,8 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co
const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha);
const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha);
tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); ColorspaceConvert555To8888_SSE2<false>(palColor0, alphaLo, convertedColor[0], convertedColor[1]);
tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); ColorspaceConvert555To8888_SSE2<false>(palColor1, alphaHi, convertedColor[2], convertedColor[3]);
ColorspaceConvert555To8888_SSE2<false>(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]);
tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi);
tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi);
ColorspaceConvert555To8888_SSE2<false>(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]);
} }
_mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]);

View File

@ -483,6 +483,102 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi
} }
} }
template <bool SWAP_RB, bool IS_UNALIGNED>
void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount)
{
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
if (SWAP_RB)
{
if (IS_UNALIGNED)
{
i = csh.ConvertBuffer555XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.ConvertBuffer555XTo888_SwapRB(src, dst, pixCountVector);
}
}
else
{
if (IS_UNALIGNED)
{
i = csh.ConvertBuffer555XTo888_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.ConvertBuffer555XTo888(src, dst, pixCountVector);
}
}
#pragma LOOPVECTORIZE_DISABLE
#endif // USEMANUALVECTORIZATION
for (; i < pixCount; i++)
{
ColorspaceConvert555XTo888<SWAP_RB>(src[i], &dst[i*3]);
}
}
template <bool SWAP_RB, bool IS_UNALIGNED>
void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount)
{
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
if (SWAP_RB)
{
if (IS_UNALIGNED)
{
i = csh.ConvertBuffer888XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.ConvertBuffer888XTo888_SwapRB(src, dst, pixCountVector);
}
}
else
{
if (IS_UNALIGNED)
{
i = csh.ConvertBuffer888XTo888_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.ConvertBuffer888XTo888(src, dst, pixCountVector);
}
}
#pragma LOOPVECTORIZE_DISABLE
#endif // USEMANUALVECTORIZATION
for (; i < pixCount; i++)
{
ColorspaceConvert888XTo888<SWAP_RB>(src[i], &dst[i*3]);
}
}
template <bool SWAP_RB, bool IS_UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount) void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount)
{ {
@ -992,6 +1088,74 @@ size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const
return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount);
} }
size_t ColorspaceHandler::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
ColorspaceConvert555XTo888<false>(src[i], &dst[i*3]);
}
return i;
}
size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
ColorspaceConvert555XTo888<true>(src[i], &dst[i*3]);
}
return i;
}
size_t ColorspaceHandler::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return this->ConvertBuffer555XTo888(src, dst, pixCount);
}
size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return this->ConvertBuffer555XTo888_SwapRB(src, dst, pixCount);
}
size_t ColorspaceHandler::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
ColorspaceConvert888XTo888<false>(src[i], &dst[i*3]);
}
return i;
}
size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
ColorspaceConvert888XTo888<true>(src[i], &dst[i*3]);
}
return i;
}
size_t ColorspaceHandler::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return this->ConvertBuffer888XTo888(src, dst, pixCount);
}
size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return this->ConvertBuffer888XTo888_SwapRB(src, dst, pixCount);
}
size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{ {
size_t i = 0; size_t i = 0;
@ -1231,6 +1395,16 @@ template void ColorspaceConvertBuffer888XTo8888Opaque<true, false>(const u32 *sr
template void ColorspaceConvertBuffer888XTo8888Opaque<false, true>(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque<false, true>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo8888Opaque<false, false>(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque<false, false>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceConvertBuffer555XTo888<true, true>(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer555XTo888<true, false>(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer555XTo888<false, true>(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer555XTo888<false, false>(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo888<true, true>(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo888<true, false>(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo888<false, true>(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo888<false, false>(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount);
template void ColorspaceCopyBuffer16<true, true>(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer16<true, true>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<true, false>(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer16<true, false>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<false, true>(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer16<false, true>(const u16 *src, u16 *dst, size_t pixCount);

View File

@ -238,6 +238,32 @@ FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor)
return ColorspaceConvert888XTo8888Opaque<SWAP_RB>(srcColorComponent); return ColorspaceConvert888XTo8888Opaque<SWAP_RB>(srcColorComponent);
} }
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert888XTo888(FragmentColor srcColor, u8 *dst)
{
dst[0] = (SWAP_RB) ? srcColor.b : srcColor.r;
dst[1] = srcColor.g;
dst[2] = (SWAP_RB) ? srcColor.r : srcColor.b;
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert888XTo888(u32 srcColor, u8 *dst)
{
FragmentColor srcColorComponent;
srcColorComponent.color = srcColor;
ColorspaceConvert888XTo888<SWAP_RB>(srcColorComponent, dst);
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo888(u16 srcColor, u8 *dst)
{
FragmentColor srcColorComponent;
srcColorComponent.color = ColorspaceConvert555To8888Opaque<SWAP_RB>(srcColor);
ColorspaceConvert888XTo888<SWAP_RB>(srcColorComponent, dst);
}
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE u16 ColorspaceCopy16(u16 srcColor) FORCEINLINE u16 ColorspaceCopy16(u16 srcColor)
{ {
@ -332,6 +358,9 @@ template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To5551
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount);
@ -378,6 +407,16 @@ public:
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;

View File

@ -25,61 +25,59 @@
#include <immintrin.h> #include <immintrin.h>
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi)
{ {
v256u32 src32;
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9));
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(srcColor, 11), _mm256_srli_epi16(srcColor, 7)), _mm256_set1_epi16(0xF8F8) );
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits);
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) );
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi16(rb, ga);
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm256_unpackhi_epi16(rb, ga);
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) );
dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi)
{ {
v256u32 src32;
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7));
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) ); v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(srcColor, 9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E)), srcAlphaBits);
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) );
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi16(rb, ga);
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm256_unpackhi_epi16(rb, ga);
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) );
dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
{ {
const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0xFF000000); const v256u16 srcAlphaBits16 = _mm256_set1_epi16(0xFF00);
ColorspaceConvert555To8888_AVX2<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To8888_AVX2<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
{ {
const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0x1F000000); const v256u16 srcAlphaBits16 = _mm256_set1_epi32(0x1F00);
ColorspaceConvert555To6665_AVX2<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To6665_AVX2<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
@ -456,6 +454,132 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si
return i; return i;
} }
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec256)
{
size_t i = 0;
v256u16 src_v256u16[2];
v256u32 src_v256u32[4];
for (; i < pixCountVec256; i+=32)
{
if (IS_UNALIGNED)
{
src_v256u16[0] = _mm256_loadu_si256((v256u16 *)(src + i + 0));
src_v256u16[1] = _mm256_loadu_si256((v256u16 *)(src + i + 16));
}
else
{
src_v256u16[0] = _mm256_load_si256((v256u16 *)(src + i + 0));
src_v256u16[1] = _mm256_load_si256((v256u16 *)(src + i + 16));
}
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) );
v256u16 g = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(src_v256u16[0], 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits);
rb = _mm256_permute4x64_epi64(rb, 0xD8);
g = _mm256_permute4x64_epi64( g, 0xD8);
src_v256u32[0] = _mm256_unpacklo_epi16(rb, g);
src_v256u32[1] = _mm256_unpackhi_epi16(rb, g);
rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi32(src_v256u16[1], 11), _mm256_srli_epi16(src_v256u16[1], 7)), _mm256_set1_epi16(0xF8F8) );
g = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(src_v256u16[1], 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits);
rb = _mm256_permute4x64_epi64(rb, 0xD8);
g = _mm256_permute4x64_epi64( g, 0xD8);
src_v256u32[2] = _mm256_unpacklo_epi16(rb, g);
src_v256u32[3] = _mm256_unpackhi_epi16(rb, g);
src_v256u32[0] = _mm256_or_si256( src_v256u32[0], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[0], 5), _mm256_set1_epi32(0x00070707)) );
src_v256u32[1] = _mm256_or_si256( src_v256u32[1], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[1], 5), _mm256_set1_epi32(0x00070707)) );
src_v256u32[2] = _mm256_or_si256( src_v256u32[2], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[2], 5), _mm256_set1_epi32(0x00070707)) );
src_v256u32[3] = _mm256_or_si256( src_v256u32[3], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[3], 5), _mm256_set1_epi32(0x00070707)) );
if (SWAP_RB)
{
src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) );
src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9) );
src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3, 29,30,28,25, 26,24,21,22) );
src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(29,30,28,25, 26,24,21,22, 20,17,18,16, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 31,27,23,19, 15,11, 7, 3) );
}
else
{
src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) );
src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8) );
src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3, 28,30,29,24, 26,25,20,22) );
src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(28,30,29,24, 26,25,20,22, 21,16,18,17, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 31,27,23,19, 15,11, 7, 3) );
}
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256( src_v256u32[3], _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
}
else
{
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256( src_v256u32[3], _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
}
}
return i;
}
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec256)
{
size_t i = 0;
v256u32 src_v256u32[4];
for (; i < pixCountVec256; i+=32)
{
if (IS_UNALIGNED)
{
src_v256u32[0] = _mm256_loadu_si256((v256u32 *)(src + i + 0));
src_v256u32[1] = _mm256_loadu_si256((v256u32 *)(src + i + 8));
src_v256u32[2] = _mm256_loadu_si256((v256u32 *)(src + i + 16));
src_v256u32[3] = _mm256_loadu_si256((v256u32 *)(src + i + 24));
}
else
{
src_v256u32[0] = _mm256_load_si256((v256u32 *)(src + i + 0));
src_v256u32[1] = _mm256_load_si256((v256u32 *)(src + i + 8));
src_v256u32[2] = _mm256_load_si256((v256u32 *)(src + i + 16));
src_v256u32[3] = _mm256_load_si256((v256u32 *)(src + i + 24));
}
if (SWAP_RB)
{
src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2));
src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8));
src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3, 28,29,30,24, 25,26,20,21));
src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 31,27,23,19, 15,11, 7, 3));
}
else
{
src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0));
src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10));
src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3, 30,29,28,26, 25,24,22,21));
src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 31,27,23,19, 15,11, 7, 3));
}
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
}
else
{
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
}
}
return i;
}
template <bool SWAP_RB, bool IS_UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256) size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256)
{ {
@ -806,6 +930,46 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
return ColorspaceConvertBuffer888XTo8888Opaque_AVX2<true, true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_AVX2<true, true>(src, dst, pixCount);
} }
size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AVX2<false, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AVX2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AVX2<false, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AVX2<true, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AVX2<false, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AVX2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AVX2<false, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AVX2<true, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{ {
return ColorspaceCopyBuffer16_AVX2<true, false>(src, dst, pixCount); return ColorspaceCopyBuffer16_AVX2<true, false>(src, dst, pixCount);
@ -866,11 +1030,11 @@ size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
return ColorspaceApplyIntensityToBuffer32_AVX2<true, true>(dst, pixCount, intensity); return ColorspaceApplyIntensityToBuffer32_AVX2<true, true>(dst, pixCount, intensity);
} }
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To6665_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits16Lo, const v256u16 &srcAlphaBits16Hi, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To6665_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To6665_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits16Lo, const v256u16 &srcAlphaBits16Hi, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);

View File

@ -24,8 +24,8 @@
#warning This header requires AVX2 support. #warning This header requires AVX2 support.
#else #else
template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); template<bool SWAP_RB> v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src);
@ -80,6 +80,16 @@ public:
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;

View File

@ -24,57 +24,47 @@
#include <string.h> #include <string.h>
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
{ {
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
dstLo = vec_unpackl((vector pixel)srcColor); dstLo = vec_unpackl((vector pixel)srcColor);
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
dstLo = vec_sel(dstLo, srcAlphaBits32Lo, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); dstLo = vec_perm(dstLo, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
dstHi = vec_unpackh((vector pixel)srcColor); dstHi = vec_unpackh((vector pixel)srcColor);
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
dstHi = vec_sel(dstHi, srcAlphaBits32Hi, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
if (SWAP_RB)
{
dstLo = vec_perm(dstLo, dstLo, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13}));
dstHi = vec_perm(dstHi, dstHi, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13}));
}
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
{ {
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
dstLo = vec_unpackl((vector pixel)srcColor); dstLo = vec_unpackl((vector pixel)srcColor);
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
dstLo = vec_sel(dstLo, srcAlphaBits32Lo, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); dstLo = vec_perm(dstLo, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
dstHi = vec_unpackh((vector pixel)srcColor); dstHi = vec_unpackh((vector pixel)srcColor);
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
dstHi = vec_sel(dstHi, srcAlphaBits32Hi, ((v128u32){0xFF000000,0xFF000000,0xFF000000,0xFF000000})); dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
if (SWAP_RB)
{
dstLo = vec_perm(dstLo, dstLo, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13}));
dstHi = vec_perm(dstHi, dstHi, ((v128u8){0,3,2,1, 4,7,6,5, 8,11,10,9, 12,15,14,13}));
}
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{ {
const v128u32 srcAlphaBits32 = {0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}; const v128u16 srcAlphaBits16 = {0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00, 0xFF00};
ColorspaceConvert555To8888_AltiVec<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To8888_AltiVec<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{ {
const v128u32 srcAlphaBits32 = {0x1F000000, 0x1F000000, 0x1F000000, 0x1F000000}; const v128u16 srcAlphaBits16 = {0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00, 0x1F00};
ColorspaceConvert555To6665_AltiVec<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To6665_AltiVec<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
@ -305,6 +295,83 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst,
return i; return i;
} }
template <bool SWAP_RB>
size_t ColorspaceConvertBuffer555XTo888_AltiVec(const u16 *src, u8 *dst, size_t pixCountVec128)
{
size_t i = 0;
v128u16 src_v128u16[2];
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
{
src_v128u16[0] = vec_ld( 0, src+i);
src_v128u16[1] = vec_ld(16, src+i);
src_v128u32[0] = vec_unpackl((vector pixel)src_v128u16[0]);
src_v128u32[1] = vec_unpackh((vector pixel)src_v128u16[0]);
src_v128u32[2] = vec_unpackl((vector pixel)src_v128u16[1]);
src_v128u32[3] = vec_unpackh((vector pixel)src_v128u16[1]);
src_v128u32[0] = vec_or( vec_sl((v128u8)src_v128u32[0], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[0], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
src_v128u32[1] = vec_or( vec_sl((v128u8)src_v128u32[1], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[1], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
src_v128u32[2] = vec_or( vec_sl((v128u8)src_v128u32[2], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[2], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
src_v128u32[3] = vec_or( vec_sl((v128u8)src_v128u32[3], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[3], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
if (SWAP_RB)
{
src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) );
src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) );
src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) );
}
else
{
src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) );
src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) );
src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) );
}
vec_st( src_v128u32[0], 0, dst + (i * 3) );
vec_st( src_v128u32[1], 16, dst + (i * 3) );
vec_st( src_v128u32[2], 32, dst + (i * 3) );
}
return i;
}
template <bool SWAP_RB>
size_t ColorspaceConvertBuffer888XTo888_AltiVec(const u32 *src, u8 *dst, size_t pixCountVec128)
{
size_t i = 0;
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
{
src_v128u32[0] = vec_ld( 0, src+i);
src_v128u32[1] = vec_ld(16, src+i);
src_v128u32[2] = vec_ld(32, src+i);
src_v128u32[3] = vec_ld(48, src+i);
if (SWAP_RB)
{
src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) );
src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) );
src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) );
}
else
{
src_v128u32[0] = vec_perm( src_v128u32[0], src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) );
src_v128u32[1] = vec_perm( src_v128u32[1], src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) );
src_v128u32[2] = vec_perm( src_v128u32[2], src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) );
}
vec_st( src_v128u32[0], 0, dst + (i * 3) );
vec_st( src_v128u32[1], 16, dst + (i * 3) );
vec_st( src_v128u32[2], 32, dst + (i * 3) );
}
return i;
}
template <bool SWAP_RB> template <bool SWAP_RB>
size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128) size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128)
{ {
@ -413,6 +480,26 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32
return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec<true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec<true>(src, dst, pixCount);
} }
size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AltiVec<false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_AltiVec<true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AltiVec<false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_AltiVec<true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{ {
return ColorspaceCopyBuffer16_AltiVec<true>(src, dst, pixCount); return ColorspaceCopyBuffer16_AltiVec<true>(src, dst, pixCount);
@ -423,11 +510,11 @@ size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst,
return ColorspaceCopyBuffer32_AltiVec<true>(src, dst, pixCount); return ColorspaceCopyBuffer32_AltiVec<true>(src, dst, pixCount);
} }
template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_AltiVec<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_AltiVec<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);

View File

@ -24,8 +24,8 @@
#warning This header requires PowerPC AltiVec support. #warning This header requires PowerPC AltiVec support.
#else #else
template<bool SWAP_RB> void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src);
@ -65,6 +65,12 @@ public:
size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;

View File

@ -33,83 +33,77 @@
#endif #endif
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
{ {
v128u32 src32;
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
#ifdef ENABLE_SSE4_1 #ifdef ENABLE_SSSE3
src32 = _mm_cvtepu16_epi32(srcColor); v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 11), _mm_srli_epi16(srcColor, 7)), _mm_set1_epi16(0xF8F8) );
#else v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8)), srcAlphaBits);
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
#endif
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); dstLo = _mm_unpacklo_epi16(rb, ga);
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#ifdef ENABLE_SSE4_1 dstHi = _mm_unpackhi_epi16(rb, ga);
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
#else
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
#endif
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9));
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#else
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800) );
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
#endif
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
{ {
v128u32 src32;
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
#ifdef ENABLE_SSE4_1 #ifdef ENABLE_SSSE3
src32 = _mm_cvtepu16_epi32(srcColor); v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
#else v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E)), srcAlphaBits);
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
#endif
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); dstLo = _mm_unpacklo_epi16(rb, ga);
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#ifdef ENABLE_SSE4_1 dstHi = _mm_unpackhi_epi16(rb, ga);
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
#else
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
#endif
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7));
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#else
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00) );
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
#endif
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{ {
const v128u32 srcAlphaBits32 = _mm_set1_epi32(0xFF000000); const v128u16 srcAlphaBits16 = _mm_set1_epi16(0xFF00);
ColorspaceConvert555To8888_SSE2<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To8888_SSE2<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{ {
const v128u32 srcAlphaBits32 = _mm_set1_epi32(0x1F000000); const v128u16 srcAlphaBits16 = _mm_set1_epi16(0x1F00);
ColorspaceConvert555To6665_SSE2<SWAP_RB>(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); ColorspaceConvert555To6665_SSE2<SWAP_RB>(srcColor, srcAlphaBits16, dstLo, dstHi);
} }
template <bool SWAP_RB> template <bool SWAP_RB>
@ -504,6 +498,132 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si
return i; return i;
} }
#ifdef ENABLE_SSSE3
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec128)
{
size_t i = 0;
v128u16 src_v128u16[2];
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
{
if (IS_UNALIGNED)
{
src_v128u16[0] = _mm_loadu_si128((v128u16 *)(src + i + 0));
src_v128u16[1] = _mm_loadu_si128((v128u16 *)(src + i + 8));
}
else
{
src_v128u16[0] = _mm_load_si128((v128u16 *)(src + i + 0));
src_v128u16[1] = _mm_load_si128((v128u16 *)(src + i + 8));
}
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) );
v128u16 g = _mm_and_si128( _mm_srli_epi16(src_v128u16[0], 2), _mm_set1_epi16(0x00F8) );
src_v128u32[0] = _mm_unpacklo_epi16(rb, g);
src_v128u32[1] = _mm_unpackhi_epi16(rb, g);
rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[1], 11), _mm_srli_epi16(src_v128u16[1], 7)), _mm_set1_epi16(0xF8F8) );
g = _mm_and_si128( _mm_srli_epi16(src_v128u16[1], 2), _mm_set1_epi16(0x00F8) );
src_v128u32[2] = _mm_unpacklo_epi16(rb, g);
src_v128u32[3] = _mm_unpackhi_epi16(rb, g);
src_v128u32[0] = _mm_or_si128( src_v128u32[0], _mm_and_si128(_mm_srli_epi32(src_v128u32[0], 5), _mm_set1_epi32(0x00070707)) );
src_v128u32[1] = _mm_or_si128( src_v128u32[1], _mm_and_si128(_mm_srli_epi32(src_v128u32[1], 5), _mm_set1_epi32(0x00070707)) );
src_v128u32[2] = _mm_or_si128( src_v128u32[2], _mm_and_si128(_mm_srli_epi32(src_v128u32[2], 5), _mm_set1_epi32(0x00070707)) );
src_v128u32[3] = _mm_or_si128( src_v128u32[3], _mm_and_si128(_mm_srli_epi32(src_v128u32[3], 5), _mm_set1_epi32(0x00070707)) );
if (SWAP_RB)
{
src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) );
src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6) );
src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9) );
src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3) );
}
else
{
src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) );
src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6) );
src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8) );
src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3) );
}
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
}
return i;
}
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec128)
{
size_t i = 0;
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
{
if (IS_UNALIGNED)
{
src_v128u32[0] = _mm_loadu_si128((v128u32 *)(src + i + 0));
src_v128u32[1] = _mm_loadu_si128((v128u32 *)(src + i + 4));
src_v128u32[2] = _mm_loadu_si128((v128u32 *)(src + i + 8));
src_v128u32[3] = _mm_loadu_si128((v128u32 *)(src + i + 12));
}
else
{
src_v128u32[0] = _mm_load_si128((v128u32 *)(src + i + 0));
src_v128u32[1] = _mm_load_si128((v128u32 *)(src + i + 4));
src_v128u32[2] = _mm_load_si128((v128u32 *)(src + i + 8));
src_v128u32[3] = _mm_load_si128((v128u32 *)(src + i + 12));
}
if (SWAP_RB)
{
src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2));
src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5));
src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8));
src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3));
}
else
{
src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0));
src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5));
src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10));
src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3));
}
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
}
return i;
}
#endif
template <bool SWAP_RB, bool IS_UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128) size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128)
{ {
@ -861,6 +981,50 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
return ColorspaceConvertBuffer888XTo8888Opaque_SSE2<true, true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_SSE2<true, true>(src, dst, pixCount);
} }
#ifdef ENABLE_SSSE3
size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_SSSE3<false, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_SSSE3<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_SSSE3<false, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer555XTo888_SSSE3<true, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_SSSE3<false, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_SSSE3<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_SSSE3<false, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const
{
return ColorspaceConvertBuffer888XTo888_SSSE3<true, true>(src, dst, pixCount);
}
#endif
size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{ {
return ColorspaceCopyBuffer16_SSE2<true, false>(src, dst, pixCount); return ColorspaceCopyBuffer16_SSE2<true, false>(src, dst, pixCount);
@ -921,11 +1085,11 @@ size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
return ColorspaceApplyIntensityToBuffer32_SSE2<true, true>(dst, pixCount, intensity); return ColorspaceApplyIntensityToBuffer32_SSE2<true, true>(dst, pixCount, intensity);
} }
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To6665_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888Opaque_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);

View File

@ -24,8 +24,8 @@
#warning This header requires SSE2 support. #warning This header requires SSE2 support.
#else #else
template<bool SWAP_RB> void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src);
@ -80,6 +80,18 @@ public:
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
#ifdef ENABLE_SSSE3
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
#endif
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;