Colorspace Handler: Fix pixel alignment bug in ColorspaceConvert555To8888_AVX2() and ColorspaceConvert555To6665_AVX2().

Fixes issue #8.
This commit is contained in:
rogerman 2016-12-12 21:30:14 -08:00
parent 1cc8cb6fc6
commit de4f1c9c69
3 changed files with 56 additions and 32 deletions

View File

@ -26,12 +26,10 @@
#include "colorspacehandler_AltiVec.cpp" #include "colorspacehandler_AltiVec.cpp"
#endif #endif
#if defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#endif
#if defined(ENABLE_AVX2) #if defined(ENABLE_AVX2)
#define USEVECTORSIZE_256 #define USEVECTORSIZE_256
#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#endif #endif
// By default, the hand-coded vectorized code will be used instead of a compiler's built-in // By default, the hand-coded vectorized code will be used instead of a compiler's built-in
@ -147,12 +145,12 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 32); const size_t pixCountVector = pixCount - (pixCount % 8);
#endif #endif
if (SWAP_RB) if (SWAP_RB)
@ -195,12 +193,12 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 32); const size_t pixCountVector = pixCount - (pixCount % 8);
#endif #endif
if (SWAP_RB) if (SWAP_RB)
@ -243,12 +241,12 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 4); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 4);
#endif #endif
if (SWAP_RB) if (SWAP_RB)
@ -291,12 +289,12 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 4); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 4);
#endif #endif
if (SWAP_RB) if (SWAP_RB)
@ -339,12 +337,12 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 32); const size_t pixCountVector = pixCount - (pixCount % 8);
#endif #endif
if (SWAP_RB) if (SWAP_RB)
@ -387,12 +385,12 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric
#ifdef USEMANUALVECTORIZATION #ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_128) #if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 8); const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256) #elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16); const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_512) #elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 32); const size_t pixCountVector = pixCount - (pixCount % 8);
#endif #endif
if (SWAP_RB) if (SWAP_RB)

View File

@ -30,14 +30,14 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9));
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) ); dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) );
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9));
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) ); dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) );
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) );
@ -52,14 +52,14 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7));
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) ); dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) );
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) );
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7));
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) ); dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) );
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) );

View File

@ -27,6 +27,10 @@
#include <tmmintrin.h> #include <tmmintrin.h>
#endif #endif
#ifdef ENABLE_SSE4_1
#include <smmintrin.h>
#endif
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi)
{ {
@ -34,14 +38,25 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
#ifdef ENABLE_SSE4_1
src32 = _mm_cvtepu16_epi32(srcColor);
#else
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
#endif
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9));
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo );
#ifdef ENABLE_SSE4_1
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
#else
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
#endif
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9));
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
@ -56,14 +71,25 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
#ifdef ENABLE_SSE4_1
src32 = _mm_cvtepu16_epi32(srcColor);
#else
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
#endif
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7));
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo );
#ifdef ENABLE_SSE4_1
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
#else
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
#endif
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7));
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );