Colorspace Handler: Fix pixel alignment bug in ColorspaceConvert555To8888_AVX2() and ColorspaceConvert555To6665_AVX2().
Fixes issue #8.
This commit is contained in:
parent
1cc8cb6fc6
commit
de4f1c9c69
|
@ -26,12 +26,10 @@
|
|||
#include "colorspacehandler_AltiVec.cpp"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
|
||||
#define USEVECTORSIZE_128
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX2)
|
||||
#define USEVECTORSIZE_256
|
||||
#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
|
||||
#define USEVECTORSIZE_128
|
||||
#endif
|
||||
|
||||
// By default, the hand-coded vectorized code will be used instead of a compiler's built-in
|
||||
|
@ -147,12 +145,12 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -195,12 +193,12 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -243,12 +241,12 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -291,12 +289,12 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -339,12 +337,12 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -387,12 +385,12 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric
|
|||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
|
|
|
@ -30,14 +30,14 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const
|
|||
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256());
|
||||
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
|
||||
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9));
|
||||
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) );
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) );
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
|
||||
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
|
||||
|
||||
src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256());
|
||||
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) );
|
||||
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9));
|
||||
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) );
|
||||
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) );
|
||||
|
@ -52,14 +52,14 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const
|
|||
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256());
|
||||
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 0) );
|
||||
dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7));
|
||||
dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) );
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) );
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
|
||||
dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo );
|
||||
|
||||
src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256());
|
||||
src32 = _mm256_cvtepu16_epi32( _mm256_extracti128_si256(srcColor, 1) );
|
||||
dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7));
|
||||
dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) );
|
||||
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) );
|
||||
|
|
|
@ -27,6 +27,10 @@
|
|||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
|
@ -34,14 +38,25 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const
|
|||
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
#ifdef ENABLE_SSE4_1
|
||||
src32 = _mm_cvtepu16_epi32(srcColor);
|
||||
#else
|
||||
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
|
||||
#endif
|
||||
|
||||
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9));
|
||||
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
|
||||
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo );
|
||||
|
||||
#ifdef ENABLE_SSE4_1
|
||||
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
|
||||
#else
|
||||
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
|
||||
#endif
|
||||
|
||||
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9));
|
||||
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) );
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) );
|
||||
|
@ -56,14 +71,25 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const
|
|||
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
#ifdef ENABLE_SSE4_1
|
||||
src32 = _mm_cvtepu16_epi32(srcColor);
|
||||
#else
|
||||
src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128());
|
||||
#endif
|
||||
|
||||
dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7));
|
||||
dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
|
||||
dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo );
|
||||
|
||||
#ifdef ENABLE_SSE4_1
|
||||
src32 = _mm_cvtepu16_epi32( _mm_srli_si128(srcColor, 8) );
|
||||
#else
|
||||
src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128());
|
||||
#endif
|
||||
|
||||
dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7));
|
||||
dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) );
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) );
|
||||
|
|
Loading…
Reference in New Issue