Colorspace Handler: Add dedicated functions ColorspaceApplyIntensityToBuffer16() and ColorspaceApplyIntensityToBuffer32() for applying an RGB intensity value to a framebuffer.
This commit is contained in:
parent
174dcc11bb
commit
74ba49e168
|
@ -563,6 +563,181 @@ void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount)
|
|||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer16_IsUnaligned(dst, pixCountVector, intensity);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer16_SwapRB(dst, pixCountVector, intensity);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer16_IsUnaligned(dst, pixCountVector, intensity);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer16(dst, pixCountVector, intensity);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
|
||||
#endif // USEMANUALVECTORIZATION
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = COLOR5551_SWAP_RB(dst[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0x8000;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(dst[i]) : dst[i];
|
||||
|
||||
u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 a = outColor & 0x8000;
|
||||
|
||||
dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a );
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer32_IsUnaligned(dst, pixCountVector, intensity);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer32_SwapRB(dst, pixCountVector, intensity);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer32_IsUnaligned(dst, pixCountVector, intensity);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.ApplyIntensityToBuffer32(dst, pixCountVector, intensity);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
|
||||
#endif // USEMANUALVECTORIZATION
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor dstColor;
|
||||
dstColor.color = dst[i];
|
||||
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = dstColor.b;
|
||||
outColor.b = dstColor.r;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0xFF000000;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor dstColor;
|
||||
dstColor.color = dst[i];
|
||||
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 );
|
||||
outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 );
|
||||
outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 );
|
||||
outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 );
|
||||
outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
@ -835,6 +1010,176 @@ size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *d
|
|||
return this->CopyBuffer32_SwapRB(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return pixCount;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0x8000;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
u16 outColor = dst[i];
|
||||
|
||||
u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 a = outColor & 0x8000;
|
||||
|
||||
dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = COLOR5551_SWAP_RB(dst[i]);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0x8000;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
u16 outColor = COLOR5551_SWAP_RB(dst[i]);
|
||||
|
||||
u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 a = outColor & 0x8000;
|
||||
|
||||
dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return this->ApplyIntensityToBuffer16(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return this->ApplyIntensityToBuffer16_SwapRB(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return pixCount;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0xFF000000;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 );
|
||||
outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 );
|
||||
outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor dstColor;
|
||||
dstColor.color = dst[i];
|
||||
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = dstColor.b;
|
||||
outColor.b = dstColor.r;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = dst[i] & 0xFF000000;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
FragmentColor dstColor;
|
||||
dstColor.color = dst[i];
|
||||
|
||||
FragmentColor &outColor = (FragmentColor &)dst[i];
|
||||
outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 );
|
||||
outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 );
|
||||
outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return this->ApplyIntensityToBuffer32(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return this->ApplyIntensityToBuffer32_SwapRB(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
|
@ -879,3 +1224,13 @@ template void ColorspaceCopyBuffer32<true, true>(const u32 *src, u32 *dst, size_
|
|||
template void ColorspaceCopyBuffer32<true, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer32<false, true>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer32<false, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
||||
template void ColorspaceApplyIntensityToBuffer16<true, true>(u16 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer16<true, false>(u16 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer16<false, true>(u16 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer16<false, false>(u16 *dst, size_t pixCount, float intensity);
|
||||
|
||||
template void ColorspaceApplyIntensityToBuffer32<true, true>(u32 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer32<true, false>(u32 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer32<false, true>(u32 *dst, size_t pixCount, float intensity);
|
||||
template void ColorspaceApplyIntensityToBuffer32<false, false>(u32 *dst, size_t pixCount, float intensity);
|
||||
|
|
|
@ -265,6 +265,65 @@ FORCEINLINE u32 ColorspaceCopy32(u32 srcColor)
|
|||
return ColorspaceCopy32<SWAP_RB>(srcColorComponent);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u16 ColorspaceApplyIntensity16(u16 srcColor, float intensity)
|
||||
{
|
||||
u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return outColor;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return (outColor & 0x8000);
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 );
|
||||
u8 a = outColor & 0x8000;
|
||||
|
||||
return ( (r << 0) | (g << 5) | (b << 10) | a );
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u32 ColorspaceApplyIntensity32(FragmentColor srcColor, float intensity)
|
||||
{
|
||||
FragmentColor outColor;
|
||||
outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r);
|
||||
outColor.g = srcColor.g;
|
||||
outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b);
|
||||
outColor.a = srcColor.a;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return outColor.color;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return (outColor.color & 0xFF000000);
|
||||
}
|
||||
|
||||
const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF));
|
||||
outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 );
|
||||
outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 );
|
||||
outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 );
|
||||
outColor.a = outColor.a;
|
||||
|
||||
return outColor.color;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u32 ColorspaceApplyIntensity32(u32 srcColor, float intensity)
|
||||
{
|
||||
FragmentColor srcColorComponent;
|
||||
srcColorComponent.color = srcColor;
|
||||
|
||||
return ColorspaceApplyIntensity32<SWAP_RB>(srcColorComponent);
|
||||
}
|
||||
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
@ -276,6 +335,9 @@ template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888
|
|||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity);
|
||||
|
||||
class ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -321,6 +383,16 @@ public:
|
|||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
};
|
||||
|
||||
FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a)
|
||||
|
|
|
@ -248,6 +248,62 @@ FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src)
|
|||
return src;
|
||||
}
|
||||
|
||||
template<bool SWAP_RB>
|
||||
FORCEINLINE v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity)
|
||||
{
|
||||
v256u16 tempSrc = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) ) : src;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return tempSrc;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return _mm256_and_si256(tempSrc, _mm256_set1_epi16(0x8000));
|
||||
}
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x001F) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 5), _mm256_set1_epi16(0x001F) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 10), _mm256_set1_epi16(0x001F) );
|
||||
v256u16 a = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x8000) );
|
||||
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi16( _mm256_mulhi_epu16(g, intensity_v256), 5 );
|
||||
b = _mm256_slli_epi16( _mm256_mulhi_epu16(b, intensity_v256), 10 );
|
||||
|
||||
return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
}
|
||||
|
||||
template<bool SWAP_RB>
|
||||
FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity)
|
||||
{
|
||||
v256u32 tempSrc = (SWAP_RB) ? _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return tempSrc;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 16), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) );
|
||||
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
|
||||
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
|
||||
|
||||
return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256)
|
||||
{
|
||||
|
@ -456,6 +512,160 @@ size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec2
|
|||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
{
|
||||
const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
|
||||
const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) );
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return pixCountVec256;
|
||||
}
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
{
|
||||
v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
|
||||
v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256;
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x001F) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempDst, 5), _mm256_set1_epi16(0x001F) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempDst, 10), _mm256_set1_epi16(0x001F) );
|
||||
v256u16 a = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x8000) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 5 );
|
||||
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 10 );
|
||||
|
||||
tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u16 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u16 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
{
|
||||
const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
|
||||
const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u32 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u32 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return pixCountVec256;
|
||||
}
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
{
|
||||
v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
|
||||
v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256;
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempDst, 16), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
|
||||
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
|
||||
|
||||
tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u32 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceConvertBuffer555To8888Opaque_AVX2<false, false>(src, dst, pixCount);
|
||||
|
@ -616,6 +826,46 @@ size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u
|
|||
return ColorspaceCopyBuffer32_AVX2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_AVX2<false, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_AVX2<true, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_AVX2<false, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_AVX2<true, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_AVX2<false, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_AVX2<true, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_AVX2<false, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_AVX2<true, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
|
@ -649,4 +899,10 @@ template v256u16 ColorspaceCopy16_AVX2<false>(const v256u16 &src);
|
|||
template v256u32 ColorspaceCopy32_AVX2<true>(const v256u32 &src);
|
||||
template v256u32 ColorspaceCopy32_AVX2<false>(const v256u32 &src);
|
||||
|
||||
template v256u16 ColorspaceApplyIntensity16_AVX2<true>(const v256u16 &src, float intensity);
|
||||
template v256u16 ColorspaceApplyIntensity16_AVX2<false>(const v256u16 &src, float intensity);
|
||||
|
||||
template v256u32 ColorspaceApplyIntensity32_AVX2<true>(const v256u32 &src, float intensity);
|
||||
template v256u32 ColorspaceApplyIntensity32_AVX2<false>(const v256u32 &src, float intensity);
|
||||
|
||||
#endif // ENABLE_AVX2
|
||||
|
|
|
@ -37,6 +37,9 @@ template<bool SWAP_RB> v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256
|
|||
template<bool SWAP_RB> v256u16 ColorspaceCopy16_AVX2(const v256u16 &src);
|
||||
template<bool SWAP_RB> v256u32 ColorspaceCopy32_AVX2(const v256u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity);
|
||||
template<bool SWAP_RB> v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity);
|
||||
|
||||
class ColorspaceHandler_AVX2 : public ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -82,6 +85,16 @@ public:
|
|||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_AVX2
|
||||
|
|
|
@ -292,6 +292,66 @@ FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src)
|
|||
return src;
|
||||
}
|
||||
|
||||
template<bool SWAP_RB>
|
||||
FORCEINLINE v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity)
|
||||
{
|
||||
v128u16 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) ) : src;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return tempSrc;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return _mm_and_si128(tempSrc, _mm_set1_epi16(0x8000));
|
||||
}
|
||||
|
||||
v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi16(0x001F) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi16(tempSrc, 5), _mm_set1_epi16(0x001F) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi16(tempSrc, 10), _mm_set1_epi16(0x001F) );
|
||||
v128u16 a = _mm_and_si128( tempSrc, _mm_set1_epi16(0x8000) );
|
||||
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 );
|
||||
b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 );
|
||||
|
||||
return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
}
|
||||
|
||||
template<bool SWAP_RB>
|
||||
FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity)
|
||||
{
|
||||
#ifdef ENABLE_SSSE3
|
||||
v128u32 tempSrc = (SWAP_RB) ? _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src;
|
||||
#else
|
||||
v128u32 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) ) : src;
|
||||
#endif
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
return tempSrc;
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempSrc, 16), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) );
|
||||
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
|
||||
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
|
||||
|
||||
return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128)
|
||||
{
|
||||
|
@ -500,6 +560,167 @@ size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec1
|
|||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
{
|
||||
const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
|
||||
const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) );
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u16 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u16 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return pixCountVec128;
|
||||
}
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
{
|
||||
v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
|
||||
v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128;
|
||||
|
||||
v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi16(0x001F) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi16(tempDst, 5), _mm_set1_epi16(0x001F) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi16(tempDst, 10), _mm_set1_epi16(0x001F) );
|
||||
v128u16 a = _mm_and_si128( tempDst, _mm_set1_epi16(0x8000) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 );
|
||||
b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 );
|
||||
|
||||
tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u16 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u16 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, float intensity)
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
if (intensity > 0.999f)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
{
|
||||
const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
|
||||
#ifdef ENABLE_SSSE3
|
||||
const v128u32 tempDst = _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
|
||||
#else
|
||||
const v128u32 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) );
|
||||
#endif
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u32 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u32 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return pixCountVec128;
|
||||
}
|
||||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
{
|
||||
v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
|
||||
#ifdef ENABLE_SSSE3
|
||||
v128u32 tempDst = (SWAP_RB) ? _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v128;
|
||||
#else
|
||||
v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128;
|
||||
#endif
|
||||
|
||||
v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempDst, 16), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
|
||||
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
|
||||
|
||||
tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u32 *)(dst+i), tempDst);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u32 *)(dst+i), tempDst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceConvertBuffer555To8888Opaque_SSE2<false, false>(src, dst, pixCount);
|
||||
|
@ -660,6 +881,46 @@ size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u
|
|||
return ColorspaceCopyBuffer32_SSE2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_SSE2<false, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_SSE2<true, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_SSE2<false, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer16_SSE2<true, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_SSE2<false, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_SSE2<true, false>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_SSE2<false, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const
|
||||
{
|
||||
return ColorspaceApplyIntensityToBuffer32_SSE2<true, true>(dst, pixCount, intensity);
|
||||
}
|
||||
|
||||
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
|
@ -693,4 +954,10 @@ template v128u16 ColorspaceCopy16_SSE2<false>(const v128u16 &src);
|
|||
template v128u32 ColorspaceCopy32_SSE2<true>(const v128u32 &src);
|
||||
template v128u32 ColorspaceCopy32_SSE2<false>(const v128u32 &src);
|
||||
|
||||
template v128u16 ColorspaceApplyIntensity16_SSE2<true>(const v128u16 &src, float intensity);
|
||||
template v128u16 ColorspaceApplyIntensity16_SSE2<false>(const v128u16 &src, float intensity);
|
||||
|
||||
template v128u32 ColorspaceApplyIntensity32_SSE2<true>(const v128u32 &src, float intensity);
|
||||
template v128u32 ColorspaceApplyIntensity32_SSE2<false>(const v128u32 &src, float intensity);
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
|
@ -37,6 +37,9 @@ template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128
|
|||
template<bool SWAP_RB> v128u16 ColorspaceCopy16_SSE2(const v128u16 &src);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceCopy32_SSE2(const v128u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity);
|
||||
|
||||
class ColorspaceHandler_SSE2 : public ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -82,6 +85,16 @@ public:
|
|||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
Loading…
Reference in New Issue