OpenGL Renderer: Standardize 32-bit color red-blue swapping in OpenGLRenderer::_FlushFramebufferConvertOnCPU().
This commit is contained in:
parent
d16785eba8
commit
8600466498
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
Copyright (C) 2006 yopyop
|
||||
Copyright (C) 2006-2007 shash
|
||||
Copyright (C) 2008-2016 DeSmuME team
|
||||
Copyright (C) 2008-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -1201,9 +1201,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) );
|
||||
_mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
|
@ -1230,29 +1228,18 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
{
|
||||
if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) )
|
||||
{
|
||||
#ifdef ENABLE_SSSE3
|
||||
#ifdef ENABLE_SSE2
|
||||
const size_t ssePixCount = pixCount - (pixCount % 8);
|
||||
for (; i < ssePixCount; i += 8)
|
||||
{
|
||||
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
|
||||
const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) );
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), srcColorLo);
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), srcColorHi);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorLo));
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorHi));
|
||||
_mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
|
@ -1266,33 +1253,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
}
|
||||
else if (dstFramebufferMain != NULL)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
#ifdef ENABLE_SSSE3
|
||||
const size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
for (; i < ssePixCount; i += 4)
|
||||
{
|
||||
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dstFramebufferMain[i].r = srcFramebuffer[i].b;
|
||||
dstFramebufferMain[i].g = srcFramebuffer[i].g;
|
||||
dstFramebufferMain[i].b = srcFramebuffer[i].r;
|
||||
dstFramebufferMain[i].a = srcFramebuffer[i].a;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dstFramebufferMain, srcFramebuffer, this->_framebufferWidth * this->_framebufferHeight * sizeof(FragmentColor));
|
||||
}
|
||||
|
||||
ColorspaceCopyBuffer32<SWAP_RB, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
|
||||
this->_renderNeedsFlushMain = false;
|
||||
}
|
||||
else
|
||||
|
@ -1324,9 +1285,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebufferMain + iw + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) );
|
||||
_mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; x < pixCount; x++, ir++, iw++)
|
||||
|
@ -1372,29 +1331,11 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0));
|
||||
const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
#ifdef ENABLE_SSSE3
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
|
||||
#else
|
||||
const __m128i swappedLo = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x000000FF)), 16)) );
|
||||
const __m128i swappedHi = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x000000FF)), 16)) );
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), swappedLo);
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), swappedHi);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), srcColorLo);
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), srcColorHi);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + iw + 0), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorLo));
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + iw + 4), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorHi));
|
||||
_mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; x < pixCount; x++, ir++, iw++)
|
||||
|
@ -1409,45 +1350,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
}
|
||||
else if (dstFramebufferMain != NULL)
|
||||
{
|
||||
const FragmentColor *__restrict srcPtr = srcFramebuffer;
|
||||
FragmentColor *__restrict dstPtr = dstFramebufferMain + ((this->_framebufferHeight - 1) * this->_framebufferWidth);
|
||||
|
||||
for (size_t y = 0; y < this->_framebufferHeight; y++)
|
||||
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
#ifdef ENABLE_SSE2
|
||||
const size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
for (; i < ssePixCount; i += 4)
|
||||
{
|
||||
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
|
||||
#ifdef ENABLE_SSSE3
|
||||
_mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
|
||||
#else
|
||||
const __m128i swappedColor = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColor, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x000000FF)), 16)));
|
||||
_mm_store_si128((__m128i *)(dstFramebufferMain + i), swappedColor);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x++)
|
||||
{
|
||||
dstPtr[x].r = srcPtr[x].b;
|
||||
dstPtr[x].g = srcPtr[x].g;
|
||||
dstPtr[x].b = srcPtr[x].r;
|
||||
dstPtr[x].a = srcPtr[x].a;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dstPtr, srcPtr, this->_framebufferWidth * sizeof(FragmentColor));
|
||||
}
|
||||
|
||||
srcPtr += this->_framebufferWidth;
|
||||
dstPtr -= this->_framebufferWidth;
|
||||
ColorspaceCopyBuffer32<SWAP_RB, false>((u32 *)srcFramebuffer + ir, (u32 *)dstFramebufferMain + iw, pixCount);
|
||||
}
|
||||
|
||||
this->_renderNeedsFlushMain = false;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
|
||||
#include "colorspacehandler.h"
|
||||
#include <string.h>
|
||||
|
||||
#if defined(ENABLE_AVX2)
|
||||
#include "colorspacehandler_AVX2.cpp"
|
||||
|
@ -54,6 +55,7 @@
|
|||
static const ColorspaceHandler csh;
|
||||
#endif
|
||||
|
||||
CACHE_ALIGN u16 color_5551_swap_rb[32768];
|
||||
CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
|
||||
CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
|
||||
CACHE_ALIGN u32 color_555_to_666[32768];
|
||||
|
@ -120,6 +122,7 @@ void ColorspaceHandlerInit()
|
|||
|
||||
if (needInitTables)
|
||||
{
|
||||
#define RGB16_SWAP_RB_BITLOGIC(col) ( (((col)&0x001F)<<10) | ((col)&0x03E0) | (((col)&0x7C00)>>10) | ((col)&0x8000) )
|
||||
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
|
||||
#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) )
|
||||
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
|
||||
|
@ -127,6 +130,8 @@ void ColorspaceHandlerInit()
|
|||
|
||||
for (size_t i = 0; i < 32768; i++)
|
||||
{
|
||||
color_5551_swap_rb[i] = LE_TO_LOCAL_16( RGB16_SWAP_RB_BITLOGIC(i) );
|
||||
|
||||
color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) );
|
||||
color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 );
|
||||
color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 );
|
||||
|
@ -474,6 +479,86 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi
|
|||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCount * sizeof(u16));
|
||||
return;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.CopyBuffer16_SwapRB_IsUnaligned(src, dst, pixCountVector);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.CopyBuffer16_SwapRB(src, dst, pixCountVector);
|
||||
}
|
||||
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
|
||||
#endif // USEMANUALVECTORIZATION
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ColorspaceCopy16<SWAP_RB>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCount * sizeof(u32));
|
||||
return;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
i = csh.CopyBuffer32_SwapRB_IsUnaligned(src, dst, pixCountVector);
|
||||
}
|
||||
else
|
||||
{
|
||||
i = csh.CopyBuffer32_SwapRB(src, dst, pixCountVector);
|
||||
}
|
||||
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
|
||||
#endif // USEMANUALVECTORIZATION
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ColorspaceCopy32<SWAP_RB>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
@ -712,6 +797,40 @@ size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const
|
|||
return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ColorspaceCopy16<true>(src[i]);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return this->CopyBuffer16_SwapRB(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ColorspaceCopy32<true>(src[i]);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return this->CopyBuffer32_SwapRB(src, dst, pixCount);
|
||||
}
|
||||
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer555To8888Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
|
@ -746,3 +865,13 @@ template void ColorspaceConvertBuffer888XTo8888Opaque<true, true>(const u32 *src
|
|||
template void ColorspaceConvertBuffer888XTo8888Opaque<true, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer888XTo8888Opaque<false, true>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceConvertBuffer888XTo8888Opaque<false, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
||||
template void ColorspaceCopyBuffer16<true, true>(const u16 *src, u16 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer16<true, false>(const u16 *src, u16 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer16<false, true>(const u16 *src, u16 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer16<false, false>(const u16 *src, u16 *dst, size_t pixCount);
|
||||
|
||||
template void ColorspaceCopyBuffer32<true, true>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer32<true, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer32<false, true>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
template void ColorspaceCopyBuffer32<false, false>(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -96,6 +96,7 @@ extern CACHE_ALIGN const u8 material_3bit_to_5bit[8];
|
|||
extern CACHE_ALIGN const u8 material_3bit_to_6bit[8];
|
||||
extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
|
||||
|
||||
extern CACHE_ALIGN u16 color_5551_swap_rb[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_666[32768];
|
||||
|
@ -103,6 +104,7 @@ extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
|
|||
extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_888[32768];
|
||||
|
||||
#define COLOR5551_SWAP_RB(col) (color_5551_swap_rb[(col)]) // Swaps the red-blue colors of a 16-bit RGBA5551 color
|
||||
#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color
|
||||
#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped
|
||||
#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color
|
||||
|
@ -236,6 +238,33 @@ FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor)
|
|||
return ColorspaceConvert888XTo8888Opaque<SWAP_RB>(srcColorComponent);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u16 ColorspaceCopy16(u16 srcColor)
|
||||
{
|
||||
return (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u32 ColorspaceCopy32(FragmentColor srcColor)
|
||||
{
|
||||
FragmentColor outColor;
|
||||
outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r;
|
||||
outColor.g = srcColor.g;
|
||||
outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b;
|
||||
outColor.a = srcColor.a;
|
||||
|
||||
return outColor.color;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE u32 ColorspaceCopy32(u32 srcColor)
|
||||
{
|
||||
FragmentColor srcColorComponent;
|
||||
srcColorComponent.color = srcColor;
|
||||
|
||||
return ColorspaceCopy32<SWAP_RB>(srcColorComponent);
|
||||
}
|
||||
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
@ -244,6 +273,9 @@ template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To5551
|
|||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount);
|
||||
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount);
|
||||
|
||||
class ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -283,6 +315,12 @@ public:
|
|||
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
};
|
||||
|
||||
FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -225,6 +225,28 @@ FORCEINLINE v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src)
|
|||
return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v256u16 ColorspaceCopy16_AVX2(const v256u16 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
return _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) );
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
return _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256)
|
||||
{
|
||||
|
@ -377,6 +399,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si
|
|||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec256 * sizeof(u16));
|
||||
return pixCountVec256;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
{
|
||||
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2<SWAP_RB>(src_vec256));
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2<SWAP_RB>(src_vec256));
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec256 * sizeof(u32));
|
||||
return pixCountVec256;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
{
|
||||
v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2<SWAP_RB>(src_vec256));
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2<SWAP_RB>(src_vec256));
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceConvertBuffer555To8888Opaque_AVX2<false, false>(src, dst, pixCount);
|
||||
|
@ -517,6 +595,26 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
|
|||
return ColorspaceConvertBuffer888XTo8888Opaque_AVX2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer16_AVX2<true, false>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer16_AVX2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer32_AVX2<true, false>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer32_AVX2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
|
@ -544,4 +642,10 @@ template v256u16 ColorspaceConvert6665To5551_AVX2<false>(const v256u32 &srcLo, c
|
|||
template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<true>(const v256u32 &src);
|
||||
template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<false>(const v256u32 &src);
|
||||
|
||||
template v256u16 ColorspaceCopy16_AVX2<true>(const v256u16 &src);
|
||||
template v256u16 ColorspaceCopy16_AVX2<false>(const v256u16 &src);
|
||||
|
||||
template v256u32 ColorspaceCopy32_AVX2<true>(const v256u32 &src);
|
||||
template v256u32 ColorspaceCopy32_AVX2<false>(const v256u32 &src);
|
||||
|
||||
#endif // ENABLE_AVX2
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -34,6 +34,9 @@ template<bool SWAP_RB> v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &s
|
|||
template<bool SWAP_RB> v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi);
|
||||
template<bool SWAP_RB> v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v256u16 ColorspaceCopy16_AVX2(const v256u16 &src);
|
||||
template<bool SWAP_RB> v256u32 ColorspaceCopy32_AVX2(const v256u32 &src);
|
||||
|
||||
class ColorspaceHandler_AVX2 : public ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -73,6 +76,12 @@ public:
|
|||
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_AVX2
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -183,6 +183,28 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src
|
|||
return vec_or(src, vec_splat_u32(0xFF000000));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
return vec_or( vec_or(vec_sr(vec_and(src, vec_splat_u16(0x7C00)), vec_splat_u16(10)), vec_or(vec_and(src, vec_splat_u16(0x0E30)), vec_sl(vec_and(src, vec_splat_u16(0x001F)), vec_splat_u16(10)))), vec_and(src, vec_splat_u16(0x8000)) );
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
return vec_perm(src, src, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}));
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128)
|
||||
{
|
||||
|
@ -282,6 +304,44 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst,
|
|||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec128 * sizeof(u16));
|
||||
return pixCountVec128;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
{
|
||||
vec_st( ColorspaceCopy16_AltiVec<SWAP_RB>(vec_ld(0, src+i)), 0, dst+i );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
size_t ColorspaceCopyBuffer32_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec128 * sizeof(u32));
|
||||
return pixCountVec128;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
{
|
||||
vec_st( ColorspaceCopy32_AltiVec<SWAP_RB>(vec_ld(0, src+i)), 0, dst+i );
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceConvertBuffer555To8888Opaque_AltiVec<false>(src, dst, pixCount);
|
||||
|
@ -352,6 +412,16 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32
|
|||
return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec<true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer16_AltiVec<true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer32_AltiVec<true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
|
@ -379,4 +449,10 @@ template v128u16 ColorspaceConvert6665To5551_AltiVec<false>(const v128u32 &srcLo
|
|||
template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<true>(const v128u32 &src);
|
||||
template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<false>(const v128u32 &src);
|
||||
|
||||
template v128u16 ColorspaceCopy16_AltiVec<true>(const v128u16 &src);
|
||||
template v128u16 ColorspaceCopy16_AltiVec<false>(const v128u16 &src);
|
||||
|
||||
template v128u32 ColorspaceCopy32_AltiVec<true>(const v128u32 &src);
|
||||
template v128u32 ColorspaceCopy32_AltiVec<false>(const v128u32 &src);
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -34,6 +34,9 @@ template<bool SWAP_RB> v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32
|
|||
template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src);
|
||||
|
||||
// AltiVec has very poor support for dealing with unaligned addresses (it's possible, just
|
||||
// very obtuse), so we're not even going to bother dealing with any unaligned addresses.
|
||||
class ColorspaceHandler_AltiVec : public ColorspaceHandler
|
||||
|
@ -61,6 +64,10 @@ public:
|
|||
|
||||
size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_ALTIVEC
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -265,6 +265,32 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src)
|
|||
return _mm_or_si128(src, _mm_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v128u16 ColorspaceCopy16_SSE2(const v128u16 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
return _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) );
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src)
|
||||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
#ifdef ENABLE_SSSE3
|
||||
return _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
|
||||
#else
|
||||
return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) );
|
||||
#endif
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128)
|
||||
{
|
||||
|
@ -417,6 +443,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si
|
|||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec128 * sizeof(u16));
|
||||
return pixCountVec128;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
{
|
||||
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2<SWAP_RB>(src_vec128));
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2<SWAP_RB>(src_vec128));
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128)
|
||||
{
|
||||
if (!SWAP_RB)
|
||||
{
|
||||
memcpy(dst, src, pixCountVec128 * sizeof(u32));
|
||||
return pixCountVec128;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
{
|
||||
v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2<SWAP_RB>(src_vec128));
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2<SWAP_RB>(src_vec128));
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceConvertBuffer555To8888Opaque_SSE2<false, false>(src, dst, pixCount);
|
||||
|
@ -557,6 +639,26 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
|
|||
return ColorspaceConvertBuffer888XTo8888Opaque_SSE2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer16_SSE2<true, false>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer16_SSE2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer32_SSE2<true, false>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
|
||||
{
|
||||
return ColorspaceCopyBuffer32_SSE2<true, true>(src, dst, pixCount);
|
||||
}
|
||||
|
||||
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
|
@ -584,4 +686,10 @@ template v128u16 ColorspaceConvert6665To5551_SSE2<false>(const v128u32 &srcLo, c
|
|||
template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<true>(const v128u32 &src);
|
||||
template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<false>(const v128u32 &src);
|
||||
|
||||
template v128u16 ColorspaceCopy16_SSE2<true>(const v128u16 &src);
|
||||
template v128u16 ColorspaceCopy16_SSE2<false>(const v128u16 &src);
|
||||
|
||||
template v128u32 ColorspaceCopy32_SSE2<true>(const v128u32 &src);
|
||||
template v128u32 ColorspaceCopy32_SSE2<false>(const v128u32 &src);
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016 DeSmuME team
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -34,6 +34,9 @@ template<bool SWAP_RB> v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &s
|
|||
template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v128u16 ColorspaceCopy16_SSE2(const v128u16 &src);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceCopy32_SSE2(const v128u32 &src);
|
||||
|
||||
class ColorspaceHandler_SSE2 : public ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
|
@ -73,6 +76,12 @@ public:
|
|||
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
Loading…
Reference in New Issue