OpenGL Renderer: Standardize 32-bit color red-blue swapping in OpenGLRenderer::_FlushFramebufferConvertOnCPU().

This commit is contained in:
rogerman 2017-07-27 22:04:42 -07:00
parent d16785eba8
commit 8600466498
9 changed files with 497 additions and 112 deletions

113
desmume/src/OGLRender.cpp Normal file → Executable file
View File

@ -1,7 +1,7 @@
/* /*
Copyright (C) 2006 yopyop Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash Copyright (C) 2006-2007 shash
Copyright (C) 2008-2016 DeSmuME team Copyright (C) 2008-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -1201,9 +1201,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE #pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
@ -1230,29 +1228,18 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
{ {
if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) ) if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) )
{ {
#ifdef ENABLE_SSSE3 #ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4));
if (SWAP_RB) _mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorLo));
{ _mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorHi));
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) );
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) );
}
else
{
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), srcColorLo);
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), srcColorHi);
}
_mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSSE3
#pragma LOOPVECTORIZE_DISABLE #pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
@ -1266,33 +1253,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
} }
else if (dstFramebufferMain != NULL) else if (dstFramebufferMain != NULL)
{ {
if (SWAP_RB) ColorspaceCopyBuffer32<SWAP_RB, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
{
#ifdef ENABLE_SSSE3
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
_mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
}
#endif
#ifdef ENABLE_SSSE3
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++)
{
dstFramebufferMain[i].r = srcFramebuffer[i].b;
dstFramebufferMain[i].g = srcFramebuffer[i].g;
dstFramebufferMain[i].b = srcFramebuffer[i].r;
dstFramebufferMain[i].a = srcFramebuffer[i].a;
}
}
else
{
memcpy(dstFramebufferMain, srcFramebuffer, this->_framebufferWidth * this->_framebufferHeight * sizeof(FragmentColor));
}
this->_renderNeedsFlushMain = false; this->_renderNeedsFlushMain = false;
} }
else else
@ -1324,9 +1285,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebufferMain + iw + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebufferMain + iw + 4), ColorspaceConvert8888To6665_SSE2<SWAP_RB>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE #pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
@ -1372,29 +1331,11 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0));
const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4));
if (SWAP_RB) _mm_store_si128((__m128i *)(dstFramebufferMain + iw + 0), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorLo));
{ _mm_store_si128((__m128i *)(dstFramebufferMain + iw + 4), ColorspaceCopy32_SSE2<SWAP_RB>(srcColorHi));
#ifdef ENABLE_SSSE3
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
_mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
#else
const __m128i swappedLo = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x000000FF)), 16)) );
const __m128i swappedHi = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x000000FF)), 16)) );
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), swappedLo);
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), swappedHi);
#endif
}
else
{
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), srcColorLo);
_mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), srcColorHi);
}
_mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2<SWAP_RB>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE #pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
@ -1409,45 +1350,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
} }
else if (dstFramebufferMain != NULL) else if (dstFramebufferMain != NULL)
{ {
const FragmentColor *__restrict srcPtr = srcFramebuffer; for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth)
FragmentColor *__restrict dstPtr = dstFramebufferMain + ((this->_framebufferHeight - 1) * this->_framebufferWidth);
for (size_t y = 0; y < this->_framebufferHeight; y++)
{ {
if (SWAP_RB) ColorspaceCopyBuffer32<SWAP_RB, false>((u32 *)srcFramebuffer + ir, (u32 *)dstFramebufferMain + iw, pixCount);
{
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
#ifdef ENABLE_SSSE3
_mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) );
#else
const __m128i swappedColor = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColor, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x000000FF)), 16)));
_mm_store_si128((__m128i *)(dstFramebufferMain + i), swappedColor);
#endif
}
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (size_t x = 0; x < this->_framebufferWidth; x++)
{
dstPtr[x].r = srcPtr[x].b;
dstPtr[x].g = srcPtr[x].g;
dstPtr[x].b = srcPtr[x].r;
dstPtr[x].a = srcPtr[x].a;
}
}
else
{
memcpy(dstPtr, srcPtr, this->_framebufferWidth * sizeof(FragmentColor));
}
srcPtr += this->_framebufferWidth;
dstPtr -= this->_framebufferWidth;
} }
this->_renderNeedsFlushMain = false; this->_renderNeedsFlushMain = false;

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -16,6 +16,7 @@
*/ */
#include "colorspacehandler.h" #include "colorspacehandler.h"
#include <string.h>
#if defined(ENABLE_AVX2) #if defined(ENABLE_AVX2)
#include "colorspacehandler_AVX2.cpp" #include "colorspacehandler_AVX2.cpp"
@ -54,6 +55,7 @@
static const ColorspaceHandler csh; static const ColorspaceHandler csh;
#endif #endif
CACHE_ALIGN u16 color_5551_swap_rb[32768];
CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
CACHE_ALIGN u32 color_555_to_666[32768]; CACHE_ALIGN u32 color_555_to_666[32768];
@ -120,6 +122,7 @@ void ColorspaceHandlerInit()
if (needInitTables) if (needInitTables)
{ {
#define RGB16_SWAP_RB_BITLOGIC(col) ( (((col)&0x001F)<<10) | ((col)&0x03E0) | (((col)&0x7C00)>>10) | ((col)&0x8000) )
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) #define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) #define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) )
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) #define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
@ -127,6 +130,8 @@ void ColorspaceHandlerInit()
for (size_t i = 0; i < 32768; i++) for (size_t i = 0; i < 32768; i++)
{ {
color_5551_swap_rb[i] = LE_TO_LOCAL_16( RGB16_SWAP_RB_BITLOGIC(i) );
color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) );
color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 );
color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 );
@ -474,6 +479,86 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi
} }
} }
template <bool SWAP_RB, bool IS_UNALIGNED>
void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCount * sizeof(u16));
return;
}
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
if (IS_UNALIGNED)
{
i = csh.CopyBuffer16_SwapRB_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.CopyBuffer16_SwapRB(src, dst, pixCountVector);
}
#pragma LOOPVECTORIZE_DISABLE
#endif // USEMANUALVECTORIZATION
for (; i < pixCount; i++)
{
dst[i] = ColorspaceCopy16<SWAP_RB>(src[i]);
}
}
template <bool SWAP_RB, bool IS_UNALIGNED>
void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCount * sizeof(u32));
return;
}
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 4);
#endif
if (IS_UNALIGNED)
{
i = csh.CopyBuffer32_SwapRB_IsUnaligned(src, dst, pixCountVector);
}
else
{
i = csh.CopyBuffer32_SwapRB(src, dst, pixCountVector);
}
#pragma LOOPVECTORIZE_DISABLE
#endif // USEMANUALVECTORIZATION
for (; i < pixCount; i++)
{
dst[i] = ColorspaceCopy32<SWAP_RB>(src[i]);
}
}
size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
{ {
size_t i = 0; size_t i = 0;
@ -712,6 +797,40 @@ size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const
return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount);
} }
size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
dst[i] = ColorspaceCopy16<true>(src[i]);
}
return i;
}
size_t ColorspaceHandler::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
{
return this->CopyBuffer16_SwapRB(src, dst, pixCount);
}
size_t ColorspaceHandler::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
{
size_t i = 0;
for (; i < pixCount; i++)
{
dst[i] = ColorspaceCopy32<true>(src[i]);
}
return i;
}
size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
{
return this->CopyBuffer32_SwapRB(src, dst, pixCount);
}
template void ColorspaceConvertBuffer555To8888Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer555To8888Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ColorspaceConvertBuffer555To8888Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
@ -746,3 +865,13 @@ template void ColorspaceConvertBuffer888XTo8888Opaque<true, true>(const u32 *src
template void ColorspaceConvertBuffer888XTo8888Opaque<true, false>(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque<true, false>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo8888Opaque<false, true>(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque<false, true>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceConvertBuffer888XTo8888Opaque<false, false>(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque<false, false>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<true, true>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<true, false>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<false, true>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer16<false, false>(const u16 *src, u16 *dst, size_t pixCount);
template void ColorspaceCopyBuffer32<true, true>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceCopyBuffer32<true, false>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceCopyBuffer32<false, true>(const u32 *src, u32 *dst, size_t pixCount);
template void ColorspaceCopyBuffer32<false, false>(const u32 *src, u32 *dst, size_t pixCount);

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -96,6 +96,7 @@ extern CACHE_ALIGN const u8 material_3bit_to_5bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; extern CACHE_ALIGN const u8 material_3bit_to_6bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
extern CACHE_ALIGN u16 color_5551_swap_rb[32768];
extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
extern CACHE_ALIGN u32 color_555_to_666[32768]; extern CACHE_ALIGN u32 color_555_to_666[32768];
@ -103,6 +104,7 @@ extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
extern CACHE_ALIGN u32 color_555_to_888[32768]; extern CACHE_ALIGN u32 color_555_to_888[32768];
#define COLOR5551_SWAP_RB(col) (color_5551_swap_rb[(col)]) // Swaps the red-blue colors of a 16-bit RGBA5551 color
#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color #define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color
#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped #define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped
#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color #define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color
@ -236,6 +238,33 @@ FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor)
return ColorspaceConvert888XTo8888Opaque<SWAP_RB>(srcColorComponent); return ColorspaceConvert888XTo8888Opaque<SWAP_RB>(srcColorComponent);
} }
template <bool SWAP_RB>
FORCEINLINE u16 ColorspaceCopy16(u16 srcColor)
{
return (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor;
}
template <bool SWAP_RB>
FORCEINLINE u32 ColorspaceCopy32(FragmentColor srcColor)
{
FragmentColor outColor;
outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r;
outColor.g = srcColor.g;
outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b;
outColor.a = srcColor.a;
return outColor.color;
}
template <bool SWAP_RB>
FORCEINLINE u32 ColorspaceCopy32(u32 srcColor)
{
FragmentColor srcColorComponent;
srcColorComponent.color = srcColor;
return ColorspaceCopy32<SWAP_RB>(srcColorComponent);
}
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount);
@ -244,6 +273,9 @@ template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer8888To5551
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount);
template<bool SWAP_RB, bool IS_UNALIGNED> void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount);
class ColorspaceHandler class ColorspaceHandler
{ {
public: public:
@ -283,6 +315,12 @@ public:
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
}; };
FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a)

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -225,6 +225,28 @@ FORCEINLINE v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src)
return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000)); return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000));
} }
template <bool SWAP_RB>
FORCEINLINE v256u16 ColorspaceCopy16_AVX2(const v256u16 &src)
{
if (SWAP_RB)
{
return _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) );
}
return src;
}
template <bool SWAP_RB>
FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src)
{
if (SWAP_RB)
{
return _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
}
return src;
}
template <bool SWAP_RB, bool IS_UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256)
{ {
@ -377,6 +399,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si
return i; return i;
} }
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec256 * sizeof(u16));
return pixCountVec256;
}
size_t i = 0;
for (; i < pixCountVec256; i+=16)
{
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
if (IS_UNALIGNED)
{
_mm256_storeu_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2<SWAP_RB>(src_vec256));
}
else
{
_mm256_store_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2<SWAP_RB>(src_vec256));
}
}
return i;
}
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec256 * sizeof(u32));
return pixCountVec256;
}
size_t i = 0;
for (; i < pixCountVec256; i+=8)
{
v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i));
if (IS_UNALIGNED)
{
_mm256_storeu_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2<SWAP_RB>(src_vec256));
}
else
{
_mm256_store_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2<SWAP_RB>(src_vec256));
}
}
return i;
}
size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
{ {
return ColorspaceConvertBuffer555To8888Opaque_AVX2<false, false>(src, dst, pixCount); return ColorspaceConvertBuffer555To8888Opaque_AVX2<false, false>(src, dst, pixCount);
@ -517,6 +595,26 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
return ColorspaceConvertBuffer888XTo8888Opaque_AVX2<true, true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_AVX2<true, true>(src, dst, pixCount);
} }
size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer16_AVX2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer16_AVX2<true, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer32_AVX2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer32_AVX2<true, true>(src, dst, pixCount);
}
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi);
@ -544,4 +642,10 @@ template v256u16 ColorspaceConvert6665To5551_AVX2<false>(const v256u32 &srcLo, c
template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<true>(const v256u32 &src); template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<true>(const v256u32 &src);
template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<false>(const v256u32 &src); template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2<false>(const v256u32 &src);
template v256u16 ColorspaceCopy16_AVX2<true>(const v256u16 &src);
template v256u16 ColorspaceCopy16_AVX2<false>(const v256u16 &src);
template v256u32 ColorspaceCopy32_AVX2<true>(const v256u32 &src);
template v256u32 ColorspaceCopy32_AVX2<false>(const v256u32 &src);
#endif // ENABLE_AVX2 #endif // ENABLE_AVX2

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,6 +34,9 @@ template<bool SWAP_RB> v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &s
template<bool SWAP_RB> v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); template<bool SWAP_RB> v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi);
template<bool SWAP_RB> v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); template<bool SWAP_RB> v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src);
template<bool SWAP_RB> v256u16 ColorspaceCopy16_AVX2(const v256u16 &src);
template<bool SWAP_RB> v256u32 ColorspaceCopy32_AVX2(const v256u32 &src);
class ColorspaceHandler_AVX2 : public ColorspaceHandler class ColorspaceHandler_AVX2 : public ColorspaceHandler
{ {
public: public:
@ -73,6 +76,12 @@ public:
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
}; };
#endif // ENABLE_AVX2 #endif // ENABLE_AVX2

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -183,6 +183,28 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src
return vec_or(src, vec_splat_u32(0xFF000000)); return vec_or(src, vec_splat_u32(0xFF000000));
} }
template <bool SWAP_RB>
FORCEINLINE v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src)
{
if (SWAP_RB)
{
return vec_or( vec_or(vec_sr(vec_and(src, vec_splat_u16(0x7C00)), vec_splat_u16(10)), vec_or(vec_and(src, vec_splat_u16(0x0E30)), vec_sl(vec_and(src, vec_splat_u16(0x001F)), vec_splat_u16(10)))), vec_and(src, vec_splat_u16(0x8000)) );
}
return src;
}
template <bool SWAP_RB>
FORCEINLINE v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src)
{
if (SWAP_RB)
{
return vec_perm(src, src, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}));
}
return src;
}
template <bool SWAP_RB> template <bool SWAP_RB>
static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128)
{ {
@ -282,6 +304,44 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst,
return i; return i;
} }
template <bool SWAP_RB>
size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec128 * sizeof(u16));
return pixCountVec128;
}
size_t i = 0;
for (; i < pixCountVec128; i+=8)
{
vec_st( ColorspaceCopy16_AltiVec<SWAP_RB>(vec_ld(0, src+i)), 0, dst+i );
}
return i;
}
template <bool SWAP_RB>
size_t ColorspaceCopyBuffer32_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec128 * sizeof(u32));
return pixCountVec128;
}
size_t i = 0;
for (; i < pixCountVec128; i+=4)
{
vec_st( ColorspaceCopy32_AltiVec<SWAP_RB>(vec_ld(0, src+i)), 0, dst+i );
}
return i;
}
size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
{ {
return ColorspaceConvertBuffer555To8888Opaque_AltiVec<false>(src, dst, pixCount); return ColorspaceConvertBuffer555To8888Opaque_AltiVec<false>(src, dst, pixCount);
@ -352,6 +412,16 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32
return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec<true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec<true>(src, dst, pixCount);
} }
size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer16_AltiVec<true>(src, dst, pixCount);
}
size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer32_AltiVec<true>(src, dst, pixCount);
}
template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
@ -379,4 +449,10 @@ template v128u16 ColorspaceConvert6665To5551_AltiVec<false>(const v128u32 &srcLo
template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<true>(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<true>(const v128u32 &src);
template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<false>(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec<false>(const v128u32 &src);
template v128u16 ColorspaceCopy16_AltiVec<true>(const v128u16 &src);
template v128u16 ColorspaceCopy16_AltiVec<false>(const v128u16 &src);
template v128u32 ColorspaceCopy32_AltiVec<true>(const v128u32 &src);
template v128u32 ColorspaceCopy32_AltiVec<false>(const v128u32 &src);
#endif // ENABLE_SSE2 #endif // ENABLE_SSE2

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,6 +34,9 @@ template<bool SWAP_RB> v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32
template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src);
template<bool SWAP_RB> v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src);
template<bool SWAP_RB> v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src);
// AltiVec has very poor support for dealing with unaligned addresses (it's possible, just // AltiVec has very poor support for dealing with unaligned addresses (it's possible, just
// very obtuse), so we're not even going to bother dealing with any unaligned addresses. // very obtuse), so we're not even going to bother dealing with any unaligned addresses.
class ColorspaceHandler_AltiVec : public ColorspaceHandler class ColorspaceHandler_AltiVec : public ColorspaceHandler
@ -61,6 +64,10 @@ public:
size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
}; };
#endif // ENABLE_ALTIVEC #endif // ENABLE_ALTIVEC

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -265,6 +265,32 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src)
return _mm_or_si128(src, _mm_set1_epi32(0xFF000000)); return _mm_or_si128(src, _mm_set1_epi32(0xFF000000));
} }
template <bool SWAP_RB>
FORCEINLINE v128u16 ColorspaceCopy16_SSE2(const v128u16 &src)
{
if (SWAP_RB)
{
return _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) );
}
return src;
}
template <bool SWAP_RB>
FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src)
{
if (SWAP_RB)
{
#ifdef ENABLE_SSSE3
return _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
#else
return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) );
#endif
}
return src;
}
template <bool SWAP_RB, bool IS_UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128)
{ {
@ -417,6 +443,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si
return i; return i;
} }
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec128 * sizeof(u16));
return pixCountVec128;
}
size_t i = 0;
for (; i < pixCountVec128; i+=8)
{
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
if (IS_UNALIGNED)
{
_mm_storeu_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2<SWAP_RB>(src_vec128));
}
else
{
_mm_store_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2<SWAP_RB>(src_vec128));
}
}
return i;
}
template <bool SWAP_RB, bool IS_UNALIGNED>
size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128)
{
if (!SWAP_RB)
{
memcpy(dst, src, pixCountVec128 * sizeof(u32));
return pixCountVec128;
}
size_t i = 0;
for (; i < pixCountVec128; i+=4)
{
v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i));
if (IS_UNALIGNED)
{
_mm_storeu_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2<SWAP_RB>(src_vec128));
}
else
{
_mm_store_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2<SWAP_RB>(src_vec128));
}
}
return i;
}
size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const
{ {
return ColorspaceConvertBuffer555To8888Opaque_SSE2<false, false>(src, dst, pixCount); return ColorspaceConvertBuffer555To8888Opaque_SSE2<false, false>(src, dst, pixCount);
@ -557,6 +639,26 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(
return ColorspaceConvertBuffer888XTo8888Opaque_SSE2<true, true>(src, dst, pixCount); return ColorspaceConvertBuffer888XTo8888Opaque_SSE2<true, true>(src, dst, pixCount);
} }
size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer16_SSE2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer16_SSE2<true, true>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer32_SSE2<true, false>(src, dst, pixCount);
}
size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const
{
return ColorspaceCopyBuffer32_SSE2<true, true>(src, dst, pixCount);
}
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi);
@ -584,4 +686,10 @@ template v128u16 ColorspaceConvert6665To5551_SSE2<false>(const v128u32 &srcLo, c
template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<true>(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<true>(const v128u32 &src);
template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<false>(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2<false>(const v128u32 &src);
template v128u16 ColorspaceCopy16_SSE2<true>(const v128u16 &src);
template v128u16 ColorspaceCopy16_SSE2<false>(const v128u16 &src);
template v128u32 ColorspaceCopy32_SSE2<true>(const v128u32 &src);
template v128u32 ColorspaceCopy32_SSE2<false>(const v128u32 &src);
#endif // ENABLE_SSE2 #endif // ENABLE_SSE2

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2016 DeSmuME team Copyright (C) 2016-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,6 +34,9 @@ template<bool SWAP_RB> v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &s
template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); template<bool SWAP_RB> v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); template<bool SWAP_RB> v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src);
template<bool SWAP_RB> v128u16 ColorspaceCopy16_SSE2(const v128u16 &src);
template<bool SWAP_RB> v128u32 ColorspaceCopy32_SSE2(const v128u32 &src);
class ColorspaceHandler_SSE2 : public ColorspaceHandler class ColorspaceHandler_SSE2 : public ColorspaceHandler
{ {
public: public:
@ -73,6 +76,12 @@ public:
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
}; };
#endif // ENABLE_SSE2 #endif // ENABLE_SSE2