- Add 555-to-6665 opaque color conversion.
- Add UNALIGNED switch to 555-to-8888, 555-to-6665, 8888-to-5551, and 6665-to-5551 color buffer conversion functions, allowing clients to inform these functions that the incoming buffer pointers may not be 16-byte aligned.
- Rendered lines from GPUEngineBase::_HandleDisplayModeOff(), GPUEngineA::_HandleDisplayModeVRAM(), and GPUEngineA::_HandleDisplayModeMainMemory() now output colors with the alpha bits filled in. This is working towards a time when clients that work directly in 16-bit and 32-bit colorspaces don’t have to fill in the alpha bits themselves.
- Unify more color conversion code.
This commit is contained in:
rogerman 2016-06-20 18:47:45 +00:00
parent d1a8663acb
commit 4d2307538d
7 changed files with 156 additions and 56 deletions

View File

@ -50,6 +50,7 @@
u32 Render3DFramesPerSecond;
CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
CACHE_ALIGN u32 color_555_to_666[32768];
CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
@ -4387,15 +4388,15 @@ void GPUEngineBase::_HandleDisplayModeOff(const size_t l)
switch (GPU->GetDisplayInfo().colorFormat)
{
case NDSColorFormat_BGR555_Rev:
memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x7FFF);
memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFF);
break;
case NDSColorFormat_BGR666_Rev:
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x003F3F3F);
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x1F3F3F3F);
break;
case NDSColorFormat_BGR888_Rev:
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x00FFFFFF);
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFFFFFF);
break;
}
}
@ -5915,11 +5916,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
{
const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
{
dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF);
}
ConvertColorBuffer555To6665Opaque<false, false>(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
break;
}
@ -5927,11 +5924,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
{
const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
{
dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF);
}
ConvertColorBuffer555To8888Opaque<false, false>(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
break;
}
}
@ -5951,11 +5944,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
{
const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth);
FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth);
for (size_t i = 0; i < customPixCount; i++)
{
dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF);
}
ConvertColorBuffer555To6665Opaque<false, false>(src, (u32 *)dst, customPixCount);
break;
}
@ -5963,11 +5952,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
{
const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth);
FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth);
for (size_t i = 0; i < customPixCount; i++)
{
dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF);
}
ConvertColorBuffer555To8888Opaque<false, false>(src, (u32 *)dst, customPixCount);
break;
}
}
@ -5993,17 +5978,17 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l)
u32 *dst = dstColorLine;
#ifdef ENABLE_SSE2
const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF);
const __m128i alphaBit = _mm_set1_epi16(0x8000);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
{
__m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order.
_mm_store_si128((__m128i *)dst + i, _mm_and_si128(fifoColor, fifoMask));
_mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit));
}
#else
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
dst[i] = DISP_FIFOrecv() & 0x7FFF7FFF;
dst[i] = DISP_FIFOrecv() | 0x80008000;
}
#endif
break;
@ -6323,14 +6308,17 @@ GPUSubsystem::GPUSubsystem()
if (needInitTables)
{
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) )
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) )
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) )
for (size_t i = 0; i < 32768; i++)
{
color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) );
color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 );
color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 );
color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) );
color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 );
color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 );
@ -7081,8 +7069,8 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID)
this->_gpu->SetDisplayByID(this->_ID);
}
template <bool SWAP_RB>
void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size_t pixCount)
template <bool SWAP_RB, bool UNALIGNED>
void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount)
{
size_t i = 0;
@ -7090,12 +7078,20 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
__m128i src_vec128 = _mm_load_si128((__m128i *)(src + i));
__m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i));
__m128i dstConvertedLo, dstConvertedHi;
ConvertColor555To8888Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi);
_mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi);
if (UNALIGNED)
{
_mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi);
}
else
{
_mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi);
}
}
#endif
@ -7105,6 +7101,38 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size
}
}
template <bool SWAP_RB, bool UNALIGNED>
void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount)
{
size_t i = 0;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
__m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i));
__m128i dstConvertedLo, dstConvertedHi;
ConvertColor555To6665Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi);
if (UNALIGNED)
{
_mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi);
}
else
{
_mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi);
}
}
#endif
for (; i < pixCount; i++)
{
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
}
}
template <bool SWAP_RB>
void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount)
{
@ -7143,7 +7171,7 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
}
}
template <bool SWAP_RB>
template <bool SWAP_RB, bool UNALIGNED>
void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount)
{
size_t i = 0;
@ -7152,7 +7180,14 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
if (UNALIGNED)
{
_mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) );
}
else
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
}
}
#endif
@ -7162,7 +7197,7 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
}
}
template <bool SWAP_RB>
template <bool SWAP_RB, bool UNALIGNED>
void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount)
{
size_t i = 0;
@ -7171,7 +7206,14 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
if (UNALIGNED)
{
_mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) );
}
else
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
}
}
#endif
@ -7217,8 +7259,15 @@ template void GPUEngineBase::RenderLayerBG<GPULayerID_BG1>(u16 *dstColorBuffer);
template void GPUEngineBase::RenderLayerBG<GPULayerID_BG2>(u16 *dstColorBuffer);
template void GPUEngineBase::RenderLayerBG<GPULayerID_BG3>(u16 *dstColorBuffer);
template void ConvertColorBuffer555To8888Opaque<true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To8888Opaque<false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To8888Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To8888Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To8888Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To8888Opaque<false, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To6665Opaque<true, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To6665Opaque<true, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To6665Opaque<false, true>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer555To6665Opaque<false, false>(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To6665<true>(const u32 *src, u32 *dst, size_t pixCount);
template void ConvertColorBuffer8888To6665<false>(const u32 *src, u32 *dst, size_t pixCount);
@ -7226,8 +7275,12 @@ template void ConvertColorBuffer8888To6665<false>(const u32 *src, u32 *dst, size
template void ConvertColorBuffer6665To8888<true>(const u32 *src, u32 *dst, size_t pixCount);
template void ConvertColorBuffer6665To8888<false>(const u32 *src, u32 *dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<true, true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<true, false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<false, true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer8888To5551<false, false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<true, true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<true, false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<false, true>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffer6665To5551<false, false>(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);

View File

@ -1638,12 +1638,14 @@ extern CACHE_ALIGN const u8 material_3bit_to_6bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768];
extern CACHE_ALIGN u32 color_555_to_666[32768];
extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
extern CACHE_ALIGN u32 color_555_to_888[32768];
#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color
#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped
#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color
#ifdef LOCAL_LE
@ -1681,6 +1683,12 @@ FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src)
return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF);
}
template <bool SWAP_RB>
FORCEINLINE u32 ConvertColor555To6665Opaque(const u16 src)
{
return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF);
}
template <bool SWAP_RB>
FORCEINLINE u32 ConvertColor8888To6665(u32 srcColor)
{
@ -1790,6 +1798,41 @@ FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo,
dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
}
template <bool SWAP_RB>
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
if (SWAP_RB)
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00000001))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000));
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00010000)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 25), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 30), _mm_set1_epi32(0x00000001))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000));
}
else
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00000001)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00010000))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000));
dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 20), _mm_set1_epi32(0x00000001)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00010000))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000));
}
__m128i tmpDstLo = dstLo;
dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
}
template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
{
@ -1957,10 +2000,13 @@ FORCEINLINE __m128i ConvertColor6665To5551(const __m128i srcLo, const __m128i sr
#endif
template<bool SWAP_RB> void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool UNALIGNED> void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool UNALIGNED> void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool UNALIGNED> void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB, bool UNALIGNED> void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount);
#endif

View File

@ -1007,7 +1007,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
}
else
{
ConvertColorBuffer8888To5551<true>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
ConvertColorBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
}
}
else if (this->_outputFormat == NDSColorFormat_BGR888_Rev)
@ -1038,7 +1038,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
}
else
{
ConvertColorBuffer8888To5551<true>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
ConvertColorBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
}
}
}
@ -1083,7 +1083,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
ConvertColorBuffer8888To5551<true>((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
ConvertColorBuffer8888To5551<true, false>((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
}
}
}
@ -1130,7 +1130,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
ConvertColorBuffer8888To5551<true>((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
ConvertColorBuffer8888To5551<true, false>((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
}
}
}

View File

@ -754,7 +754,7 @@
if (dispInfo.pixelBytes == 2)
{
RGB555ToRGBA8888Buffer((u16 *)displayBuffer, bitmapData, (w * h));
ConvertColorBuffer555To8888Opaque<false, false>((u16 *)displayBuffer, bitmapData, (w * h));
}
else if (dispInfo.pixelBytes == 4)
{

View File

@ -22,6 +22,7 @@
#import "cocoa_util.h"
#include "../NDSSystem.h"
#include "../GPU.h"
#include "../common.h"
#include "../mc.h"
#undef BOOL
@ -691,7 +692,7 @@ void RomIconToRGBA8888(uint32_t *bitmapData)
//
// The first entry always represents the alpha, so we can just ignore it.
clut[0] = 0x00000000;
RGB555ToRGBA8888Buffer(iconClutPtr, &clut[1], 15);
ConvertColorBuffer555To8888Opaque<false, true>((u16 *)iconClutPtr, &clut[1], 15);
// Load the image from the icon pixel data.
//

View File

@ -625,11 +625,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
{
if (this->_outputFormat == NDSColorFormat_BGR666_Rev)
{
ConvertColorBuffer6665To5551<false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
ConvertColorBuffer6665To5551<false, false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
}
else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev)
{
ConvertColorBuffer8888To5551<false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
ConvertColorBuffer8888To5551<false, false>((u32 *)srcFramebuffer, dstRGBA5551, pixCount);
}
}

View File

@ -1920,7 +1920,7 @@ static void DoDisplay(bool firstTime)
//convert pixel format to 32bpp for compositing
//why do we do this over and over? well, we are compositing to
//filteredbuffer32bpp, and it needs to get refreshed each frame.
ConvertColorBuffer555To8888Opaque<true>((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16));
ConvertColorBuffer555To8888Opaque<true, false>((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16));
if(firstTime)
{