From 4d2307538d32cead83a14395dad4c89a1e7ec282 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 20 Jun 2016 18:47:45 +0000 Subject: [PATCH] =?UTF-8?q?GPU:=20-=20Add=20555-to-6665=20opaque=20color?= =?UTF-8?q?=20conversion.=20-=20Add=20UNALIGNED=20switch=20to=20555-to-888?= =?UTF-8?q?8,=20555-to-6665,=208888-to-5551,=20and=206665-to-5551=20color?= =?UTF-8?q?=20buffer=20conversion=20functions,=20allowing=20clients=20to?= =?UTF-8?q?=20inform=20these=20functions=20that=20the=20incoming=20buffer?= =?UTF-8?q?=20pointers=20may=20not=20be=2016-byte=20aligned.=20-=20Rendere?= =?UTF-8?q?d=20lines=20from=20GPUEngineBase::=5FHandleDisplayModeOff(),=20?= =?UTF-8?q?GPUEngineA::=5FHandleDisplayModeVRAM(),=20and=20GPUEngineA::=5F?= =?UTF-8?q?HandleDisplayModeMainMemory()=20now=20output=20colors=20with=20?= =?UTF-8?q?the=20alpha=20bits=20filled=20in.=20This=20is=20working=20towar?= =?UTF-8?q?ds=20a=20time=20when=20clients=20that=20work=20directly=20in=20?= =?UTF-8?q?16-bit=20and=2032-bit=20colorspaces=20don=E2=80=99t=20have=20to?= =?UTF-8?q?=20fill=20in=20the=20alpha=20bits=20themselves.=20-=20Unify=20m?= =?UTF-8?q?ore=20color=20conversion=20code.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desmume/src/GPU.cpp | 141 ++++++++++++++++++++---------- desmume/src/GPU.h | 52 ++++++++++- desmume/src/OGLRender.cpp | 8 +- desmume/src/cocoa/cocoa_output.mm | 2 +- desmume/src/cocoa/cocoa_rom.mm | 3 +- desmume/src/render3D.cpp | 4 +- desmume/src/windows/main.cpp | 2 +- 7 files changed, 156 insertions(+), 56 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 43947b8bc..7c3ff493d 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -50,6 +50,7 @@ u32 Render3DFramesPerSecond; CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; CACHE_ALIGN u32 color_555_to_666[32768]; CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; @@ -4387,15 +4388,15 @@ void GPUEngineBase::_HandleDisplayModeOff(const size_t l) switch (GPU->GetDisplayInfo().colorFormat) { case NDSColorFormat_BGR555_Rev: - memset_u16_fast((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x7FFF); + memset_u16_fast((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFF); break; case NDSColorFormat_BGR666_Rev: - memset_u32_fast((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x003F3F3F); + memset_u32_fast((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x1F3F3F3F); break; case NDSColorFormat_BGR888_Rev: - memset_u32_fast((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x00FFFFFF); + memset_u32_fast((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFFFFFF); break; } } @@ -5915,11 +5916,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF); - } + ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } @@ -5927,11 +5924,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF); - } + ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } } @@ -5951,11 +5944,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - - for (size_t i = 0; i < customPixCount; i++) - { - dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF); - } + ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, customPixCount); break; } @@ -5963,11 +5952,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - - for (size_t i = 0; i < customPixCount; i++) - { - dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF); - } + ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, customPixCount); break; } } @@ -5993,17 +5978,17 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l) u32 *dst = dstColorLine; #ifdef ENABLE_SSE2 - const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF); + const __m128i alphaBit = _mm_set1_epi16(0x8000); for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++) { __m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv()); fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order. - _mm_store_si128((__m128i *)dst + i, _mm_and_si128(fifoColor, fifoMask)); + _mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit)); } #else for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) { - dst[i] = DISP_FIFOrecv() & 0x7FFF7FFF; + dst[i] = DISP_FIFOrecv() | 0x80008000; } #endif break; @@ -6323,14 +6308,17 @@ GPUSubsystem::GPUSubsystem() if (needInitTables) { -#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) -#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) -#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) +#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) +#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) +#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) +#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) for (size_t i = 0; i < 32768; i++) { color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); + color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); + color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); @@ -7081,8 +7069,8 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID) this->_gpu->SetDisplayByID(this->_ID); } -template -void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size_t pixCount) +template +void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) { size_t i = 0; @@ -7090,12 +7078,20 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size const size_t ssePixCount = pixCount - (pixCount % 8); for (; i < ssePixCount; i += 8) { - __m128i src_vec128 = _mm_load_si128((__m128i *)(src + i)); + __m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); __m128i dstConvertedLo, dstConvertedHi; ConvertColor555To8888Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); + if (UNALIGNED) + { + _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); + _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); + } + else + { + _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); + _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); + } } #endif @@ -7105,6 +7101,38 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size } } +template +void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef ENABLE_SSE2 + const size_t ssePixCount = pixCount - (pixCount % 8); + for (; i < ssePixCount; i += 8) + { + __m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); + __m128i dstConvertedLo, dstConvertedHi; + ConvertColor555To6665Opaque(src_vec128, dstConvertedLo, dstConvertedHi); + + if (UNALIGNED) + { + _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); + _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); + } + else + { + _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); + _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); + } + } +#endif + + for (; i < pixCount; i++) + { + dst[i] = ConvertColor555To6665Opaque(src[i]); + } +} + template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) { @@ -7143,7 +7171,7 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) } } -template +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) { size_t i = 0; @@ -7152,7 +7180,14 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst const size_t ssePixCount = pixCount - (pixCount % 8); for (; i < ssePixCount; i += 8) { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); + if (UNALIGNED) + { + _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); + } + else + { + _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); + } } #endif @@ -7162,7 +7197,7 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst } } -template +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) { size_t i = 0; @@ -7171,7 +7206,14 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst const size_t ssePixCount = pixCount - (pixCount % 8); for (; i < ssePixCount; i += 8) { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); + if (UNALIGNED) + { + _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); + } + else + { + _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); + } } #endif @@ -7217,8 +7259,15 @@ template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); @@ -7226,8 +7275,12 @@ template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 91aa2e608..8fbfa57d7 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1638,12 +1638,14 @@ extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_666[32768]; extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_888[32768]; #define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped #define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color #ifdef LOCAL_LE @@ -1681,6 +1683,12 @@ FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src) return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); } +template +FORCEINLINE u32 ConvertColor555To6665Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); +} + template FORCEINLINE u32 ConvertColor8888To6665(u32 srcColor) { @@ -1790,6 +1798,41 @@ FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo, dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); } +template +FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + if (SWAP_RB) + { + dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000))); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) ); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00000001))) ); + dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000)); + + dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00010000))); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) ); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 25), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 30), _mm_set1_epi32(0x00000001))) ); + dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000)); + } + else + { + dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00000001))); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) ); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00010000))) ); + dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000)); + + dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 20), _mm_set1_epi32(0x00000001))); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) ); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00010000))) ); + dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000)); + } + + __m128i tmpDstLo = dstLo; + dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); + dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); +} + template FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src) { @@ -1957,10 +2000,13 @@ FORCEINLINE __m128i ConvertColor6665To5551(const __m128i srcLo, const __m128i sr #endif -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); #endif diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 27443bf60..cc842e7d9 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -1007,7 +1007,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } else if (this->_outputFormat == NDSColorFormat_BGR888_Rev) @@ -1038,7 +1038,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } } @@ -1083,7 +1083,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } @@ -1130,7 +1130,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } diff --git a/desmume/src/cocoa/cocoa_output.mm b/desmume/src/cocoa/cocoa_output.mm index 6f9c9425a..5d1ef23bf 100644 --- a/desmume/src/cocoa/cocoa_output.mm +++ b/desmume/src/cocoa/cocoa_output.mm @@ -754,7 +754,7 @@ if (dispInfo.pixelBytes == 2) { - RGB555ToRGBA8888Buffer((u16 *)displayBuffer, bitmapData, (w * h)); + ConvertColorBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); } else if (dispInfo.pixelBytes == 4) { diff --git a/desmume/src/cocoa/cocoa_rom.mm b/desmume/src/cocoa/cocoa_rom.mm index 692dba661..a987e61c7 100644 --- a/desmume/src/cocoa/cocoa_rom.mm +++ b/desmume/src/cocoa/cocoa_rom.mm @@ -22,6 +22,7 @@ #import "cocoa_util.h" #include "../NDSSystem.h" +#include "../GPU.h" #include "../common.h" #include "../mc.h" #undef BOOL @@ -691,7 +692,7 @@ void RomIconToRGBA8888(uint32_t *bitmapData) // // The first entry always represents the alpha, so we can just ignore it. clut[0] = 0x00000000; - RGB555ToRGBA8888Buffer(iconClutPtr, &clut[1], 15); + ConvertColorBuffer555To8888Opaque((u16 *)iconClutPtr, &clut[1], 15); // Load the image from the icon pixel data. // diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 7fd479255..5ac323d92 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -625,11 +625,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if (this->_outputFormat == NDSColorFormat_BGR666_Rev) { - ConvertColorBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index ab02d72f9..68c3fe179 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -1920,7 +1920,7 @@ static void DoDisplay(bool firstTime) //convert pixel format to 32bpp for compositing //why do we do this over and over? well, we are compositing to //filteredbuffer32bpp, and it needs to get refreshed each frame. - ConvertColorBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); + ConvertColorBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); if(firstTime) {