From d1a8663acbd3e4b888c5e164403b06cb047e94cf Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 18 Jun 2016 22:20:07 +0000 Subject: [PATCH] GPU: - In the SSE2 version of ConvertColor555To8888Opaque(), change the algorithm to use computation instead of memory lookups. Although memory lookups are faster on newer CPUs, computation is much faster on older CPUs, which have smaller caches and longer memory latencies. I believe this is the correct decision, since older CPUs are the ones that need as much performance as they can get. --- desmume/src/GPU.h | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 39e364755..91aa2e608 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1758,12 +1758,8 @@ FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) template FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) { -#if 0 - // I'm shelving this code until the time when I figure out how to do this conversion faster in SSE2 - // without using any memory lookups. This code does work, albeit slowly. -- rogerman, 2016-06-17 - // Conversion algorithm: - // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB8 << 3) | ((srcRGB8 >> 2) & 0x07) + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) if (SWAP_RB) { dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000))); @@ -1792,35 +1788,6 @@ FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo, __m128i tmpDstLo = dstLo; dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); -#else - // This code does the same thing as the above, but with memory lookups. It's faster, but kinda - // defeats the purpose of using SSE2 due to the memory lookups. -- rogerman, 2016-06-17 - - __m128i srcMasked = _mm_and_si128(src, _mm_set1_epi16(0x7FFF)); - - if (SWAP_RB) - { - dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 7)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 6)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 5)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 4))); - dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 3)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 2)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 1)), - COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 0))); - } - else - { - dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 7)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 6)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 5)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 4))); - dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 3)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 2)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 1)), - COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 0))); - } -#endif } template