diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 2a8f9bc63..43947b8bc 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -52,6 +52,7 @@ u32 Render3DFramesPerSecond; CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; CACHE_ALIGN u32 color_555_to_666[32768]; CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; CACHE_ALIGN u32 color_555_to_888[32768]; //is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX @@ -6324,13 +6325,15 @@ GPUSubsystem::GPUSubsystem() { #define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) #define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) +#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) for (size_t i = 0; i < 32768; i++) { - color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); - color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); - color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); - color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); + color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); + color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); + color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); + color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); + color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); } needInitTables = false; @@ -7088,11 +7091,11 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size for (; i < ssePixCount; i += 8) { __m128i src_vec128 = _mm_load_si128((__m128i *)(src + i)); - __m128i dstConverted0, dstConverted1; - ConvertColor555To8888Opaque(src_vec128, dstConverted0, dstConverted1); + __m128i dstConvertedLo, dstConvertedHi; + ConvertColor555To8888Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - _mm_store_si128((__m128i *)(dst + i + 0), dstConverted0); - _mm_store_si128((__m128i *)(dst + i + 4), dstConverted1); + _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); + _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); } #endif diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index c54ac3c32..39e364755 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1640,24 +1640,26 @@ extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; extern CACHE_ALIGN u32 color_555_to_666[32768]; extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_888[32768]; -#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color -#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color #ifdef LOCAL_LE - #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian + #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian #else - #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian + #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian #endif -#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color -#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color +#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color +#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped +#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color #ifdef LOCAL_LE - #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian + #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian #else - #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian + #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian #endif //produce a 15bpp color from individual 5bit components @@ -1676,13 +1678,7 @@ inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const template FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src) { - FragmentColor outColor; - outColor.r = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 10) & 0x001F) : ((src >> 0) & 0x001F))]; - outColor.g = material_5bit_to_8bit[((src >> 5) & 0x001F)]; - outColor.b = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 0) & 0x001F) : ((src >> 10) & 0x001F))]; - outColor.a = 0xFF; - - return outColor.color; + return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); } template @@ -1760,34 +1756,71 @@ FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) #ifdef ENABLE_SSE2 template -FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dst0, __m128i &dst1) +FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) { +#if 0 + // I'm shelving this code until the time when I figure out how to do this conversion faster in SSE2 + // without using any memory lookups. This code does work, albeit slowly. -- rogerman, 2016-06-17 + // Conversion algorithm: // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB8 << 3) | ((srcRGB8 >> 2) & 0x07) if (SWAP_RB) { - dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000))); - dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); - dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) ); - dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000)); + dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000))); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) ); + dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000)); - dst1 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000))); - dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); - dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) ); - dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000)); + dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000))); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) ); + dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000)); } else { - dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007))); - dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); - dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) ); - dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000)); + dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007))); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); + dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) ); + dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000)); - dst1 = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007))); - dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); - dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) ); - dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000)); + dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007))); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); + dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) ); + dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000)); } + + __m128i tmpDstLo = dstLo; + dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); + dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) ); +#else + // This code does the same thing as the above, but with memory lookups. It's faster, but kinda + // defeats the purpose of using SSE2 due to the memory lookups. -- rogerman, 2016-06-17 + + __m128i srcMasked = _mm_and_si128(src, _mm_set1_epi16(0x7FFF)); + + if (SWAP_RB) + { + dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 7)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 6)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 5)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 4))); + dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 3)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 2)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 1)), + COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 0))); + } + else + { + dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 7)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 6)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 5)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 4))); + dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 3)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 2)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 1)), + COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 0))); + } +#endif } template diff --git a/desmume/src/frontend/modules/ImageOut.cpp b/desmume/src/frontend/modules/ImageOut.cpp index afb88c500..9359b32fb 100644 --- a/desmume/src/frontend/modules/ImageOut.cpp +++ b/desmume/src/frontend/modules/ImageOut.cpp @@ -21,7 +21,7 @@ #include "ImageOut.h" #include "formats/rpng.h" #include "formats/rbmp.h" -#include "gfx3d.h" +#include "GPU.h" static u8* Convert15To24(const u16* src, int width, int height) { @@ -33,11 +33,9 @@ static u8* Convert15To24(const u16* src, int width, int height) { for(int x=0;x>16)&0xFF; - *tmp_inc++ = (expanded>>8)&0xFF; - *tmp_inc++ = expanded&0xFF; + u32 dst = ConvertColor555To8888Opaque(*src++); + *(u32 *)tmp_inc[i] = (dst & 0x00FFFFFF) | (*(u32 *)tmp_inc & 0xFF000000); + tmp_inc += 3; } } return tmp_buffer; diff --git a/desmume/src/windows/aviout.cpp b/desmume/src/windows/aviout.cpp index 8757a534d..cd34ac559 100644 --- a/desmume/src/windows/aviout.cpp +++ b/desmume/src/windows/aviout.cpp @@ -320,12 +320,9 @@ static void do_video_conversion(AVIFile* avi, const u16* buffer) { for(int x=0;x>16)&0xFF; - *outbuf++ = (col24>>8)&0xFF; - *outbuf++ = col24&0xFF; + u32 dst = ConvertColor555To8888Opaque(*buffer++); + *(u32 *)outbuf = (dst & 0x00FFFFFF) | (*(u32 *)outbuf & 0xFF000000); + outbuf += 3; } outbuf -= width*3*2; diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 0d15713ed..ab02d72f9 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -1919,11 +1919,8 @@ static void DoDisplay(bool firstTime) //convert pixel format to 32bpp for compositing //why do we do this over and over? well, we are compositing to - //filteredbuffer32bpp, and it needs to get refreshed each frame.. - const int size = video.srcBufferSize/2; - u16* src = (u16*)video.srcBuffer; - for(int i=0;i((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); if(firstTime) {