From 29ff68cda95d366bfdc45feff5386a43385e0c41 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 17 Jun 2016 22:36:56 +0000 Subject: [PATCH] GPU: - Add color 555 to 8888-opaque conversions. - In the new color buffer conversion functions, change the FragmentColor data types to u32. (Related to r5455.) --- desmume/src/GPU.cpp | 53 +++++++++++++++------ desmume/src/GPU.h | 96 +++++++++++++++++++++++++++++++++++---- desmume/src/OGLRender.cpp | 24 +++++----- desmume/src/OGLRender.h | 2 - desmume/src/render3D.cpp | 8 ++-- 5 files changed, 144 insertions(+), 39 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 39ca518a1..2a8f9bc63 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -4561,7 +4561,7 @@ void GPUEngineBase::ResolveCustomRendering() void GPUEngineBase::ResolveRGB666ToRGB888() { - ConvertColorBuffers6665To8888((FragmentColor *)this->renderedBuffer, (FragmentColor *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); + ConvertColorBuffer6665To8888((u32 *)this->renderedBuffer, (u32 *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); } void GPUEngineBase::ResolveToCustomFramebuffer() @@ -7079,7 +7079,31 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID) } template -void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount) +void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef ENABLE_SSE2 + const size_t ssePixCount = pixCount - (pixCount % 8); + for (; i < ssePixCount; i += 8) + { + __m128i src_vec128 = _mm_load_si128((__m128i *)(src + i)); + __m128i dstConverted0, dstConverted1; + ConvertColor555To8888Opaque(src_vec128, dstConverted0, dstConverted1); + + _mm_store_si128((__m128i *)(dst + i + 0), dstConverted0); + _mm_store_si128((__m128i *)(dst + i + 4), dstConverted1); + } +#endif + + for (; i < pixCount; i++) + { + dst[i] = ConvertColor555To8888Opaque(src[i]); + } +} + +template +void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) { size_t i = 0; @@ -7098,7 +7122,7 @@ void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, } template -void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount) +void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) { size_t i = 0; @@ -7117,7 +7141,7 @@ void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, } template -void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount) +void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) { size_t i = 0; @@ -7136,7 +7160,7 @@ void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__r } template -void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount) +void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) { size_t i = 0; @@ -7190,14 +7214,17 @@ template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG(u16 *dstColorBuffer); -template void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount); -template void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount); -template void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount); +template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index b6f8dfb19..c54ac3c32 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1674,7 +1674,28 @@ inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const } template -FORCEINLINE FragmentColor ConvertColor8888To6665(FragmentColor srcColor) +FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src) +{ + FragmentColor outColor; + outColor.r = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 10) & 0x001F) : ((src >> 0) & 0x001F))]; + outColor.g = material_5bit_to_8bit[((src >> 5) & 0x001F)]; + outColor.b = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 0) & 0x001F) : ((src >> 10) & 0x001F))]; + outColor.a = 0xFF; + + return outColor.color; +} + +template +FORCEINLINE u32 ConvertColor8888To6665(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ConvertColor8888To6665(srcColorComponent); +} + +template +FORCEINLINE u32 ConvertColor8888To6665(FragmentColor srcColor) { FragmentColor outColor; outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; @@ -1682,11 +1703,20 @@ FORCEINLINE FragmentColor ConvertColor8888To6665(FragmentColor srcColor) outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; outColor.a = srcColor.a >> 3; - return outColor; + return outColor.color; } template -FORCEINLINE FragmentColor ConvertColor6665To8888(FragmentColor srcColor) +FORCEINLINE u32 ConvertColor6665To8888(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ConvertColor6665To8888(srcColorComponent); +} + +template +FORCEINLINE u32 ConvertColor6665To8888(FragmentColor srcColor) { FragmentColor outColor; outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; @@ -1694,7 +1724,16 @@ FORCEINLINE FragmentColor ConvertColor6665To8888(FragmentColor srcColor) outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; outColor.a = material_5bit_to_8bit[srcColor.a]; - return outColor; + return outColor.color; +} + +template +FORCEINLINE u16 ConvertColor8888To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ConvertColor8888To5551(srcColorComponent); } template @@ -1703,6 +1742,15 @@ FORCEINLINE u16 ConvertColor8888To5551(FragmentColor srcColor) return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); } +template +FORCEINLINE u16 ConvertColor6665To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ConvertColor6665To5551(srcColorComponent); +} + template FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) { @@ -1711,6 +1759,37 @@ FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) #ifdef ENABLE_SSE2 +template +FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dst0, __m128i &dst1) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB8 << 3) | ((srcRGB8 >> 2) & 0x07) + if (SWAP_RB) + { + dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000))); + dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); + dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) ); + dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000)); + + dst1 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000))); + dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); + dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) ); + dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000)); + } + else + { + dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007))); + dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) ); + dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) ); + dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000)); + + dst1 = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007))); + dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) ); + dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) ); + dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000)); + } +} + template FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src) { @@ -1878,9 +1957,10 @@ FORCEINLINE __m128i ConvertColor6665To5551(const __m128i srcLo, const __m128i sr #endif -template void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount); -template void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount); -template void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); #endif diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 1c20abc07..27443bf60 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -997,17 +997,17 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i] = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) { - ConvertColorBuffers8888To6665(srcFramebuffer, dstFramebuffer, pixCount); + ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else { - ConvertColorBuffers8888To5551(srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } else if (this->_outputFormat == NDSColorFormat_BGR888_Rev) @@ -1028,8 +1028,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i] = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) @@ -1038,7 +1038,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else { - ConvertColorBuffers8888To5551(srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } } @@ -1067,8 +1067,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; x < pixCount; x++, ir++, iw++) { - dstFramebuffer[iw] = ConvertColor8888To6665(srcFramebuffer[ir]); - dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); + dstFramebuffer[iw].color = ConvertColor8888To6665(srcFramebuffer[ir]); + dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); } } } @@ -1076,14 +1076,14 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - ConvertColorBuffers8888To6665(srcFramebuffer + ir, dstFramebuffer + iw, pixCount); + ConvertColorBuffer8888To6665((u32 *)srcFramebuffer + ir, (u32 *)dstFramebuffer + iw, pixCount); } } else { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - ConvertColorBuffers8888To5551(srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } @@ -1130,7 +1130,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - ConvertColorBuffers8888To5551(srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index e5d2633b7..a5cbf305d 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -557,8 +557,6 @@ void ENDGL(); extern void (*OGLLoadEntryPoints_3_2_Func)(); extern void (*OGLCreateRenderer_3_2_Func)(OpenGLRenderer **rendererPtr); -FORCEINLINE u32 BGRA8888_32_To_RGBA6665_32(const u32 srcPix); -FORCEINLINE u32 BGRA8888_32Rev_To_RGBA6665_32Rev(const u32 srcPix); bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision); #if defined(ENABLE_SSE2) diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 5cbbcbbd8..7fd479255 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -608,11 +608,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) ) { - ConvertColorBuffers8888To6665(srcFramebuffer, dstFramebuffer, pixCount); + ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) ) { - ConvertColorBuffers6665To8888(srcFramebuffer, dstFramebuffer, pixCount); + ConvertColorBuffer6665To8888((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) || ((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) ) @@ -625,11 +625,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if (this->_outputFormat == NDSColorFormat_BGR666_Rev) { - ConvertColorBuffers6665To5551(srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev) { - ConvertColorBuffers8888To5551(srcFramebuffer, dstRGBA5551, pixCount); + ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } }