diff --git a/desmume/src/frontend/cocoa/OGLDisplayOutput.cpp b/desmume/src/frontend/cocoa/OGLDisplayOutput.cpp index 411c408d2..bcf40ba02 100644 --- a/desmume/src/frontend/cocoa/OGLDisplayOutput.cpp +++ b/desmume/src/frontend/cocoa/OGLDisplayOutput.cpp @@ -18,6 +18,7 @@ #include "OGLDisplayOutput.h" #include "cocoa_globals.h" #include "utilities.h" +#include "../../utils/colorspacehandler/colorspacehandler.h" #include "../../filter/videofilter.h" #include @@ -7384,11 +7385,11 @@ void OGLDisplayLayer::LoadFrameOGL(bool isMainSizeNative, bool isTouchSizeNative { if (this->_videoColorFormat == GL_UNSIGNED_SHORT_1_5_5_5_REV) { - RGB555ToBGRA8888Buffer((const uint16_t *)this->_videoSrcNativeBuffer[0], this->_vf[0]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); + ColorspaceConvertBuffer555To8888Opaque((const uint16_t *)this->_videoSrcNativeBuffer[0], this->_vf[0]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); } else { - RGB888ToBGRA8888Buffer((const uint32_t *)this->_videoSrcNativeBuffer[0], this->_vf[0]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); + ColorspaceConvertBuffer888XTo8888Opaque((const uint32_t *)this->_videoSrcNativeBuffer[0], this->_vf[0]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); } } } @@ -7426,11 +7427,11 @@ void OGLDisplayLayer::LoadFrameOGL(bool isMainSizeNative, bool isTouchSizeNative { if (this->_videoColorFormat == GL_UNSIGNED_SHORT_1_5_5_5_REV) { - RGB555ToBGRA8888Buffer((const uint16_t *)this->_videoSrcNativeBuffer[1], this->_vf[1]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); + ColorspaceConvertBuffer555To8888Opaque((const uint16_t *)this->_videoSrcNativeBuffer[1], this->_vf[1]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); } else { - RGB888ToBGRA8888Buffer((const uint32_t *)this->_videoSrcNativeBuffer[1], this->_vf[1]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); + ColorspaceConvertBuffer888XTo8888Opaque((const uint32_t *)this->_videoSrcNativeBuffer[1], this->_vf[1]->GetSrcBufferPtr(), GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT); } } } diff --git a/desmume/src/frontend/cocoa/cocoa_output.mm b/desmume/src/frontend/cocoa/cocoa_output.mm index 0a9d272c4..00ff212ec 100644 --- a/desmume/src/frontend/cocoa/cocoa_output.mm +++ b/desmume/src/frontend/cocoa/cocoa_output.mm @@ -718,11 +718,11 @@ if (dispInfo.pixelBytes == 2) { - ColorspaceConvertBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); + ColorspaceConvertBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); } else if (dispInfo.pixelBytes == 4) { - RGBA8888ForceOpaqueBuffer((u32 *)displayBuffer, bitmapData, (w * h)); + memcpy(bitmapData, displayBuffer, w * h * sizeof(uint32_t)); } pthread_rwlock_unlock(self.rwlockProducer); diff --git a/desmume/src/frontend/cocoa/cocoa_videofilter.mm b/desmume/src/frontend/cocoa/cocoa_videofilter.mm index cced55964..d8e53efd0 100644 --- a/desmume/src/frontend/cocoa/cocoa_videofilter.mm +++ b/desmume/src/frontend/cocoa/cocoa_videofilter.mm @@ -18,7 +18,7 @@ #import "cocoa_videofilter.h" #import -#include "utilities.h" +#include "../../utils/colorspacehandler/colorspacehandler.h" @implementation CocoaVideoFilter @@ -139,7 +139,7 @@ } uint32_t *bitmapData = (uint32_t *)[imageRep bitmapData]; - RGBA8888ForceOpaqueBuffer((const uint32_t *)[self runFilter], bitmapData, (w * h)); + ColorspaceConvertBuffer888XTo8888Opaque((const uint32_t *)[self runFilter], bitmapData, w * h); #ifdef MSB_FIRST uint32_t *bitmapDataEnd = bitmapData + (w * h); diff --git a/desmume/src/frontend/cocoa/utilities.c b/desmume/src/frontend/cocoa/utilities.c index 823b76dfc..09968abe8 100644 --- a/desmume/src/frontend/cocoa/utilities.c +++ b/desmume/src/frontend/cocoa/utilities.c @@ -126,212 +126,6 @@ bool IsOSXVersionSupported(const unsigned int major, const unsigned int minor, c return result; } -/******************************************************************************************** - RGB555ToRGBA8888() - INLINE - - Converts a color from 15-bit RGB555 format into 32-bit RGBA8888 format. - - Takes: - color16 - The pixel in 15-bit RGB555 format. - - Returns: - A 32-bit unsigned integer containing the RGBA8888 formatted color. - - Details: - The input and output pixels are expected to have little-endian byte order. - ********************************************************************************************/ -inline uint32_t RGB555ToRGBA8888(const uint16_t color16) -{ - return (bits5to8[((color16 >> 0) & 0x001F)] << 0) | - (bits5to8[((color16 >> 5) & 0x001F)] << 8) | - (bits5to8[((color16 >> 10) & 0x001F)] << 16) | - 0xFF000000; -} - -/******************************************************************************************** - RGB555ToBGRA8888() - INLINE - - Converts a color from 15-bit RGB555 format into 32-bit BGRA8888 format. - - Takes: - color16 - The pixel in 15-bit RGB555 format. - - Returns: - A 32-bit unsigned integer containing the BGRA8888 formatted color. - - Details: - The input and output pixels are expected to have little-endian byte order. - ********************************************************************************************/ -inline uint32_t RGB555ToBGRA8888(const uint16_t color16) -{ - return (bits5to8[((color16 >> 10) & 0x001F)] << 0) | - (bits5to8[((color16 >> 5) & 0x001F)] << 8) | - (bits5to8[((color16 >> 0) & 0x001F)] << 16) | - 0xFF000000; -} - -/******************************************************************************************** - RGB888ToBGRA8888() - INLINE - - Converts a color from 24-bit RGB888 format into 32-bit BGRA8888 format. - - Takes: - color32 - The pixel in 24-bit RGB888 format. - - Returns: - A 32-bit unsigned integer containing the BGRA8888 formatted color. - - Details: - The input and output pixels are expected to have little-endian byte order. - ********************************************************************************************/ -inline uint32_t RGB888ToBGRA8888(const uint32_t color32) -{ - return ((color32 & 0x000000FF) << 16) | - ((color32 & 0x0000FF00) ) | - ((color32 & 0x00FF0000) >> 16) | - 0xFF000000; -} - -/******************************************************************************************** - RGBA8888ForceOpaque() - INLINE - - Forces the alpha channel of a 32-bit RGBA8888 color to a value of 0xFF. - - Takes: - color32 - The pixel in 32-bit RGBA8888 format. - - Returns: - A 32-bit unsigned integer containing the RGBA8888 formatted color. - - Details: - The input and output pixels are expected to have little-endian byte order. - ********************************************************************************************/ -inline uint32_t RGBA8888ForceOpaque(const uint32_t color32) -{ - return color32 | 0xFF000000; -} - -/******************************************************************************************** - RGB555ToRGBA8888Buffer() - - Copies a 15-bit RGB555 pixel buffer into a 32-bit RGBA8888 pixel buffer. - - Takes: - srcBuffer - Pointer to the source 15-bit RGB555 pixel buffer. - - destBuffer - Pointer to the destination 32-bit RGBA8888 pixel buffer. - - pixelCount - The number of pixels to copy. - - Returns: - Nothing. - - Details: - The source and destination pixels are expected to have little-endian byte order. - Also, it is the caller's responsibility to ensure that the source and destination - buffers are large enough to accomodate the requested number of pixels. - ********************************************************************************************/ -void RGB555ToRGBA8888Buffer(const uint16_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount) -{ - const uint32_t *__restrict__ destBufferEnd = destBuffer + pixelCount; - - while (destBuffer < destBufferEnd) - { - *destBuffer++ = RGB555ToRGBA8888(*srcBuffer++); - } -} - -/******************************************************************************************** - RGB555ToBGRA8888Buffer() - - Copies a 15-bit RGB555 pixel buffer into a 32-bit BGRA8888 pixel buffer. - - Takes: - srcBuffer - Pointer to the source 15-bit RGB555 pixel buffer. - - destBuffer - Pointer to the destination 32-bit BGRA8888 pixel buffer. - - pixelCount - The number of pixels to copy. - - Returns: - Nothing. - - Details: - The source and destination pixels are expected to have little-endian byte order. - Also, it is the caller's responsibility to ensure that the source and destination - buffers are large enough to accomodate the requested number of pixels. - ********************************************************************************************/ -void RGB555ToBGRA8888Buffer(const uint16_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount) -{ - const uint32_t *__restrict__ destBufferEnd = destBuffer + pixelCount; - - while (destBuffer < destBufferEnd) - { - *destBuffer++ = RGB555ToBGRA8888(*srcBuffer++); - } -} - -/******************************************************************************************** - RGB888ToBGRA8888Buffer() - - Copies a 24-bit RGB888 pixel buffer into a 32-bit BGRA8888 pixel buffer. - - Takes: - srcBuffer - Pointer to the source 24-bit RGB888 pixel buffer. - - destBuffer - Pointer to the destination 32-bit BGRA8888 pixel buffer. - - pixelCount - The number of pixels to copy. - - Returns: - Nothing. - - Details: - The source and destination pixels are expected to have little-endian byte order. - Also, it is the caller's responsibility to ensure that the source and destination - buffers are large enough to accomodate the requested number of pixels. - ********************************************************************************************/ -void RGB888ToBGRA8888Buffer(const uint32_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount) -{ - const uint32_t *__restrict__ destBufferEnd = destBuffer + pixelCount; - - while (destBuffer < destBufferEnd) - { - *destBuffer++ = RGB888ToBGRA8888(*srcBuffer++); - } -} - -/******************************************************************************************** - RGBA8888ForceOpaqueBuffer() - - Copies a 32-bit RGBA8888 pixel buffer into another 32-bit RGBA8888 pixel buffer. - The pixels in the destination buffer will have an alpha value of 0xFF. - - Takes: - srcBuffer - Pointer to the source 32-bit RGBA8888 pixel buffer. - - destBuffer - Pointer to the destination 32-bit RGBA8888 pixel buffer. - - pixelCount - The number of pixels to copy. - - Returns: - Nothing. - - Details: - The source and destination pixels are expected to have little-endian byte order. - Also, it is the caller's responsibility to ensure that the source and destination - buffers are large enough to accomodate the requested number of pixels. - ********************************************************************************************/ -void RGBA8888ForceOpaqueBuffer(const uint32_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount) -{ - const uint32_t *__restrict__ destBufferEnd = destBuffer + pixelCount; - - while (destBuffer < destBufferEnd) - { - *destBuffer++ = RGBA8888ForceOpaque(*srcBuffer++); - } -} - /******************************************************************************************** GetNearestPositivePOT() diff --git a/desmume/src/frontend/cocoa/utilities.h b/desmume/src/frontend/cocoa/utilities.h index 6498059ba..551394964 100644 --- a/desmume/src/frontend/cocoa/utilities.h +++ b/desmume/src/frontend/cocoa/utilities.h @@ -27,17 +27,7 @@ extern "C" { #endif -bool IsOSXVersionSupported(const unsigned int major, const unsigned int minor, const unsigned int revision); - -uint32_t RGB555ToRGBA8888(const uint16_t color16); -uint32_t RGB555ToBGRA8888(const uint16_t color16); -uint32_t RGB888ToBGRA8888(const uint32_t color32); -uint32_t RGBA8888ForceOpaque(const uint32_t color32); -void RGB555ToRGBA8888Buffer(const uint16_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount); -void RGB555ToBGRA8888Buffer(const uint16_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount); -void RGB888ToBGRA8888Buffer(const uint32_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount); -void RGBA8888ForceOpaqueBuffer(const uint32_t *__restrict__ srcBuffer, uint32_t *__restrict__ destBuffer, size_t pixelCount); - +bool IsOSXVersionSupported(const unsigned int major, const unsigned int minor, const unsigned int revision); uint32_t GetNearestPositivePOT(uint32_t value); #ifdef __cplusplus diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index bc71c3eb1..d7622ac2f 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -426,6 +426,54 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric } } +template +void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo8888Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo8888Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } +} + size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { size_t i = 0; @@ -612,7 +660,7 @@ size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict s { size_t i = 0; - for (;i < pixCount; i++) + for (; i < pixCount; i++) { dst[i] = ColorspaceConvert6665To5551(src[i]); } @@ -630,6 +678,40 @@ size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 * return this->ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(src, dst, pixCount); } +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo8888Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); @@ -659,3 +741,8 @@ template void ColorspaceConvertBuffer6665To5551(const u32 *__restric template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h index 3cecbee9a..b73b3d021 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -214,12 +214,34 @@ FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) return ColorspaceConvert6665To5551(srcColorComponent); } +template +FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r; + outColor.g = srcColor.g; + outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b; + outColor.a = 0xFF; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert888XTo8888Opaque(srcColorComponent); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); class ColorspaceHandler { @@ -255,6 +277,11 @@ public: size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index 3bbe895d9..72a482a13 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -212,6 +212,17 @@ FORCEINLINE v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); } +template +FORCEINLINE v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src) +{ + if (SWAP_RB) + { + return _mm256_or_si256( _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm256_set1_epi32(0xFF000000) ); + } + + return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000)); +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) { @@ -344,6 +355,26 @@ size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__ return i; } +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); @@ -464,6 +495,26 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); } +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); @@ -488,4 +539,7 @@ template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, c template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); + #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h index 730bf730f..2ac8dd7b6 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -32,6 +32,7 @@ template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &s template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); class ColorspaceHandler_AVX2 : public ColorspaceHandler { @@ -67,6 +68,11 @@ public: size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp index b4b39f751..dd9b34ea1 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -172,6 +172,17 @@ FORCEINLINE v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, co return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); } +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src) +{ + if (SWAP_RB) + { + return vec_or( vec_perm(src, src, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15})), vec_splat_u32(0xFF000000) ); + } + + return vec_or(src, vec_splat_u32(0xFF000000)); +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) { @@ -258,6 +269,19 @@ size_t ColorspaceConvertBuffer6665To5551_AltiVec(const u32 *__restrict src, u16 return i; } +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert888XTo8888Opaque_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); @@ -318,6 +342,16 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551_SwapRB(const u32 *__re return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); } +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); @@ -342,4 +376,7 @@ template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); + #endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h index d26e05eba..b5041b638 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -32,6 +32,7 @@ template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); // AltiVec has very poor support for dealing with unaligned addresses (it's possible, just // very obtuse), so we're not even going to bother dealing with any unaligned addresses. @@ -57,6 +58,9 @@ public: size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_ALTIVEC diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index 31b6ff156..0c21a4e55 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -250,6 +250,21 @@ FORCEINLINE v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); } +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src) +{ + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + return _mm_or_si128( _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm_set1_epi32(0xFF000000) ); +#else + return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_set1_epi32(0xFF000000) ); +#endif + } + + return _mm_or_si128(src, _mm_set1_epi32(0xFF000000)); +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) { @@ -382,6 +397,26 @@ size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__ return i; } +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); @@ -502,6 +537,26 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); } +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); @@ -526,4 +581,7 @@ template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, c template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); + #endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h index 5b44577ea..08df5d8cd 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -32,6 +32,7 @@ template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &s template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); class ColorspaceHandler_SSE2 : public ColorspaceHandler { @@ -67,6 +68,11 @@ public: size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_SSE2