From 8600466498105e6a4121ba0076237fc495de340b Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 27 Jul 2017 22:04:42 -0700 Subject: [PATCH] OpenGL Renderer: Standardize 32-bit color red-blue swapping in OpenGLRenderer::_FlushFramebufferConvertOnCPU(). --- desmume/src/OGLRender.cpp | 113 ++------------- .../colorspacehandler/colorspacehandler.cpp | 131 +++++++++++++++++- .../colorspacehandler/colorspacehandler.h | 40 +++++- .../colorspacehandler_AVX2.cpp | 106 +++++++++++++- .../colorspacehandler_AVX2.h | 11 +- .../colorspacehandler_AltiVec.cpp | 78 ++++++++++- .../colorspacehandler_AltiVec.h | 9 +- .../colorspacehandler_SSE2.cpp | 110 ++++++++++++++- .../colorspacehandler_SSE2.h | 11 +- 9 files changed, 497 insertions(+), 112 deletions(-) mode change 100644 => 100755 desmume/src/OGLRender.cpp diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp old mode 100644 new mode 100755 index 1c5b0b85e..af1e72c06 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -1,7 +1,7 @@ /* Copyright (C) 2006 yopyop Copyright (C) 2006-2007 shash - Copyright (C) 2008-2016 DeSmuME team + Copyright (C) 2008-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1201,9 +1201,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } -#endif -#ifdef ENABLE_SSE2 #pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) @@ -1230,29 +1228,18 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) ) { -#ifdef ENABLE_SSSE3 +#ifdef ENABLE_SSE2 const size_t ssePixCount = pixCount - (pixCount % 8); for (; i < ssePixCount; i += 8) { const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4)); - if (SWAP_RB) - { - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) ); - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)) ); - } - else - { - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), srcColorLo); - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), srcColorHi); - } - + _mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), ColorspaceCopy32_SSE2(srcColorLo)); + _mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), ColorspaceCopy32_SSE2(srcColorHi)); _mm_store_si128( (__m128i *)(dstFramebuffer16 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } -#endif -#ifdef ENABLE_SSSE3 #pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) @@ -1266,33 +1253,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else if (dstFramebufferMain != NULL) { - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i)); - _mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) ); - } -#endif - -#ifdef ENABLE_SSSE3 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dstFramebufferMain[i].r = srcFramebuffer[i].b; - dstFramebufferMain[i].g = srcFramebuffer[i].g; - dstFramebufferMain[i].b = srcFramebuffer[i].r; - dstFramebufferMain[i].a = srcFramebuffer[i].a; - } - } - else - { - memcpy(dstFramebufferMain, srcFramebuffer, this->_framebufferWidth * this->_framebufferHeight * sizeof(FragmentColor)); - } - + ColorspaceCopyBuffer32((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount); this->_renderNeedsFlushMain = false; } else @@ -1324,9 +1285,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebufferMain + iw + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } -#endif -#ifdef ENABLE_SSE2 #pragma LOOPVECTORIZE_DISABLE #endif for (; x < pixCount; x++, ir++, iw++) @@ -1372,29 +1331,11 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4)); - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 0), _mm_shuffle_epi8(srcColorLo, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) ); - _mm_store_si128( (__m128i *)(dstFramebufferMain + i + 4), _mm_shuffle_epi8(srcColorHi, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) ); -#else - const __m128i swappedLo = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorLo, _mm_set1_epi32(0x000000FF)), 16)) ); - const __m128i swappedHi = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColorHi, _mm_set1_epi32(0x000000FF)), 16)) ); - _mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), swappedLo); - _mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), swappedHi); -#endif - } - else - { - _mm_store_si128((__m128i *)(dstFramebufferMain + i + 0), srcColorLo); - _mm_store_si128((__m128i *)(dstFramebufferMain + i + 4), srcColorHi); - } - + _mm_store_si128((__m128i *)(dstFramebufferMain + iw + 0), ColorspaceCopy32_SSE2(srcColorLo)); + _mm_store_si128((__m128i *)(dstFramebufferMain + iw + 4), ColorspaceCopy32_SSE2(srcColorHi)); _mm_store_si128( (__m128i *)(dstFramebuffer16 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } -#endif -#ifdef ENABLE_SSE2 #pragma LOOPVECTORIZE_DISABLE #endif for (; x < pixCount; x++, ir++, iw++) @@ -1409,45 +1350,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else if (dstFramebufferMain != NULL) { - const FragmentColor *__restrict srcPtr = srcFramebuffer; - FragmentColor *__restrict dstPtr = dstFramebufferMain + ((this->_framebufferHeight - 1) * this->_framebufferWidth); - - for (size_t y = 0; y < this->_framebufferHeight; y++) + for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - if (SWAP_RB) - { -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i)); -#ifdef ENABLE_SSSE3 - _mm_store_si128( (__m128i *)(dstFramebufferMain + i), _mm_shuffle_epi8(srcColor, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) ); -#else - const __m128i swappedColor = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(srcColor, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(srcColor, _mm_set1_epi32(0x000000FF)), 16))); - _mm_store_si128((__m128i *)(dstFramebufferMain + i), swappedColor); -#endif - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (size_t x = 0; x < this->_framebufferWidth; x++) - { - dstPtr[x].r = srcPtr[x].b; - dstPtr[x].g = srcPtr[x].g; - dstPtr[x].b = srcPtr[x].r; - dstPtr[x].a = srcPtr[x].a; - } - } - else - { - memcpy(dstPtr, srcPtr, this->_framebufferWidth * sizeof(FragmentColor)); - } - - srcPtr += this->_framebufferWidth; - dstPtr -= this->_framebufferWidth; + ColorspaceCopyBuffer32((u32 *)srcFramebuffer + ir, (u32 *)dstFramebufferMain + iw, pixCount); } this->_renderNeedsFlushMain = false; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index d7622ac2f..30ff4cac8 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ */ #include "colorspacehandler.h" +#include #if defined(ENABLE_AVX2) #include "colorspacehandler_AVX2.cpp" @@ -54,6 +55,7 @@ static const ColorspaceHandler csh; #endif +CACHE_ALIGN u16 color_5551_swap_rb[32768]; CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; CACHE_ALIGN u32 color_555_to_666[32768]; @@ -120,6 +122,7 @@ void ColorspaceHandlerInit() if (needInitTables) { +#define RGB16_SWAP_RB_BITLOGIC(col) ( (((col)&0x001F)<<10) | ((col)&0x03E0) | (((col)&0x7C00)>>10) | ((col)&0x8000) ) #define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) #define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) #define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) @@ -127,6 +130,8 @@ void ColorspaceHandlerInit() for (size_t i = 0; i < 32768; i++) { + color_5551_swap_rb[i] = LE_TO_LOCAL_16( RGB16_SWAP_RB_BITLOGIC(i) ); + color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); @@ -474,6 +479,86 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi } } +template +void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCount * sizeof(u16)); + return; + } + + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (IS_UNALIGNED) + { + i = csh.CopyBuffer16_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.CopyBuffer16_SwapRB(src, dst, pixCountVector); + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy16(src[i]); + } +} + +template +void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCount * sizeof(u32)); + return; + } + + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); +#endif + + if (IS_UNALIGNED) + { + i = csh.CopyBuffer32_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.CopyBuffer32_SwapRB(src, dst, pixCountVector); + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy32(src[i]); + } +} + size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { size_t i = 0; @@ -712,6 +797,40 @@ size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); } +size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy16(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return this->CopyBuffer16_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy32(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->CopyBuffer32_SwapRB(src, dst, pixCount); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); @@ -746,3 +865,13 @@ template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); + +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h index d878fb6f6..6e14d2918 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -96,6 +96,7 @@ extern CACHE_ALIGN const u8 material_3bit_to_5bit[8]; extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; +extern CACHE_ALIGN u16 color_5551_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_666[32768]; @@ -103,6 +104,7 @@ extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; extern CACHE_ALIGN u32 color_555_to_888[32768]; +#define COLOR5551_SWAP_RB(col) (color_5551_swap_rb[(col)]) // Swaps the red-blue colors of a 16-bit RGBA5551 color #define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color #define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped #define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color @@ -236,6 +238,33 @@ FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor) return ColorspaceConvert888XTo8888Opaque(srcColorComponent); } +template +FORCEINLINE u16 ColorspaceCopy16(u16 srcColor) +{ + return (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor; +} + +template +FORCEINLINE u32 ColorspaceCopy32(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r; + outColor.g = srcColor.g; + outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b; + outColor.a = srcColor.a; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceCopy32(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceCopy32(srcColorComponent); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); @@ -244,6 +273,9 @@ template void ColorspaceConvertBuffer8888To5551 template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); + class ColorspaceHandler { public: @@ -283,6 +315,12 @@ public: size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index 82b66fe32..56f1fe4aa 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -225,6 +225,28 @@ FORCEINLINE v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src) return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000)); } +template +FORCEINLINE v256u16 ColorspaceCopy16_AVX2(const v256u16 &src) +{ + if (SWAP_RB) + { + return _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src) +{ + if (SWAP_RB) + { + return _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + } + + return src; +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) { @@ -377,6 +399,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si return i; } +template +size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec256 * sizeof(u16)); + return pixCountVec256; + } + + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2(src_vec256)); + } + else + { + _mm256_store_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2(src_vec256)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec256 * sizeof(u32)); + return pixCountVec256; + } + + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2(src_vec256)); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2(src_vec256)); + } + } + + return i; +} + size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); @@ -517,6 +595,26 @@ size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned( return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); } +size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX2(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); @@ -544,4 +642,10 @@ template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, c template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); + +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); + #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h index 2ac8dd7b6..b2e926200 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,9 @@ template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &s template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); + class ColorspaceHandler_AVX2 : public ColorspaceHandler { public: @@ -73,6 +76,12 @@ public: size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp index dd9b34ea1..6d8eb217e 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -183,6 +183,28 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src return vec_or(src, vec_splat_u32(0xFF000000)); } +template +FORCEINLINE v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src) +{ + if (SWAP_RB) + { + return vec_or( vec_or(vec_sr(vec_and(src, vec_splat_u16(0x7C00)), vec_splat_u16(10)), vec_or(vec_and(src, vec_splat_u16(0x0E30)), vec_sl(vec_and(src, vec_splat_u16(0x001F)), vec_splat_u16(10)))), vec_and(src, vec_splat_u16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src) +{ + if (SWAP_RB) + { + return vec_perm(src, src, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15})); + } + + return src; +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) { @@ -282,6 +304,44 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst, return i; } +template +size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceCopy16_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceCopy32_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); @@ -352,6 +412,16 @@ size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); } +size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AltiVec(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); @@ -379,4 +449,10 @@ template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); + +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); + #endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h index b5041b638..0428c16c5 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,9 @@ template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); + // AltiVec has very poor support for dealing with unaligned addresses (it's possible, just // very obtuse), so we're not even going to bother dealing with any unaligned addresses. class ColorspaceHandler_AltiVec : public ColorspaceHandler @@ -61,6 +64,10 @@ public: size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_ALTIVEC diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index 9295852f4..1c1394858 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -265,6 +265,32 @@ FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src) return _mm_or_si128(src, _mm_set1_epi32(0xFF000000)); } +template +FORCEINLINE v128u16 ColorspaceCopy16_SSE2(const v128u16 &src) +{ + if (SWAP_RB) + { + return _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src) +{ + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + return _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); +#else + return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) ); +#endif + } + + return src; +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) { @@ -417,6 +443,62 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si return i; } +template +size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2(src_vec128)); + } + else + { + _mm_store_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2(src_vec128)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2(src_vec128)); + } + else + { + _mm_store_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2(src_vec128)); + } + } + + return i; +} + size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); @@ -557,6 +639,26 @@ size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned( return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); } +size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_SSE2(src, dst, pixCount); +} + template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); @@ -584,4 +686,10 @@ template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, c template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); + +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); + #endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h index 08df5d8cd..81cf3a5a4 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 DeSmuME team + Copyright (C) 2016-2017 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,9 @@ template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &s template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); + class ColorspaceHandler_SSE2 : public ColorspaceHandler { public: @@ -73,6 +76,12 @@ public: size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; }; #endif // ENABLE_SSE2