diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index 6454042b9..42574d8a1 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -563,6 +563,181 @@ void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount) } } +template +void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer16_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer16_SwapRB(dst, pixCountVector, intensity); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer16_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer16(dst, pixCountVector, intensity); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCount; i++) + { + dst[i] = COLOR5551_SWAP_RB(dst[i]); + } + } + + return; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(dst[i]) : dst[i]; + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u8 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } +} + +template +void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer32_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer32_SwapRB(dst, pixCountVector, intensity); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer32_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer32(dst, pixCountVector, intensity); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = dstColor.b; + outColor.b = dstColor.r; + } + } + + return; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + if (SWAP_RB) + { + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 ); + } + } + else + { + for (; i < pixCount; i++) + { + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + } + } +} + size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { size_t i = 0; @@ -835,6 +1010,176 @@ size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *d return this->CopyBuffer32_SwapRB(src, dst, pixCount); } +size_t ColorspaceHandler::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + return pixCount; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + u16 outColor = dst[i]; + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u8 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + for (; i < pixCount; i++) + { + dst[i] = COLOR5551_SWAP_RB(dst[i]); + } + + return i; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + u16 outColor = COLOR5551_SWAP_RB(dst[i]); + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u8 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer16(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer16_SwapRB(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + return pixCount; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = dstColor.b; + outColor.b = dstColor.r; + } + + return i; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer32(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer32_SwapRB(dst, pixCount, intensity); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); @@ -879,3 +1224,13 @@ template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_ template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); + +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h index a3204b5da..d57cd8ff2 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -265,6 +265,65 @@ FORCEINLINE u32 ColorspaceCopy32(u32 srcColor) return ColorspaceCopy32(srcColorComponent); } +template +FORCEINLINE u16 ColorspaceApplyIntensity16(u16 srcColor, float intensity) +{ + u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor; + + if (intensity > 0.999f) + { + return outColor; + } + else if (intensity < 0.001f) + { + return (outColor & 0x8000); + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u8 a = outColor & 0x8000; + + return ( (r << 0) | (g << 5) | (b << 10) | a ); +} + +template +FORCEINLINE u32 ColorspaceApplyIntensity32(FragmentColor srcColor, float intensity) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r); + outColor.g = srcColor.g; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b); + outColor.a = srcColor.a; + + if (intensity > 0.999f) + { + return outColor.color; + } + else if (intensity < 0.001f) + { + return (outColor.color & 0xFF000000); + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + outColor.a = outColor.a; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceApplyIntensity32(u32 srcColor, float intensity) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceApplyIntensity32(srcColorComponent); +} + template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); @@ -276,6 +335,9 @@ template void ColorspaceConvertBuffer888XTo8888 template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); + class ColorspaceHandler { public: @@ -321,6 +383,16 @@ public: size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; }; FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp index a5d272411..ccc342c67 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -248,6 +248,62 @@ FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src) return src; } +template +FORCEINLINE v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity) +{ + v256u16 tempSrc = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm256_and_si256(tempSrc, _mm256_set1_epi16(0x8000)); + } + + v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 10), _mm256_set1_epi16(0x001F) ); + v256u16 a = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x8000) ); + + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi16( _mm256_mulhi_epu16(g, intensity_v256), 5 ); + b = _mm256_slli_epi16( _mm256_mulhi_epu16(b, intensity_v256), 10 ); + + return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); +} + +template +FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity) +{ + v256u32 tempSrc = (SWAP_RB) ? _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000)); + } + + v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x000000FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 16), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) ); + + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 ); + b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 ); + + return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) { @@ -456,6 +512,160 @@ size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec2 return i; } +template +size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec256; i+=16) + { + const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); + const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec256; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec256; i+=16) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) ); + } + } + } + else + { + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec256; i+=16) + { + v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); + v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256; + + v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempDst, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempDst, 10), _mm256_set1_epi16(0x001F) ); + v256u16 a = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x8000) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 5 ); + b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 10 ); + + tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u16 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256((v256u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec256; i+=8) + { + const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); + const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec256; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec256; i+=8) + { + v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); + v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256; + + v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x000000FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempDst, 16), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 ); + b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 ); + + tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); @@ -616,6 +826,46 @@ size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u return ColorspaceCopyBuffer32_AVX2(src, dst, pixCount); } +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); @@ -649,4 +899,10 @@ template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); + +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); + #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h index b2e926200..a83b27271 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -37,6 +37,9 @@ template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256 template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); + class ColorspaceHandler_AVX2 : public ColorspaceHandler { public: @@ -82,6 +85,16 @@ public: size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; }; #endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp index 920faa16d..9b34e0266 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -292,6 +292,66 @@ FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src) return src; } +template +FORCEINLINE v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity) +{ + v128u16 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm_and_si128(tempSrc, _mm_set1_epi16(0x8000)); + } + + v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(tempSrc, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(tempSrc, 10), _mm_set1_epi16(0x001F) ); + v128u16 a = _mm_and_si128( tempSrc, _mm_set1_epi16(0x8000) ); + + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 ); + b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 ); + + return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); +} + +template +FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity) +{ +#ifdef ENABLE_SSSE3 + v128u32 tempSrc = (SWAP_RB) ? _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; +#else + v128u32 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) ) : src; +#endif + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000)); + } + + v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi32(0x000000FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) ); + v128u16 b = _mm_and_si128( _mm_srli_epi32(tempSrc, 16), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) ); + + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 ); + b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 ); + + return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); +} + template static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) { @@ -500,6 +560,167 @@ size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec1 return i; } +template +size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=8) + { + const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); + const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), tempDst); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec128; i+=8) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) ); + } + } + } + else + { + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=8) + { + v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); + v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128; + + v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(tempDst, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(tempDst, 10), _mm_set1_epi16(0x001F) ); + v128u16 a = _mm_and_si128( tempDst, _mm_set1_epi16(0x8000) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 ); + b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 ); + + tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u16 *)(dst+i), tempDst); + } + else + { + _mm_store_si128((v128u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=4) + { + const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); +#ifdef ENABLE_SSSE3 + const v128u32 tempDst = _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); +#else + const v128u32 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ); +#endif + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), tempDst); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=4) + { + v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); +#ifdef ENABLE_SSSE3 + v128u32 tempDst = (SWAP_RB) ? _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v128; +#else + v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128; +#endif + + v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi32(0x000000FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) ); + v128u16 b = _mm_and_si128( _mm_srli_epi32(tempDst, 16), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 ); + b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 ); + + tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i), tempDst); + } + else + { + _mm_store_si128((v128u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const { return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); @@ -660,6 +881,46 @@ size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u return ColorspaceCopyBuffer32_SSE2(src, dst, pixCount); } +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); @@ -693,4 +954,10 @@ template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); + +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); + #endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h index 81cf3a5a4..50b8597d1 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -37,6 +37,9 @@ template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128 template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); + class ColorspaceHandler_SSE2 : public ColorspaceHandler { public: @@ -82,6 +85,16 @@ public: size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; }; #endif // ENABLE_SSE2