Colorspace Handler: Add support for AVX-512, new 16-bit to 32-bit alpha agnostic conversion functions, and minor optimizations to some functions.

- New 16-bit to 32-bit alpha agnostic conversion functions: ColorspaceConvert555XTo888X_*(), ColorspaceConvert555XTo666X_*().
- Minor optimizations to the following functions: ColorspaceConvert555To8888_*(), ColorspaceConvert555To6665_*(), ColorspaceApplyIntensity32_*().
This commit is contained in:
rogerman 2019-05-20 14:57:34 -07:00
parent 2d2320f4d1
commit de198c00a0
13 changed files with 1845 additions and 356 deletions

View File

@ -2685,6 +2685,8 @@
ABC570D0134431CE00E7B0B1 /* AudioUnit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AudioUnit.framework; path = System/Library/Frameworks/AudioUnit.framework; sourceTree = SDKROOT; };
ABC570D4134431DA00E7B0B1 /* OpenGL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenGL.framework; path = System/Library/Frameworks/OpenGL.framework; sourceTree = SDKROOT; };
ABC719E1138CB25E002827A9 /* DefaultKeyMappings.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = DefaultKeyMappings.plist; sourceTree = "<group>"; };
ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX512.cpp; sourceTree = "<group>"; };
ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX512.h; sourceTree = "<group>"; };
ABCFA9F2178BDE920030C8BA /* encrypt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = encrypt.h; sourceTree = "<group>"; };
ABCFA9F3178BDE920030C8BA /* encrypt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = encrypt.cpp; sourceTree = "<group>"; };
ABD103FE1346652500AF11D1 /* cocoa_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_core.h; sourceTree = "<group>"; };
@ -3842,10 +3844,12 @@
children = (
ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */,
ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */,
ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */,
ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */,
ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */,
ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */,
ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */,
ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */,
ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */,
ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */,
);
@ -7073,6 +7077,7 @@
GDB_STUB,
);
MACOSX_DEPLOYMENT_TARGET = 10.7;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
PRODUCT_NAME = "DeSmuME (Debug, dev+)";
};
name = Debug;
@ -7087,6 +7092,7 @@
GDB_STUB,
);
MACOSX_DEPLOYMENT_TARGET = 10.7;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
PRODUCT_NAME = "DeSmuME (dev+)";
};
name = Release;
@ -7257,6 +7263,9 @@
INFOPLIST_FILE = "Info (Debug).plist";
LD_NO_PIE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.5;
MTL_FAST_MATH = YES;
MTL_LANGUAGE_REVISION = Metal11;
MTL_OPTIMIZATION_LEVEL = 3;
ONLY_ACTIVE_ARCH = YES;
OTHER_CFLAGS = "-I./../../";
PRODUCT_NAME = "DeSmuME (Debug)";
@ -7309,6 +7318,9 @@
INFOPLIST_FILE = Info.plist;
LD_NO_PIE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.5;
MTL_FAST_MATH = YES;
MTL_LANGUAGE_REVISION = Metal11;
MTL_OPTIMIZATION_LEVEL = 3;
OTHER_CFLAGS = "-I./../../";
PRODUCT_NAME = DeSmuME;
SDKROOT = macosx;

View File

@ -37,10 +37,6 @@
#include <smmintrin.h>
#endif
#ifdef ENABLE_AVX
#include <immintrin.h>
#endif
enum MatrixMode
{
MATRIXMODE_PROJECTION = 0,
@ -159,7 +155,47 @@ FORCEINLINE s32 sfx32_shiftdown(const s64 a)
// SIMD Functions
//-------------
#if defined(ENABLE_AVX)
#if defined(ENABLE_AVX512_0)
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
v512u16 *dst_vec512 = (v512u16 *)dst;
const size_t length_vec512 = elementCount / (sizeof(v512u16) / sizeof(u16));
const v512u16 val_vec512 = _mm512_set1_epi16(val);
for (size_t i = 0; i < length_vec512; i++)
_mm512_stream_si512(dst_vec512 + i, val_vec512);
}
template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val)
{
v512u16 *dst_vec512 = (v512u16 *)dst;
const v512u16 val_vec512 = _mm512_set1_epi16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v512u16) / sizeof(u16)), _mm512_store_si512(dst_vec512 + (X), val_vec512));
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
v512u32 *dst_vec512 = (v512u32 *)dst;
const size_t length_vec512 = elementCount / (sizeof(v512u32) / sizeof(u32));
const v512u32 val_vec512 = _mm512_set1_epi32(val);
for (size_t i = 0; i < length_vec512; i++)
_mm512_stream_si512(dst_vec512 + i, val_vec512);
}
template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
v512u32 *dst_vec512 = (v512u32 *)dst;
const v512u32 val_vec512 = _mm512_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v512u32) / sizeof(u32)), _mm512_store_si512(dst_vec512 + (X), val_vec512));
}
#elif defined(ENABLE_AVX)
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{

View File

@ -49,6 +49,10 @@
#endif
#ifdef __GNUC__
#ifdef __ALTIVEC__
#define ENABLE_ALTIVEC
#endif
#ifdef __SSE__
#define ENABLE_SSE
#endif
@ -81,8 +85,27 @@
#define ENABLE_AVX2
#endif
#ifdef __ALTIVEC__
#define ENABLE_ALTIVEC
// AVX-512 is special because it has multiple tiers of support.
//
// For our case, Tier-0 will be the baseline AVX-512 tier that includes the basic Foundation and
// Conflict Detection extensions, which should be supported on all AVX-512 CPUs. Higher tiers
// include more extensions, where each higher tier also assumes support for all lower tiers.
//
// For typical use cases in DeSmuME, the most practical AVX-512 tier will be Tier-1.
#if defined(__AVX512F__) && defined(__AVX512CD__)
#define ENABLE_AVX512_0
#endif
#if defined(ENABLE_AVX512_0) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define ENABLE_AVX512_1
#endif
#if defined(ENABLE_AVX512_1) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__)
#define ENABLE_AVX512_2
#endif
#if defined(ENABLE_AVX512_2) && defined(__AVX512VNNI__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__)
#define ENABLE_AVX512_3
#endif
#endif
@ -245,7 +268,8 @@ typedef __m128i v128u32;
typedef __m128i v128s32;
#endif
#ifdef ENABLE_AVX
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)
#include <immintrin.h>
typedef __m256i v256u8;
typedef __m256i v256s8;
@ -253,8 +277,18 @@ typedef __m256i v256u16;
typedef __m256i v256s16;
typedef __m256i v256u32;
typedef __m256i v256s32;
#if defined(ENABLE_AVX512_0)
typedef __m512i v512u8;
typedef __m512i v512s8;
typedef __m512i v512u16;
typedef __m512i v512s16;
typedef __m512i v512u32;
typedef __m512i v512s32;
#endif
#endif // defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)
/*---------- GPU3D fixed-points types -----------*/
typedef s32 f32;

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2016-2017 DeSmuME team
Copyright (C) 2016-2019 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -18,19 +18,31 @@
#include "colorspacehandler.h"
#include <string.h>
#if defined(ENABLE_AVX2)
#include "colorspacehandler_AVX2.cpp"
#include "colorspacehandler_SSE2.cpp"
#elif defined(ENABLE_SSE2)
#include "colorspacehandler_SSE2.cpp"
#elif defined(ENABLE_ALTIVEC)
#include "colorspacehandler_AltiVec.cpp"
#if defined(ENABLE_AVX512_1)
#include "colorspacehandler_AVX512.cpp"
#endif
#if defined(ENABLE_AVX2)
#include "colorspacehandler_AVX2.cpp"
#endif
#if defined(ENABLE_SSE2)
#include "colorspacehandler_SSE2.cpp"
#endif
#if defined(ENABLE_ALTIVEC)
#include "colorspacehandler_AltiVec.cpp"
#endif
#if defined(ENABLE_AVX512_1)
#define USEVECTORSIZE_512
#define VECTORSIZE 64
#elif defined(ENABLE_AVX2)
#define USEVECTORSIZE_256
#define VECTORSIZE 32
#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#endif
// By default, the hand-coded vectorized code will be used instead of a compiler's built-in
@ -42,7 +54,9 @@
#endif
#ifdef USEMANUALVECTORIZATION
#if defined(ENABLE_AVX2)
#if defined(ENABLE_AVX512_1)
static const ColorspaceHandler_AVX512 csh;
#elif defined(ENABLE_AVX2)
static const ColorspaceHandler_AVX2 csh;
#elif defined(ENABLE_SSE2)
static const ColorspaceHandler_SSE2 csh;
@ -153,14 +167,7 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (SWAP_RB)
{
@ -201,14 +208,7 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (SWAP_RB)
{
@ -249,14 +249,7 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 4);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
if (SWAP_RB)
{
@ -297,14 +290,7 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 4);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
if (SWAP_RB)
{
@ -345,14 +331,7 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (SWAP_RB)
{
@ -393,14 +372,7 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (SWAP_RB)
{
@ -441,14 +413,7 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
if (SWAP_RB)
{
@ -489,14 +454,7 @@ void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u16)) * 2));
if (SWAP_RB)
{
@ -537,14 +495,7 @@ void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u32)) * 4));
if (SWAP_RB)
{
@ -591,14 +542,7 @@ void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount)
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (IS_UNALIGNED)
{
@ -631,14 +575,7 @@ void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount)
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 8);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 4);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
if (IS_UNALIGNED)
{
@ -665,14 +602,7 @@ void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensi
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
if (SWAP_RB)
{
@ -750,14 +680,7 @@ void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensi
size_t i = 0;
#ifdef USEMANUALVECTORIZATION
#if defined(USEVECTORSIZE_512)
const size_t pixCountVector = pixCount - (pixCount % 32);
#elif defined(USEVECTORSIZE_256)
const size_t pixCountVector = pixCount - (pixCount % 16);
#elif defined(USEVECTORSIZE_128)
const size_t pixCountVector = pixCount - (pixCount % 8);
#endif
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
if (SWAP_RB)
{

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2016-2018 DeSmuME team
Copyright (C) 2016-2019 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,19 +30,74 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 11), _mm256_srli_epi16(srcColor, 7)), _mm256_set1_epi16(0xF8F8) );
v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits);
if (SWAP_RB)
{
v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) );
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707)));
v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) );
ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5));
ga = _mm256_or_si256(ga, srcAlphaBits);
rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi8(rb, ga);
dstHi = _mm256_unpackhi_epi8(rb, ga);
}
else
{
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) );
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) );
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0707)) );
v256u16 ba = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) );
ba = _mm256_or_si256(ba, _mm256_srli_epi16(ba, 5));
ba = _mm256_or_si256(ba, srcAlphaBits);
rg = _mm256_permute4x64_epi64(rg, 0xD8);
ba = _mm256_permute4x64_epi64(ba, 0xD8);
dstLo = _mm256_unpacklo_epi16(rg, ba);
dstHi = _mm256_unpackhi_epi16(rg, ba);
}
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi16(rb, ga);
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm256_unpackhi_epi16(rb, ga);
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) );
dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
if (SWAP_RB)
{
v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) );
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707)));
v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) );
g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5));
rb = _mm256_permute4x64_epi64(rb, 0xD8);
g = _mm256_permute4x64_epi64( g, 0xD8);
dstLo = _mm256_unpacklo_epi8(rb, g);
dstHi = _mm256_unpackhi_epi8(rb, g);
}
else
{
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) );
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) );
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi32(rg, 5), _mm256_set1_epi16(0x0707)) );
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) );
b = _mm256_or_si256(b, _mm256_srli_epi32(b, 5));
rg = _mm256_permute4x64_epi64(rg, 0xD8);
b = _mm256_permute4x64_epi64( b, 0xD8);
dstLo = _mm256_unpacklo_epi16(rg, b);
dstHi = _mm256_unpackhi_epi16(rg, b);
}
}
template <bool SWAP_RB>
@ -51,19 +106,75 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E)), srcAlphaBits);
if (SWAP_RB)
{
v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101)));
v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) );
ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5));
ga = _mm256_or_si256(ga, srcAlphaBits);
rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi8(rb, ga);
dstHi = _mm256_unpackhi_epi8(rb, ga);
}
else
{
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) );
const v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) );
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) );
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) );
v256u16 ba = _mm256_or_si256(b, _mm256_srli_epi16(b, 5));
ba = _mm256_or_si256(ba, srcAlphaBits);
rg = _mm256_permute4x64_epi64(rg, 0xD8);
ba = _mm256_permute4x64_epi64(ba, 0xD8);
dstLo = _mm256_unpacklo_epi16(rg, ba);
dstHi = _mm256_unpackhi_epi16(rg, ba);
}
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
rb = _mm256_permute4x64_epi64(rb, 0xD8);
ga = _mm256_permute4x64_epi64(ga, 0xD8);
dstLo = _mm256_unpacklo_epi16(rb, ga);
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm256_unpackhi_epi16(rb, ga);
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) );
dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
if (SWAP_RB)
{
v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101)));
v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) );
g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5));
rb = _mm256_permute4x64_epi64(rb, 0xD8);
g = _mm256_permute4x64_epi64( g, 0xD8);
dstLo = _mm256_unpacklo_epi8(rb, g);
dstHi = _mm256_unpackhi_epi8(rb, g);
}
else
{
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) );
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) );
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) );
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) );
b = _mm256_or_si256(b, _mm256_srli_epi16(b, 5));
rg = _mm256_permute4x64_epi64(rg, 0xD8);
b = _mm256_permute4x64_epi64( b, 0xD8);
dstLo = _mm256_unpacklo_epi16(rg, b);
dstHi = _mm256_unpackhi_epi16(rg, b);
}
}
template <bool SWAP_RB>
@ -86,18 +197,13 @@ FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src)
// Conversion algorithm:
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
v256u32 rgb;
const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) );
v256u32 rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) );
if (SWAP_RB)
{
rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) );
}
else
{
rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
}
return _mm256_or_si256(rgb, a);
}
@ -288,18 +394,16 @@ FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float in
return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000));
}
v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x000000FF) );
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) );
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 16), _mm256_set1_epi32(0x000000FF) );
v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) );
v256u16 rb = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x00FF00FF) );
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) );
v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) );
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
r = _mm256_mulhi_epu16(r, intensity_v256);
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
rb = _mm256_mulhi_epu16(rb, intensity_v256);
g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 );
return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
return _mm256_or_si256( _mm256_or_si256(rb, g), a);
}
template <bool SWAP_RB, bool IS_UNALIGNED>
@ -307,7 +411,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict
{
size_t i = 0;
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
v256u32 dstConvertedLo, dstConvertedHi;
@ -315,13 +419,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict
if (IS_UNALIGNED)
{
_mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo);
_mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi);
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
}
else
{
_mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo);
_mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi);
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
}
}
@ -333,7 +437,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3
{
size_t i = 0;
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
v256u32 dstConvertedLo, dstConvertedHi;
@ -341,13 +445,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3
if (IS_UNALIGNED)
{
_mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo);
_mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi);
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
}
else
{
_mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo);
_mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi);
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
}
}
@ -359,7 +463,7 @@ size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t p
{
size_t i = 0;
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -379,7 +483,7 @@ size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t p
{
size_t i = 0;
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -399,15 +503,15 @@ size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__
{
size_t i = 0;
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) );
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
}
else
{
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) );
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
}
}
@ -419,15 +523,15 @@ size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__
{
size_t i = 0;
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) );
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
}
else
{
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) );
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
}
}
@ -439,7 +543,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si
{
size_t i = 0;
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -461,17 +565,17 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re
v256u16 src_v256u16[2];
v256u32 src_v256u32[4];
for (; i < pixCountVec256; i+=32)
for (; i < pixCountVec256; i+=((sizeof(v256u16)/sizeof(u16)) * 2))
{
if (IS_UNALIGNED)
{
src_v256u16[0] = _mm256_loadu_si256((v256u16 *)(src + i + 0));
src_v256u16[1] = _mm256_loadu_si256((v256u16 *)(src + i + 16));
src_v256u16[0] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) );
src_v256u16[1] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) );
}
else
{
src_v256u16[0] = _mm256_load_si256((v256u16 *)(src + i + 0));
src_v256u16[1] = _mm256_load_si256((v256u16 *)(src + i + 16));
src_v256u16[0] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) );
src_v256u16[1] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) );
}
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) );
@ -516,15 +620,15 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
}
else
{
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
}
}
@ -537,21 +641,21 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re
size_t i = 0;
v256u32 src_v256u32[4];
for (; i < pixCountVec256; i+=32)
for (; i < pixCountVec256; i+=((sizeof(v256u32)/sizeof(u32)) * 4))
{
if (IS_UNALIGNED)
{
src_v256u32[0] = _mm256_loadu_si256((v256u32 *)(src + i + 0));
src_v256u32[1] = _mm256_loadu_si256((v256u32 *)(src + i + 8));
src_v256u32[2] = _mm256_loadu_si256((v256u32 *)(src + i + 16));
src_v256u32[3] = _mm256_loadu_si256((v256u32 *)(src + i + 24));
src_v256u32[0] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) );
src_v256u32[1] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) );
src_v256u32[2] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) );
src_v256u32[3] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) );
}
else
{
src_v256u32[0] = _mm256_load_si256((v256u32 *)(src + i + 0));
src_v256u32[1] = _mm256_load_si256((v256u32 *)(src + i + 8));
src_v256u32[2] = _mm256_load_si256((v256u32 *)(src + i + 16));
src_v256u32[3] = _mm256_load_si256((v256u32 *)(src + i + 24));
src_v256u32[0] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) );
src_v256u32[1] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) );
src_v256u32[2] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) );
src_v256u32[3] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) );
}
if (SWAP_RB)
@ -577,15 +681,15 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re
if (IS_UNALIGNED)
{
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
}
else
{
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
}
}
@ -603,7 +707,7 @@ size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec2
size_t i = 0;
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
@ -631,7 +735,7 @@ size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec2
size_t i = 0;
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i));
@ -657,7 +761,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
{
if (SWAP_RB)
{
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) );
@ -679,7 +783,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
}
else if (intensity < 0.001f)
{
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
@ -695,7 +799,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
{
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
for (; i < pixCountVec256; i+=16)
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
{
v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256;
@ -734,7 +838,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
{
if (SWAP_RB)
{
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
@ -756,7 +860,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
}
else if (intensity < 0.001f)
{
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -772,21 +876,19 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
{
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
for (; i < pixCountVec256; i+=8)
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
{
v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256;
v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x000000FF) );
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) );
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempDst, 16), _mm256_set1_epi32(0x000000FF) );
v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) );
v256u16 rb = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x00FF00FF) );
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) );
v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) );
r = _mm256_mulhi_epu16(r, intensity_v256);
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
rb = _mm256_mulhi_epu16(rb, intensity_v256);
g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 );
tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
tempDst = _mm256_or_si256( _mm256_or_si256(rb, g), a);
if (IS_UNALIGNED)
{
@ -1045,9 +1147,15 @@ size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555XTo888X_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555XTo888X_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To6665_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To6665_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555XTo666X_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555XTo666X_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);

View File

@ -25,7 +25,9 @@
#else
template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
template<bool SWAP_RB> v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,114 @@
/*
Copyright (C) 2016-2019 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef COLORSPACEHANDLER_AVX512_H
#define COLORSPACEHANDLER_AVX512_H
#include "colorspacehandler.h"
#ifndef ENABLE_AVX512_1
#warning This header requires AVX-512 Tier-1 support.
#else
template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
template<bool SWAP_RB> v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src);
template<bool SWAP_RB> v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src);
template<bool SWAP_RB> v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi);
template<bool SWAP_RB> v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi);
template<bool SWAP_RB> v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src);
template<bool SWAP_RB> v512u16 ColorspaceCopy16_AVX512(const v512u16 &src);
template<bool SWAP_RB> v512u32 ColorspaceCopy32_AVX512(const v512u32 &src);
template<bool SWAP_RB> v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity);
template<bool SWAP_RB> v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity);
class ColorspaceHandler_AVX512 : public ColorspaceHandler
{
public:
ColorspaceHandler_AVX512() {};
size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
};
#endif // ENABLE_AVX512_1
#endif // COLORSPACEHANDLER_AVX512_H

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2016-2017 DeSmuME team
Copyright (C) 2016-2019 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -38,6 +38,21 @@ FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, con
dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
dstLo = vec_unpackl((vector pixel)srcColor);
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
dstHi = vec_unpackh((vector pixel)srcColor);
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
{
@ -53,6 +68,21 @@ FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, con
dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
dstLo = vec_unpackl((vector pixel)srcColor);
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
dstHi = vec_unpackh((vector pixel)srcColor);
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{
@ -513,9 +543,15 @@ size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst,
template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo888X_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo888X_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo666X_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo666X_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);

View File

@ -25,7 +25,9 @@
#else
template<bool SWAP_RB> void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src);

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2016-2017 DeSmuME team
Copyright (C) 2016-2019 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -38,28 +38,62 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
#ifdef ENABLE_SSSE3
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 11), _mm_srli_epi16(srcColor, 7)), _mm_set1_epi16(0xF8F8) );
v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8)), srcAlphaBits);
if (SWAP_RB)
{
v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) );
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707)));
v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) );
ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5));
ga = _mm_or_si128(ga, srcAlphaBits);
dstLo = _mm_unpacklo_epi8(rb, ga);
dstHi = _mm_unpackhi_epi8(rb, ga);
}
else
{
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) );
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) );
v128u16 ba = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
ba = _mm_or_si128(ba, _mm_srli_epi16(ba, 5));
ba = _mm_or_si128(ba, srcAlphaBits);
dstLo = _mm_unpacklo_epi16(rg, ba);
dstHi = _mm_unpackhi_epi16(rg, ba);
}
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
dstLo = _mm_unpacklo_epi16(rb, ga);
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm_unpackhi_epi16(rb, ga);
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#else
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800) );
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
#endif
if (SWAP_RB)
{
v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) );
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707)));
v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) );
g = _mm_or_si128(g, _mm_srli_epi16(g, 5));
dstLo = _mm_unpacklo_epi8(rb, g);
dstHi = _mm_unpackhi_epi8(rb, g);
}
else
{
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) );
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) );
v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
b = _mm_or_si128(b, _mm_srli_epi16(b, 5));
dstLo = _mm_unpacklo_epi16(rg, b);
dstHi = _mm_unpackhi_epi16(rg, b);
}
}
template <bool SWAP_RB>
@ -68,28 +102,63 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
#ifdef ENABLE_SSSE3
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E)), srcAlphaBits);
if (SWAP_RB)
{
v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101)));
v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) );
ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5));
ga = _mm_or_si128(ga, srcAlphaBits);
dstLo = _mm_unpacklo_epi8(rb, ga);
dstHi = _mm_unpackhi_epi8(rb, ga);
}
else
{
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
const v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) );
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) );
v128u16 ba = _mm_or_si128(b, _mm_srli_epi16(b, 5));
ba = _mm_or_si128(ba, srcAlphaBits);
dstLo = _mm_unpacklo_epi16(rg, ba);
dstHi = _mm_unpackhi_epi16(rg, ba);
}
}
template <bool SWAP_RB>
FORCEINLINE void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
{
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
dstLo = _mm_unpacklo_epi16(rb, ga);
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
dstHi = _mm_unpackhi_epi16(rb, ga);
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
#else
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00) );
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
#endif
if (SWAP_RB)
{
v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101)));
v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) );
g = _mm_or_si128(g, _mm_srli_epi16(g, 5));
dstLo = _mm_unpacklo_epi8(rb, g);
dstHi = _mm_unpackhi_epi8(rb, g);
}
else
{
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) );
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) );
v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
b = _mm_or_si128(b, _mm_srli_epi16(b, 5));
dstLo = _mm_unpacklo_epi16(rg, b);
dstHi = _mm_unpackhi_epi16(rg, b);
}
}
template <bool SWAP_RB>
@ -332,18 +401,16 @@ FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float in
return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000));
}
v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi32(0x000000FF) );
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) );
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempSrc, 16), _mm_set1_epi32(0x000000FF) );
v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) );
v128u16 rb = _mm_and_si128( tempSrc, _mm_set1_epi32(0x00FF00FF) );
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) );
v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) );
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
r = _mm_mulhi_epu16(r, intensity_v128);
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
rb = _mm_mulhi_epu16(rb, intensity_v128);
g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 );
return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
return _mm_or_si128( _mm_or_si128(rb, g), a);
}
template <bool SWAP_RB, bool IS_UNALIGNED>
@ -351,7 +418,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict
{
size_t i = 0;
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
v128u32 dstConvertedLo, dstConvertedHi;
@ -359,13 +426,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict
if (IS_UNALIGNED)
{
_mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo);
_mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi);
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
}
else
{
_mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo);
_mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi);
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
}
}
@ -377,7 +444,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3
{
size_t i = 0;
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
v128u32 dstConvertedLo, dstConvertedHi;
@ -385,13 +452,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3
if (IS_UNALIGNED)
{
_mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo);
_mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi);
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
}
else
{
_mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo);
_mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi);
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
}
}
@ -403,7 +470,7 @@ size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t p
{
size_t i = 0;
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -423,7 +490,7 @@ size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t p
{
size_t i = 0;
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -443,7 +510,7 @@ size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__
{
size_t i = 0;
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
@ -463,7 +530,7 @@ size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__
{
size_t i = 0;
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
@ -483,7 +550,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si
{
size_t i = 0;
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -507,17 +574,17 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r
v128u16 src_v128u16[2];
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2))
{
if (IS_UNALIGNED)
{
src_v128u16[0] = _mm_loadu_si128((v128u16 *)(src + i + 0));
src_v128u16[1] = _mm_loadu_si128((v128u16 *)(src + i + 8));
src_v128u16[0] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) );
src_v128u16[1] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) );
}
else
{
src_v128u16[0] = _mm_load_si128((v128u16 *)(src + i + 0));
src_v128u16[1] = _mm_load_si128((v128u16 *)(src + i + 8));
src_v128u16[0] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) );
src_v128u16[1] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) );
}
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) );
@ -553,28 +620,28 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r
#ifdef ENABLE_SSE4_1
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
}
#else
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
#endif
}
@ -588,21 +655,21 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
size_t i = 0;
v128u32 src_v128u32[4];
for (; i < pixCountVec128; i+=16)
for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4))
{
if (IS_UNALIGNED)
{
src_v128u32[0] = _mm_loadu_si128((v128u32 *)(src + i + 0));
src_v128u32[1] = _mm_loadu_si128((v128u32 *)(src + i + 4));
src_v128u32[2] = _mm_loadu_si128((v128u32 *)(src + i + 8));
src_v128u32[3] = _mm_loadu_si128((v128u32 *)(src + i + 12));
src_v128u32[0] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) );
src_v128u32[1] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) );
src_v128u32[2] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) );
src_v128u32[3] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) );
}
else
{
src_v128u32[0] = _mm_load_si128((v128u32 *)(src + i + 0));
src_v128u32[1] = _mm_load_si128((v128u32 *)(src + i + 4));
src_v128u32[2] = _mm_load_si128((v128u32 *)(src + i + 8));
src_v128u32[3] = _mm_load_si128((v128u32 *)(src + i + 12));
src_v128u32[0] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) );
src_v128u32[1] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) );
src_v128u32[2] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) );
src_v128u32[3] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) );
}
if (SWAP_RB)
@ -623,28 +690,28 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
#ifdef ENABLE_SSE4_1
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
}
#else
if (IS_UNALIGNED)
{
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
else
{
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
}
#endif
}
@ -665,7 +732,7 @@ size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec1
size_t i = 0;
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
@ -693,7 +760,7 @@ size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec1
size_t i = 0;
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i));
@ -719,7 +786,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
{
if (SWAP_RB)
{
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) );
@ -741,7 +808,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
}
else if (intensity < 0.001f)
{
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
if (IS_UNALIGNED)
{
@ -757,7 +824,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
{
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
for (; i < pixCountVec128; i+=8)
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
{
v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128;
@ -796,7 +863,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
{
if (SWAP_RB)
{
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
#ifdef ENABLE_SSSE3
@ -821,7 +888,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
}
else if (intensity < 0.001f)
{
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
if (IS_UNALIGNED)
{
@ -837,7 +904,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
{
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
for (; i < pixCountVec128; i+=4)
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
{
v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
#ifdef ENABLE_SSSE3
@ -846,16 +913,14 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128;
#endif
v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi32(0x000000FF) );
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) );
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempDst, 16), _mm_set1_epi32(0x000000FF) );
v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) );
v128u16 rb = _mm_and_si128( tempDst, _mm_set1_epi32(0x00FF00FF) );
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) );
v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) );
r = _mm_mulhi_epu16(r, intensity_v128);
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
rb = _mm_mulhi_epu16(rb, intensity_v128);
g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 );
tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
tempDst = _mm_or_si128( _mm_or_si128(rb, g), a);
if (IS_UNALIGNED)
{
@ -1118,9 +1183,15 @@ size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo888X_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo888X_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To6665_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo666X_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555XTo666X_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template void ColorspaceConvert555To8888Opaque_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);

View File

@ -25,7 +25,9 @@
#else
template<bool SWAP_RB> void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src);

View File

@ -60,40 +60,44 @@
#define DESMUME_PLATFORM_STRING ""
#endif
#define DESMUME_CPUEXT_PRIMARY_STRING ""
#define DESMUME_CPUEXT_SECONDARY_STRING ""
#if defined(ENABLE_SSE4_2)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.2"
#elif defined(ENABLE_SSE4_1)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.1"
#elif defined(ENABLE_SSSE3)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSSE3"
#elif defined(ENABLE_SSE3)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE3"
#elif defined(ENABLE_SSE2)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE2"
#elif defined(ENABLE_SSE)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE"
#elif defined(ENABLE_ALTIVEC)
#undef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec"
#endif
#if defined(ENABLE_AVX2)
#undef DESMUME_CPUEXT_SECONDARY_STRING
#if defined(ENABLE_AVX512_3)
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-3"
#elif defined(ENABLE_AVX512_2)
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-2"
#elif defined(ENABLE_AVX512_1)
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-1"
#elif defined(ENABLE_AVX512_0)
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-0"
#elif defined(ENABLE_AVX2)
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX2"
#elif defined(ENABLE_AVX)
#undef DESMUME_CPUEXT_SECONDARY_STRING
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX"
#endif
#ifndef DESMUME_CPUEXT_PRIMARY_STRING
#define DESMUME_CPUEXT_PRIMARY_STRING ""
#endif
#ifndef DESMUME_CPUEXT_SECONDARY_STRING
#define DESMUME_CPUEXT_SECONDARY_STRING ""
#endif
#define DESMUME_CPUEXT_STRING DESMUME_CPUEXT_PRIMARY_STRING DESMUME_CPUEXT_SECONDARY_STRING
#ifdef DEVELOPER