Colorspace Handler: Add support for AVX-512, new 16-bit to 32-bit alpha agnostic conversion functions, and minor optimizations to some functions.
- New 16-bit to 32-bit alpha agnostic conversion functions: ColorspaceConvert555XTo888X_*(), ColorspaceConvert555XTo666X_*(). - Minor optimizations to the following functions: ColorspaceConvert555To8888_*(), ColorspaceConvert555To6665_*(), ColorspaceApplyIntensity32_*().
This commit is contained in:
parent
2d2320f4d1
commit
de198c00a0
|
@ -2685,6 +2685,8 @@
|
|||
ABC570D0134431CE00E7B0B1 /* AudioUnit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AudioUnit.framework; path = System/Library/Frameworks/AudioUnit.framework; sourceTree = SDKROOT; };
|
||||
ABC570D4134431DA00E7B0B1 /* OpenGL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenGL.framework; path = System/Library/Frameworks/OpenGL.framework; sourceTree = SDKROOT; };
|
||||
ABC719E1138CB25E002827A9 /* DefaultKeyMappings.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = DefaultKeyMappings.plist; sourceTree = "<group>"; };
|
||||
ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX512.cpp; sourceTree = "<group>"; };
|
||||
ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX512.h; sourceTree = "<group>"; };
|
||||
ABCFA9F2178BDE920030C8BA /* encrypt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = encrypt.h; sourceTree = "<group>"; };
|
||||
ABCFA9F3178BDE920030C8BA /* encrypt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = encrypt.cpp; sourceTree = "<group>"; };
|
||||
ABD103FE1346652500AF11D1 /* cocoa_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_core.h; sourceTree = "<group>"; };
|
||||
|
@ -3842,10 +3844,12 @@
|
|||
children = (
|
||||
ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */,
|
||||
ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */,
|
||||
ABCC19332287879000DFA471 /* colorspacehandler_AVX512.cpp */,
|
||||
ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */,
|
||||
ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */,
|
||||
ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */,
|
||||
ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */,
|
||||
ABCC19342287879000DFA471 /* colorspacehandler_AVX512.h */,
|
||||
ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */,
|
||||
ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */,
|
||||
);
|
||||
|
@ -7073,6 +7077,7 @@
|
|||
GDB_STUB,
|
||||
);
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.7;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
PRODUCT_NAME = "DeSmuME (Debug, dev+)";
|
||||
};
|
||||
name = Debug;
|
||||
|
@ -7087,6 +7092,7 @@
|
|||
GDB_STUB,
|
||||
);
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.7;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
PRODUCT_NAME = "DeSmuME (dev+)";
|
||||
};
|
||||
name = Release;
|
||||
|
@ -7257,6 +7263,9 @@
|
|||
INFOPLIST_FILE = "Info (Debug).plist";
|
||||
LD_NO_PIE = YES;
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.5;
|
||||
MTL_FAST_MATH = YES;
|
||||
MTL_LANGUAGE_REVISION = Metal11;
|
||||
MTL_OPTIMIZATION_LEVEL = 3;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
OTHER_CFLAGS = "-I./../../";
|
||||
PRODUCT_NAME = "DeSmuME (Debug)";
|
||||
|
@ -7309,6 +7318,9 @@
|
|||
INFOPLIST_FILE = Info.plist;
|
||||
LD_NO_PIE = YES;
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.5;
|
||||
MTL_FAST_MATH = YES;
|
||||
MTL_LANGUAGE_REVISION = Metal11;
|
||||
MTL_OPTIMIZATION_LEVEL = 3;
|
||||
OTHER_CFLAGS = "-I./../../";
|
||||
PRODUCT_NAME = DeSmuME;
|
||||
SDKROOT = macosx;
|
||||
|
|
|
@ -37,10 +37,6 @@
|
|||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
enum MatrixMode
|
||||
{
|
||||
MATRIXMODE_PROJECTION = 0,
|
||||
|
@ -159,7 +155,47 @@ FORCEINLINE s32 sfx32_shiftdown(const s64 a)
|
|||
|
||||
// SIMD Functions
|
||||
//-------------
|
||||
#if defined(ENABLE_AVX)
|
||||
#if defined(ENABLE_AVX512_0)
|
||||
|
||||
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||
{
|
||||
v512u16 *dst_vec512 = (v512u16 *)dst;
|
||||
const size_t length_vec512 = elementCount / (sizeof(v512u16) / sizeof(u16));
|
||||
|
||||
const v512u16 val_vec512 = _mm512_set1_epi16(val);
|
||||
for (size_t i = 0; i < length_vec512; i++)
|
||||
_mm512_stream_si512(dst_vec512 + i, val_vec512);
|
||||
}
|
||||
|
||||
template <size_t ELEMENTCOUNT>
|
||||
static void memset_u16_fast(void *dst, const u16 val)
|
||||
{
|
||||
v512u16 *dst_vec512 = (v512u16 *)dst;
|
||||
|
||||
const v512u16 val_vec512 = _mm512_set1_epi16(val);
|
||||
MACRODO_N(ELEMENTCOUNT / (sizeof(v512u16) / sizeof(u16)), _mm512_store_si512(dst_vec512 + (X), val_vec512));
|
||||
}
|
||||
|
||||
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
||||
{
|
||||
v512u32 *dst_vec512 = (v512u32 *)dst;
|
||||
const size_t length_vec512 = elementCount / (sizeof(v512u32) / sizeof(u32));
|
||||
|
||||
const v512u32 val_vec512 = _mm512_set1_epi32(val);
|
||||
for (size_t i = 0; i < length_vec512; i++)
|
||||
_mm512_stream_si512(dst_vec512 + i, val_vec512);
|
||||
}
|
||||
|
||||
template <size_t ELEMENTCOUNT>
|
||||
static void memset_u32_fast(void *dst, const u32 val)
|
||||
{
|
||||
v512u32 *dst_vec512 = (v512u32 *)dst;
|
||||
|
||||
const v512u32 val_vec512 = _mm512_set1_epi32(val);
|
||||
MACRODO_N(ELEMENTCOUNT / (sizeof(v512u32) / sizeof(u32)), _mm512_store_si512(dst_vec512 + (X), val_vec512));
|
||||
}
|
||||
|
||||
#elif defined(ENABLE_AVX)
|
||||
|
||||
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||
{
|
||||
|
|
|
@ -49,6 +49,10 @@
|
|||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __ALTIVEC__
|
||||
#define ENABLE_ALTIVEC
|
||||
#endif
|
||||
|
||||
#ifdef __SSE__
|
||||
#define ENABLE_SSE
|
||||
#endif
|
||||
|
@ -81,8 +85,27 @@
|
|||
#define ENABLE_AVX2
|
||||
#endif
|
||||
|
||||
#ifdef __ALTIVEC__
|
||||
#define ENABLE_ALTIVEC
|
||||
// AVX-512 is special because it has multiple tiers of support.
|
||||
//
|
||||
// For our case, Tier-0 will be the baseline AVX-512 tier that includes the basic Foundation and
|
||||
// Conflict Detection extensions, which should be supported on all AVX-512 CPUs. Higher tiers
|
||||
// include more extensions, where each higher tier also assumes support for all lower tiers.
|
||||
//
|
||||
// For typical use cases in DeSmuME, the most practical AVX-512 tier will be Tier-1.
|
||||
#if defined(__AVX512F__) && defined(__AVX512CD__)
|
||||
#define ENABLE_AVX512_0
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX512_0) && defined(__AVX512BW__) && defined(__AVX512DQ__)
|
||||
#define ENABLE_AVX512_1
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX512_1) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__)
|
||||
#define ENABLE_AVX512_2
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX512_2) && defined(__AVX512VNNI__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__)
|
||||
#define ENABLE_AVX512_3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -245,7 +268,8 @@ typedef __m128i v128u32;
|
|||
typedef __m128i v128s32;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX
|
||||
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)
|
||||
|
||||
#include <immintrin.h>
|
||||
typedef __m256i v256u8;
|
||||
typedef __m256i v256s8;
|
||||
|
@ -253,8 +277,18 @@ typedef __m256i v256u16;
|
|||
typedef __m256i v256s16;
|
||||
typedef __m256i v256u32;
|
||||
typedef __m256i v256s32;
|
||||
|
||||
#if defined(ENABLE_AVX512_0)
|
||||
typedef __m512i v512u8;
|
||||
typedef __m512i v512s8;
|
||||
typedef __m512i v512u16;
|
||||
typedef __m512i v512s16;
|
||||
typedef __m512i v512u32;
|
||||
typedef __m512i v512s32;
|
||||
#endif
|
||||
|
||||
#endif // defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)
|
||||
|
||||
/*---------- GPU3D fixed-points types -----------*/
|
||||
|
||||
typedef s32 f32;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
Copyright (C) 2016-2019 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -18,19 +18,31 @@
|
|||
#include "colorspacehandler.h"
|
||||
#include <string.h>
|
||||
|
||||
#if defined(ENABLE_AVX2)
|
||||
#include "colorspacehandler_AVX2.cpp"
|
||||
#include "colorspacehandler_SSE2.cpp"
|
||||
#elif defined(ENABLE_SSE2)
|
||||
#include "colorspacehandler_SSE2.cpp"
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
#include "colorspacehandler_AltiVec.cpp"
|
||||
#if defined(ENABLE_AVX512_1)
|
||||
#include "colorspacehandler_AVX512.cpp"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX2)
|
||||
#include "colorspacehandler_AVX2.cpp"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SSE2)
|
||||
#include "colorspacehandler_SSE2.cpp"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_ALTIVEC)
|
||||
#include "colorspacehandler_AltiVec.cpp"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX512_1)
|
||||
#define USEVECTORSIZE_512
|
||||
#define VECTORSIZE 64
|
||||
#elif defined(ENABLE_AVX2)
|
||||
#define USEVECTORSIZE_256
|
||||
#define VECTORSIZE 32
|
||||
#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
|
||||
#define USEVECTORSIZE_128
|
||||
#define VECTORSIZE 16
|
||||
#endif
|
||||
|
||||
// By default, the hand-coded vectorized code will be used instead of a compiler's built-in
|
||||
|
@ -42,7 +54,9 @@
|
|||
#endif
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
#if defined(ENABLE_AVX2)
|
||||
#if defined(ENABLE_AVX512_1)
|
||||
static const ColorspaceHandler_AVX512 csh;
|
||||
#elif defined(ENABLE_AVX2)
|
||||
static const ColorspaceHandler_AVX2 csh;
|
||||
#elif defined(ENABLE_SSE2)
|
||||
static const ColorspaceHandler_SSE2 csh;
|
||||
|
@ -153,14 +167,7 @@ void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__re
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -201,14 +208,7 @@ void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__re
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -249,14 +249,7 @@ void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -297,14 +290,7 @@ void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -345,14 +331,7 @@ void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restric
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -393,14 +372,7 @@ void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restric
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -441,14 +413,7 @@ void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pi
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -489,14 +454,7 @@ void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u16)) * 2));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -537,14 +495,7 @@ void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u32)) * 4));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -591,14 +542,7 @@ void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount)
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -631,14 +575,7 @@ void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount)
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 4);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -665,14 +602,7 @@ void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensi
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
@ -750,14 +680,7 @@ void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensi
|
|||
size_t i = 0;
|
||||
|
||||
#ifdef USEMANUALVECTORIZATION
|
||||
|
||||
#if defined(USEVECTORSIZE_512)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 32);
|
||||
#elif defined(USEVECTORSIZE_256)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 16);
|
||||
#elif defined(USEVECTORSIZE_128)
|
||||
const size_t pixCountVector = pixCount - (pixCount % 8);
|
||||
#endif
|
||||
const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32)));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016-2018 DeSmuME team
|
||||
Copyright (C) 2016-2019 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -30,19 +30,74 @@ FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const
|
|||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 11), _mm256_srli_epi16(srcColor, 7)), _mm256_set1_epi16(0xF8F8) );
|
||||
v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8)), srcAlphaBits);
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) );
|
||||
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707)));
|
||||
|
||||
v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) );
|
||||
ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5));
|
||||
ga = _mm256_or_si256(ga, srcAlphaBits);
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
ga = _mm256_permute4x64_epi64(ga, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi8(rb, ga);
|
||||
dstHi = _mm256_unpackhi_epi8(rb, ga);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) );
|
||||
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) );
|
||||
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0707)) );
|
||||
|
||||
v256u16 ba = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) );
|
||||
ba = _mm256_or_si256(ba, _mm256_srli_epi16(ba, 5));
|
||||
ba = _mm256_or_si256(ba, srcAlphaBits);
|
||||
|
||||
rg = _mm256_permute4x64_epi64(rg, 0xD8);
|
||||
ba = _mm256_permute4x64_epi64(ba, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rg, ba);
|
||||
dstHi = _mm256_unpackhi_epi16(rg, ba);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
ga = _mm256_permute4x64_epi64(ga, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rb, ga);
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) );
|
||||
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
|
||||
dstHi = _mm256_unpackhi_epi16(rb, ga);
|
||||
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) );
|
||||
dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) );
|
||||
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707)));
|
||||
|
||||
v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) );
|
||||
g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5));
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
g = _mm256_permute4x64_epi64( g, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi8(rb, g);
|
||||
dstHi = _mm256_unpackhi_epi8(rb, g);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) );
|
||||
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) );
|
||||
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi32(rg, 5), _mm256_set1_epi16(0x0707)) );
|
||||
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) );
|
||||
b = _mm256_or_si256(b, _mm256_srli_epi32(b, 5));
|
||||
|
||||
rg = _mm256_permute4x64_epi64(rg, 0xD8);
|
||||
b = _mm256_permute4x64_epi64( b, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rg, b);
|
||||
dstHi = _mm256_unpackhi_epi16(rg, b);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
@ -51,19 +106,75 @@ FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const
|
|||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(srcColor, 9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
|
||||
v256u16 ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E)), srcAlphaBits);
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
|
||||
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101)));
|
||||
|
||||
v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) );
|
||||
ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5));
|
||||
ga = _mm256_or_si256(ga, srcAlphaBits);
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
ga = _mm256_permute4x64_epi64(ga, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi8(rb, ga);
|
||||
dstHi = _mm256_unpackhi_epi8(rb, ga);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) );
|
||||
const v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) );
|
||||
|
||||
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) );
|
||||
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) );
|
||||
|
||||
v256u16 ba = _mm256_or_si256(b, _mm256_srli_epi16(b, 5));
|
||||
ba = _mm256_or_si256(ba, srcAlphaBits);
|
||||
|
||||
rg = _mm256_permute4x64_epi64(rg, 0xD8);
|
||||
ba = _mm256_permute4x64_epi64(ba, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rg, ba);
|
||||
dstHi = _mm256_unpackhi_epi16(rg, ba);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
ga = _mm256_permute4x64_epi64(ga, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rb, ga);
|
||||
dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) );
|
||||
dstLo = _mm256_shuffle_epi8( dstLo, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
|
||||
dstHi = _mm256_unpackhi_epi16(rb, ga);
|
||||
dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) );
|
||||
dstHi = _mm256_shuffle_epi8( dstHi, (SWAP_RB) ? _mm256_set_epi8(31,29,30,28, 27,25,26,24, 23,21,22,20, 19,17,18,16, 15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm256_set_epi8(31,28,30,29, 27,24,26,25, 23,20,22,21, 19,16,18,17, 15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) );
|
||||
rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101)));
|
||||
|
||||
v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) );
|
||||
g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5));
|
||||
|
||||
rb = _mm256_permute4x64_epi64(rb, 0xD8);
|
||||
g = _mm256_permute4x64_epi64( g, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi8(rb, g);
|
||||
dstHi = _mm256_unpackhi_epi8(rb, g);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) );
|
||||
v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) );
|
||||
rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) );
|
||||
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) );
|
||||
b = _mm256_or_si256(b, _mm256_srli_epi16(b, 5));
|
||||
|
||||
rg = _mm256_permute4x64_epi64(rg, 0xD8);
|
||||
b = _mm256_permute4x64_epi64( b, 0xD8);
|
||||
|
||||
dstLo = _mm256_unpacklo_epi16(rg, b);
|
||||
dstHi = _mm256_unpackhi_epi16(rg, b);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
@ -86,18 +197,13 @@ FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src)
|
|||
// Conversion algorithm:
|
||||
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
|
||||
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
|
||||
v256u32 rgb;
|
||||
const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) );
|
||||
v256u32 rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
|
||||
const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) );
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
|
||||
rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) );
|
||||
}
|
||||
else
|
||||
{
|
||||
rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) );
|
||||
}
|
||||
|
||||
return _mm256_or_si256(rgb, a);
|
||||
}
|
||||
|
@ -288,18 +394,16 @@ FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float in
|
|||
return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 16), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) );
|
||||
v256u16 rb = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x00FF00FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) );
|
||||
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
|
||||
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
|
||||
rb = _mm256_mulhi_epu16(rb, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 );
|
||||
|
||||
return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
return _mm256_or_si256( _mm256_or_si256(rb, g), a);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
|
@ -307,7 +411,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
|
||||
v256u32 dstConvertedLo, dstConvertedHi;
|
||||
|
@ -315,13 +419,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -333,7 +437,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
|
||||
v256u32 dstConvertedLo, dstConvertedHi;
|
||||
|
@ -341,13 +445,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u3
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -359,7 +463,7 @@ size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t p
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -379,7 +483,7 @@ size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t p
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -399,15 +503,15 @@ size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) );
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) );
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -419,15 +523,15 @@ size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) );
|
||||
_mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) );
|
||||
_mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2<SWAP_RB>(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -439,7 +543,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, si
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -461,17 +565,17 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re
|
|||
v256u16 src_v256u16[2];
|
||||
v256u32 src_v256u32[4];
|
||||
|
||||
for (; i < pixCountVec256; i+=32)
|
||||
for (; i < pixCountVec256; i+=((sizeof(v256u16)/sizeof(u16)) * 2))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
src_v256u16[0] = _mm256_loadu_si256((v256u16 *)(src + i + 0));
|
||||
src_v256u16[1] = _mm256_loadu_si256((v256u16 *)(src + i + 16));
|
||||
src_v256u16[0] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) );
|
||||
src_v256u16[1] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
src_v256u16[0] = _mm256_load_si256((v256u16 *)(src + i + 0));
|
||||
src_v256u16[1] = _mm256_load_si256((v256u16 *)(src + i + 16));
|
||||
src_v256u16[0] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) );
|
||||
src_v256u16[1] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) );
|
||||
}
|
||||
|
||||
v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) );
|
||||
|
@ -516,15 +620,15 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -537,21 +641,21 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re
|
|||
size_t i = 0;
|
||||
v256u32 src_v256u32[4];
|
||||
|
||||
for (; i < pixCountVec256; i+=32)
|
||||
for (; i < pixCountVec256; i+=((sizeof(v256u32)/sizeof(u32)) * 4))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
src_v256u32[0] = _mm256_loadu_si256((v256u32 *)(src + i + 0));
|
||||
src_v256u32[1] = _mm256_loadu_si256((v256u32 *)(src + i + 8));
|
||||
src_v256u32[2] = _mm256_loadu_si256((v256u32 *)(src + i + 16));
|
||||
src_v256u32[3] = _mm256_loadu_si256((v256u32 *)(src + i + 24));
|
||||
src_v256u32[0] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) );
|
||||
src_v256u32[1] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) );
|
||||
src_v256u32[2] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) );
|
||||
src_v256u32[3] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
src_v256u32[0] = _mm256_load_si256((v256u32 *)(src + i + 0));
|
||||
src_v256u32[1] = _mm256_load_si256((v256u32 *)(src + i + 8));
|
||||
src_v256u32[2] = _mm256_load_si256((v256u32 *)(src + i + 16));
|
||||
src_v256u32[3] = _mm256_load_si256((v256u32 *)(src + i + 24));
|
||||
src_v256u32[0] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) );
|
||||
src_v256u32[1] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) );
|
||||
src_v256u32[2] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) );
|
||||
src_v256u32[3] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) );
|
||||
}
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -577,15 +681,15 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
|
||||
_mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -603,7 +707,7 @@ size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec2
|
|||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i));
|
||||
|
||||
|
@ -631,7 +735,7 @@ size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec2
|
|||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i));
|
||||
|
||||
|
@ -657,7 +761,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
|
|||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
|
||||
const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) );
|
||||
|
@ -679,7 +783,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
|
|||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -695,7 +799,7 @@ size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256,
|
|||
{
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec256; i+=16)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16)))
|
||||
{
|
||||
v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i));
|
||||
v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256;
|
||||
|
@ -734,7 +838,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
|
|||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
|
||||
const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2));
|
||||
|
@ -756,7 +860,7 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
|
|||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -772,21 +876,19 @@ size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256,
|
|||
{
|
||||
const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec256; i+=8)
|
||||
for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32)))
|
||||
{
|
||||
v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i));
|
||||
v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256;
|
||||
|
||||
v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u16 b = _mm256_and_si256( _mm256_srli_epi32(tempDst, 16), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) );
|
||||
v256u16 rb = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x00FF00FF) );
|
||||
v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) );
|
||||
v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) );
|
||||
|
||||
r = _mm256_mulhi_epu16(r, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 8 );
|
||||
b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 16 );
|
||||
rb = _mm256_mulhi_epu16(rb, intensity_v256);
|
||||
g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 );
|
||||
|
||||
tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a);
|
||||
tempDst = _mm256_or_si256( _mm256_or_si256(rb, g), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -1045,9 +1147,15 @@ size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
|
|||
template void ColorspaceConvert555To8888_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo888X_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo888X_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To6665_AVX2<true>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555To6665_AVX2<false>(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo666X_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo666X_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To8888Opaque_AVX2<true>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888Opaque_AVX2<false>(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@
|
|||
#else
|
||||
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi);
|
||||
template<bool SWAP_RB> v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
Copyright (C) 2016-2019 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with the this software. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef COLORSPACEHANDLER_AVX512_H
|
||||
#define COLORSPACEHANDLER_AVX512_H
|
||||
|
||||
#include "colorspacehandler.h"
|
||||
|
||||
#ifndef ENABLE_AVX512_1
|
||||
#warning This header requires AVX-512 Tier-1 support.
|
||||
#else
|
||||
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi);
|
||||
template<bool SWAP_RB> v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src);
|
||||
template<bool SWAP_RB> v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src);
|
||||
template<bool SWAP_RB> v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi);
|
||||
template<bool SWAP_RB> v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi);
|
||||
template<bool SWAP_RB> v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v512u16 ColorspaceCopy16_AVX512(const v512u16 &src);
|
||||
template<bool SWAP_RB> v512u32 ColorspaceCopy32_AVX512(const v512u32 &src);
|
||||
|
||||
template<bool SWAP_RB> v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity);
|
||||
template<bool SWAP_RB> v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity);
|
||||
|
||||
class ColorspaceHandler_AVX512 : public ColorspaceHandler
|
||||
{
|
||||
public:
|
||||
ColorspaceHandler_AVX512() {};
|
||||
|
||||
size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const;
|
||||
|
||||
size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const;
|
||||
|
||||
size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const;
|
||||
};
|
||||
|
||||
#endif // ENABLE_AVX512_1
|
||||
|
||||
#endif // COLORSPACEHANDLER_AVX512_H
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
Copyright (C) 2016-2019 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -38,6 +38,21 @@ FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, con
|
|||
dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
dstLo = vec_unpackl((vector pixel)srcColor);
|
||||
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
|
||||
dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
|
||||
|
||||
dstHi = vec_unpackh((vector pixel)srcColor);
|
||||
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) );
|
||||
dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
|
@ -53,6 +68,21 @@ FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, con
|
|||
dstHi = vec_perm(dstHi, srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
dstLo = vec_unpackl((vector pixel)srcColor);
|
||||
dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
|
||||
dstLo = vec_perm(dstLo, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F}));
|
||||
|
||||
dstHi = vec_unpackh((vector pixel)srcColor);
|
||||
dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) );
|
||||
dstHi = vec_perm(dstHi, ((v128u8){0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0})), (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F}));
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
|
@ -513,9 +543,15 @@ size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst,
|
|||
template void ColorspaceConvert555To8888_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo888X_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo888X_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To6665_AltiVec<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To6665_AltiVec<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo666X_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo666X_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To8888Opaque_AltiVec<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888Opaque_AltiVec<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@
|
|||
#else
|
||||
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
Copyright (C) 2016-2019 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -38,28 +38,62 @@ FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const
|
|||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 11), _mm_srli_epi16(srcColor, 7)), _mm_set1_epi16(0xF8F8) );
|
||||
v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8)), srcAlphaBits);
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) );
|
||||
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707)));
|
||||
|
||||
v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) );
|
||||
ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5));
|
||||
ga = _mm_or_si128(ga, srcAlphaBits);
|
||||
|
||||
dstLo = _mm_unpacklo_epi8(rb, ga);
|
||||
dstHi = _mm_unpackhi_epi8(rb, ga);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
|
||||
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) );
|
||||
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) );
|
||||
|
||||
v128u16 ba = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
|
||||
ba = _mm_or_si128(ba, _mm_srli_epi16(ba, 5));
|
||||
ba = _mm_or_si128(ba, srcAlphaBits);
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rg, ba);
|
||||
dstHi = _mm_unpackhi_epi16(rg, ba);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rb, ga);
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
|
||||
dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
|
||||
dstHi = _mm_unpackhi_epi16(rb, ga);
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
|
||||
dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
#else
|
||||
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
|
||||
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800) );
|
||||
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
|
||||
|
||||
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) );
|
||||
|
||||
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) );
|
||||
#endif
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) );
|
||||
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707)));
|
||||
|
||||
v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) );
|
||||
g = _mm_or_si128(g, _mm_srli_epi16(g, 5));
|
||||
|
||||
dstLo = _mm_unpacklo_epi8(rb, g);
|
||||
dstHi = _mm_unpackhi_epi8(rb, g);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) );
|
||||
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) );
|
||||
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) );
|
||||
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) );
|
||||
b = _mm_or_si128(b, _mm_srli_epi16(b, 5));
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rg, b);
|
||||
dstHi = _mm_unpackhi_epi16(rg, b);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
@ -68,28 +102,63 @@ FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const
|
|||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(srcColor, 9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
|
||||
v128u16 ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E)), srcAlphaBits);
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
|
||||
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101)));
|
||||
|
||||
v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) );
|
||||
ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5));
|
||||
ga = _mm_or_si128(ga, srcAlphaBits);
|
||||
|
||||
dstLo = _mm_unpacklo_epi8(rb, ga);
|
||||
dstHi = _mm_unpackhi_epi8(rb, ga);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
|
||||
const v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
|
||||
|
||||
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) );
|
||||
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) );
|
||||
|
||||
v128u16 ba = _mm_or_si128(b, _mm_srli_epi16(b, 5));
|
||||
ba = _mm_or_si128(ba, srcAlphaBits);
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rg, ba);
|
||||
dstHi = _mm_unpackhi_epi16(rg, ba);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rb, ga);
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
|
||||
dstLo = _mm_shuffle_epi8( dstLo, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
|
||||
dstHi = _mm_unpackhi_epi16(rb, ga);
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
|
||||
dstHi = _mm_shuffle_epi8( dstHi, (SWAP_RB) ? _mm_set_epi8(15,13,14,12, 11,9,10,8, 7,5,6,4, 3,1,2,0) : _mm_set_epi8(15,12,14,13, 11,8,10,9, 7,4,6,5, 3,0,2,1) );
|
||||
#else
|
||||
v128u16 r = (SWAP_RB) ? _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
|
||||
v128u16 g = _mm_and_si128( _mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00) );
|
||||
v128u16 b = (SWAP_RB) ? _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ) : _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
|
||||
|
||||
dstLo = _mm_or_si128( _mm_unpacklo_epi16(r, b), _mm_unpacklo_epi16(g, srcAlphaBits) );
|
||||
dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) );
|
||||
|
||||
dstHi = _mm_or_si128( _mm_unpackhi_epi16(r, b), _mm_unpackhi_epi16(g, srcAlphaBits) );
|
||||
dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) );
|
||||
#endif
|
||||
if (SWAP_RB)
|
||||
{
|
||||
v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) );
|
||||
rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101)));
|
||||
|
||||
v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) );
|
||||
g = _mm_or_si128(g, _mm_srli_epi16(g, 5));
|
||||
|
||||
dstLo = _mm_unpacklo_epi8(rb, g);
|
||||
dstHi = _mm_unpackhi_epi8(rb, g);
|
||||
}
|
||||
else
|
||||
{
|
||||
const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) );
|
||||
v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) );
|
||||
rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) );
|
||||
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) );
|
||||
b = _mm_or_si128(b, _mm_srli_epi16(b, 5));
|
||||
|
||||
dstLo = _mm_unpacklo_epi16(rg, b);
|
||||
dstHi = _mm_unpackhi_epi16(rg, b);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
@ -332,18 +401,16 @@ FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float in
|
|||
return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempSrc, 16), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) );
|
||||
v128u16 rb = _mm_and_si128( tempSrc, _mm_set1_epi32(0x00FF00FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) );
|
||||
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
|
||||
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
|
||||
rb = _mm_mulhi_epu16(rb, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 );
|
||||
|
||||
return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
return _mm_or_si128( _mm_or_si128(rb, g), a);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB, bool IS_UNALIGNED>
|
||||
|
@ -351,7 +418,7 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
|
||||
v128u32 dstConvertedLo, dstConvertedHi;
|
||||
|
@ -359,13 +426,13 @@ static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi);
|
||||
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -377,7 +444,7 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
|
||||
v128u32 dstConvertedLo, dstConvertedHi;
|
||||
|
@ -385,13 +452,13 @@ size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u3
|
|||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo);
|
||||
_mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi);
|
||||
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo);
|
||||
_mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -403,7 +470,7 @@ size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t p
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -423,7 +490,7 @@ size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t p
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -443,7 +510,7 @@ size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -463,7 +530,7 @@ size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -483,7 +550,7 @@ size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, si
|
|||
{
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -507,17 +574,17 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r
|
|||
v128u16 src_v128u16[2];
|
||||
v128u32 src_v128u32[4];
|
||||
|
||||
for (; i < pixCountVec128; i+=16)
|
||||
for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
src_v128u16[0] = _mm_loadu_si128((v128u16 *)(src + i + 0));
|
||||
src_v128u16[1] = _mm_loadu_si128((v128u16 *)(src + i + 8));
|
||||
src_v128u16[0] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) );
|
||||
src_v128u16[1] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
src_v128u16[0] = _mm_load_si128((v128u16 *)(src + i + 0));
|
||||
src_v128u16[1] = _mm_load_si128((v128u16 *)(src + i + 8));
|
||||
src_v128u16[0] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) );
|
||||
src_v128u16[1] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) );
|
||||
}
|
||||
|
||||
v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) );
|
||||
|
@ -553,28 +620,28 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r
|
|||
#ifdef ENABLE_SSE4_1
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
}
|
||||
#else
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -588,21 +655,21 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
|
|||
size_t i = 0;
|
||||
v128u32 src_v128u32[4];
|
||||
|
||||
for (; i < pixCountVec128; i+=16)
|
||||
for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
src_v128u32[0] = _mm_loadu_si128((v128u32 *)(src + i + 0));
|
||||
src_v128u32[1] = _mm_loadu_si128((v128u32 *)(src + i + 4));
|
||||
src_v128u32[2] = _mm_loadu_si128((v128u32 *)(src + i + 8));
|
||||
src_v128u32[3] = _mm_loadu_si128((v128u32 *)(src + i + 12));
|
||||
src_v128u32[0] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) );
|
||||
src_v128u32[1] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) );
|
||||
src_v128u32[2] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) );
|
||||
src_v128u32[3] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) );
|
||||
}
|
||||
else
|
||||
{
|
||||
src_v128u32[0] = _mm_load_si128((v128u32 *)(src + i + 0));
|
||||
src_v128u32[1] = _mm_load_si128((v128u32 *)(src + i + 4));
|
||||
src_v128u32[2] = _mm_load_si128((v128u32 *)(src + i + 8));
|
||||
src_v128u32[3] = _mm_load_si128((v128u32 *)(src + i + 12));
|
||||
src_v128u32[0] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) );
|
||||
src_v128u32[1] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) );
|
||||
src_v128u32[2] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) );
|
||||
src_v128u32[3] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) );
|
||||
}
|
||||
|
||||
if (SWAP_RB)
|
||||
|
@ -623,28 +690,28 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
|
|||
#ifdef ENABLE_SSE4_1
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
|
||||
}
|
||||
#else
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
|
||||
_mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -665,7 +732,7 @@ size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec1
|
|||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i));
|
||||
|
||||
|
@ -693,7 +760,7 @@ size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec1
|
|||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i));
|
||||
|
||||
|
@ -719,7 +786,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
|
|||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
|
||||
const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) );
|
||||
|
@ -741,7 +808,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
|
|||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -757,7 +824,7 @@ size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128,
|
|||
{
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec128; i+=8)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16)))
|
||||
{
|
||||
v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i));
|
||||
v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128;
|
||||
|
@ -796,7 +863,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
|
|||
{
|
||||
if (SWAP_RB)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
|
||||
#ifdef ENABLE_SSSE3
|
||||
|
@ -821,7 +888,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
|
|||
}
|
||||
else if (intensity < 0.001f)
|
||||
{
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -837,7 +904,7 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
|
|||
{
|
||||
const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) );
|
||||
|
||||
for (; i < pixCountVec128; i+=4)
|
||||
for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32)))
|
||||
{
|
||||
v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i));
|
||||
#ifdef ENABLE_SSSE3
|
||||
|
@ -846,16 +913,14 @@ size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128,
|
|||
v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128;
|
||||
#endif
|
||||
|
||||
v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u16 b = _mm_and_si128( _mm_srli_epi32(tempDst, 16), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) );
|
||||
v128u16 rb = _mm_and_si128( tempDst, _mm_set1_epi32(0x00FF00FF) );
|
||||
v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) );
|
||||
v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) );
|
||||
|
||||
r = _mm_mulhi_epu16(r, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16(g, intensity_v128), 8 );
|
||||
b = _mm_slli_epi32( _mm_mulhi_epu16(b, intensity_v128), 16 );
|
||||
rb = _mm_mulhi_epu16(rb, intensity_v128);
|
||||
g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 );
|
||||
|
||||
tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a);
|
||||
tempDst = _mm_or_si128( _mm_or_si128(rb, g), a);
|
||||
|
||||
if (IS_UNALIGNED)
|
||||
{
|
||||
|
@ -1118,9 +1183,15 @@ size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *
|
|||
template void ColorspaceConvert555To8888_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo888X_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo888X_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To6665_SSE2<true>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To6665_SSE2<false>(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555XTo666X_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555XTo666X_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
template void ColorspaceConvert555To8888Opaque_SSE2<true>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template void ColorspaceConvert555To8888Opaque_SSE2<false>(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@
|
|||
#else
|
||||
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi);
|
||||
template<bool SWAP_RB> v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src);
|
||||
|
|
|
@ -60,40 +60,44 @@
|
|||
#define DESMUME_PLATFORM_STRING ""
|
||||
#endif
|
||||
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING ""
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING ""
|
||||
|
||||
#if defined(ENABLE_SSE4_2)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.2"
|
||||
#elif defined(ENABLE_SSE4_1)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.1"
|
||||
#elif defined(ENABLE_SSSE3)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSSE3"
|
||||
#elif defined(ENABLE_SSE3)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE3"
|
||||
#elif defined(ENABLE_SSE2)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE2"
|
||||
#elif defined(ENABLE_SSE)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE"
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
#undef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX2)
|
||||
#undef DESMUME_CPUEXT_SECONDARY_STRING
|
||||
#if defined(ENABLE_AVX512_3)
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-3"
|
||||
#elif defined(ENABLE_AVX512_2)
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-2"
|
||||
#elif defined(ENABLE_AVX512_1)
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-1"
|
||||
#elif defined(ENABLE_AVX512_0)
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX-512,Tier-0"
|
||||
#elif defined(ENABLE_AVX2)
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX2"
|
||||
#elif defined(ENABLE_AVX)
|
||||
#undef DESMUME_CPUEXT_SECONDARY_STRING
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING "+AVX"
|
||||
#endif
|
||||
|
||||
#ifndef DESMUME_CPUEXT_PRIMARY_STRING
|
||||
#define DESMUME_CPUEXT_PRIMARY_STRING ""
|
||||
#endif
|
||||
|
||||
#ifndef DESMUME_CPUEXT_SECONDARY_STRING
|
||||
#define DESMUME_CPUEXT_SECONDARY_STRING ""
|
||||
#endif
|
||||
|
||||
#define DESMUME_CPUEXT_STRING DESMUME_CPUEXT_PRIMARY_STRING DESMUME_CPUEXT_SECONDARY_STRING
|
||||
|
||||
#ifdef DEVELOPER
|
||||
|
|
Loading…
Reference in New Issue