From 0db98725dc62d2c4bfdb3e73013e3f2dac5249c7 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 6 Sep 2021 18:00:38 -0700 Subject: [PATCH] GPU: Do a massive refactor of the manually vectorized code and add full support for AVX2. - Most notably, each version of the manually vectorized code now resides in their own files. - Depending on the rendering situation, the new AVX2 code may increase rendering performance by 5% to up to 50%. - Certain functions automatically gain manual vectorization support since the new GPU code makes use of the new general-purpose copy functions that were added in commit e991b16. In other words, AVX-512 and AltiVec builds also benefit from this. --- desmume/src/GPU.cpp | 4229 ++------------------------- desmume/src/GPU.h | 166 +- desmume/src/GPU_Operations.cpp | 1336 +++++++++ desmume/src/GPU_Operations.h | 96 + desmume/src/GPU_Operations_AVX2.cpp | 3130 ++++++++++++++++++++ desmume/src/GPU_Operations_AVX2.h | 122 + desmume/src/GPU_Operations_SSE2.cpp | 2896 ++++++++++++++++++ desmume/src/GPU_Operations_SSE2.h | 122 + desmume/src/gfx3d.cpp | 2 +- 9 files changed, 8003 insertions(+), 4096 deletions(-) mode change 100644 => 100755 desmume/src/GPU.cpp create mode 100644 desmume/src/GPU_Operations.cpp create mode 100644 desmume/src/GPU_Operations.h create mode 100644 desmume/src/GPU_Operations_AVX2.cpp create mode 100644 desmume/src/GPU_Operations_AVX2.h create mode 100644 desmume/src/GPU_Operations_SSE2.cpp create mode 100644 desmume/src/GPU_Operations_SSE2.h diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp old mode 100644 new mode 100755 index 091bb6ebf..42c122be4 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -48,30 +48,29 @@ #define DISABLE_COMPOSITOR_FAST_PATHS #endif +#include "GPU_Operations.cpp" + +#if defined(ENABLE_AVX2) + #define USEVECTORSIZE_256 + #define VECTORSIZE 32 +#elif defined(ENABLE_SSE2) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 +#endif + +#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128) + #define VECELEMENTCOUNT8 (VECTORSIZE/sizeof(s8)) + #define VECELEMENTCOUNT16 (VECTORSIZE/sizeof(s16)) + #define VECELEMENTCOUNT32 (VECTORSIZE/sizeof(s32)) + // Comment out USEMANUALVECTORIZATION to disable the hand-coded vectorized code. + #define USEMANUALVECTORIZATION +#endif + //instantiate static instance -u16 GPUEngineBase::_brightnessUpTable555[17][0x8000]; -FragmentColor GPUEngineBase::_brightnessUpTable666[17][0x8000]; -FragmentColor GPUEngineBase::_brightnessUpTable888[17][0x8000]; -u16 GPUEngineBase::_brightnessDownTable555[17][0x8000]; -FragmentColor GPUEngineBase::_brightnessDownTable666[17][0x8000]; -FragmentColor GPUEngineBase::_brightnessDownTable888[17][0x8000]; -u8 GPUEngineBase::_blendTable555[17][17][32][32]; GPUEngineBase::MosaicLookup GPUEngineBase::_mosaicLookup; GPUSubsystem *GPU = NULL; -static size_t _gpuLargestDstLineCount = 1; -static size_t _gpuVRAMBlockOffset = GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH; - -static u16 *_gpuDstToSrcIndex = NULL; // Key: Destination pixel index / Value: Source pixel index -static u8 *_gpuDstToSrcSSSE3_u8_8e = NULL; -static u8 *_gpuDstToSrcSSSE3_u8_16e = NULL; -static u8 *_gpuDstToSrcSSSE3_u16_8e = NULL; -static u8 *_gpuDstToSrcSSSE3_u32_4e = NULL; - -static CACHE_ALIGN size_t _gpuDstPitchCount[GPU_FRAMEBUFFER_NATIVE_WIDTH]; // Key: Source pixel index in x-dimension / Value: Number of x-dimension destination pixels for the source pixel -static CACHE_ALIGN size_t _gpuDstPitchIndex[GPU_FRAMEBUFFER_NATIVE_WIDTH]; // Key: Source pixel index in x-dimension / Value: First destination pixel that maps to the source pixel - const CACHE_ALIGN SpriteSize GPUEngineBase::_sprSizeTab[4][4] = { {{8, 8}, {16, 8}, {8, 16}, {8, 8}}, {{16, 16}, {32, 8}, {8, 32}, {8, 8}}, @@ -103,1159 +102,6 @@ const CACHE_ALIGN BGLayerSize GPUEngineBase::_BGLayerSizeLUT[8][4] = { {{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext direct }; -template -static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead, size_t lineWidth, size_t lineCount) -{ - u8 *__restrict dst = (u8 *)dstLineHead + (lineWidth * ELEMENTSIZE); - - for (size_t line = 1; line < lineCount; line++) - { - memcpy(dst, dstLineHead, lineWidth * ELEMENTSIZE); - dst += (lineWidth * ELEMENTSIZE); - } -} - -template -static FORCEINLINE void CopyLineExpand_C(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) -{ - if (INTEGERSCALEHINT == 0) - { -#if defined(MSB_FIRST) - if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) - { - for (size_t i = 0; i < dstWidth; i++) - { - if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); - } - } - } - else -#endif - { - memcpy(dst, src, dstWidth * ELEMENTSIZE); - } - } - else if (INTEGERSCALEHINT == 1) - { -#if defined(MSB_FIRST) - if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) - { - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); - } - } - } - else -#endif - { - memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE); - } - } - else if ( (INTEGERSCALEHINT >= 2) && (INTEGERSCALEHINT <= 16) ) - { - const size_t S = INTEGERSCALEHINT; - - if (SCALEVERTICAL) - { - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - for (size_t q = 0; q < S; q++) - { - for (size_t p = 0; p < S; p++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = ( (u8 *)src)[x]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; - } - } - } - } - } - else - { - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - for (size_t p = 0; p < S; p++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[(x * S) + p] = ( (u8 *)src)[x]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[(x * S) + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[(x * S) + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; - } - } - } - } - } - else - { - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; - } - } - } - - if (SCALEVERTICAL) - { - CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); - } - } -} - -#ifdef ENABLE_SSE2 -template -static FORCEINLINE void CopyLineExpand_SSE2(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) -{ - if (INTEGERSCALEHINT == 0) - { - memcpy(dst, src, dstWidth * ELEMENTSIZE); - } - else if (INTEGERSCALEHINT == 1) - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) ); - } - else if (INTEGERSCALEHINT == 2) - { - __m128i srcPix; - __m128i srcPixOut[2]; - - switch (ELEMENTSIZE) - { - case 1: - { - if (SCALEVERTICAL) - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ - srcPix = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ - srcPixOut[0] = _mm_unpacklo_epi8(srcPix, srcPix); \ - srcPixOut[1] = _mm_unpackhi_epi8(srcPix, srcPix); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ - ); - } - else - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ - srcPix = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ - srcPixOut[0] = _mm_unpacklo_epi8(srcPix, srcPix); \ - srcPixOut[1] = _mm_unpackhi_epi8(srcPix, srcPix); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + 1, srcPixOut[1]); \ - ); - } - break; - } - - case 2: - { - if (SCALEVERTICAL) - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ - srcPix = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ - srcPixOut[0] = _mm_unpacklo_epi16(srcPix, srcPix); \ - srcPixOut[1] = _mm_unpackhi_epi16(srcPix, srcPix); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ - ); - } - else - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ - srcPix = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ - srcPixOut[0] = _mm_unpacklo_epi16(srcPix, srcPix); \ - srcPixOut[1] = _mm_unpackhi_epi16(srcPix, srcPix); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + 0, srcPixOut[0]); \ - _mm_store_si128((__m128i *)dst + ((X) * 2) + 1, srcPixOut[1]); \ - ); - } - break; - } - - case 4: - { - // If we're also doing vertical expansion, then the total number of instructions for a fully - // unrolled loop is 448 instructions. Therefore, let's not unroll the loop in this case in - // order to avoid overusing the CPU's instruction cache. - for (size_t i = 0; i < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); i++) - { - srcPix = _mm_load_si128((__m128i *)((__m128i *)src + i)); - srcPixOut[0] = _mm_unpacklo_epi32(srcPix, srcPix); - srcPixOut[1] = _mm_unpackhi_epi32(srcPix, srcPix); - - _mm_store_si128((__m128i *)dst + (i * 2) + 0, srcPixOut[0]); - _mm_store_si128((__m128i *)dst + (i * 2) + 1, srcPixOut[1]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)dst + (i * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); - _mm_store_si128((__m128i *)dst + (i * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); - } - } - break; - } - } - } - else if (INTEGERSCALEHINT == 3) - { - __m128i srcPixOut[3]; - - for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; ) - { - if (ELEMENTSIZE == 1) - { - const __m128i src8 = _mm_load_si128((__m128i *)((u8 *)src + srcX)); - -#ifdef ENABLE_SSSE3 - srcPixOut[0] = _mm_shuffle_epi8(src8, _mm_set_epi8( 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0)); - srcPixOut[1] = _mm_shuffle_epi8(src8, _mm_set_epi8(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5)); - srcPixOut[2] = _mm_shuffle_epi8(src8, _mm_set_epi8(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10)); -#else - __m128i src8As32[4]; - src8As32[0] = _mm_unpacklo_epi8(src8, src8); - src8As32[1] = _mm_unpackhi_epi8(src8, src8); - src8As32[2] = _mm_unpacklo_epi8(src8As32[1], src8As32[1]); - src8As32[3] = _mm_unpackhi_epi8(src8As32[1], src8As32[1]); - src8As32[1] = _mm_unpackhi_epi8(src8As32[0], src8As32[0]); - src8As32[0] = _mm_unpacklo_epi8(src8As32[0], src8As32[0]); - - src8As32[0] = _mm_and_si128(src8As32[0], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); - src8As32[1] = _mm_and_si128(src8As32[1], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); - src8As32[2] = _mm_and_si128(src8As32[2], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); - src8As32[3] = _mm_and_si128(src8As32[3], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); - - __m128i srcWorking[4]; - - srcWorking[0] = _mm_shuffle_epi32(src8As32[0], 0x40); - srcWorking[1] = _mm_shuffle_epi32(src8As32[0], 0xA5); - srcWorking[2] = _mm_shuffle_epi32(src8As32[0], 0xFE); - srcWorking[3] = _mm_shuffle_epi32(src8As32[1], 0x40); - srcPixOut[0] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); - - srcWorking[0] = _mm_shuffle_epi32(src8As32[1], 0xA5); - srcWorking[1] = _mm_shuffle_epi32(src8As32[1], 0xFE); - srcWorking[2] = _mm_shuffle_epi32(src8As32[2], 0x40); - srcWorking[3] = _mm_shuffle_epi32(src8As32[2], 0xA5); - srcPixOut[1] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); - - srcWorking[0] = _mm_shuffle_epi32(src8As32[2], 0xFE); - srcWorking[1] = _mm_shuffle_epi32(src8As32[3], 0x40); - srcWorking[2] = _mm_shuffle_epi32(src8As32[3], 0xA5); - srcWorking[3] = _mm_shuffle_epi32(src8As32[3], 0xFE); - srcPixOut[2] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); -#endif - _mm_store_si128((__m128i *)((u8 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)((u8 *)dst + dstX + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)((u8 *)dst + dstX + 32), srcPixOut[2]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 32), srcPixOut[2]); - - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 32), srcPixOut[2]); - } - - srcX += 16; - dstX += 48; - } - else if (ELEMENTSIZE == 2) - { - const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX)); - -#ifdef ENABLE_SSSE3 - srcPixOut[0] = _mm_shuffle_epi8(src16, _mm_set_epi8( 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0)); - srcPixOut[1] = _mm_shuffle_epi8(src16, _mm_set_epi8(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4)); - srcPixOut[2] = _mm_shuffle_epi8(src16, _mm_set_epi8(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10)); -#else - const __m128i src16lo = _mm_shuffle_epi32(src16, 0x44); - const __m128i src16hi = _mm_shuffle_epi32(src16, 0xEE); - - srcPixOut[0] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16lo, 0x40), 0xA5); - srcPixOut[1] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16, 0xFE), 0x40); - srcPixOut[2] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16hi, 0xA5), 0xFE); -#endif - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), srcPixOut[2]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 16), srcPixOut[2]); - - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 16), srcPixOut[2]); - } - - srcX += 8; - dstX += 24; - } - else if (ELEMENTSIZE == 4) - { - const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX)); - - srcPixOut[0] = _mm_shuffle_epi32(src32, 0x40); - srcPixOut[1] = _mm_shuffle_epi32(src32, 0xA5); - srcPixOut[2] = _mm_shuffle_epi32(src32, 0xFE); - - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 8), srcPixOut[2]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 1) + 8), srcPixOut[2]); - - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 3) * 2) + 8), srcPixOut[2]); - } - - srcX += 4; - dstX += 12; - } - } - } - else if (INTEGERSCALEHINT == 4) - { - __m128i srcPixOut[4]; - - for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; ) - { - if (ELEMENTSIZE == 1) - { - const __m128i src8 = _mm_load_si128((__m128i *)( (u8 *)src + srcX)); - -#ifdef ENABLE_SSSE3 - srcPixOut[0] = _mm_shuffle_epi8(src8, _mm_set_epi8( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0)); - srcPixOut[1] = _mm_shuffle_epi8(src8, _mm_set_epi8( 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4)); - srcPixOut[2] = _mm_shuffle_epi8(src8, _mm_set_epi8(11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8)); - srcPixOut[3] = _mm_shuffle_epi8(src8, _mm_set_epi8(15, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12)); -#else - const __m128i src8_lo = _mm_unpacklo_epi8(src8, src8); - const __m128i src8_hi = _mm_unpackhi_epi8(src8, src8); - - srcPixOut[0] = _mm_unpacklo_epi8(src8_lo, src8_lo); - srcPixOut[1] = _mm_unpackhi_epi8(src8_lo, src8_lo); - srcPixOut[2] = _mm_unpacklo_epi8(src8_hi, src8_hi); - srcPixOut[3] = _mm_unpackhi_epi8(src8_hi, src8_hi); -#endif - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + 32), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + 48), srcPixOut[3]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 32), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 48), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 32), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 48), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 16), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 32), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 48), srcPixOut[3]); - } - - srcX += 16; - dstX += 64; - } - else if (ELEMENTSIZE == 2) - { - const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX)); - -#ifdef ENABLE_SSSE3 - srcPixOut[0] = _mm_shuffle_epi8(src16, _mm_set_epi8( 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0)); - srcPixOut[1] = _mm_shuffle_epi8(src16, _mm_set_epi8( 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4)); - srcPixOut[2] = _mm_shuffle_epi8(src16, _mm_set_epi8(11, 10, 11, 10, 11, 10, 11, 10, 9, 8, 9, 8, 9, 8, 9, 8)); - srcPixOut[3] = _mm_shuffle_epi8(src16, _mm_set_epi8(15, 14, 15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 13, 12)); -#else - const __m128i src16_lo = _mm_unpacklo_epi16(src16, src16); - const __m128i src16_hi = _mm_unpackhi_epi16(src16, src16); - - srcPixOut[0] = _mm_unpacklo_epi16(src16_lo, src16_lo); - srcPixOut[1] = _mm_unpackhi_epi16(src16_lo, src16_lo); - srcPixOut[2] = _mm_unpacklo_epi16(src16_hi, src16_hi); - srcPixOut[3] = _mm_unpackhi_epi16(src16_hi, src16_hi); -#endif - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), srcPixOut[2]); - _mm_store_si128((__m128i *)((u16 *)dst + dstX + 24), srcPixOut[3]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 16), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 24), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 16), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 24), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 8), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 16), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u16 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 24), srcPixOut[3]); - } - - srcX += 8; - dstX += 32; - } - else if (ELEMENTSIZE == 4) - { - const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX)); - -#ifdef ENABLE_SSSE3 - srcPixOut[0] = _mm_shuffle_epi8(src32, _mm_set_epi8( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0)); - srcPixOut[1] = _mm_shuffle_epi8(src32, _mm_set_epi8( 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4)); - srcPixOut[2] = _mm_shuffle_epi8(src32, _mm_set_epi8(11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8)); - srcPixOut[3] = _mm_shuffle_epi8(src32, _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12)); -#else - const __m128i src32_lo = _mm_unpacklo_epi32(src32, src32); - const __m128i src32_hi = _mm_unpackhi_epi32(src32, src32); - - srcPixOut[0] = _mm_unpacklo_epi32(src32_lo, src32_lo); - srcPixOut[1] = _mm_unpackhi_epi32(src32_lo, src32_lo); - srcPixOut[2] = _mm_unpacklo_epi32(src32_hi, src32_hi); - srcPixOut[3] = _mm_unpackhi_epi32(src32_hi, src32_hi); -#endif - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 8), srcPixOut[2]); - _mm_store_si128((__m128i *)((u32 *)dst + dstX + 12), srcPixOut[3]); - - if (SCALEVERTICAL) - { - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 8), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 1) + 12), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 8), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 2) + 12), srcPixOut[3]); - - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 0), srcPixOut[0]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 4), srcPixOut[1]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 8), srcPixOut[2]); - _mm_store_si128((__m128i *)( (u32 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) * 3) + 12), srcPixOut[3]); - } - - srcX += 4; - dstX += 16; - } - } - } -#ifdef ENABLE_SSSE3 - else if (INTEGERSCALEHINT >= 0) - { - const size_t scale = dstWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; - - for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; ) - { - if (ELEMENTSIZE == 1) - { - const __m128i src8 = _mm_load_si128((__m128i *)((u8 *)src + srcX)); - - for (size_t s = 0; s < scale; s++) - { - const __m128i ssse3idx_u8 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u8_16e + (s * 16))); - _mm_store_si128( (__m128i *)( (u8 *)dst + dstX + (s * 16)), _mm_shuffle_epi8( src8, ssse3idx_u8 ) ); - } - - srcX += 16; - dstX += (16 * scale); - } - else if (ELEMENTSIZE == 2) - { - const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX)); - - for (size_t s = 0; s < scale; s++) - { - const __m128i ssse3idx_u16 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u16_8e + (s * 16))); - _mm_store_si128( (__m128i *)((u16 *)dst + dstX + (s * 8)), _mm_shuffle_epi8(src16, ssse3idx_u16) ); - } - - srcX += 8; - dstX += (8 * scale); - } - else if (ELEMENTSIZE == 4) - { - const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX)); - - for (size_t s = 0; s < scale; s++) - { - const __m128i ssse3idx_u32 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u32_4e + (s * 16))); - _mm_store_si128( (__m128i *)((u32 *)dst + dstX + (s * 4)), _mm_shuffle_epi8(src32, ssse3idx_u32) ); - } - - srcX += 4; - dstX += (4 * scale); - } - } - - if (SCALEVERTICAL) - { - CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); - } - } -#endif - else - { - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x]; - } - } - } - - if (SCALEVERTICAL) - { - CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); - } - } -} -#endif - -template -static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) -{ - // Use INTEGERSCALEHINT to provide a hint to CopyLineExpand() for the fastest execution path. - // INTEGERSCALEHINT represents the scaling value of the framebuffer width, and is always - // assumed to be a positive integer. - // - // Use cases: - // - Passing a value of 0 causes CopyLineExpand() to perform a simple copy, using dstWidth - // to copy dstWidth elements. - // - Passing a value of 1 causes CopyLineExpand() to perform a simple copy, ignoring dstWidth - // and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements. - // - Passing any negative value causes CopyLineExpand() to assume that the framebuffer width - // is NOT scaled by an integer value, and will therefore take the safest (but slowest) - // execution path. - // - Passing any positive value greater than 1 causes CopyLineExpand() to expand the line - // using the integer scaling value. - -#ifdef ENABLE_SSE2 - CopyLineExpand_SSE2(dst, src, dstWidth, dstLineCount); -#else - CopyLineExpand_C(dst, src, dstWidth, dstLineCount); -#endif -} - -template -void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, - void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount) -{ - switch (INTEGERSCALEHINT) - { - case 0: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; - - CopyLineExpand(dst, src, dstLineWidth * dstLineCount, 1); - break; - } - - case 1: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; - - CopyLineExpand(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH, 1); - break; - } - - default: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; - - // TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized. - // - // The implementation below is a stopgap measure for getting the faster code paths to run. - // However, this setup is not ideal, since the code size will greatly increase in order to - // include all possible code paths, possibly causing cache misses on lesser CPUs. - switch (dstLineWidth) - { - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2): - CopyLineExpand<2, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2, 2); - break; - - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3): - CopyLineExpand<3, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3, 3); - break; - - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4): - CopyLineExpand<4, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4, 4); - break; - - default: - { - if ((dstLineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0) - { - CopyLineExpand<0xFFFF, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLineWidth, dstLineCount); - } - else - { - CopyLineExpand<-1, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLineWidth, dstLineCount); - } - break; - } - } - break; - } - } -} - -template -void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer) -{ - CopyLineExpandHinted(srcBuffer, lineInfo.indexNative, - dstBuffer, lineInfo.indexCustom, lineInfo.widthCustom, lineInfo.renderCount); -} - -template -static FORCEINLINE void CopyLineReduce_C(void *__restrict dst, const void *__restrict src, size_t srcWidth) -{ - if (INTEGERSCALEHINT == 0) - { -#if defined(MSB_FIRST) - if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) - { - for (size_t i = 0; i < srcWidth; i++) - { - if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); - } - } - } - else -#endif - { - memcpy(dst, src, srcWidth * ELEMENTSIZE); - } - } - else if (INTEGERSCALEHINT == 1) - { -#if defined(MSB_FIRST) - if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) - { - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); - } - } - } - else -#endif - { - memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE); - } - } - else if ( (INTEGERSCALEHINT >= 2) && (INTEGERSCALEHINT <= 16) ) - { - const size_t S = INTEGERSCALEHINT; - - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - if (ELEMENTSIZE == 1) - { - ((u8 *)dst)[x] = ((u8 *)src)[x * S]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[x] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x * S] ) : ((u16 *)src)[x * S]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[x] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x * S] ) : ((u32 *)src)[x * S]; - } - } - } - else - { - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[i] = ((u8 *)src)[_gpuDstPitchIndex[i]]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[_gpuDstPitchIndex[i]] ) : ((u16 *)src)[_gpuDstPitchIndex[i]]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[_gpuDstPitchIndex[i]] ) : ((u32 *)src)[_gpuDstPitchIndex[i]]; - } - } - } -} - -#ifdef ENABLE_SSE2 -template -static FORCEINLINE void CopyLineReduce_SSE2(void *__restrict dst, const void *__restrict src, size_t srcWidth) -{ - if (INTEGERSCALEHINT == 0) - { - memcpy(dst, src, srcWidth * ELEMENTSIZE); - } - else if (INTEGERSCALEHINT == 1) - { - MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) ); - } - else if (INTEGERSCALEHINT == 2) - { - __m128i srcPix[2]; - - for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) - { - srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 2) + 0); - srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 2) + 1); - - if (ELEMENTSIZE == 1) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x00FF00FF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x00FF00FF)); - - _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi16(srcPix[0], srcPix[1])); - } - else if (ELEMENTSIZE == 2) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x0000FFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x0000FFFF)); - -#if defined(ENABLE_SSE4_1) - _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi32(srcPix[0], srcPix[1])); -#elif defined(ENABLE_SSSE3) - srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); - srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2)); - - _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); -#else - srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0xD8); - srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0xD8); - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); - - srcPix[1] = _mm_shufflelo_epi16(srcPix[1], 0xD8); - srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0xD8); - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); - - _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); -#endif - } - else if (ELEMENTSIZE == 4) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF)); - - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); - - _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); - } - } - } - else if (INTEGERSCALEHINT == 3) - { - __m128i srcPix[3]; - - for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) - { - srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 3) + 0); - srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 3) + 1); - srcPix[2] = _mm_load_si128((__m128i *)src + (dstX * 3) + 2); - - if (ELEMENTSIZE == 1) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFF0000FF, 0x0000FF00, 0x00FF0000, 0xFF0000FF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00FF0000, 0xFF0000FF, 0x0000FF00, 0x00FF0000)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x0000FF00, 0x00FF0000, 0xFF0000FF, 0x0000FF00)); - -#ifdef ENABLE_SSSE3 - srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(14, 13, 11, 10, 8, 7, 5, 4, 2, 1, 15, 12, 9, 6, 3, 0)); - srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(15, 13, 12, 10, 9, 14, 11, 8, 5, 2, 7, 6, 4, 3, 1, 0)); - srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(13, 10, 7, 4, 1, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0)); - - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); -#else - __m128i srcWorking[3]; - - srcWorking[0] = _mm_unpacklo_epi8(srcPix[0], _mm_setzero_si128()); - srcWorking[1] = _mm_unpackhi_epi8(srcPix[0], _mm_setzero_si128()); - srcWorking[2] = _mm_unpacklo_epi8(srcPix[1], _mm_setzero_si128()); - srcPix[0] = _mm_or_si128(srcWorking[0], srcWorking[1]); - srcPix[0] = _mm_or_si128(srcPix[0], srcWorking[2]); - - srcWorking[0] = _mm_unpackhi_epi8(srcPix[1], _mm_setzero_si128()); - srcWorking[1] = _mm_unpacklo_epi8(srcPix[2], _mm_setzero_si128()); - srcWorking[2] = _mm_unpackhi_epi8(srcPix[2], _mm_setzero_si128()); - srcPix[1] = _mm_or_si128(srcWorking[0], srcWorking[1]); - srcPix[1] = _mm_or_si128(srcPix[1], srcWorking[2]); - - srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0x6C); - srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0x6C); - srcPix[1] = _mm_shufflelo_epi16(srcPix[1], 0x6C); - srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0x6C); - - srcPix[0] = _mm_packus_epi16(srcPix[0], srcPix[1]); - srcPix[1] = _mm_shuffle_epi32(srcPix[0], 0xB1); - - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000)); - - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); -#endif - _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); - } - else if (ELEMENTSIZE == 2) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x0000FFFF, 0x00000000, 0xFFFF0000, 0x0000FFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0xFFFF0000, 0x0000FFFF, 0x00000000, 0xFFFF0000)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0xFFFF0000, 0x0000FFFF, 0x00000000)); - -#ifdef ENABLE_SSSE3 - srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(15, 14, 11, 10, 9, 8, 5, 4, 3, 2, 13, 12, 7, 6, 1, 0)); - srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(13, 12, 11, 10, 15, 14, 9, 8, 3, 2, 7, 6, 5, 4, 1, 0)); - srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(11, 10, 5, 4, 15, 14, 13, 12, 9, 8, 7, 6, 3, 2, 1, 0)); -#else - srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0x9C); - srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0x9C); - srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0x9C); - - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0x9C); - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0xE1); - srcPix[2] = _mm_shufflehi_epi16(srcPix[2], 0xC9); -#endif - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); - - _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); - } - else if (ELEMENTSIZE == 4) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000)); - - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0x9C); - srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0x78); - - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); - - _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); - } - } - } - else if (INTEGERSCALEHINT == 4) - { - __m128i srcPix[4]; - - for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) - { - srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 4) + 0); - srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 4) + 1); - srcPix[2] = _mm_load_si128((__m128i *)src + (dstX * 4) + 2); - srcPix[3] = _mm_load_si128((__m128i *)src + (dstX * 4) + 3); - - if (ELEMENTSIZE == 1) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x000000FF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x000000FF)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set1_epi32(0x000000FF)); - srcPix[3] = _mm_and_si128(srcPix[3], _mm_set1_epi32(0x000000FF)); - - srcPix[0] = _mm_packus_epi16(srcPix[0], srcPix[1]); - srcPix[1] = _mm_packus_epi16(srcPix[2], srcPix[3]); - - _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi16(srcPix[0], srcPix[1])); - } - else if (ELEMENTSIZE == 2) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); - srcPix[3] = _mm_and_si128(srcPix[3], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); - -#if defined(ENABLE_SSE4_1) - srcPix[0] = _mm_packus_epi32(srcPix[0], srcPix[1]); - srcPix[1] = _mm_packus_epi32(srcPix[2], srcPix[3]); - - _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi32(srcPix[0], srcPix[1])); -#elif defined(ENABLE_SSSE3) - srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(15, 14, 13, 12, 11, 10, 7, 6, 5, 4, 3, 2, 9, 8, 1, 0)); - srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(13, 12, 13, 12, 11, 10, 7, 6, 9, 8, 1, 0, 5, 4, 3, 2)); - srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(13, 12, 13, 12, 9, 8, 1, 0, 11, 10, 7, 6, 5, 4, 3, 2)); - srcPix[3] = _mm_shuffle_epi8(srcPix[3], _mm_set_epi8( 9, 8, 1, 0, 15, 14, 13, 12, 11, 10, 7, 6, 5, 4, 3, 2)); - - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); - srcPix[1] = _mm_or_si128(srcPix[2], srcPix[3]); - - _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); -#else - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0xD8); - srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0xD8); - srcPix[3] = _mm_shuffle_epi32(srcPix[3], 0xD8); - - srcPix[0] = _mm_unpacklo_epi32(srcPix[0], srcPix[1]); - srcPix[1] = _mm_unpacklo_epi32(srcPix[2], srcPix[3]); - - srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); - - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); - srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0xD8); - srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0xD8); - - _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); -#endif - } - else if (ELEMENTSIZE == 4) - { - srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); - srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); - srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); - srcPix[3] = _mm_and_si128(srcPix[3], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); - - srcPix[0] = _mm_unpacklo_epi32(srcPix[0], srcPix[1]); - srcPix[1] = _mm_unpacklo_epi32(srcPix[2], srcPix[3]); -#ifdef HOST_64 - srcPix[0] = _mm_unpacklo_epi64(srcPix[0], srcPix[1]); -#else - srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x4E); - srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); -#endif - _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); - } - } - } - else if ( (INTEGERSCALEHINT >= 5) && (INTEGERSCALEHINT <= 16) ) - { - const size_t S = INTEGERSCALEHINT; - - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) - { - if (ELEMENTSIZE == 1) - { - ((u8 *)dst)[x] = ((u8 *)src)[x * S]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[x] = ((u16 *)src)[x * S]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[x] = ((u32 *)src)[x * S]; - } - } - } - else - { - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - if (ELEMENTSIZE == 1) - { - ( (u8 *)dst)[i] = ( (u8 *)src)[_gpuDstPitchIndex[i]]; - } - else if (ELEMENTSIZE == 2) - { - ((u16 *)dst)[i] = ((u16 *)src)[_gpuDstPitchIndex[i]]; - } - else if (ELEMENTSIZE == 4) - { - ((u32 *)dst)[i] = ((u32 *)src)[_gpuDstPitchIndex[i]]; - } - } - } -} -#endif - -template -static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src, size_t srcWidth) -{ - // Use INTEGERSCALEHINT to provide a hint to CopyLineReduce() for the fastest execution path. - // INTEGERSCALEHINT represents the scaling value of the source framebuffer width, and is always - // assumed to be a positive integer. - // - // Use cases: - // - Passing a value of 0 causes CopyLineReduce() to perform a simple copy, using srcWidth - // to copy srcWidth elements. - // - Passing a value of 1 causes CopyLineReduce() to perform a simple copy, ignoring srcWidth - // and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements. - // - Passing any negative value causes CopyLineReduce() to assume that the framebuffer width - // is NOT scaled by an integer value, and will therefore take the safest (but slowest) - // execution path. - // - Passing any positive value greater than 1 causes CopyLineReduce() to expand the line - // using the integer scaling value. - -#ifdef ENABLE_SSE2 - CopyLineReduce_SSE2(dst, src, srcWidth); -#else - CopyLineReduce_C(dst, src, srcWidth); -#endif -} - -template -void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth, - void *__restrict dstBuffer, const size_t dstLineIndex) -{ - switch (INTEGERSCALEHINT) - { - case 0: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; - - CopyLineReduce(dst, src, srcLineWidth); - break; - } - - case 1: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; - - CopyLineReduce(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH); - break; - } - - default: - { - const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; - u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; - - // TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized. - // - // The implementation below is a stopgap measure for getting the faster code paths to run. - // However, this setup is not ideal, since the code size will greatly increase in order to - // include all possible code paths, possibly causing cache misses on lesser CPUs. - switch (srcLineWidth) - { - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2): - CopyLineReduce<2, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2); - break; - - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3): - CopyLineReduce<3, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3); - break; - - case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4): - CopyLineReduce<4, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4); - break; - - default: - { - if ((srcLineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0) - { - CopyLineReduce<0xFFFF, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, srcLineWidth); - } - else - { - CopyLineReduce<-1, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, srcLineWidth); - } - break; - } - } - break; - } - } -} - -template -void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer) -{ - CopyLineReduceHinted(srcBuffer, lineInfo.indexCustom, lineInfo.widthCustom, - dstBuffer, lineInfo.indexNative); -} - /*****************************************************************************/ // BACKGROUND RENDERING -ROTOSCALE- /*****************************************************************************/ @@ -1308,71 +154,6 @@ bool gpu_loadstate(EMUFILE &is, int size) /*****************************************************************************/ // INITIALIZATION /*****************************************************************************/ -void GPUEngineBase::_InitLUTs() -{ - static bool didInit = false; - - if (didInit) - { - return; - } - - /* - NOTE: gbatek (in the reference above) seems to expect 6bit values - per component, but as desmume works with 5bit per component, - we use 31 as top, instead of 63. Testing it on a few games, - using 63 seems to give severe color wraping, and 31 works - nicely, so for now we'll just that, until proven wrong. - - i have seen pics of pokemon ranger getting white with 31, with 63 it is nice. - it could be pb of alpha or blending or... - - MightyMax> created a test NDS to check how the brightness values work, - and 31 seems to be correct. FactorEx is a override for max brighten/darken - See: http://mightymax.org/gfx_test_brightness.nds - The Pokemon Problem could be a problem with 8/32 bit writes not recognized yet, - i'll add that so you can check back. - */ - - for (u16 i = 0; i <= 16; i++) - { - for (u16 j = 0x0000; j < 0x8000; j++) - { - COLOR cur; - - cur.val = j; - cur.bits.red = (cur.bits.red + ((31 - cur.bits.red) * i / 16)); - cur.bits.green = (cur.bits.green + ((31 - cur.bits.green) * i / 16)); - cur.bits.blue = (cur.bits.blue + ((31 - cur.bits.blue) * i / 16)); - cur.bits.alpha = 0; - GPUEngineBase::_brightnessUpTable555[i][j] = cur.val; - GPUEngineBase::_brightnessUpTable666[i][j].color = COLOR555TO666(cur.val); - GPUEngineBase::_brightnessUpTable888[i][j].color = COLOR555TO888(cur.val); - - cur.val = j; - cur.bits.red = (cur.bits.red - (cur.bits.red * i / 16)); - cur.bits.green = (cur.bits.green - (cur.bits.green * i / 16)); - cur.bits.blue = (cur.bits.blue - (cur.bits.blue * i / 16)); - cur.bits.alpha = 0; - GPUEngineBase::_brightnessDownTable555[i][j] = cur.val; - GPUEngineBase::_brightnessDownTable666[i][j].color = COLOR555TO666(cur.val); - GPUEngineBase::_brightnessDownTable888[i][j].color = COLOR555TO888(cur.val); - } - } - - for(int c0=0;c0<=31;c0++) - for(int c1=0;c1<=31;c1++) - for(int eva=0;eva<=16;eva++) - for(int evb=0;evb<=16;evb++) - { - int blend = ((c0 * eva) + (c1 * evb) ) / 16; - int final = std::min(31,blend); - GPUEngineBase::_blendTable555[eva][evb][c0][c1] = final; - } - - didInit = true; -} - GPUEngineBase::GPUEngineBase() { _IORegisterMap = NULL; @@ -1393,7 +174,6 @@ GPUEngineBase::GPUEngineBase() _BGLayer[GPULayerID_BG2].extPalette = NULL; _BGLayer[GPULayerID_BG3].extPalette = NULL; - _InitLUTs(); _internalRenderLineTargetCustom = NULL; _renderLineLayerIDCustom = NULL; _deferredIndexCustom = NULL; @@ -1496,9 +276,9 @@ void GPUEngineBase::_Reset_Base() memset(this->_sprColor, 0, sizeof(this->_sprColor)); memset(this->_sprNum, 0, sizeof(this->_sprNum)); - memset(this->_didPassWindowTestNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8)); - memset(this->_enableColorEffectNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8)); - memset(this->_didPassWindowTestCustomMasterPtr, 1, 10 * dispInfo.customWidth * sizeof(u8)); + memset(this->_didPassWindowTestNative, 0xFF, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8)); + memset(this->_enableColorEffectNative, 0xFF, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8)); + memset(this->_didPassWindowTestCustomMasterPtr, 0xFF, 10 * dispInfo.customWidth * sizeof(u8)); memset(this->_h_win[0], 0, sizeof(this->_h_win[0])); memset(this->_h_win[1], 0, sizeof(this->_h_win[1])); @@ -1594,115 +374,33 @@ void GPUEngineBase::_Reset_Base() renderState.masterBrightnessIntensity = 0; renderState.masterBrightnessIsFullIntensity = false; renderState.masterBrightnessIsMaxOrMin = true; - renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0]; - renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0]; - renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0]; - renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0]; - renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0]; - renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0]; - renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0]; + renderState.blendTable555 = (TBlendTable *)&PixelOperation::BlendTable555[renderState.blendEVA][renderState.blendEVB][0][0]; + renderState.brightnessUpTable555 = &PixelOperation::BrightnessUpTable555[renderState.blendEVY][0]; + renderState.brightnessUpTable666 = &PixelOperation::BrightnessUpTable666[renderState.blendEVY][0]; + renderState.brightnessUpTable888 = &PixelOperation::BrightnessUpTable888[renderState.blendEVY][0]; + renderState.brightnessDownTable555 = &PixelOperation::BrightnessDownTable555[renderState.blendEVY][0]; + renderState.brightnessDownTable666 = &PixelOperation::BrightnessDownTable666[renderState.blendEVY][0]; + renderState.brightnessDownTable888 = &PixelOperation::BrightnessDownTable888[renderState.blendEVY][0]; - renderState.srcEffectEnable[GPULayerID_BG0] = false; - renderState.srcEffectEnable[GPULayerID_BG1] = false; - renderState.srcEffectEnable[GPULayerID_BG2] = false; - renderState.srcEffectEnable[GPULayerID_BG3] = false; - renderState.srcEffectEnable[GPULayerID_OBJ] = false; - renderState.srcEffectEnable[GPULayerID_Backdrop] = false; - - renderState.dstBlendEnable[GPULayerID_BG0] = false; - renderState.dstBlendEnable[GPULayerID_BG1] = false; - renderState.dstBlendEnable[GPULayerID_BG2] = false; - renderState.dstBlendEnable[GPULayerID_BG3] = false; - renderState.dstBlendEnable[GPULayerID_OBJ] = false; - renderState.dstBlendEnable[GPULayerID_Backdrop] = false; - renderState.dstAnyBlendEnable = false; - -#ifdef ENABLE_SSE2 - renderState.srcEffectEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.srcEffectEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.srcEffectEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.srcEffectEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.srcEffectEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.srcEffectEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128(); -#ifdef ENABLE_SSSE3 - renderState.dstBlendEnable_SSSE3 = _mm_setzero_si128(); -#else - renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128(); -#endif -#endif - - renderState.WIN0_enable[GPULayerID_BG0] = 0; - renderState.WIN0_enable[GPULayerID_BG1] = 0; - renderState.WIN0_enable[GPULayerID_BG2] = 0; - renderState.WIN0_enable[GPULayerID_BG3] = 0; - renderState.WIN0_enable[GPULayerID_OBJ] = 0; - renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = 0; - - renderState.WIN1_enable[GPULayerID_BG0] = 0; - renderState.WIN1_enable[GPULayerID_BG1] = 0; - renderState.WIN1_enable[GPULayerID_BG2] = 0; - renderState.WIN1_enable[GPULayerID_BG3] = 0; - renderState.WIN1_enable[GPULayerID_OBJ] = 0; - renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = 0; - - renderState.WINOUT_enable[GPULayerID_BG0] = 0; - renderState.WINOUT_enable[GPULayerID_BG1] = 0; - renderState.WINOUT_enable[GPULayerID_BG2] = 0; - renderState.WINOUT_enable[GPULayerID_BG3] = 0; - renderState.WINOUT_enable[GPULayerID_OBJ] = 0; - renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = 0; - - renderState.WINOBJ_enable[GPULayerID_BG0] = 0; - renderState.WINOBJ_enable[GPULayerID_BG1] = 0; - renderState.WINOBJ_enable[GPULayerID_BG2] = 0; - renderState.WINOBJ_enable[GPULayerID_BG3] = 0; - renderState.WINOBJ_enable[GPULayerID_OBJ] = 0; - renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = 0; - -#if defined(ENABLE_SSE2) - renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128(); - - renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128(); - - renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128(); - - renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128(); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128(); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128(); - renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128(); - renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128(); -#endif + memset(&renderState.WIN0_enable[0], 0, sizeof(renderState.WIN0_enable[0]) * 6); + memset(&renderState.WIN1_enable[0], 0, sizeof(renderState.WIN1_enable[0]) * 6); + memset(&renderState.WINOUT_enable[0], 0, sizeof(renderState.WINOUT_enable[0]) * 6); + memset(&renderState.WINOBJ_enable[0], 0, sizeof(renderState.WINOBJ_enable[0]) * 6); + memset(&renderState.dstBlendEnableVecLookup, 0, sizeof(renderState.dstBlendEnableVecLookup)); renderState.WIN0_ENABLED = false; renderState.WIN1_ENABLED = false; renderState.WINOBJ_ENABLED = false; renderState.isAnyWindowEnabled = false; - renderState.mosaicWidthBG = this->_mosaicLookup.table[0]; - renderState.mosaicHeightBG = this->_mosaicLookup.table[0]; - renderState.mosaicWidthOBJ = this->_mosaicLookup.table[0]; - renderState.mosaicHeightOBJ = this->_mosaicLookup.table[0]; + memset(&renderState.srcEffectEnable[0], 0, sizeof(renderState.srcEffectEnable[0]) * 6); + memset(&renderState.dstBlendEnable[0], 0, sizeof(renderState.dstBlendEnable[0]) * 6); + renderState.dstAnyBlendEnable = false; + + renderState.mosaicWidthBG = &this->_mosaicLookup.table[0]; + renderState.mosaicHeightBG = &this->_mosaicLookup.table[0]; + renderState.mosaicWidthOBJ = &this->_mosaicLookup.table[0]; + renderState.mosaicHeightOBJ = &this->_mosaicLookup.table[0]; renderState.isBGMosaicSet = false; renderState.isOBJMosaicSet = false; @@ -1787,464 +485,6 @@ void GPUEngineBase::_ResortBGLayers() #endif } -FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB) -{ - u16 ra = colA & 0x001F; - u16 ga = (colA >> 5) & 0x001F; - u16 ba = (colA >> 10) & 0x001F; - u16 rb = colB & 0x001F; - u16 gb = (colB >> 5) & 0x001F; - u16 bb = (colB >> 10) & 0x001F; - - ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16; - ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16; - ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16; - - ra = (ra > 31) ? 31 : ra; - ga = (ga > 31) ? 31 : ga; - ba = (ba > 31) ? 31 : ba; - - return ra | (ga << 5) | (ba << 10); -} - -template -FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB) -{ - FragmentColor outColor; - - u16 r16 = ( (colA.r * blendEVA) + (colB.r * blendEVB) ) / 16; - u16 g16 = ( (colA.g * blendEVA) + (colB.g * blendEVB) ) / 16; - u16 b16 = ( (colA.b * blendEVA) + (colB.b * blendEVB) ) / 16; - - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - outColor.r = (r16 > 63) ? 63 : r16; - outColor.g = (g16 > 63) ? 63 : g16; - outColor.b = (b16 > 63) ? 63 : b16; - } - else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) - { - outColor.r = (r16 > 255) ? 255 : r16; - outColor.g = (g16 > 255) ? 255 : g16; - outColor.b = (b16 > 255) ? 255 : b16; - } - - outColor.a = 0; - return outColor; -} - -FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable) -{ - const u8 r = (*blendTable)[ colA & 0x1F][ colB & 0x1F]; - const u8 g = (*blendTable)[(colA >> 5) & 0x1F][(colB >> 5) & 0x1F]; - const u8 b = (*blendTable)[(colA >> 10) & 0x1F][(colB >> 10) & 0x1F]; - - return r | (g << 5) | (b << 10); -} - -FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const u16 colB) -{ - const u16 alpha = colA.a + 1; - COLOR c2; - COLOR cfinal; - - c2.val = colB; - - cfinal.bits.red = ((colA.r * alpha) + ((c2.bits.red << 1) * (32 - alpha))) >> 6; - cfinal.bits.green = ((colA.g * alpha) + ((c2.bits.green << 1) * (32 - alpha))) >> 6; - cfinal.bits.blue = ((colA.b * alpha) + ((c2.bits.blue << 1) * (32 - alpha))) >> 6; - cfinal.bits.alpha = 0; - - return cfinal.val; -} - -template -FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB) -{ - FragmentColor blendedColor; - const u16 alpha = colA.a + 1; - - if (COLORFORMATB == NDSColorFormat_BGR666_Rev) - { - blendedColor.r = ((colA.r * alpha) + (colB.r * (32 - alpha))) >> 5; - blendedColor.g = ((colA.g * alpha) + (colB.g * (32 - alpha))) >> 5; - blendedColor.b = ((colA.b * alpha) + (colB.b * (32 - alpha))) >> 5; - } - else if (COLORFORMATB == NDSColorFormat_BGR888_Rev) - { - blendedColor.r = ((colA.r * alpha) + (colB.r * (256 - alpha))) >> 8; - blendedColor.g = ((colA.g * alpha) + (colB.g * (256 - alpha))) >> 8; - blendedColor.b = ((colA.b * alpha) + (colB.b * (256 - alpha))) >> 8; - } - - blendedColor.a = 0; - return blendedColor; -} - -FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY) -{ - u16 r = col & 0x001F; - u16 g = (col >> 5) & 0x001F; - u16 b = (col >> 10) & 0x001F; - - r = (r + ((31 - r) * blendEVY / 16)); - g = (g + ((31 - g) * blendEVY / 16)); - b = (b + ((31 - b) * blendEVY / 16)); - - return r | (g << 5) | (b << 10); -} - -template -FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectIncreaseBrightness(const FragmentColor col, const u16 blendEVY) -{ - FragmentColor newColor; - newColor.color = 0; - - u32 r = col.r; - u32 g = col.g; - u32 b = col.b; - - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - newColor.r = (r + ((63 - r) * blendEVY / 16)); - newColor.g = (g + ((63 - g) * blendEVY / 16)); - newColor.b = (b + ((63 - b) * blendEVY / 16)); - } - else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) - { - newColor.r = (r + ((255 - r) * blendEVY / 16)); - newColor.g = (g + ((255 - g) * blendEVY / 16)); - newColor.b = (b + ((255 - b) * blendEVY / 16)); - } - - return newColor; -} - -FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY) -{ - u16 r = col & 0x001F; - u16 g = (col >> 5) & 0x001F; - u16 b = (col >> 10) & 0x001F; - - r = (r - (r * blendEVY / 16)); - g = (g - (g * blendEVY / 16)); - b = (b - (b * blendEVY / 16)); - - return r | (g << 5) | (b << 10); -} - -FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY) -{ - FragmentColor newColor; - newColor.color = 0; - - u32 r = col.r; - u32 g = col.g; - u32 b = col.b; - - newColor.r = (r - (r * blendEVY / 16)); - newColor.g = (g - (g * blendEVY / 16)); - newColor.b = (b - (b * blendEVY / 16)); - - return newColor; -} - -#ifdef ENABLE_SSE2 - -template -FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY) -{ - if (COLORFORMAT == NDSColorFormat_BGR555_Rev) - { - __m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); - __m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); - __m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); - - r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) ); - g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) ); - b_vec128 = _mm_add_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b_vec128), blendEVY), 4) ); - - return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); - } - else - { - __m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128()); - __m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128()); - - rgbLo = _mm_add_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4) ); - rgbHi = _mm_add_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4) ); - - return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) ); - } -} - -template -FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY) -{ - if (COLORFORMAT == NDSColorFormat_BGR555_Rev) - { - __m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); - __m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); - __m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); - - r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) ); - g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) ); - b_vec128 = _mm_sub_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(b_vec128, blendEVY), 4) ); - - return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); - } - else - { - __m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128()); - __m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128()); - - rgbLo = _mm_sub_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(rgbLo, blendEVY), 4) ); - rgbHi = _mm_sub_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(rgbHi, blendEVY), 4) ); - - return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) ); - } -} - -// Note that if USECONSTANTBLENDVALUESHINT is true, then this method will assume that blendEVA contains identical values -// for each 16-bit vector element, and also that blendEVB contains identical values for each 16-bit vector element. If -// this assumption is broken, then the resulting color will be undefined. -template -FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB) -{ - if (COLORFORMAT == NDSColorFormat_BGR555_Rev) - { - __m128i ra; - __m128i ga; - __m128i ba; - __m128i colorBitMask = _mm_set1_epi16(0x001F); - -#ifdef ENABLE_SSSE3 - ra = _mm_or_si128( _mm_and_si128( colA, colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) ); - ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) ); - ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) ); - - const __m128i blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8)); - ra = _mm_maddubs_epi16(ra, blendAB); - ga = _mm_maddubs_epi16(ga, blendAB); - ba = _mm_maddubs_epi16(ba, blendAB); -#else - ra = _mm_and_si128( colA, colorBitMask); - ga = _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask); - ba = _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask); - - __m128i rb = _mm_and_si128( colB, colorBitMask); - __m128i gb = _mm_and_si128(_mm_srli_epi16(colB, 5), colorBitMask); - __m128i bb = _mm_and_si128(_mm_srli_epi16(colB, 10), colorBitMask); - - ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) ); - ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) ); - ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) ); -#endif - - ra = _mm_srli_epi16(ra, 4); - ga = _mm_srli_epi16(ga, 4); - ba = _mm_srli_epi16(ba, 4); - - ra = _mm_min_epi16(ra, colorBitMask); - ga = _mm_min_epi16(ga, colorBitMask); - ba = _mm_min_epi16(ba, colorBitMask); - - return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) ); - } - else - { - __m128i outColorLo; - __m128i outColorHi; - __m128i outColor; - -#ifdef ENABLE_SSSE3 - const __m128i blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8)); - - outColorLo = _mm_unpacklo_epi8(colA, colB); - outColorHi = _mm_unpackhi_epi8(colA, colB); - - if (USECONSTANTBLENDVALUESHINT) - { - outColorLo = _mm_maddubs_epi16(outColorLo, blendAB); - outColorHi = _mm_maddubs_epi16(outColorHi, blendAB); - } - else - { - const __m128i blendABLo = _mm_unpacklo_epi16(blendAB, blendAB); - const __m128i blendABHi = _mm_unpackhi_epi16(blendAB, blendAB); - outColorLo = _mm_maddubs_epi16(outColorLo, blendABLo); - outColorHi = _mm_maddubs_epi16(outColorHi, blendABHi); - } -#else - const __m128i colALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128()); - const __m128i colAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128()); - const __m128i colBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128()); - const __m128i colBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128()); - - if (USECONSTANTBLENDVALUESHINT) - { - outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendEVA), _mm_mullo_epi16(colBLo, blendEVB) ); - outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendEVA), _mm_mullo_epi16(colBHi, blendEVB) ); - } - else - { - const __m128i blendALo = _mm_unpacklo_epi16(blendEVA, blendEVA); - const __m128i blendAHi = _mm_unpackhi_epi16(blendEVA, blendEVA); - const __m128i blendBLo = _mm_unpacklo_epi16(blendEVB, blendEVB); - const __m128i blendBHi = _mm_unpackhi_epi16(blendEVB, blendEVB); - - outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendALo), _mm_mullo_epi16(colBLo, blendBLo) ); - outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendAHi), _mm_mullo_epi16(colBHi, blendBHi) ); - } -#endif - - outColorLo = _mm_srli_epi16(outColorLo, 4); - outColorHi = _mm_srli_epi16(outColorHi, 4); - outColor = _mm_packus_epi16(outColorLo, outColorHi); - - // When the color format is 888, the packuswb instruction will naturally clamp - // the color component values to 255. However, when the color format is 666, the - // color component values must be clamped to 63. In this case, we must call pminub - // to do the clamp. - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63)); - } - - outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF)); - - return outColor; - } -} - -template -FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB) -{ - if (COLORFORMATB == NDSColorFormat_BGR555_Rev) - { - // If the color format of B is 555, then the colA_Hi parameter is required. - // The color format of A is assumed to be RGB666. - __m128i ra_lo = _mm_and_si128( colA_Lo, _mm_set1_epi32(0x000000FF) ); - __m128i ga_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 8), _mm_set1_epi32(0x000000FF) ); - __m128i ba_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 16), _mm_set1_epi32(0x000000FF) ); - __m128i aa_lo = _mm_srli_epi32(colA_Lo, 24); - - __m128i ra_hi = _mm_and_si128( colA_Hi, _mm_set1_epi32(0x000000FF) ); - __m128i ga_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 8), _mm_set1_epi32(0x000000FF) ); - __m128i ba_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 16), _mm_set1_epi32(0x000000FF) ); - __m128i aa_hi = _mm_srli_epi32(colA_Hi, 24); - - __m128i ra = _mm_packs_epi32(ra_lo, ra_hi); - __m128i ga = _mm_packs_epi32(ga_lo, ga_hi); - __m128i ba = _mm_packs_epi32(ba_lo, ba_hi); - __m128i aa = _mm_packs_epi32(aa_lo, aa_hi); - -#ifdef ENABLE_SSSE3 - ra = _mm_or_si128( ra, _mm_and_si128(_mm_slli_epi16(colB, 9), _mm_set1_epi16(0x3E00)) ); - ga = _mm_or_si128( ga, _mm_and_si128(_mm_slli_epi16(colB, 4), _mm_set1_epi16(0x3E00)) ); - ba = _mm_or_si128( ba, _mm_and_si128(_mm_srli_epi16(colB, 1), _mm_set1_epi16(0x3E00)) ); - - aa = _mm_adds_epu8(aa, _mm_set1_epi16(1)); - aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(32), aa), 8) ); - - ra = _mm_maddubs_epi16(ra, aa); - ga = _mm_maddubs_epi16(ga, aa); - ba = _mm_maddubs_epi16(ba, aa); -#else - aa = _mm_adds_epu16(aa, _mm_set1_epi16(1)); - __m128i rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) ); - __m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) ); - __m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) ); - __m128i ab = _mm_subs_epu16( _mm_set1_epi16(32), aa ); - - ra = _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) ); - ga = _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) ); - ba = _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) ); -#endif - - ra = _mm_srli_epi16(ra, 6); - ga = _mm_srli_epi16(ga, 6); - ba = _mm_srli_epi16(ba, 6); - - return _mm_or_si128( _mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10) ); - } - else - { - // If the color format of B is 666 or 888, then the colA_Hi parameter is ignored. - // The color format of A is assumed to match the color format of B. - __m128i rgbALo; - __m128i rgbAHi; - -#ifdef ENABLE_SSSE3 - if (COLORFORMATB == NDSColorFormat_BGR666_Rev) - { - // Does not work for RGBA8888 color format. The reason is because this - // algorithm depends on the pmaddubsw instruction, which multiplies - // two unsigned 8-bit integers into an intermediate signed 16-bit - // integer. This means that we can overrun the signed 16-bit value - // range, which would be limited to [-32767 - 32767]. For example, a - // color component of value 255 multiplied by an alpha value of 255 - // would equal 65025, which is greater than the upper range of a signed - // 16-bit value. - rgbALo = _mm_unpacklo_epi8(colA_Lo, colB); - rgbAHi = _mm_unpackhi_epi8(colA_Lo, colB); - - __m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x0000001F) ); - alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) ); - alpha = _mm_adds_epu8(alpha, _mm_set1_epi8(1)); - - __m128i invAlpha = _mm_subs_epu8(_mm_set1_epi8(32), alpha); - __m128i alphaLo = _mm_unpacklo_epi8(alpha, invAlpha); - __m128i alphaHi = _mm_unpackhi_epi8(alpha, invAlpha); - - rgbALo = _mm_maddubs_epi16(rgbALo, alphaLo); - rgbAHi = _mm_maddubs_epi16(rgbAHi, alphaHi); - } - else -#endif - { - rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128()); - rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128()); - __m128i rgbBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128()); - __m128i rgbBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128()); - - __m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x000000FF) ); - alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) ); - - __m128i alphaLo = _mm_unpacklo_epi8(alpha, _mm_setzero_si128()); - __m128i alphaHi = _mm_unpackhi_epi8(alpha, _mm_setzero_si128()); - alphaLo = _mm_add_epi16(alphaLo, _mm_set1_epi16(1)); - alphaHi = _mm_add_epi16(alphaHi, _mm_set1_epi16(1)); - - if (COLORFORMATB == NDSColorFormat_BGR666_Rev) - { - rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) ); - rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) ); - } - else if (COLORFORMATB == NDSColorFormat_BGR888_Rev) - { - rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) ); - rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) ); - } - } - - if (COLORFORMATB == NDSColorFormat_BGR666_Rev) - { - rgbALo = _mm_srli_epi16(rgbALo, 5); - rgbAHi = _mm_srli_epi16(rgbAHi, 5); - } - else if (COLORFORMATB == NDSColorFormat_BGR888_Rev) - { - rgbALo = _mm_srli_epi16(rgbALo, 8); - rgbAHi = _mm_srli_epi16(rgbAHi, 8); - } - - return _mm_and_si128( _mm_packus_epi16(rgbALo, rgbAHi), _mm_set1_epi32(0x00FFFFFF) ); - } -} - -#endif - void GPUEngineBase::ParseReg_MASTER_BRIGHT() { const IOREG_MASTER_BRIGHT &MASTER_BRIGHT = this->_IORegisterMap->MASTER_BRIGHT; @@ -2830,1111 +1070,6 @@ void GPUEngineBase::_TransitionLineNativeToCustom(GPUEngineCompositorInfo &compI this->_nativeLineRenderCount--; } -/*****************************************************************************/ -// PIXEL RENDERING -/*****************************************************************************/ -template -FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = srcColor16 | 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); - break; - } - - if (!ISDEBUGRENDER) - { - dstLayerID = compInfo.renderState.selectedLayerID; - } -} - -template -FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = ColorspaceConvert6665To5551(srcColor32); - dstColor16 = dstColor16 | 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32 = srcColor32; - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32 = srcColor32; - dstColor32.a = 0xFF; - break; - - default: - return; - } - - if (!ISDEBUGRENDER) - { - dstLayerID = compInfo.renderState.selectedLayerID; - } -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF] | 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF]; - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF]; - dstColor32.a = 0xFF; - break; - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); - dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; - dstColor16 = dstColor16 | 0x8000; - } - else - { - dstColor32 = this->_ColorEffectIncreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF] | 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF]; - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF]; - dstColor32.a = 0xFF; - break; - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); - dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; - dstColor16 = dstColor16 | 0x8000; - } - else - { - dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - TBlendTable *selectedBlendTable = compInfo.renderState.blendTable555; - u8 blendEVA = compInfo.renderState.blendEVA; - u8 blendEVB = compInfo.renderState.blendEVB; - - const bool dstTargetBlendEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; - bool forceDstTargetBlend = false; - - if (LAYERTYPE == GPULayerType_OBJ) - { - //translucent-capable OBJ are forcing the function to blend when the second target is satisfied - const bool isObjTranslucentType = (spriteMode == OBJMode_Transparent) || (spriteMode == OBJMode_Bitmap); - if (isObjTranslucentType && dstTargetBlendEnable) - { - // OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha. - // Test cases: - // * The spriteblend demo - // * Glory of Heracles - fairy on the title screen - // * Phoenix Wright: Ace Attorney - character fade-in/fade-out - if (spriteAlpha != 0xFF) - { - blendEVA = spriteAlpha; - blendEVB = 16 - spriteAlpha; - selectedBlendTable = &GPUEngineBase::_blendTable555[blendEVA][blendEVB]; - } - - forceDstTargetBlend = true; - } - } - - ColorEffect selectedEffect = ColorEffect_Disable; - - if (forceDstTargetBlend) - { - selectedEffect = ColorEffect_Blend; - } - else - { - // If we're not forcing blending, then select the color effect based on the BLDCNT target flags. - if (enableColorEffect && compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]) - { - switch (compInfo.renderState.colorEffect) - { - // For the Blend effect, both first and second target flags must be checked. - case ColorEffect_Blend: - { - if (dstTargetBlendEnable) selectedEffect = compInfo.renderState.colorEffect; - break; - } - - // For the Increase/Decrease Brightness effects, only the first target flag needs to be checked. - // Test case: Bomberman Land Touch! dialog boxes will render too dark without this check. - case ColorEffect_IncreaseBrightness: - case ColorEffect_DecreaseBrightness: - selectedEffect = compInfo.renderState.colorEffect; - break; - - default: - break; - } - } - } - - // Render the pixel using the selected color effect. - switch (selectedEffect) - { - case ColorEffect_Disable: - { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = srcColor16; - dstColor16 |= 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); - break; - } - break; - } - - case ColorEffect_IncreaseBrightness: - { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; - dstColor16 |= 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF]; - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF]; - dstColor32.a = 0xFF; - break; - } - break; - } - - case ColorEffect_DecreaseBrightness: - { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; - dstColor16 |= 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF]; - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF]; - dstColor32.a = 0xFF; - break; - } - break; - } - - case ColorEffect_Blend: - { - FragmentColor srcColor32; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - dstColor16 = this->_ColorEffectBlend(srcColor16, dstColor16, selectedBlendTable); - dstColor16 |= 0x8000; - break; - - case NDSColorFormat_BGR666_Rev: - srcColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); - dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); - dstColor32.a = 0x1F; - break; - - case NDSColorFormat_BGR888_Rev: - srcColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); - dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); - dstColor32.a = 0xFF; - break; - } - break; - } - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - u8 blendEVA = compInfo.renderState.blendEVA; - u8 blendEVB = compInfo.renderState.blendEVB; - - const bool dstTargetBlendEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; - - // 3D rendering has a special override: If the destination pixel is set to blend, then always blend. - // Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless - // blending is forced here. - // - // This behavior must take priority over checking for the window color effect enable flag. - // Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless - // blend forcing takes priority. - bool forceDstTargetBlend = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnable : false; - - if (LAYERTYPE == GPULayerType_OBJ) - { - //translucent-capable OBJ are forcing the function to blend when the second target is satisfied - const bool isObjTranslucentType = (spriteMode == OBJMode_Transparent) || (spriteMode == OBJMode_Bitmap); - if (isObjTranslucentType && dstTargetBlendEnable) - { - // OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha. - // Test cases: - // * The spriteblend demo - // * Glory of Heracles - fairy on the title screen - // * Phoenix Wright: Ace Attorney - character fade-in/fade-out - if (spriteAlpha != 0xFF) - { - blendEVA = spriteAlpha; - blendEVB = 16 - spriteAlpha; - } - - forceDstTargetBlend = true; - } - } - - ColorEffect selectedEffect = ColorEffect_Disable; - - if (forceDstTargetBlend) - { - selectedEffect = ColorEffect_Blend; - } - else - { - // If we're not forcing blending, then select the color effect based on the BLDCNT target flags. - if (enableColorEffect && compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]) - { - switch (compInfo.renderState.colorEffect) - { - // For the Blend effect, both first and second target flags must be checked. - case ColorEffect_Blend: - { - if (dstTargetBlendEnable) selectedEffect = compInfo.renderState.colorEffect; - break; - } - - // For the Increase/Decrease Brightness effects, only the first target flag needs to be checked. - // Test case: Bomberman Land Touch! dialog boxes will render too dark without this check. - case ColorEffect_IncreaseBrightness: - case ColorEffect_DecreaseBrightness: - selectedEffect = compInfo.renderState.colorEffect; - break; - - default: - break; - } - } - } - - // Render the pixel using the selected color effect. - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); - - switch (selectedEffect) - { - case ColorEffect_Disable: - dstColor16 = srcColor16; - break; - - case ColorEffect_IncreaseBrightness: - dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; - break; - - case ColorEffect_DecreaseBrightness: - dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; - break; - - case ColorEffect_Blend: - dstColor16 = this->_ColorEffectBlend3D(srcColor32, dstColor16); - break; - } - - dstColor16 |= 0x8000; - } - else - { - switch (selectedEffect) - { - case ColorEffect_Disable: - dstColor32 = srcColor32; - break; - - case ColorEffect_IncreaseBrightness: - dstColor32 = this->_ColorEffectIncreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_DecreaseBrightness: - dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_Blend: - dstColor32 = (LAYERTYPE == GPULayerType_3D) ? this->_ColorEffectBlend3D(srcColor32, dstColor32) : this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); - break; - } - - dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; - } - - dstLayerID = compInfo.renderState.selectedLayerID; -} - -template -FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) -{ - switch (COMPOSITORMODE) - { - case GPUCompositorMode_Debug: - this->_PixelCopy(compInfo, srcColor16); - break; - - case GPUCompositorMode_Copy: - this->_PixelCopy(compInfo, srcColor16); - break; - - case GPUCompositorMode_BrightUp: - this->_PixelBrightnessUp(compInfo, srcColor16); - break; - - case GPUCompositorMode_BrightDown: - this->_PixelBrightnessDown(compInfo, srcColor16); - break; - - default: - this->_PixelUnknownEffect(compInfo, srcColor16, enableColorEffect, spriteAlpha, (OBJMode)spriteMode); - break; - } -} - -template -FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) -{ - switch (COMPOSITORMODE) - { - case GPUCompositorMode_Debug: - this->_PixelCopy(compInfo, srcColor32); - break; - - case GPUCompositorMode_Copy: - this->_PixelCopy(compInfo, srcColor32); - break; - - case GPUCompositorMode_BrightUp: - this->_PixelBrightnessUp(compInfo, srcColor32); - break; - - case GPUCompositorMode_BrightDown: - this->_PixelBrightnessDown(compInfo, srcColor32); - break; - - default: - this->_PixelUnknownEffect(compInfo, srcColor32, enableColorEffect, spriteAlpha, (OBJMode)spriteMode); - break; - } -} - -#ifdef ENABLE_SSE2 - -template -FORCEINLINE void GPUEngineBase::_PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_or_si128(src0, alphaBits); - dst1 = _mm_or_si128(src1, alphaBits); - } - else - { - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_or_si128(src0, alphaBits); - dst1 = _mm_or_si128(src1, alphaBits); - dst2 = _mm_or_si128(src2, alphaBits); - dst3 = _mm_or_si128(src3, alphaBits); - } - - if (!ISDEBUGRENDER) - { - dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); - } -} - -template -FORCEINLINE void GPUEngineBase::_PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), - _mm_unpackhi_epi8(passMask8, passMask8) }; - - // Do the masked copy. - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask16[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask16[1]); - } - else - { - const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]), - _mm_unpackhi_epi16(passMask16[0], passMask16[0]), - _mm_unpacklo_epi16(passMask16[1], passMask16[1]), - _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; - - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask32[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask32[1]); - dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(src2, alphaBits), passMask32[2]); - dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(src3, alphaBits), passMask32[3]); - } - - if (!ISDEBUGRENDER) - { - const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID); - dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8); - } -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessUp16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src0, evy_vec128), alphaBits); - dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src1, evy_vec128), alphaBits); - } - else - { - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src0, evy_vec128), alphaBits); - dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src1, evy_vec128), alphaBits); - dst2 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src2, evy_vec128), alphaBits); - dst3 = _mm_or_si128(this->_ColorEffectIncreaseBrightness(src3, evy_vec128), alphaBits); - } - - dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), - _mm_unpackhi_epi8(passMask8, passMask8) }; - - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src0, evy_vec128), alphaBits), passMask16[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src1, evy_vec128), alphaBits), passMask16[1]); - } - else - { - const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]), - _mm_unpackhi_epi16(passMask16[0], passMask16[0]), - _mm_unpacklo_epi16(passMask16[1], passMask16[1]), - _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; - - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src0, evy_vec128), alphaBits), passMask32[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src1, evy_vec128), alphaBits), passMask32[1]); - dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src2, evy_vec128), alphaBits), passMask32[2]); - dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectIncreaseBrightness(src3, evy_vec128), alphaBits), passMask32[3]); - } - - const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID); - dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8); -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src0, evy_vec128), alphaBits); - dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src1, evy_vec128), alphaBits); - } - else - { - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src0, evy_vec128), alphaBits); - dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src1, evy_vec128), alphaBits); - dst2 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src2, evy_vec128), alphaBits); - dst3 = _mm_or_si128(this->_ColorEffectDecreaseBrightness(src3, evy_vec128), alphaBits); - } - - dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); -} - -template -FORCEINLINE void GPUEngineBase::_PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), - _mm_unpackhi_epi8(passMask8, passMask8) }; - - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i alphaBits = _mm_set1_epi16(0x8000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src0, evy_vec128), alphaBits), passMask16[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src1, evy_vec128), alphaBits), passMask16[1]); - } - else - { - const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]), - _mm_unpackhi_epi16(passMask16[0], passMask16[0]), - _mm_unpacklo_epi16(passMask16[1], passMask16[1]), - _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; - - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src0, evy_vec128), alphaBits), passMask32[0]); - dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src1, evy_vec128), alphaBits), passMask32[1]); - dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src2, evy_vec128), alphaBits), passMask32[2]); - dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectDecreaseBrightness(src3, evy_vec128), alphaBits), passMask32[3]); - } - - const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID); - dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8); -} - -template -FORCEINLINE void GPUEngineBase::_PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &srcEffectEnableMask, - const __m128i &enableColorEffectMask, - const __m128i &spriteAlpha, - const __m128i &spriteMode, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID); - const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), - _mm_unpackhi_epi8(passMask8, passMask8) }; - - const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]), - _mm_unpackhi_epi16(passMask16[0], passMask16[0]), - _mm_unpacklo_epi16(passMask16[1], passMask16[1]), - _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; - - __m128i dstTargetBlendEnableMask; - -#ifdef ENABLE_SSSE3 - dstTargetBlendEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID); - dstTargetBlendEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstTargetBlendEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) ); -#else - dstTargetBlendEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]); - dstTargetBlendEnableMask = _mm_or_si128(dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) ); - dstTargetBlendEnableMask = _mm_or_si128(dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) ); - dstTargetBlendEnableMask = _mm_or_si128(dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) ); - dstTargetBlendEnableMask = _mm_or_si128(dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) ); - dstTargetBlendEnableMask = _mm_or_si128(dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) ); -#endif - - dstTargetBlendEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID_vec128), dstTargetBlendEnableMask ); - - // Select the color effect based on the BLDCNT target flags. - const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - __m128i forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm_setzero_si128(); - - // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. - // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then - // convert EVA and EVB into vectors of uint16 right before we use them. - __m128i eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVA) : _mm_set1_epi16(compInfo.renderState.blendEVA); - __m128i evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVB) : _mm_set1_epi16(compInfo.renderState.blendEVB); - - if (LAYERTYPE == GPULayerType_OBJ) - { - const __m128i isObjTranslucentMask = _mm_and_si128( dstTargetBlendEnableMask, _mm_or_si128(_mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Bitmap))) ); - forceDstTargetBlendMask = isObjTranslucentMask; - - const __m128i spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask); - eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask); - evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); - } - - __m128i tmpSrc[4]; - - if ( (LAYERTYPE == GPULayerType_3D) && (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ) - { - // 3D layer blending requires that all src colors are preserved as 32-bit values. - // Since dst2 and dst3 are currently unused for RGB555 output, we used these variables - // to store the converted 16-bit src colors in a previous step. - tmpSrc[0] = dst2; - tmpSrc[1] = dst3; - } - else - { - tmpSrc[0] = src0; - tmpSrc[1] = src1; - tmpSrc[2] = src2; - tmpSrc[3] = src3; - } - - switch (compInfo.renderState.colorEffect) - { - case ColorEffect_IncreaseBrightness: - { - const __m128i brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); - const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)}; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask16[1] ); - } - else - { - const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) }; - - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectIncreaseBrightness(tmpSrc[2], evy_vec128), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectIncreaseBrightness(tmpSrc[3], evy_vec128), brightnessMask32[3] ); - } - break; - } - - case ColorEffect_DecreaseBrightness: - { - const __m128i brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); - const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)}; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask16[1] ); - } - else - { - const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) }; - - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectDecreaseBrightness(tmpSrc[2], evy_vec128), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectDecreaseBrightness(tmpSrc[3], evy_vec128), brightnessMask32[3] ); - } - break; - } - - default: - break; - } - - // Render the pixel using the selected color effect. - const __m128i blendMask8 = _mm_or_si128( forceDstTargetBlendMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstTargetBlendEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); - const __m128i blendMask16[2] = {_mm_unpacklo_epi8(blendMask8, blendMask8), _mm_unpackhi_epi8(blendMask8, blendMask8)}; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - __m128i blendSrc16[2]; - - switch (LAYERTYPE) - { - case GPULayerType_3D: - blendSrc16[0] = this->_ColorEffectBlend3D(src0, src1, dst0); - blendSrc16[1] = this->_ColorEffectBlend3D(src2, src3, dst1); - break; - - case GPULayerType_BG: - blendSrc16[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128); - blendSrc16[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: - { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - const __m128i tempEVA[2] = { - _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) - }; - const __m128i tempEVB[2] = { - _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) - }; - - blendSrc16[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, tempEVA[0], tempEVB[0]); - blendSrc16[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, tempEVA[1], tempEVB[1]); - break; - } - } - - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); - - // Combine the final colors. - tmpSrc[0] = _mm_or_si128(tmpSrc[0], _mm_set1_epi16(0x8000)); - tmpSrc[1] = _mm_or_si128(tmpSrc[1], _mm_set1_epi16(0x8000)); - - dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask16[0]); - dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask16[1]); - } - else - { - __m128i blendSrc32[4]; - - switch (LAYERTYPE) - { - case GPULayerType_3D: - blendSrc32[0] = this->_ColorEffectBlend3D(src0, src0, dst0); - blendSrc32[1] = this->_ColorEffectBlend3D(src1, src1, dst1); - blendSrc32[2] = this->_ColorEffectBlend3D(src2, src2, dst2); - blendSrc32[3] = this->_ColorEffectBlend3D(src3, src3, dst3); - break; - - case GPULayerType_BG: - blendSrc32[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128); - blendSrc32[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128); - blendSrc32[2] = this->_ColorEffectBlend(tmpSrc[2], dst2, eva_vec128, evb_vec128); - blendSrc32[3] = this->_ColorEffectBlend(tmpSrc[3], dst3, eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: - { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - // - // Note that we are sending only 4 colors for each _ColorEffectBlend() call, and so we are only - // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual - // EVA/EVB value is mirrored for each adjacent 16-bit boundary. - __m128i tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); - __m128i tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); - - const __m128i tempEVA[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); - tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); - - const __m128i tempEVB[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - blendSrc32[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, tempEVA[0], tempEVB[0]); - blendSrc32[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, tempEVA[1], tempEVB[1]); - blendSrc32[2] = this->_ColorEffectBlend(tmpSrc[2], dst2, tempEVA[2], tempEVB[2]); - blendSrc32[3] = this->_ColorEffectBlend(tmpSrc[3], dst3, tempEVA[3], tempEVB[3]); - break; - } - } - - const __m128i blendMask32[4] = { _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), - _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), - _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), - _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) }; - - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); - tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); - tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); - - tmpSrc[0] = _mm_or_si128(tmpSrc[0], alphaBits); - tmpSrc[1] = _mm_or_si128(tmpSrc[1], alphaBits); - tmpSrc[2] = _mm_or_si128(tmpSrc[2], alphaBits); - tmpSrc[3] = _mm_or_si128(tmpSrc[3], alphaBits); - - dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask32[0]); - dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask32[1]); - dst2 = _mm_blendv_epi8(dst2, tmpSrc[2], passMask32[2]); - dst3 = _mm_blendv_epi8(dst3, tmpSrc[3], passMask32[3]); - } - - dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8); -} - -template -FORCEINLINE void GPUEngineBase::_PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, - const bool didAllPixelsPass, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &srcEffectEnableMask, - const u8 *__restrict enableColorEffectPtr, - const u8 *__restrict sprAlphaPtr, - const u8 *__restrict sprModePtr) -{ - const bool is555and3D = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) && (LAYERTYPE == GPULayerType_3D); - __m128i dst[4]; - __m128i dstLayerID_vec128; - - if (is555and3D) - { - // 3D layer blending requires that all src colors are preserved as 32-bit values. - // Since dst2 and dst3 are currently unused for RGB555 output, we using these variables - // to store the converted 16-bit src colors. - dst[2] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x003E0000)), 7)), - _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x003E0000)), 7)) ); - dst[3] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x003E0000)), 7)), - _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x003E0000)), 7)) ); - } - - if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) - { - switch (COMPOSITORMODE) - { - case GPUCompositorMode_Debug: - this->_PixelCopy16_SSE2(compInfo, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_Copy: - this->_PixelCopy16_SSE2(compInfo, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_BrightUp: - this->_PixelBrightnessUp16_SSE2(compInfo, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_BrightDown: - this->_PixelBrightnessDown16_SSE2(compInfo, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - default: - break; - } - } - else - { - // Read the destination pixels into registers if we're doing a masked pixel write. - dst[0] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 0); - dst[1] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 1); - - if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev) - { - dst[2] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 2); - dst[3] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 3); - } - - dstLayerID_vec128 = _mm_load_si128((__m128i *)compInfo.target.lineLayerID); - - switch (COMPOSITORMODE) - { - case GPUCompositorMode_Debug: - this->_PixelCopyWithMask16_SSE2(compInfo, - passMask8, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_Copy: - this->_PixelCopyWithMask16_SSE2(compInfo, - passMask8, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_BrightUp: - this->_PixelBrightnessUpWithMask16_SSE2(compInfo, - passMask8, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - case GPUCompositorMode_BrightDown: - this->_PixelBrightnessDownWithMask16_SSE2(compInfo, - passMask8, - src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - - default: - { - const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)enableColorEffectPtr), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF); - const __m128i spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((__m128i *)sprAlphaPtr) : _mm_setzero_si128(); - const __m128i spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((__m128i *)sprModePtr) : _mm_setzero_si128(); - - this->_PixelUnknownEffectWithMask16_SSE2(compInfo, - passMask8, - src3, src2, src1, src0, - srcEffectEnableMask, - enableColorEffectMask, - spriteAlpha, - spriteMode, - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); - break; - } - } - } - - _mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]); - _mm_store_si128((__m128i *)*compInfo.target.lineColor + 1, dst[1]); - - if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev) - { - _mm_store_si128((__m128i *)*compInfo.target.lineColor + 2, dst[2]); - _mm_store_si128((__m128i *)*compInfo.target.lineColor + 3, dst[3]); - } - - _mm_store_si128((__m128i *)compInfo.target.lineLayerID, dstLayerID_vec128); -} - -#endif - //this is fantastically inaccurate. //we do the early return even though it reduces the resulting accuracy //because we need the speed, and because it is inaccurate anyway @@ -3953,9 +1088,9 @@ void GPUEngineBase::_MosaicSpriteLinePixel(GPUEngineCompositorInfo &compInfo, co const size_t y = compInfo.line.indexNative; - if (!compInfo.renderState.mosaicWidthOBJ[x].begin || !compInfo.renderState.mosaicHeightOBJ[y].begin) + if (!compInfo.renderState.mosaicWidthOBJ->begin[x] || !compInfo.renderState.mosaicHeightOBJ->begin[y]) { - objColor = this->_mosaicColors.obj[compInfo.renderState.mosaicWidthOBJ[x].trunc]; + objColor = this->_mosaicColors.obj[compInfo.renderState.mosaicWidthOBJ->trunc[x]]; } this->_mosaicColors.obj[x] = objColor; @@ -3982,8 +1117,6 @@ templatesize.width : GPU_FRAMEBUFFER_NATIVE_WIDTH; - const s16 dx = (s16)LOCAL_TO_LE_16(param.BGnPA.value); - const s16 dy = (s16)LOCAL_TO_LE_16(param.BGnPC.value); const s32 wh = compInfo.renderState.selectedBGLayer->size.width; const s32 ht = compInfo.renderState.selectedBGLayer->size.height; const s32 wmask = wh - 1; @@ -4011,7 +1144,7 @@ void GPUEngineBase::_RenderPixelIterate_Final(GPUEngineCompositorInfo &compInfo, // as an optimization, specially handle the fairly common case of // "unrotated + unscaled + no boundary checking required" - if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0) + if ( (param.BGnPA.Integer == 1) && (param.BGnPA.Fraction == 0) && (param.BGnPC.value == 0) ) { s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer; const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer; @@ -4044,6 +1177,9 @@ void GPUEngineBase::_RenderPixelIterate_Final(GPUEngineCompositorInfo &compInfo, } } + const s16 dx = (s16)LOCAL_TO_LE_16(param.BGnPA.value); + const s16 dy = (s16)LOCAL_TO_LE_16(param.BGnPC.value); + for (size_t i = 0; i < lineWidth; i++, x.value+=dx, y.value+=dy) { const s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer; @@ -4098,32 +1234,34 @@ TILEENTRY GPUEngineBase::_GetTileEntry(const u32 tileMapAddress, const u16 xOffs } template -FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool opaque) +FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool isOpaque) { if (MOSAIC) { //due to this early out, we will get incorrect behavior in cases where //we enable mosaic in the middle of a frame. this is deemed unlikely. - if (compInfo.renderState.mosaicWidthBG[srcX].begin && compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin) + u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; + + if (compInfo.renderState.mosaicHeightBG->begin[compInfo.line.indexNative] && compInfo.renderState.mosaicWidthBG->begin[srcX]) { - srcColor16 = (!opaque) ? 0xFFFF : (srcColor16 & 0x7FFF); - this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][srcX] = srcColor16; + srcColor16 = (isOpaque) ? (srcColor16 & 0x7FFF) : 0xFFFF; + mosaicColorBG[srcX] = srcColor16; } else { - srcColor16 = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[srcX].trunc]; + srcColor16 = mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc[srcX]]; } - opaque = (srcColor16 != 0xFFFF); + isOpaque = (srcColor16 != 0xFFFF); } - if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) + if (!isOpaque) { return; } - if (!opaque) + if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) { return; } @@ -4135,7 +1273,7 @@ FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative + srcX; const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, srcColor16, enableColorEffect, 0, 0); + pixelop.Composite16(compInfo, srcColor16, enableColorEffect, 0, 0); } template @@ -4143,157 +1281,33 @@ void GPUEngineBase::_PrecompositeNativeToCustomLineBG(GPUEngineCompositorInfo &c { if (MOSAIC) { -#ifdef ENABLE_SSE2 - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=8) + if (compInfo.renderState.mosaicHeightBG->begin[compInfo.line.indexNative]) { - __m128i mosaicColor_vec128; - u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; - - const __m128i index_vec128 = _mm_loadl_epi64((__m128i *)(this->_deferredIndexNative + x)); - const __m128i col_vec128 = _mm_load_si128((__m128i *)(this->_deferredColorNative + x)); - - if (compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin) - { - const __m128i mosaicSetColorMask = _mm_cmpgt_epi16( _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_loadu_si128((__m128i *)(compInfo.renderState.mosaicWidthBG + x))), _mm_setzero_si128() ); - const __m128i idxMask = _mm_cmpeq_epi16(_mm_unpacklo_epi8(index_vec128, _mm_setzero_si128()), _mm_setzero_si128()); - mosaicColor_vec128 = _mm_blendv_epi8(_mm_and_si128(col_vec128, _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask); - - _mm_storeu_si128( (__m128i *)(mosaicColorBG + x), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(mosaicColorBG + x)), mosaicColor_vec128, mosaicSetColorMask) ); - } - - mosaicColor_vec128 = _mm_set_epi16(mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+7].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+6].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+5].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+4].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+3].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+2].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+1].trunc], - mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+0].trunc]); - - const __m128i writeColorMask = _mm_cmpeq_epi16(mosaicColor_vec128, _mm_set1_epi16(0xFFFF)); - _mm_storel_epi64( (__m128i *)(this->_deferredIndexNative + x), _mm_andnot_si128(_mm_packs_epi16(writeColorMask, _mm_setzero_si128()), index_vec128) ); - _mm_store_si128( (__m128i *)(this->_deferredColorNative + x), _mm_blendv_epi8(mosaicColor_vec128, col_vec128, writeColorMask) ); + this->_MosaicLine(compInfo); } -#else - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + else { - u16 mosaicColor; - - if (compInfo.renderState.mosaicWidthBG[x].begin && compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin) - { - mosaicColor = (this->_deferredIndexNative[x] == 0) ? 0xFFFF : this->_deferredColorNative[x] & 0x7FFF; - this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][x] = mosaicColor; - } - else - { - mosaicColor = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[x].trunc]; - } - - if (mosaicColor == 0xFFFF) - { - this->_deferredIndexNative[x] = 0; - } - else - { - this->_deferredColorNative[x] = mosaicColor; - } + this->_MosaicLine(compInfo); } -#endif } CopyLineExpand<0xFFFF, false, false, 2>(this->_deferredColorCustom, this->_deferredColorNative, compInfo.line.widthCustom, 1); CopyLineExpand<0xFFFF, false, false, 1>(this->_deferredIndexCustom, this->_deferredIndexNative, compInfo.line.widthCustom, 1); } -template +template void GPUEngineBase::_CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32) { - const bool isUsingSrc32 = (srcColorNative32 != NULL); - compInfo.target.xNative = 0; compInfo.target.xCustom = 0; compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead; compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead; compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead; -#ifdef ENABLE_SSE2 - const __m128i srcEffectEnableMask = compInfo.renderState.srcEffectEnable_SSE2[GPULayerID_OBJ]; - - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=16, srcColorNative16+=16, srcColorNative32+=16, compInfo.target.xNative+=16, compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) - { - __m128i passMask8; - int passMaskValue; - bool didAllPixelsPass; - - if (WILLPERFORMWINDOWTEST) - { - // Do the window test. - passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestNative[GPULayerID_OBJ] + i)), _mm_set1_epi8(1) ); - - // If none of the pixels within the vector pass, then reject them all at once. - passMaskValue = _mm_movemask_epi8(passMask8); - if (passMaskValue == 0) - { - continue; - } - - didAllPixelsPass = (passMaskValue == 0xFFFF); - } - else - { - passMask8 = _mm_set1_epi8(0xFF); - passMaskValue = 0xFFFF; - didAllPixelsPass = true; - } - - __m128i src[4]; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - src[0] = _mm_load_si128((__m128i *)srcColorNative16 + 0); - src[1] = _mm_load_si128((__m128i *)srcColorNative16 + 1); - } - else - { - if (isUsingSrc32) - { - src[0] = _mm_load_si128((__m128i *)srcColorNative32 + 0); - src[1] = _mm_load_si128((__m128i *)srcColorNative32 + 1); - src[2] = _mm_load_si128((__m128i *)srcColorNative32 + 2); - src[3] = _mm_load_si128((__m128i *)srcColorNative32 + 3); - } - else - { - const __m128i src16[2] = { - _mm_load_si128((__m128i *)srcColorNative16 + 0), - _mm_load_si128((__m128i *)srcColorNative16 + 1) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) - { - ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); - ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); - ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); - } - } - } - - // Write out the pixels. - this->_PixelComposite16_SSE2(compInfo, - didAllPixelsPass, - passMask8, - src[3], src[2], src[1], src[0], - srcEffectEnableMask, - this->_enableColorEffectNative[GPULayerID_OBJ] + i, - this->_sprAlpha[compInfo.line.indexNative] + i, - this->_sprType[compInfo.line.indexNative] + i); - } +#ifdef USEMANUALVECTORIZATION + this->_CompositeNativeLineOBJ_LoopOp(compInfo, srcColorNative16, srcColorNative32); #else - if (isUsingSrc32) + if (srcColorNative32 != NULL) { for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++, srcColorNative32++, compInfo.target.xNative++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) { @@ -4303,7 +1317,7 @@ void GPUEngineBase::_CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, c } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][i] != 0) : true; - this->_PixelComposite(compInfo, *srcColorNative32, enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][i], this->_sprType[compInfo.line.indexNative][i]); + pixelop.Composite32(compInfo, *srcColorNative32, enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][i], this->_sprType[compInfo.line.indexNative][i]); } } else @@ -4316,13 +1330,13 @@ void GPUEngineBase::_CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, c } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][i] != 0) : true; - this->_PixelComposite(compInfo, *srcColorNative16, enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][i], this->_sprType[compInfo.line.indexNative][i]); + pixelop.Composite16(compInfo, *srcColorNative16, enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][i], this->_sprType[compInfo.line.indexNative][i]); } } #endif } -template +template void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) { compInfo.target.xNative = 0; @@ -4333,99 +1347,8 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, co size_t i = 0; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % 16)); - const __m128i srcEffectEnableMask = compInfo.renderState.srcEffectEnable_SSE2[compInfo.renderState.selectedLayerID]; - - for (; i < ssePixCount; i+=16, compInfo.target.xCustom+=16, compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) - { - if (compInfo.target.xCustom >= compInfo.line.widthCustom) - { - compInfo.target.xCustom -= compInfo.line.widthCustom; - } - - __m128i passMask8; - int passMaskValue; - bool didAllPixelsPass; - - if (WILLPERFORMWINDOWTEST || (LAYERTYPE == GPULayerType_BG)) - { - if (WILLPERFORMWINDOWTEST) - { - // Do the window test. - passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ); - } - - if (LAYERTYPE == GPULayerType_BG) - { - const __m128i tempPassMask = _mm_cmpeq_epi8(_mm_load_si128((__m128i *)(srcIndexCustom + compInfo.target.xCustom)), _mm_setzero_si128()); - - // Do the index test. Pixels with an index value of 0 are rejected. - if (WILLPERFORMWINDOWTEST) - { - passMask8 = _mm_andnot_si128(tempPassMask, passMask8); - } - else - { - passMask8 = _mm_xor_si128(tempPassMask, _mm_set1_epi32(0xFFFFFFFF)); - } - } - - // If none of the pixels within the vector pass, then reject them all at once. - passMaskValue = _mm_movemask_epi8(passMask8); - if (passMaskValue == 0) - { - continue; - } - - didAllPixelsPass = (passMaskValue == 0xFFFF); - } - else - { - passMask8 = _mm_set1_epi8(0xFF); - passMaskValue = 0xFFFF; - didAllPixelsPass = true; - } - - __m128i src[4]; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - src[0] = _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 0)); - src[1] = _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 8)); - } - else - { - const __m128i src16[2] = { - _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 0)), - _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 8)) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) - { - ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); - ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); - ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); - } - } - - // Write out the pixels. - this->_PixelComposite16_SSE2(compInfo, - didAllPixelsPass, - passMask8, - src[3], src[2], src[1], src[0], - srcEffectEnableMask, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, - this->_sprAlphaCustom + compInfo.target.xCustom, - this->_sprTypeCustom + compInfo.target.xCustom); - } -#endif - -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION + i = this->_CompositeLineDeferred_LoopOp(compInfo, srcColorCustom16, srcIndexCustom); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -4446,11 +1369,11 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, co } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - this->_PixelComposite(compInfo, srcColorCustom16[compInfo.target.xCustom], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); + pixelop.Composite16(compInfo, srcColorCustom16[compInfo.target.xCustom], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); } } -template +template void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) { compInfo.target.xNative = 0; @@ -4461,118 +1384,8 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo size_t i = 0; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % 16)); - const __m128i srcEffectEnableMask = compInfo.renderState.srcEffectEnable_SSE2[compInfo.renderState.selectedLayerID]; - - for (; i < ssePixCount; i+=16, compInfo.target.xCustom+=16, compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) - { - if (compInfo.target.xCustom >= compInfo.line.widthCustom) - { - compInfo.target.xCustom -= compInfo.line.widthCustom; - } - - __m128i passMask8; - int passMaskValue; - - if (WILLPERFORMWINDOWTEST) - { - // Do the window test. - passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ); - - // If none of the pixels within the vector pass, then reject them all at once. - passMaskValue = _mm_movemask_epi8(passMask8); - if (passMaskValue == 0) - { - continue; - } - } - else - { - passMask8 = _mm_set1_epi8(0xFF); - passMaskValue = 0xFFFF; - } - - __m128i src[4]; - - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - { - src[0] = _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)); - src[1] = _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)); - - if (LAYERTYPE != GPULayerType_OBJ) - { - __m128i tempPassMask = _mm_packus_epi16( _mm_srli_epi16(src[0], 15), _mm_srli_epi16(src[1], 15) ); - tempPassMask = _mm_cmpeq_epi8(tempPassMask, _mm_set1_epi8(1)); - - passMask8 = _mm_and_si128(tempPassMask, passMask8); - passMaskValue = _mm_movemask_epi8(passMask8); - } - break; - } - - case NDSColorFormat_BGR666_Rev: - { - const __m128i src16[2] = { - _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)), - _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)) - }; - - ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); - ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); - - if (LAYERTYPE != GPULayerType_OBJ) - { - __m128i tempPassMask = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) ); - tempPassMask = _mm_cmpeq_epi8(tempPassMask, _mm_set1_epi8(1)); - - passMask8 = _mm_and_si128(tempPassMask, passMask8); - passMaskValue = _mm_movemask_epi8(passMask8); - } - break; - } - - case NDSColorFormat_BGR888_Rev: - { - src[0] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 0)); - src[1] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 4)); - src[2] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 8)); - src[3] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 12)); - - if (LAYERTYPE != GPULayerType_OBJ) - { - __m128i tempPassMask = _mm_packus_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)), _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) ); - tempPassMask = _mm_cmpeq_epi8(tempPassMask, _mm_setzero_si128()); - - passMask8 = _mm_andnot_si128(tempPassMask, passMask8); - passMaskValue = _mm_movemask_epi8(passMask8); - } - break; - } - } - - // If none of the pixels within the vector pass, then reject them all at once. - if (passMaskValue == 0) - { - continue; - } - - // Write out the pixels. - const bool didAllPixelsPass = (passMaskValue == 0xFFFF); - this->_PixelComposite16_SSE2(compInfo, - didAllPixelsPass, - passMask8, - src[3], src[2], src[1], src[0], - srcEffectEnableMask, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, - this->_sprAlphaCustom + compInfo.target.xCustom, - this->_sprTypeCustom + compInfo.target.xCustom); - } -#endif - -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION + i = _CompositeVRAMLineDeferred_LoopOp(compInfo, vramColorPtr); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -4595,7 +1408,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - this->_PixelComposite(compInfo, ((FragmentColor *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); + pixelop.Composite32(compInfo, ((FragmentColor *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); } else { @@ -4605,7 +1418,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - this->_PixelComposite(compInfo, ((u16 *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); + pixelop.Composite16(compInfo, ((u16 *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); } } } @@ -4848,7 +1661,7 @@ void GPUEngineBase::_RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, co const bool isRotationScaled = ( (param.BGnPA.value != 0x100) || (param.BGnPC.value != 0) || (param.BGnX.value != 0) || - (param.BGnY.value != (0x100 * compInfo.line.indexNative)) ); + (param.BGnY.value != (0x100 * (s32)compInfo.line.indexNative)) ); if (!isRotationScaled) { const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(compInfo.renderState.selectedBGLayer->BMPAddress) - MMU.ARM9_LCD) / sizeof(u16); @@ -4992,48 +1805,10 @@ void GPUEngineBase::_RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u16 *__restrict vramBuffer = (u16 *)MMU_gpu_map(objAddress); size_t i = 0; -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION if (readXStep == 1) { - if (ISDEBUGRENDER) - { - const size_t ssePixCount = length - (length % 8); - for (; i < ssePixCount; i += 8, spriteX += 8, frameX += 8) - { - const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX)); - const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) ); - _mm_storeu_si128( (__m128i *)(dst + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX)), color_vec128, alphaCompare) ); - } - } - else - { - const __m128i prio_vec128 = _mm_set1_epi8(prio); - - const size_t ssePixCount = length - (length % 16); - for (; i < ssePixCount; i += 16, spriteX += 16, frameX += 16) - { - const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + frameX)); - const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX)); - const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX + 8)); - - const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128); - const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) ); - - const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, alphaCompare); - const __m128i combinedLoCompare = _mm_unpacklo_epi8(combinedPackedCompare, combinedPackedCompare); - const __m128i combinedHiCompare = _mm_unpackhi_epi8(combinedPackedCompare, combinedPackedCompare); - - // Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because - // maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite - // to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs. - _mm_storeu_si128( (__m128i *)(dst + frameX + 0), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX + 0)), colorLo_vec128, combinedLoCompare) ); - _mm_storeu_si128( (__m128i *)(dst + frameX + 8), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX + 8)), colorHi_vec128, combinedHiCompare) ); - _mm_storeu_si128( (__m128i *)(dst_alpha + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + frameX)), _mm_set1_epi8(spriteAlpha + 1), combinedPackedCompare) ); - _mm_storeu_si128( (__m128i *)(typeTab + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + frameX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) ); - _mm_storeu_si128( (__m128i *)(prioTab + frameX), _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) ); - _mm_storeu_si128( (__m128i *)(this->_sprNum + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + frameX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) ); - } - } + i = this->_RenderSpriteBMP_LoopOp(length, spriteAlpha, prio, spriteNum, vramBuffer, frameX, spriteX, dst, dst_alpha, typeTab, prioTab); } #endif @@ -5759,7 +2534,7 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, vramColorPtr[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); + pixelop.Composite32(compInfo, vramColorPtr[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); } } else @@ -5780,7 +2555,7 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, this->_sprColor[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); + pixelop.Composite16(compInfo, this->_sprColor[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); } } } @@ -5823,11 +2598,11 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { - this->_PixelComposite(compInfo, ((FragmentColor *)vramColorPtr)[dstX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); + pixelop.Composite32(compInfo, ((FragmentColor *)vramColorPtr)[dstX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); } else { - this->_PixelComposite(compInfo, ((u16 *)vramColorPtr)[dstX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); + pixelop.Composite16(compInfo, ((u16 *)vramColorPtr)[dstX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); } } } @@ -5866,7 +2641,7 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineLayerID = dstLayerIDPtr + dstX; const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, this->_sprColor[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); + pixelop.Composite16(compInfo, this->_sprColor[srcX], enableColorEffect, this->_sprAlpha[compInfo.line.indexNative][srcX], this->_sprType[compInfo.line.indexNative][srcX]); } } @@ -5965,62 +2740,21 @@ void GPUEngineBase::ApplyMasterBrightness(void *dst, const size_t pixCount, cons { size_t i = 0; - switch (OUTPUTFORMAT) +#ifdef USEMANUALVECTORIZATION + i = this->_ApplyMasterBrightnessUp_LoopOp(dst, pixCount, intensityClamped); +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) { - case NDSColorFormat_BGR555_Rev: + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { -#ifdef ENABLE_SSE2 - const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped); - - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); - dstColor_vec128 = this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128); - dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000)); - _mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - ((u16 *)dst)[i] = GPUEngineBase::_brightnessUpTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000; - } - break; + ((u16 *)dst)[i] = PixelOperation::BrightnessUpTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000; } - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: + else { -#ifdef ENABLE_SSE2 - const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped); - - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i)); - dstColor_vec128 = this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128); - dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000)); - _mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - ((FragmentColor *)dst)[i] = this->_ColorEffectIncreaseBrightness(((FragmentColor *)dst)[i], intensityClamped); - ((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; - } - break; + ((FragmentColor *)dst)[i] = colorop.increase(((FragmentColor *)dst)[i], intensityClamped); + ((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; } - - default: - break; } } else @@ -6053,62 +2787,21 @@ void GPUEngineBase::ApplyMasterBrightness(void *dst, const size_t pixCount, cons { size_t i = 0; - switch (OUTPUTFORMAT) +#ifdef USEMANUALVECTORIZATION + i = this->_ApplyMasterBrightnessDown_LoopOp(dst, pixCount, intensityClamped); +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) { - case NDSColorFormat_BGR555_Rev: + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { -#ifdef ENABLE_SSE2 - const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped); - - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); - dstColor_vec128 = this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128); - dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000)); - _mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - ((u16 *)dst)[i] = GPUEngineBase::_brightnessDownTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000; - } - break; + ((u16 *)dst)[i] = PixelOperation::BrightnessDownTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000; } - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: + else { -#ifdef ENABLE_SSE2 - const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped); - - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i)); - dstColor_vec128 = this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128); - dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000)); - _mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - ((FragmentColor *)dst)[i] = this->_ColorEffectDecreaseBrightness(((FragmentColor *)dst)[i], intensityClamped); - ((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; - } - break; + ((FragmentColor *)dst)[i] = colorop.decrease(((FragmentColor *)dst)[i], intensityClamped); + ((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; } - - default: - break; } } else @@ -6143,26 +2836,36 @@ void GPUEngineBase::ApplyMasterBrightness(void *dst, const size_t pixCount, cons template bool GPUEngineBase::_IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo) { + if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) + { + return false; + } + + if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) + { + return false; + } + const u16 windowTop = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Top : this->_IORegisterMap->WIN1V.Top; const u16 windowBottom = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Bottom : this->_IORegisterMap->WIN1V.Bottom; - - if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) goto allout; - if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) goto allout; if (windowTop > windowBottom) { - if ((compInfo.line.indexNative < windowTop) && (compInfo.line.indexNative > windowBottom)) goto allout; + if ((compInfo.line.indexNative < windowTop) && (compInfo.line.indexNative > windowBottom)) + { + return false; + } } else { - if ((compInfo.line.indexNative < windowTop) || (compInfo.line.indexNative >= windowBottom)) goto allout; + if ((compInfo.line.indexNative < windowTop) || (compInfo.line.indexNative >= windowBottom)) + { + return false; + } } //the x windows will apply for this scanline return true; - -allout: - return false; } template @@ -6203,6 +2906,10 @@ void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo) if (this->_needUpdateWINH[0]) this->_UpdateWINH<0>(compInfo); if (this->_needUpdateWINH[1]) this->_UpdateWINH<1>(compInfo); + const u8 *__restrict win0Ptr = (this->_IsWindowInsideVerticalRange<0>(compInfo)) ? this->_h_win[0] : NULL; + const u8 *__restrict win1Ptr = (this->_IsWindowInsideVerticalRange<1>(compInfo)) ? this->_h_win[1] : NULL; + const u8 *__restrict winObjPtr = (compInfo.renderState.WINOBJ_ENABLED) ? this->_sprWin[compInfo.line.indexNative] : NULL; + for (size_t layerID = GPULayerID_BG0; layerID <= GPULayerID_OBJ; layerID++) { if (!this->_isBGLayerShown[layerID]) @@ -6210,99 +2917,8 @@ void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo) continue; } -#ifdef ENABLE_SSE2 - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=16) - { - __m128i win_vec128; - - __m128i didPassWindowTest = _mm_setzero_si128(); - __m128i enableColorEffect = _mm_setzero_si128(); - - __m128i win0HandledMask = _mm_setzero_si128(); - __m128i win1HandledMask = _mm_setzero_si128(); - __m128i winOBJHandledMask = _mm_setzero_si128(); - - // Window 0 has the highest priority, so always check this first. - if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo)) - { - win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[0] + i)); - win0HandledMask = _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)); - - didPassWindowTest = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[layerID]); - enableColorEffect = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]); - } - - // Window 1 has medium priority, and is checked after Window 0. - if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo)) - { - win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[1] + i)); - win1HandledMask = _mm_andnot_si128(win0HandledMask, _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1))); - - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[layerID]) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) ); - } - - // Window OBJ has low priority, and is checked after both Window 0 and Window 1. - if (compInfo.renderState.WINOBJ_ENABLED) - { - win_vec128 = _mm_load_si128((__m128i *)(this->_sprWin[compInfo.line.indexNative] + i)); - winOBJHandledMask = _mm_andnot_si128( _mm_or_si128(win0HandledMask, win1HandledMask), _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)) ); - - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[layerID]) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) ); - } - - // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. - // This has the lowest priority, and is always checked last. - const __m128i winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF) ); - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[layerID]) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) ); - - _mm_store_si128((__m128i *)(this->_didPassWindowTestNative[layerID] + i), _mm_and_si128(didPassWindowTest, _mm_set1_epi8(0x01))); - _mm_store_si128((__m128i *)(this->_enableColorEffectNative[layerID] + i), _mm_and_si128(enableColorEffect, _mm_set1_epi8(0x01))); - } -#else - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) - { - // Window 0 has the highest priority, so always check this first. - if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo)) - { - if (this->_h_win[0][i] != 0) - { - this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN0_enable[layerID]; - this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG]; - continue; - } - } - - // Window 1 has medium priority, and is checked after Window 0. - if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo)) - { - if (this->_h_win[1][i] != 0) - { - this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN1_enable[layerID]; - this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG]; - continue; - } - } - - // Window OBJ has low priority, and is checked after both Window 0 and Window 1. - if (compInfo.renderState.WINOBJ_ENABLED) - { - if (this->_sprWin[compInfo.line.indexNative][i] != 0) - { - this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOBJ_enable[layerID]; - this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG]; - continue; - } - } - - // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. - // This has the lowest priority, and is always checked last. - this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOUT_enable[layerID]; - this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG]; - } -#endif + this->_PerformWindowTestingNative(compInfo, layerID, win0Ptr, win1Ptr, winObjPtr, this->_didPassWindowTestNative[layerID], this->_enableColorEffectNative[layerID]); + if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 1)) { CopyLineExpand<1, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 1, 1); @@ -6323,6 +2939,26 @@ void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo) CopyLineExpand<4, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4, 1); CopyLineExpand<4, false, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4, 1); } + else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 5)) + { + CopyLineExpand<5, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 5, 1); + CopyLineExpand<5, false, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 5, 1); + } + else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 6)) + { + CopyLineExpand<6, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 6, 1); + CopyLineExpand<6, false, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 6, 1); + } + else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 7)) + { + CopyLineExpand<7, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 7, 1); + CopyLineExpand<7, false, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 7, 1); + } + else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 8)) + { + CopyLineExpand<8, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 8, 1); + CopyLineExpand<8, false, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 8, 1); + } else if ((compInfo.line.widthCustom % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0) { CopyLineExpand<0xFFFF, false, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], compInfo.line.widthCustom, 1); @@ -6510,80 +3146,48 @@ void GPUEngineBase::ParseReg_WININ() { GPUEngineRenderState &renderState = this->_currentRenderState; - renderState.WIN0_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN0IN.BG0_Enable; - renderState.WIN0_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN0IN.BG1_Enable; - renderState.WIN0_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN0IN.BG2_Enable; - renderState.WIN0_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN0IN.BG3_Enable; - renderState.WIN0_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN0IN.OBJ_Enable; - renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN0IN.Effect_Enable; + renderState.WIN0_enable[GPULayerID_BG0] = (this->_IORegisterMap->WIN0IN.BG0_Enable != 0) ? 0xFF : 0x00; + renderState.WIN0_enable[GPULayerID_BG1] = (this->_IORegisterMap->WIN0IN.BG1_Enable != 0) ? 0xFF : 0x00; + renderState.WIN0_enable[GPULayerID_BG2] = (this->_IORegisterMap->WIN0IN.BG2_Enable != 0) ? 0xFF : 0x00; + renderState.WIN0_enable[GPULayerID_BG3] = (this->_IORegisterMap->WIN0IN.BG3_Enable != 0) ? 0xFF : 0x00; + renderState.WIN0_enable[GPULayerID_OBJ] = (this->_IORegisterMap->WIN0IN.OBJ_Enable != 0) ? 0xFF : 0x00; + renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = (this->_IORegisterMap->WIN0IN.Effect_Enable != 0) ? 0xFF : 0x00; - renderState.WIN1_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN1IN.BG0_Enable; - renderState.WIN1_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN1IN.BG1_Enable; - renderState.WIN1_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN1IN.BG2_Enable; - renderState.WIN1_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN1IN.BG3_Enable; - renderState.WIN1_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN1IN.OBJ_Enable; - renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN1IN.Effect_Enable; - -#if defined(ENABLE_SSE2) - renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG0_Enable != 0) ? 0xFF : 0x00); - renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG1_Enable != 0) ? 0xFF : 0x00); - renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG2_Enable != 0) ? 0xFF : 0x00); - renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG3_Enable != 0) ? 0xFF : 0x00); - renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.OBJ_Enable != 0) ? 0xFF : 0x00); - renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.Effect_Enable != 0) ? 0xFF : 0x00); - - renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG0_Enable != 0) ? 0xFF : 0x00); - renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG1_Enable != 0) ? 0xFF : 0x00); - renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG2_Enable != 0) ? 0xFF : 0x00); - renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG3_Enable != 0) ? 0xFF : 0x00); - renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.OBJ_Enable != 0) ? 0xFF : 0x00); - renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.Effect_Enable != 0) ? 0xFF : 0x00); -#endif + renderState.WIN1_enable[GPULayerID_BG0] = (this->_IORegisterMap->WIN1IN.BG0_Enable != 0) ? 0xFF : 0x00; + renderState.WIN1_enable[GPULayerID_BG1] = (this->_IORegisterMap->WIN1IN.BG1_Enable != 0) ? 0xFF : 0x00; + renderState.WIN1_enable[GPULayerID_BG2] = (this->_IORegisterMap->WIN1IN.BG2_Enable != 0) ? 0xFF : 0x00; + renderState.WIN1_enable[GPULayerID_BG3] = (this->_IORegisterMap->WIN1IN.BG3_Enable != 0) ? 0xFF : 0x00; + renderState.WIN1_enable[GPULayerID_OBJ] = (this->_IORegisterMap->WIN1IN.OBJ_Enable != 0) ? 0xFF : 0x00; + renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = (this->_IORegisterMap->WIN1IN.Effect_Enable != 0) ? 0xFF : 0x00; } void GPUEngineBase::ParseReg_WINOUT() { GPUEngineRenderState &renderState = this->_currentRenderState; - renderState.WINOUT_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOUT.BG0_Enable; - renderState.WINOUT_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOUT.BG1_Enable; - renderState.WINOUT_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOUT.BG2_Enable; - renderState.WINOUT_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOUT.BG3_Enable; - renderState.WINOUT_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOUT.OBJ_Enable; - renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOUT.Effect_Enable; + renderState.WINOUT_enable[GPULayerID_BG0] = (this->_IORegisterMap->WINOUT.BG0_Enable != 0) ? 0xFF : 0x00; + renderState.WINOUT_enable[GPULayerID_BG1] = (this->_IORegisterMap->WINOUT.BG1_Enable != 0) ? 0xFF : 0x00; + renderState.WINOUT_enable[GPULayerID_BG2] = (this->_IORegisterMap->WINOUT.BG2_Enable != 0) ? 0xFF : 0x00; + renderState.WINOUT_enable[GPULayerID_BG3] = (this->_IORegisterMap->WINOUT.BG3_Enable != 0) ? 0xFF : 0x00; + renderState.WINOUT_enable[GPULayerID_OBJ] = (this->_IORegisterMap->WINOUT.OBJ_Enable != 0) ? 0xFF : 0x00; + renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = (this->_IORegisterMap->WINOUT.Effect_Enable != 0) ? 0xFF : 0x00; - renderState.WINOBJ_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOBJ.BG0_Enable; - renderState.WINOBJ_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOBJ.BG1_Enable; - renderState.WINOBJ_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOBJ.BG2_Enable; - renderState.WINOBJ_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOBJ.BG3_Enable; - renderState.WINOBJ_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOBJ.OBJ_Enable; - renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOBJ.Effect_Enable; - -#if defined(ENABLE_SSE2) - renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG0_Enable != 0) ? 0xFF : 0x00); - renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG1_Enable != 0) ? 0xFF : 0x00); - renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG2_Enable != 0) ? 0xFF : 0x00); - renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG3_Enable != 0) ? 0xFF : 0x00); - renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.OBJ_Enable != 0) ? 0xFF : 0x00); - renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.Effect_Enable != 0) ? 0xFF : 0x00); - - renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG0_Enable != 0) ? 0xFF : 0x00); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG1_Enable != 0) ? 0xFF : 0x00); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG2_Enable != 0) ? 0xFF : 0x00); - renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG3_Enable != 0) ? 0xFF : 0x00); - renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.OBJ_Enable != 0) ? 0xFF : 0x00); - renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.Effect_Enable != 0) ? 0xFF : 0x00); -#endif + renderState.WINOBJ_enable[GPULayerID_BG0] = (this->_IORegisterMap->WINOBJ.BG0_Enable != 0) ? 0xFF : 0x00; + renderState.WINOBJ_enable[GPULayerID_BG1] = (this->_IORegisterMap->WINOBJ.BG1_Enable != 0) ? 0xFF : 0x00; + renderState.WINOBJ_enable[GPULayerID_BG2] = (this->_IORegisterMap->WINOBJ.BG2_Enable != 0) ? 0xFF : 0x00; + renderState.WINOBJ_enable[GPULayerID_BG3] = (this->_IORegisterMap->WINOBJ.BG3_Enable != 0) ? 0xFF : 0x00; + renderState.WINOBJ_enable[GPULayerID_OBJ] = (this->_IORegisterMap->WINOBJ.OBJ_Enable != 0) ? 0xFF : 0x00; + renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = (this->_IORegisterMap->WINOBJ.Effect_Enable != 0) ? 0xFF : 0x00; } void GPUEngineBase::ParseReg_MOSAIC() { GPUEngineRenderState &renderState = this->_currentRenderState; - renderState.mosaicWidthBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicH]; - renderState.mosaicHeightBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicV]; - renderState.mosaicWidthOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicH]; - renderState.mosaicHeightOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicV]; + renderState.mosaicWidthBG = &this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicH]; + renderState.mosaicHeightBG = &this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicV]; + renderState.mosaicWidthOBJ = &this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicH]; + renderState.mosaicHeightOBJ = &this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicV]; renderState.isBGMosaicSet = (this->_IORegisterMap->MOSAIC.BG_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.BG_MosaicV != 0); renderState.isOBJMosaicSet = (this->_IORegisterMap->MOSAIC.OBJ_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.OBJ_MosaicV != 0); @@ -6596,19 +3200,19 @@ void GPUEngineBase::ParseReg_BLDCNT() renderState.colorEffect = (ColorEffect)BLDCNT.ColorEffect; - renderState.srcEffectEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target1 != 0); - renderState.srcEffectEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target1 != 0); - renderState.srcEffectEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target1 != 0); - renderState.srcEffectEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target1 != 0); - renderState.srcEffectEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target1 != 0); - renderState.srcEffectEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target1 != 0); + renderState.srcEffectEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target1 != 0) ? 0xFF : 0x00; + renderState.srcEffectEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target1 != 0) ? 0xFF : 0x00; + renderState.srcEffectEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target1 != 0) ? 0xFF : 0x00; + renderState.srcEffectEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target1 != 0) ? 0xFF : 0x00; + renderState.srcEffectEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target1 != 0) ? 0xFF : 0x00; + renderState.srcEffectEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target1 != 0) ? 0xFF : 0x00; - renderState.dstBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target2 != 0); - renderState.dstBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target2 != 0); - renderState.dstBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target2 != 0); - renderState.dstBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target2 != 0); - renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0); - renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0); + renderState.dstBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target2 != 0) ? 0xFF : 0x00; + renderState.dstBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target2 != 0) ? 0xFF : 0x00; + renderState.dstBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target2 != 0) ? 0xFF : 0x00; + renderState.dstBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target2 != 0) ? 0xFF : 0x00; + renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0) ? 0xFF : 0x00; + renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0) ? 0xFF : 0x00; renderState.dstAnyBlendEnable = renderState.dstBlendEnable[GPULayerID_BG0] || renderState.dstBlendEnable[GPULayerID_BG1] || @@ -6617,34 +3221,16 @@ void GPUEngineBase::ParseReg_BLDCNT() renderState.dstBlendEnable[GPULayerID_OBJ] || renderState.dstBlendEnable[GPULayerID_Backdrop]; -#ifdef ENABLE_SSE2 - const __m128i one_vec128 = _mm_set1_epi8(1); - - renderState.srcEffectEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target1), one_vec128); - renderState.srcEffectEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target1), one_vec128); - renderState.srcEffectEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target1), one_vec128); - renderState.srcEffectEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target1), one_vec128); - renderState.srcEffectEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target1), one_vec128); - renderState.srcEffectEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target1), one_vec128); - -#ifdef ENABLE_SSSE3 - renderState.dstBlendEnable_SSSE3 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - BLDCNT.Backdrop_Target2, - BLDCNT.OBJ_Target2, - BLDCNT.BG3_Target2, - BLDCNT.BG2_Target2, - BLDCNT.BG1_Target2, - BLDCNT.BG0_Target2); -#else - renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target2), one_vec128); - renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target2), one_vec128); - renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target2), one_vec128); - renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target2), one_vec128); - renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target2), one_vec128); - renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target2), one_vec128); -#endif - -#endif // ENABLE_SSE2 + // For the vectorized rendering loops, create a lookup table for each 128-bit lane. + for (size_t i = 0; i < sizeof(renderState.dstBlendEnableVecLookup); i+=16) + { + renderState.dstBlendEnableVecLookup[i+0] = renderState.dstBlendEnable[GPULayerID_BG0]; + renderState.dstBlendEnableVecLookup[i+1] = renderState.dstBlendEnable[GPULayerID_BG1]; + renderState.dstBlendEnableVecLookup[i+2] = renderState.dstBlendEnable[GPULayerID_BG2]; + renderState.dstBlendEnableVecLookup[i+3] = renderState.dstBlendEnable[GPULayerID_BG3]; + renderState.dstBlendEnableVecLookup[i+4] = renderState.dstBlendEnable[GPULayerID_OBJ]; + renderState.dstBlendEnableVecLookup[i+5] = renderState.dstBlendEnable[GPULayerID_Backdrop]; + } } void GPUEngineBase::ParseReg_BLDALPHA() @@ -6654,7 +3240,7 @@ void GPUEngineBase::ParseReg_BLDALPHA() renderState.blendEVA = (BLDALPHA.EVA >= 16) ? 16 : BLDALPHA.EVA; renderState.blendEVB = (BLDALPHA.EVB >= 16) ? 16 : BLDALPHA.EVB; - renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0]; + renderState.blendTable555 = (TBlendTable *)&PixelOperation::BlendTable555[renderState.blendEVA][renderState.blendEVB][0][0]; } void GPUEngineBase::ParseReg_BLDY() @@ -6663,12 +3249,12 @@ void GPUEngineBase::ParseReg_BLDY() GPUEngineRenderState &renderState = this->_currentRenderState; renderState.blendEVY = (BLDY.EVY >= 16) ? 16 : BLDY.EVY; - renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0]; - renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0]; - renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0]; - renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0]; - renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0]; - renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0]; + renderState.brightnessUpTable555 = &PixelOperation::BrightnessUpTable555[renderState.blendEVY][0]; + renderState.brightnessUpTable666 = &PixelOperation::BrightnessUpTable666[renderState.blendEVY][0]; + renderState.brightnessUpTable888 = &PixelOperation::BrightnessUpTable888[renderState.blendEVY][0]; + renderState.brightnessDownTable555 = &PixelOperation::BrightnessDownTable555[renderState.blendEVY][0]; + renderState.brightnessDownTable666 = &PixelOperation::BrightnessDownTable666[renderState.blendEVY][0]; + renderState.brightnessDownTable888 = &PixelOperation::BrightnessDownTable888[renderState.blendEVY][0]; } const BGLayerInfo& GPUEngineBase::GetBGLayerInfoByID(const GPULayerID layerID) @@ -7261,73 +3847,8 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) { size_t i = 0; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % 16)); - const __m128i srcEffectEnableMask = compInfo.renderState.srcEffectEnable_SSE2[GPULayerID_BG0]; - - for (; i < ssePixCount; i+=16, srcLinePtr+=16, compInfo.target.xCustom+=16, compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) - { - if (compInfo.target.xCustom >= compInfo.line.widthCustom) - { - compInfo.target.xCustom -= compInfo.line.widthCustom; - } - - // Determine which pixels pass by doing the window test and the alpha test. - __m128i passMask8; - int passMaskValue; - - if (WILLPERFORMWINDOWTEST) - { - // Do the window test. - passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[GPULayerID_BG0] + compInfo.target.xCustom)), _mm_set1_epi8(1) ); - - // If none of the pixels within the vector pass, then reject them all at once. - passMaskValue = _mm_movemask_epi8(passMask8); - if (passMaskValue == 0) - { - continue; - } - } - else - { - passMask8 = _mm_set1_epi8(0xFF); - passMaskValue = 0xFFFF; - } - - const __m128i src[4] = { - _mm_load_si128((__m128i *)srcLinePtr + 0), - _mm_load_si128((__m128i *)srcLinePtr + 1), - _mm_load_si128((__m128i *)srcLinePtr + 2), - _mm_load_si128((__m128i *)srcLinePtr + 3) - }; - - // Do the alpha test. Pixels with an alpha value of 0 are rejected. - const __m128i srcAlpha = _mm_packs_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)), - _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) ); - - passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(srcAlpha, _mm_setzero_si128()), passMask8); - - // If none of the pixels within the vector pass, then reject them all at once. - passMaskValue = _mm_movemask_epi8(passMask8); - if (passMaskValue == 0) - { - continue; - } - - // Write out the pixels. - const bool didAllPixelsPass = (passMaskValue == 0xFFFF); - this->_PixelComposite16_SSE2(compInfo, - didAllPixelsPass, - passMask8, - src[3], src[2], src[1], src[0], - srcEffectEnableMask, - this->_enableColorEffectCustom[GPULayerID_BG0] + compInfo.target.xCustom, - NULL, - NULL); - } -#endif - -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION + i = this->_RenderLine_Layer3D_LoopOp(compInfo, srcLinePtr); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, srcLinePtr++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -7343,7 +3864,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[GPULayerID_BG0][compInfo.target.xCustom] != 0) : true; - this->_PixelComposite(compInfo, *srcLinePtr, enableColorEffect, 0, 0); + pixelop.Composite32(compInfo, *srcLinePtr, enableColorEffect, 0, 0); } } else @@ -7369,7 +3890,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[GPULayerID_BG0][compInfo.target.xCustom] != 0) : true; - this->_PixelComposite(compInfo, srcLinePtr[srcX], enableColorEffect, 0, 0); + pixelop.Composite32(compInfo, srcLinePtr[srcX], enableColorEffect, 0, 0); } srcLinePtr += compInfo.line.widthCustom; @@ -7477,7 +3998,7 @@ void GPUEngineA::_RenderLine_DisplayCaptureCustom(const IOREG_DISPCAPCNT &DISPCA } } - this->_RenderLine_DispCapture_Blend(lineInfo, srcAPtr, srcBPtr, dstCustomPtr, captureLengthExt); + this->_RenderLine_DispCapture_Blend(lineInfo, srcAPtr, srcBPtr, dstCustomPtr, captureLengthExt); break; } } @@ -7635,10 +4156,10 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI dstCustomPtr = (u16 *)this->_VRAMCustomBlockPtr[DISPCAPCNT.VRAMWriteBlock] + dstCustomOffset; } - if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + if ( (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) || (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ) { // Note that although RGB666 colors are 32-bit values, this particular mode uses 16-bit color depth for line captures. - if (needConvertDisplayLine23) + if ( (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) && needConvertDisplayLine23 ) { ColorspaceConvertBuffer6665To5551((u32 *)compInfo.target.lineColorHead, this->_captureWorkingDisplay16, compInfo.line.pixelCount); srcAPtr = this->_captureWorkingDisplay16; @@ -7747,28 +4268,25 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI case 0x42000000: // Display + FIFO - ((DISPCAPCNT.CaptureSrc == 2) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1)) case 0x43000000: // 3D + FIFO - ((DISPCAPCNT.CaptureSrc == 2) && (DISPCAPCNT.SrcA == 1) && (DISPCAPCNT.SrcB == 1)) case 0x60000000: // Display + VRAM - ((DISPCAPCNT.CaptureSrc == 3) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 0)) - case 0x62000000: // Display + FIFO - ((DISPCAPCNT.CaptureSrc == 3) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1)) case 0x61000000: // 3D + VRAM - ((DISPCAPCNT.CaptureSrc == 3) && (DISPCAPCNT.SrcA == 1) && (DISPCAPCNT.SrcB == 0)) + case 0x62000000: // Display + FIFO - ((DISPCAPCNT.CaptureSrc == 3) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1)) case 0x63000000: // 3D + FIFO - ((DISPCAPCNT.CaptureSrc == 3) && (DISPCAPCNT.SrcA == 1) && (DISPCAPCNT.SrcB == 1)) { - if ( ((DISPCAPCNT.SrcA == 0) && isReadDisplayLineNative) || ((DISPCAPCNT.SrcA != 0) && isRead3DLineNative) ) + if ( ((DISPCAPCNT.SrcA == 0) && !isReadDisplayLineNative) || ((DISPCAPCNT.SrcA != 0) && !isRead3DLineNative) ) { - this->_RenderLine_DispCapture_Blend(compInfo.line, srcAPtr, srcBPtr, dstNative16, CAPTURELENGTH); - } - else - { - this->_RenderLine_DispCapture_Blend(compInfo.line, srcAPtr, srcBPtr, dstNative16, CAPTURELENGTH); + CopyLineReduceHinted<0xFFFF, false, false, 2>(srcAPtr, 0, CAPTURELENGTH, this->_captureWorkingA16, 0); + srcAPtr = this->_captureWorkingA16; } + + this->_RenderLine_DispCapture_Blend(compInfo.line, srcAPtr, srcBPtr, dstNative16, CAPTURELENGTH); break; } } } -#ifdef ENABLE_SSE2 - MACRODO_N( CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_stream_si128((__m128i *)(this->_VRAMNativeBlockCaptureCopyPtr[DISPCAPCNT.VRAMWriteBlock] + dstNativeOffset) + (X), _mm_load_si128((__m128i *)dstNative16 + (X))) ); -#else - memcpy(this->_VRAMNativeBlockCaptureCopyPtr[DISPCAPCNT.VRAMWriteBlock] + dstNativeOffset, dstNative16, CAPTURELENGTH * sizeof(u16)); -#endif + // Save a fresh copy of the current native VRAM buffer so that we have something to compare against + // in the case where something else decides to make direct changes to VRAM behind our back. + stream_copy_fast(this->_VRAMNativeBlockCaptureCopyPtr[DISPCAPCNT.VRAMWriteBlock] + dstNativeOffset, dstNative16); if (this->_isLineCaptureNative[DISPCAPCNT.VRAMWriteBlock][writeLineIndexWithOffset] && !willWriteVRAMLineNative) { @@ -7793,24 +4311,20 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const GPUEngineLineInfo &lineInfo, const u16 alphaBit16 = (SOURCESWITCH == 0) ? 0x8000 : 0x0000; const u32 alphaBit32 = (SOURCESWITCH == 0) ? ((COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF000000 : 0x1F000000) : 0x00000000; -#ifdef ENABLE_SSE2 - const __m128i alpha_vec128 = (COLORFORMAT == NDSColorFormat_BGR555_Rev) ? _mm_set1_epi16(alphaBit16) : _mm_set1_epi32(alphaBit32); -#endif - if (CAPTURETONATIVEDST) { if (CAPTUREFROMNATIVESRC) { -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION switch (COLORFORMAT) { case NDSColorFormat_BGR555_Rev: - MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) )); + buffer_copy_or_constant_s16_fast(dst, src, alphaBit16); break; case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR888_Rev: - MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u32)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) )); + buffer_copy_or_constant_s32_fast(dst, src, alphaBit32); break; } #else @@ -7892,33 +4406,26 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const GPUEngineLineInfo &lineInfo, const size_t pixCountExt = captureLengthExt * lineInfo.renderCount; size_t i = 0; -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION switch (COLORFORMAT) { case NDSColorFormat_BGR555_Rev: { - const size_t ssePixCount = pixCountExt - (pixCountExt % 8); - for (; i < ssePixCount; i += 8) - { - _mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) ); - } + const size_t vecLength = (pixCountExt * sizeof(u16)) - ((pixCountExt * sizeof(u16)) % VECTORSIZE); + buffer_copy_or_constant_s16(dst, src, vecLength, alphaBit16); + i += vecLength; break; } case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR888_Rev: { - const size_t ssePixCount = pixCountExt - (pixCountExt % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) ); - } + const size_t vecLength = (pixCountExt * sizeof(u32)) - ((pixCountExt * sizeof(u32)) % VECTORSIZE); + buffer_copy_or_constant_s32(dst, src, vecLength, alphaBit32); + i += vecLength; break; } } -#endif - -#ifdef ENABLE_SSE2 #pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCountExt; i++) @@ -7946,15 +4453,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const GPUEngineLineInfo &lineInfo, { case NDSColorFormat_BGR555_Rev: { -#ifdef ENABLE_SSE2 - const size_t ssePixCount = captureLengthExt - (captureLengthExt % 8); - for (; i < ssePixCount; i += 8) - { - _mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) ); - } -#endif - -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION + const size_t vecLength = (captureLengthExt * sizeof(u16)) - ((captureLengthExt * sizeof(u16)) % VECTORSIZE); + buffer_copy_or_constant_s16(dst, src, vecLength, alphaBit16); + i += vecLength; #pragma LOOPVECTORIZE_DISABLE #endif for (; i < captureLengthExt; i++) @@ -7970,15 +4472,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const GPUEngineLineInfo &lineInfo, case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR888_Rev: { -#ifdef ENABLE_SSE2 - const size_t ssePixCount = captureLengthExt - (captureLengthExt % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) ); - } -#endif - -#ifdef ENABLE_SSE2 +#ifdef USEMANUALVECTORIZATION + const size_t vecLength = (captureLengthExt * sizeof(u32)) - ((captureLengthExt * sizeof(u32)) % VECTORSIZE); + buffer_copy_or_constant_s32(dst, src, vecLength, alphaBit32); + i += vecLength; #pragma LOOPVECTORIZE_DISABLE #endif for (; i < captureLengthExt; i++) @@ -8081,151 +4578,22 @@ FragmentColor GPUEngineA::_RenderLine_DispCapture_BlendFunc(const FragmentColor return outColor; } -#ifdef ENABLE_SSE2 -template -__m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB) -{ -#ifdef ENABLE_SSSE3 - __m128i blendAB = _mm_or_si128( blendEVA, _mm_slli_epi16(blendEVB, 8) ); -#endif - - switch (COLORFORMAT) - { - case NDSColorFormat_BGR555_Rev: - { - __m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi16(0x8000)); - __m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi16(0x8000)); - __m128i srcA_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcA_alpha, _mm_setzero_si128()), srcA ); - __m128i srcB_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcB_alpha, _mm_setzero_si128()), srcB ); - __m128i colorBitMask = _mm_set1_epi16(0x001F); - - __m128i ra; - __m128i ga; - __m128i ba; - -#ifdef ENABLE_SSSE3 - ra = _mm_or_si128( _mm_and_si128( srcA_masked, colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 8), _mm_set1_epi16(0x1F00)) ); - ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 3), _mm_set1_epi16(0x1F00)) ); - ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(srcB_masked, 2), _mm_set1_epi16(0x1F00)) ); - - ra = _mm_maddubs_epi16(ra, blendAB); - ga = _mm_maddubs_epi16(ga, blendAB); - ba = _mm_maddubs_epi16(ba, blendAB); -#else - ra = _mm_and_si128( srcA_masked, colorBitMask); - ga = _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask); - ba = _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask); - - __m128i rb = _mm_and_si128( srcB_masked, colorBitMask); - __m128i gb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 5), colorBitMask); - __m128i bb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 10), colorBitMask); - - ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) ); - ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) ); - ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) ); -#endif - - ra = _mm_srli_epi16(ra, 4); - ga = _mm_srli_epi16(ga, 4); - ba = _mm_srli_epi16(ba, 4); - - ra = _mm_min_epi16(ra, colorBitMask); - ga = _mm_min_epi16(ga, colorBitMask); - ba = _mm_min_epi16(ba, colorBitMask); - - return _mm_or_si128( _mm_or_si128(_mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10)), _mm_or_si128(srcA_alpha, srcB_alpha) ); - } - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: - { - // Get color masks based on if the alpha value is 0. Colors with an alpha value - // equal to 0 are rejected. - __m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi32(0xFF000000)); - __m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi32(0xFF000000)); - __m128i srcA_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcA_alpha, _mm_setzero_si128()), srcA); - __m128i srcB_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcB_alpha, _mm_setzero_si128()), srcB); - - __m128i outColorLo; - __m128i outColorHi; - __m128i outColor; - - // Temporarily convert the color component values from 8-bit to 16-bit, and then - // do the blend calculation. -#ifdef ENABLE_SSSE3 - outColorLo = _mm_unpacklo_epi8(srcA_masked, srcB_masked); - outColorHi = _mm_unpackhi_epi8(srcA_masked, srcB_masked); - - outColorLo = _mm_maddubs_epi16(outColorLo, blendAB); - outColorHi = _mm_maddubs_epi16(outColorHi, blendAB); -#else - __m128i srcA_maskedLo = _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128()); - __m128i srcA_maskedHi = _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128()); - __m128i srcB_maskedLo = _mm_unpacklo_epi8(srcB_masked, _mm_setzero_si128()); - __m128i srcB_maskedHi = _mm_unpackhi_epi8(srcB_masked, _mm_setzero_si128()); - - outColorLo = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedLo, blendEVA), _mm_mullo_epi16(srcB_maskedLo, blendEVB) ); - outColorHi = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedHi, blendEVA), _mm_mullo_epi16(srcB_maskedHi, blendEVB) ); -#endif - - outColorLo = _mm_srli_epi16(outColorLo, 4); - outColorHi = _mm_srli_epi16(outColorHi, 4); - - // Convert the color components back from 16-bit to 8-bit using a saturated pack. - outColor = _mm_packus_epi16(outColorLo, outColorHi); - - // When the color format is 8888, the packuswb instruction will naturally clamp - // the color component values to 255. However, when the color format is 6665, the - // color component values must be clamped to 63. In this case, we must call pminub - // to do the clamp. - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63)); - } - - // Add the alpha components back in. - outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF)); - outColor = _mm_or_si128(outColor, srcA_alpha); - outColor = _mm_or_si128(outColor, srcB_alpha); - - return outColor; - } - } - - return srcA; -} -#endif - template -void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) +void GPUEngineA::_RenderLine_DispCapture_Blend_Buffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) { -#ifdef ENABLE_SSE2 - const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA); - const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB); -#endif - size_t i = 0; +#ifdef USEMANUALVECTORIZATION + i = this->_RenderLine_DispCapture_Blend_VecLoop(srcA, srcB, dst, blendEVA, blendEVB, length); +#pragma LOOPVECTORIZE_DISABLE +#endif + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { const FragmentColor *srcA_32 = (const FragmentColor *)srcA; const FragmentColor *srcB_32 = (const FragmentColor *)srcB; FragmentColor *dst32 = (FragmentColor *)dst; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = length - (length % 4); - for (; i < ssePixCount; i+=4) - { - const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_32 + i)); - const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_32 + i)); - - _mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif for (; i < length; i++) { const FragmentColor colorA = srcA_32[i]; @@ -8240,20 +4608,6 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA const u16 *srcB_16 = (const u16 *)srcB; u16 *dst16 = (u16 *)dst; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = length - (length % 8); - for (; i < ssePixCount; i+=8) - { - const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_16 + i)); - const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_16 + i)); - - _mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif for (; i < length; i++) { const u16 colorA = srcA_16[i]; @@ -8264,99 +4618,27 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA } } -template +template void GPUEngineA::_RenderLine_DispCapture_Blend(const GPUEngineLineInfo &lineInfo, const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt) { const u8 blendEVA = this->_dispCapCnt.EVA; const u8 blendEVB = this->_dispCapCnt.EVB; - if (CAPTURETONATIVEDST) + if (ISCAPTURENATIVE) { -#ifdef ENABLE_SSE2 - const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA); - const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB); - - if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) - { - const u32 *srcA_32 = (const u32 *)srcA; - const u32 *srcB_32 = (const u32 *)srcB; - FragmentColor *dst32 = (FragmentColor *)dst; - - for (size_t i = 0; i < CAPTURELENGTH; i+=4) - { - __m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_32 + i)) : _mm_set_epi32(srcA_32[_gpuDstPitchIndex[i+3]], - srcA_32[_gpuDstPitchIndex[i+2]], - srcA_32[_gpuDstPitchIndex[i+1]], - srcA_32[_gpuDstPitchIndex[i+0]]); - - __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_32 + i)) : _mm_set_epi32(srcB_32[_gpuDstPitchIndex[i+3]], - srcB_32[_gpuDstPitchIndex[i+2]], - srcB_32[_gpuDstPitchIndex[i+1]], - srcB_32[_gpuDstPitchIndex[i+0]]); - - _mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); - } - } - else - { - const u16 *srcA_16 = (const u16 *)srcA; - const u16 *srcB_16 = (const u16 *)srcB; - u16 *dst16 = (u16 *)dst; - - for (size_t i = 0; i < CAPTURELENGTH; i+=8) - { - __m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_16 + i)) : _mm_set_epi16(srcA_16[_gpuDstPitchIndex[i+7]], - srcA_16[_gpuDstPitchIndex[i+6]], - srcA_16[_gpuDstPitchIndex[i+5]], - srcA_16[_gpuDstPitchIndex[i+4]], - srcA_16[_gpuDstPitchIndex[i+3]], - srcA_16[_gpuDstPitchIndex[i+2]], - srcA_16[_gpuDstPitchIndex[i+1]], - srcA_16[_gpuDstPitchIndex[i+0]]); - - __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_16 + i)) : _mm_set_epi16(srcB_16[_gpuDstPitchIndex[i+7]], - srcB_16[_gpuDstPitchIndex[i+6]], - srcB_16[_gpuDstPitchIndex[i+5]], - srcB_16[_gpuDstPitchIndex[i+4]], - srcB_16[_gpuDstPitchIndex[i+3]], - srcB_16[_gpuDstPitchIndex[i+2]], - srcB_16[_gpuDstPitchIndex[i+1]], - srcB_16[_gpuDstPitchIndex[i+0]]); - - _mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); - } - } -#else - for (size_t i = 0; i < CAPTURELENGTH; i++) - { - if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) - { - const FragmentColor colorA = (CAPTUREFROMNATIVESRCA) ? ((const FragmentColor *)srcA)[i] : ((const FragmentColor *)srcA)[_gpuDstPitchIndex[i]]; - const FragmentColor colorB = (CAPTUREFROMNATIVESRCB) ? ((const FragmentColor *)srcB)[i] : ((const FragmentColor *)srcB)[_gpuDstPitchIndex[i]]; - - ((FragmentColor *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); - } - else - { - const u16 colorA = (CAPTUREFROMNATIVESRCA) ? ((u16 *)srcA)[i] : ((u16 *)srcA)[_gpuDstPitchIndex[i]]; - const u16 colorB = (CAPTUREFROMNATIVESRCB) ? ((u16 *)srcB)[i] : ((u16 *)srcB)[_gpuDstPitchIndex[i]]; - - ((u16 *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); - } - } -#endif + this->_RenderLine_DispCapture_Blend_Buffer(srcA, srcB, dst, blendEVA, blendEVB, CAPTURELENGTH); } else { if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) { - this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * lineInfo.renderCount); + this->_RenderLine_DispCapture_Blend_Buffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * lineInfo.renderCount); } else { for (size_t line = 0; line < lineInfo.renderCount; line++) { - this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt); + this->_RenderLine_DispCapture_Blend_Buffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt); srcA = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcA + lineInfo.widthCustom) : (void *)((u16 *)srcA + lineInfo.widthCustom); srcB = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcB + lineInfo.widthCustom) : (void *)((u16 *)srcB + lineInfo.widthCustom); dst = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)dst + lineInfo.widthCustom) : (void *)((u16 *)dst + lineInfo.widthCustom); @@ -8597,6 +4879,7 @@ void GPUEngineB::RenderLine(const size_t l) GPUSubsystem::GPUSubsystem() { ColorspaceHandlerInit(); + PixelOperation::InitLUTs(); _defaultEventHandler = new GPUEventHandlerDefault; _event = _defaultEventHandler; @@ -9244,7 +5527,7 @@ int GPUSubsystem::Get3DRendererID() void GPUSubsystem::Set3DRendererByID(int rendererID) { Render3DInterface *newRenderInterface = core3DList[rendererID]; - if (newRenderInterface->NDS_3D_Init == NULL) + if ( (newRenderInterface == NULL) || (newRenderInterface->NDS_3D_Init == NULL) ) { return; } @@ -9262,7 +5545,7 @@ bool GPUSubsystem::Change3DRendererByID(int rendererID) this->_needChange3DRenderer = false; Render3DInterface *newRenderInterface = core3DList[rendererID]; - if (newRenderInterface->NDS_3D_Init == NULL) + if ( (newRenderInterface == NULL) || (newRenderInterface->NDS_3D_Init == NULL) ) { return result; } @@ -10357,7 +6640,3 @@ template void GPUEngineBase::ParseReg_BGnY(); template void GPUSubsystem::RenderLine(const size_t l); template void GPUSubsystem::RenderLine(const size_t l); template void GPUSubsystem::RenderLine(const size_t l); - -// These functions are used in gfx3d.cpp -template void CopyLineExpandHinted<0xFFFF, true, false, true, 4>(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); -template void CopyLineReduceHinted<0xFFFF, false, true, 4>(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 82d460578..473cbeaaf 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -2,7 +2,7 @@ Copyright (C) 2006 yopyop Copyright (C) 2006-2007 Theo Berkau Copyright (C) 2007 shash - Copyright (C) 2009-2019 DeSmuME team + Copyright (C) 2009-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,7 +28,6 @@ #include "./utils/colorspacehandler/colorspacehandler.h" #ifdef ENABLE_SSE2 -#include #include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #endif @@ -40,6 +39,14 @@ #include #endif +#ifdef ENABLE_AVX2 +#include "./utils/colorspacehandler/colorspacehandler_AVX2.h" +#endif + +#ifdef ENABLE_AVX512_1 +#include "./utils/colorspacehandler/colorspacehandler_AVX512.h" +#endif + // Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the // shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can // then substitute the palignr instruction with an SSE2 equivalent. @@ -1201,8 +1208,9 @@ typedef struct typedef struct { - u8 begin; - u8 trunc; + u8 begin[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + u8 trunc[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + u32 trunc32[GPU_FRAMEBUFFER_NATIVE_WIDTH]; } MosaicTableEntry; typedef struct @@ -1272,34 +1280,21 @@ typedef struct FragmentColor *brightnessDownTable666; FragmentColor *brightnessDownTable888; - bool srcEffectEnable[6]; - bool dstBlendEnable[6]; -#ifdef ENABLE_SSE2 - __m128i srcEffectEnable_SSE2[6]; -#ifdef ENABLE_SSSE3 - __m128i dstBlendEnable_SSSE3; -#else - __m128i dstBlendEnable_SSE2[6]; -#endif -#endif // ENABLE_SSE2 - bool dstAnyBlendEnable; - u8 WIN0_enable[6]; u8 WIN1_enable[6]; u8 WINOUT_enable[6]; u8 WINOBJ_enable[6]; -#if defined(ENABLE_SSE2) - __m128i WIN0_enable_SSE2[6]; - __m128i WIN1_enable_SSE2[6]; - __m128i WINOUT_enable_SSE2[6]; - __m128i WINOBJ_enable_SSE2[6]; -#endif bool WIN0_ENABLED; bool WIN1_ENABLED; bool WINOBJ_ENABLED; bool isAnyWindowEnabled; + u8 srcEffectEnable[6]; + u8 dstBlendEnable[6]; + bool dstAnyBlendEnable; + CACHE_ALIGN u8 dstBlendEnableVecLookup[128]; // Supports up to 1024-bit vectors + MosaicTableEntry *mosaicWidthBG; MosaicTableEntry *mosaicHeightBG; MosaicTableEntry *mosaicWidthOBJ; @@ -1340,32 +1335,26 @@ typedef struct class GPUEngineBase { protected: - static CACHE_ALIGN u16 _brightnessUpTable555[17][0x8000]; - static CACHE_ALIGN FragmentColor _brightnessUpTable666[17][0x8000]; - static CACHE_ALIGN FragmentColor _brightnessUpTable888[17][0x8000]; - static CACHE_ALIGN u16 _brightnessDownTable555[17][0x8000]; - static CACHE_ALIGN FragmentColor _brightnessDownTable666[17][0x8000]; - static CACHE_ALIGN FragmentColor _brightnessDownTable888[17][0x8000]; - static CACHE_ALIGN u8 _blendTable555[17][17][32][32]; - static const CACHE_ALIGN SpriteSize _sprSizeTab[4][4]; static const CACHE_ALIGN BGLayerSize _BGLayerSizeLUT[8][4]; static const CACHE_ALIGN BGType _mode2type[8][4]; static struct MosaicLookup { - CACHE_ALIGN MosaicTableEntry table[16][256]; + CACHE_ALIGN MosaicTableEntry table[16]; MosaicLookup() { for (size_t m = 0; m < 16; m++) { - for (size_t i = 0; i < 256; i++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) { size_t mosaic = m+1; - MosaicTableEntry &te = table[m][i]; - te.begin = ((i % mosaic) == 0); - te.trunc = (i / mosaic) * mosaic; + MosaicTableEntry &te = table[m]; + + te.begin[i] = ((i % mosaic) == 0); + te.trunc[i] = (i / mosaic) * mosaic; + te.trunc32[i] = te.trunc[i]; } } } @@ -1413,12 +1402,12 @@ protected: itemsForPriority_t _itemsForPriority[NB_PRIORITIES]; struct MosaicColor { - u16 bg[4][256]; + CACHE_ALIGN u16 bg[4][GPU_FRAMEBUFFER_NATIVE_WIDTH + sizeof(u32)]; // Pad this buffersa little bit to avoid buffer overruns with vectorized gather instructions. struct Obj { u16 color; u8 alpha; u8 opaque; - } obj[256]; + } obj[GPU_FRAMEBUFFER_NATIVE_WIDTH]; } _mosaicColors; GPUEngineID _engineID; @@ -1452,7 +1441,6 @@ protected: FragmentColor _asyncClearBackdropColor32; // Do not modify this variable directly. bool _asyncClearUseInternalCustomBuffer; // Do not modify this variable directly. - void _InitLUTs(); void _Reset_Base(); void _ResortBGLayers(); @@ -1466,12 +1454,19 @@ protected: template void _RenderPixelIterate(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter ¶m, const u32 map, const u32 tile, const u16 *__restrict pal); TILEENTRY _GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask); - template FORCEINLINE void _CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool opaque); + template FORCEINLINE void _CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool isOpaque); + template void _MosaicLine(GPUEngineCompositorInfo &compInfo); + template void _PrecompositeNativeToCustomLineBG(GPUEngineCompositorInfo &compInfo); + template void _CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32); template void _CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom); template void _CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); + template void _CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32); + template size_t _CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom); + template size_t _CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); + template void _RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG); template void _RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter ¶m); template void _RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter ¶m, bool &outUseCustomVRAM); @@ -1487,79 +1482,22 @@ protected: template void _HandleDisplayModeOff(const size_t l); template void _HandleDisplayModeNormal(const size_t l); + template size_t _ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped); + template size_t _ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped); + template void _UpdateWINH(GPUEngineCompositorInfo &compInfo); template bool _IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo); void _PerformWindowTesting(GPUEngineCompositorInfo &compInfo); + void _PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative); template FORCEINLINE void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); template FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); - template FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16); - template FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32); - template FORCEINLINE void _PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const u16 srcColor16); - template FORCEINLINE void _PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32); - template FORCEINLINE void _PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const u16 srcColor16); - template FORCEINLINE void _PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32); - template FORCEINLINE void _PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode); - template FORCEINLINE void _PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode); - - template FORCEINLINE void _PixelComposite(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode); - template FORCEINLINE void _PixelComposite(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode); - - FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB); - FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable); - template FORCEINLINE FragmentColor _ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB); - - FORCEINLINE u16 _ColorEffectBlend3D(const FragmentColor colA, const u16 colB); - template FORCEINLINE FragmentColor _ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB); - - FORCEINLINE u16 _ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY); - template FORCEINLINE FragmentColor _ColorEffectIncreaseBrightness(const FragmentColor col, const u16 blendEVY); - - FORCEINLINE u16 _ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY); - FORCEINLINE FragmentColor _ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY); - -#ifdef ENABLE_SSE2 - template FORCEINLINE __m128i _ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB); - template FORCEINLINE __m128i _ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB); - template FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY); - template FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY); - template FORCEINLINE void _RenderPixel_CheckWindows16_SSE2(GPUEngineCompositorInfo &compInfo, const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const; - - template FORCEINLINE void _PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelBrightnessUp16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - - template - FORCEINLINE void _PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &srcEffectEnableMask, - const __m128i &enableColorEffectMask, - const __m128i &spriteAlpha, - const __m128i &spriteMode, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID); - - template - FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, - const bool didAllPixelsPass, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &srcEffectEnableMask, - const u8 *__restrict enableColorEffectPtr, - const u8 *__restrict sprAlphaPtr, - const u8 *__restrict sprModePtr); -#endif - template FORCEINLINE void _RenderSpriteUpdatePixel(GPUEngineCompositorInfo &compInfo, size_t frameX, const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u8 spriteAlpha, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); + template size_t _RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, size_t &frameX, size_t &spriteX, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir); @@ -1703,14 +1641,16 @@ protected: u16 _RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB, const u8 blendEVA, const u8 blendEVB); template FragmentColor _RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB); -#ifdef ENABLE_SSE2 - template __m128i _RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB); -#endif + template + size_t _RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr); template - void _RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length); // Do not use restrict pointers, since srcB and dst can be the same + void _RenderLine_DispCapture_Blend_Buffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t pixCount); // Do not use restrict pointers, since srcB and dst can be the same - template + template + size_t _RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length); + + template void _RenderLine_DispCapture_Blend(const GPUEngineLineInfo &lineInfo, const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt); // Do not use restrict pointers, since srcB and dst can be the same template void _HandleDisplayModeVRAM(const GPUEngineLineInfo &lineInfo); @@ -1986,20 +1926,6 @@ public: void SetClientData(void *clientData); }; -template -void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, - void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount); - -template -void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); - -template -void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth, - void *__restrict dstBuffer, const size_t dstLineIndex); - -template -void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); - extern GPUSubsystem *GPU; extern MMU_struct MMU; diff --git a/desmume/src/GPU_Operations.cpp b/desmume/src/GPU_Operations.cpp new file mode 100644 index 000000000..0916fe1cc --- /dev/null +++ b/desmume/src/GPU_Operations.cpp @@ -0,0 +1,1336 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include "GPU_Operations.h" + + +static size_t _gpuLargestDstLineCount = 1; +static size_t _gpuVRAMBlockOffset = GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH; + +static u16 *_gpuDstToSrcIndex = NULL; // Key: Destination pixel index / Value: Source pixel index +static u8 *_gpuDstToSrcSSSE3_u8_8e = NULL; +static u8 *_gpuDstToSrcSSSE3_u8_16e = NULL; +static u8 *_gpuDstToSrcSSSE3_u16_8e = NULL; +static u8 *_gpuDstToSrcSSSE3_u32_4e = NULL; + +static CACHE_ALIGN u32 _gpuDstPitchCount[GPU_FRAMEBUFFER_NATIVE_WIDTH]; // Key: Source pixel index in x-dimension / Value: Number of x-dimension destination pixels for the source pixel +static CACHE_ALIGN u32 _gpuDstPitchIndex[GPU_FRAMEBUFFER_NATIVE_WIDTH]; // Key: Source pixel index in x-dimension / Value: First destination pixel that maps to the source pixel + +u8 PixelOperation::BlendTable555[17][17][32][32]; +u16 PixelOperation::BrightnessUpTable555[17][0x8000]; +FragmentColor PixelOperation::BrightnessUpTable666[17][0x8000]; +FragmentColor PixelOperation::BrightnessUpTable888[17][0x8000]; +u16 PixelOperation::BrightnessDownTable555[17][0x8000]; +FragmentColor PixelOperation::BrightnessDownTable666[17][0x8000]; +FragmentColor PixelOperation::BrightnessDownTable888[17][0x8000]; + +static CACHE_ALIGN ColorOperation colorop; +static CACHE_ALIGN PixelOperation pixelop; + +FORCEINLINE u16 ColorOperation::blend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB) const +{ + u16 ra = colA & 0x001F; + u16 ga = (colA >> 5) & 0x001F; + u16 ba = (colA >> 10) & 0x001F; + u16 rb = colB & 0x001F; + u16 gb = (colB >> 5) & 0x001F; + u16 bb = (colB >> 10) & 0x001F; + + ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16; + ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16; + ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16; + + ra = (ra > 31) ? 31 : ra; + ga = (ga > 31) ? 31 : ga; + ba = (ba > 31) ? 31 : ba; + + return ra | (ga << 5) | (ba << 10); +} + +FORCEINLINE u16 ColorOperation::blend(const u16 colA, const u16 colB, const TBlendTable *blendTable) const +{ + const u8 r = (*blendTable)[ colA & 0x1F][ colB & 0x1F]; + const u8 g = (*blendTable)[(colA >> 5) & 0x1F][(colB >> 5) & 0x1F]; + const u8 b = (*blendTable)[(colA >> 10) & 0x1F][(colB >> 10) & 0x1F]; + + return r | (g << 5) | (b << 10); +} + +template +FORCEINLINE FragmentColor ColorOperation::blend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB) const +{ + FragmentColor outColor; + + u16 r16 = ( (colA.r * blendEVA) + (colB.r * blendEVB) ) / 16; + u16 g16 = ( (colA.g * blendEVA) + (colB.g * blendEVB) ) / 16; + u16 b16 = ( (colA.b * blendEVA) + (colB.b * blendEVB) ) / 16; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + outColor.r = (r16 > 63) ? 63 : r16; + outColor.g = (g16 > 63) ? 63 : g16; + outColor.b = (b16 > 63) ? 63 : b16; + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + outColor.r = (r16 > 255) ? 255 : r16; + outColor.g = (g16 > 255) ? 255 : g16; + outColor.b = (b16 > 255) ? 255 : b16; + } + + outColor.a = 0; + return outColor; +} + +FORCEINLINE u16 ColorOperation::blend3D(const FragmentColor colA, const u16 colB) const +{ + const u16 alpha = colA.a + 1; + COLOR c2; + COLOR cfinal; + + c2.val = colB; + + cfinal.bits.red = ((colA.r * alpha) + ((c2.bits.red << 1) * (32 - alpha))) >> 6; + cfinal.bits.green = ((colA.g * alpha) + ((c2.bits.green << 1) * (32 - alpha))) >> 6; + cfinal.bits.blue = ((colA.b * alpha) + ((c2.bits.blue << 1) * (32 - alpha))) >> 6; + cfinal.bits.alpha = 0; + + return cfinal.val; +} + +template +FORCEINLINE FragmentColor ColorOperation::blend3D(const FragmentColor colA, const FragmentColor colB) const +{ + FragmentColor blendedColor; + const u16 alpha = colA.a + 1; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + blendedColor.r = ((colA.r * alpha) + (colB.r * (32 - alpha))) >> 5; + blendedColor.g = ((colA.g * alpha) + (colB.g * (32 - alpha))) >> 5; + blendedColor.b = ((colA.b * alpha) + (colB.b * (32 - alpha))) >> 5; + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + blendedColor.r = ((colA.r * alpha) + (colB.r * (256 - alpha))) >> 8; + blendedColor.g = ((colA.g * alpha) + (colB.g * (256 - alpha))) >> 8; + blendedColor.b = ((colA.b * alpha) + (colB.b * (256 - alpha))) >> 8; + } + + blendedColor.a = 0; + return blendedColor; +} + +FORCEINLINE u16 ColorOperation::increase(const u16 col, const u16 blendEVY) const +{ + u16 r = col & 0x001F; + u16 g = (col >> 5) & 0x001F; + u16 b = (col >> 10) & 0x001F; + + r = (r + ((31 - r) * blendEVY / 16)); + g = (g + ((31 - g) * blendEVY / 16)); + b = (b + ((31 - b) * blendEVY / 16)); + + return r | (g << 5) | (b << 10); +} + +template +FORCEINLINE FragmentColor ColorOperation::increase(const FragmentColor col, const u16 blendEVY) const +{ + FragmentColor newColor; + newColor.color = 0; + + u32 r = col.r; + u32 g = col.g; + u32 b = col.b; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + newColor.r = (r + ((63 - r) * blendEVY / 16)); + newColor.g = (g + ((63 - g) * blendEVY / 16)); + newColor.b = (b + ((63 - b) * blendEVY / 16)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + newColor.r = (r + ((255 - r) * blendEVY / 16)); + newColor.g = (g + ((255 - g) * blendEVY / 16)); + newColor.b = (b + ((255 - b) * blendEVY / 16)); + } + + return newColor; +} + +FORCEINLINE u16 ColorOperation::decrease(const u16 col, const u16 blendEVY) const +{ + u16 r = col & 0x001F; + u16 g = (col >> 5) & 0x001F; + u16 b = (col >> 10) & 0x001F; + + r = (r - (r * blendEVY / 16)); + g = (g - (g * blendEVY / 16)); + b = (b - (b * blendEVY / 16)); + + return r | (g << 5) | (b << 10); +} + +template +FORCEINLINE FragmentColor ColorOperation::decrease(const FragmentColor col, const u16 blendEVY) const +{ + FragmentColor newColor; + newColor.color = 0; + + u32 r = col.r; + u32 g = col.g; + u32 b = col.b; + + newColor.r = (r - (r * blendEVY / 16)); + newColor.g = (g - (g * blendEVY / 16)); + newColor.b = (b - (b * blendEVY / 16)); + + return newColor; +} + +void PixelOperation::InitLUTs() +{ + static bool didInit = false; + + if (didInit) + { + return; + } + + /* + NOTE: gbatek (in the reference above) seems to expect 6bit values + per component, but as desmume works with 5bit per component, + we use 31 as top, instead of 63. Testing it on a few games, + using 63 seems to give severe color wraping, and 31 works + nicely, so for now we'll just that, until proven wrong. + + i have seen pics of pokemon ranger getting white with 31, with 63 it is nice. + it could be pb of alpha or blending or... + + MightyMax> created a test NDS to check how the brightness values work, + and 31 seems to be correct. FactorEx is a override for max brighten/darken + See: http://mightymax.org/gfx_test_brightness.nds + The Pokemon Problem could be a problem with 8/32 bit writes not recognized yet, + i'll add that so you can check back. + */ + + for (u16 i = 0; i <= 16; i++) + { + for (u16 j = 0x0000; j < 0x8000; j++) + { + COLOR cur; + + cur.val = j; + cur.bits.red = (cur.bits.red + ((31 - cur.bits.red) * i / 16)); + cur.bits.green = (cur.bits.green + ((31 - cur.bits.green) * i / 16)); + cur.bits.blue = (cur.bits.blue + ((31 - cur.bits.blue) * i / 16)); + cur.bits.alpha = 0; + PixelOperation::BrightnessUpTable555[i][j] = cur.val; + PixelOperation::BrightnessUpTable666[i][j].color = COLOR555TO666(cur.val); + PixelOperation::BrightnessUpTable888[i][j].color = COLOR555TO888(cur.val); + + cur.val = j; + cur.bits.red = (cur.bits.red - (cur.bits.red * i / 16)); + cur.bits.green = (cur.bits.green - (cur.bits.green * i / 16)); + cur.bits.blue = (cur.bits.blue - (cur.bits.blue * i / 16)); + cur.bits.alpha = 0; + PixelOperation::BrightnessDownTable555[i][j] = cur.val; + PixelOperation::BrightnessDownTable666[i][j].color = COLOR555TO666(cur.val); + PixelOperation::BrightnessDownTable888[i][j].color = COLOR555TO888(cur.val); + } + } + + for (u16 c0 = 0; c0 <= 31; c0++) + { + for (u16 c1 = 0; c1 <= 31; c1++) + { + for (u16 eva = 0; eva <= 16; eva++) + { + for (u16 evb = 0; evb <= 16; evb++) + { + u8 color = (u8)( ((c0 * eva) + (c1 * evb)) / 16 ); + u8 clampedColor = std::min(31, color); + PixelOperation::BlendTable555[eva][evb][c0][c1] = clampedColor; + } + } + } + } + + didInit = true; +} + +template +FORCEINLINE void PixelOperation::_copy16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + dstColor16 = srcColor16 | 0x8000; + break; + + case NDSColorFormat_BGR666_Rev: + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); + break; + + case NDSColorFormat_BGR888_Rev: + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); + break; + } + + if (!ISDEBUGRENDER) + { + dstLayerID = compInfo.renderState.selectedLayerID; + } +} + +template +FORCEINLINE void PixelOperation::_copy32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + dstColor16 = ColorspaceConvert6665To5551(srcColor32); + dstColor16 = dstColor16 | 0x8000; + break; + + case NDSColorFormat_BGR666_Rev: + dstColor32 = srcColor32; + dstColor32.a = 0x1F; + break; + + case NDSColorFormat_BGR888_Rev: + dstColor32 = srcColor32; + dstColor32.a = 0xFF; + break; + + default: + return; + } + + if (!ISDEBUGRENDER) + { + dstLayerID = compInfo.renderState.selectedLayerID; + } +} + +template +FORCEINLINE void PixelOperation::_brightnessUp16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF] | 0x8000; + break; + + case NDSColorFormat_BGR666_Rev: + dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF]; + dstColor32.a = 0x1F; + break; + + case NDSColorFormat_BGR888_Rev: + dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF]; + dstColor32.a = 0xFF; + break; + } + + dstLayerID = compInfo.renderState.selectedLayerID; +} + +template +FORCEINLINE void PixelOperation::_brightnessUp32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); + dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; + dstColor16 = dstColor16 | 0x8000; + } + else + { + dstColor32 = colorop.increase(srcColor32, compInfo.renderState.blendEVY); + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; + } + + dstLayerID = compInfo.renderState.selectedLayerID; +} + +template +FORCEINLINE void PixelOperation::_brightnessDown16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF] | 0x8000; + break; + + case NDSColorFormat_BGR666_Rev: + dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF]; + dstColor32.a = 0x1F; + break; + + case NDSColorFormat_BGR888_Rev: + dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF]; + dstColor32.a = 0xFF; + break; + } + + dstLayerID = compInfo.renderState.selectedLayerID; +} + +template +FORCEINLINE void PixelOperation::_brightnessDown32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const +{ + u16 &dstColor16 = *compInfo.target.lineColor16; + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + u8 &dstLayerID = *compInfo.target.lineLayerID; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); + dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; + dstColor16 = dstColor16 | 0x8000; + } + else + { + dstColor32 = colorop.decrease(srcColor32, compInfo.renderState.blendEVY); + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; + } + + dstLayerID = compInfo.renderState.selectedLayerID; +} + +template +FORCEINLINE void PixelOperation::__selectedEffect(const GPUEngineCompositorInfo &compInfo, const u8 &dstLayerID, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode, ColorEffect &selectedEffect, TBlendTable **selectedBlendTable, u8 &blendEVA, u8 &blendEVB) const +{ + const bool dstTargetBlendEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; + + // 3D rendering has a special override: If the destination pixel is set to blend, then always blend. + // Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless + // blending is forced here. + // + // This behavior must take priority over checking for the window color effect enable flag. + // Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless + // blend forcing takes priority. + bool forceDstTargetBlend = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnable : false; + + if (LAYERTYPE == GPULayerType_OBJ) + { + //translucent-capable OBJ are forcing the function to blend when the second target is satisfied + const bool isObjTranslucentType = (spriteMode == OBJMode_Transparent) || (spriteMode == OBJMode_Bitmap); + if (isObjTranslucentType && dstTargetBlendEnable) + { + // OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha. + // Test cases: + // * The spriteblend demo + // * Glory of Heracles - fairy on the title screen + // * Phoenix Wright: Ace Attorney - character fade-in/fade-out + if (spriteAlpha != 0xFF) + { + blendEVA = spriteAlpha; + blendEVB = 16 - spriteAlpha; + *selectedBlendTable = &PixelOperation::BlendTable555[blendEVA][blendEVB]; + } + + forceDstTargetBlend = true; + } + } + + if (forceDstTargetBlend) + { + selectedEffect = ColorEffect_Blend; + } + else + { + // If we're not forcing blending, then select the color effect based on the BLDCNT target flags. + if (enableColorEffect && compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]) + { + switch (compInfo.renderState.colorEffect) + { + // For the Blend effect, both first and second target flags must be checked. + case ColorEffect_Blend: + { + if (dstTargetBlendEnable) selectedEffect = compInfo.renderState.colorEffect; + break; + } + + // For the Increase/Decrease Brightness effects, only the first target flag needs to be checked. + // Test case: Bomberman Land Touch! dialog boxes will render too dark without this check. + case ColorEffect_IncreaseBrightness: + case ColorEffect_DecreaseBrightness: + selectedEffect = compInfo.renderState.colorEffect; + break; + + default: + break; + } + } + } +} + +template +FORCEINLINE void PixelOperation::_unknownEffect16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const +{ + u8 &dstLayerID = *compInfo.target.lineLayerID; + TBlendTable *selectedBlendTable = compInfo.renderState.blendTable555; + u8 blendEVA = compInfo.renderState.blendEVA; + u8 blendEVB = compInfo.renderState.blendEVB; + ColorEffect selectedEffect = ColorEffect_Disable; + + this->__selectedEffect(compInfo, dstLayerID, enableColorEffect, spriteAlpha, spriteMode, selectedEffect, &selectedBlendTable, blendEVA, blendEVB); + + // Render the pixel using the selected color effect. + dstLayerID = compInfo.renderState.selectedLayerID; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + u16 &dstColor16 = *compInfo.target.lineColor16; + + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor16 = srcColor16; + break; + + case ColorEffect_IncreaseBrightness: + dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_DecreaseBrightness: + dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_Blend: + { + if (LAYERTYPE == GPULayerType_3D) + { + //dstColor16 = colorop.blend3D(srcColor16, dstColor16); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the __selectedEffect32() method instead.\n"); + assert(false); + } + else + { + dstColor16 = colorop.blend(srcColor16, dstColor16, selectedBlendTable); + } + break; + } + } + + dstColor16 |= 0x8000; + } + else + { + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); + break; + + case ColorEffect_IncreaseBrightness: + dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_DecreaseBrightness: + dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_Blend: + { + FragmentColor srcColor32; + srcColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); + dstColor32 = (LAYERTYPE == GPULayerType_3D) ? colorop.blend3D(srcColor32, dstColor32) : colorop.blend(srcColor32, dstColor32, blendEVA, blendEVB); + break; + } + } + } + else + { + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); + break; + + case ColorEffect_IncreaseBrightness: + dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_DecreaseBrightness: + dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_Blend: + { + FragmentColor srcColor32; + srcColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); + dstColor32 = (LAYERTYPE == GPULayerType_3D) ? colorop.blend3D(srcColor32, dstColor32) : colorop.blend(srcColor32, dstColor32, blendEVA, blendEVB); + break; + } + } + } + + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; + } +} + +template +FORCEINLINE void PixelOperation::_unknownEffect32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const +{ + u8 &dstLayerID = *compInfo.target.lineLayerID; + TBlendTable *selectedBlendTable = compInfo.renderState.blendTable555; + u8 blendEVA = compInfo.renderState.blendEVA; + u8 blendEVB = compInfo.renderState.blendEVB; + ColorEffect selectedEffect = ColorEffect_Disable; + + this->__selectedEffect(compInfo, dstLayerID, enableColorEffect, spriteAlpha, spriteMode, selectedEffect, &selectedBlendTable, blendEVA, blendEVB); + + // Render the pixel using the selected color effect. + dstLayerID = compInfo.renderState.selectedLayerID; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); + u16 &dstColor16 = *compInfo.target.lineColor16; + + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor16 = srcColor16; + break; + + case ColorEffect_IncreaseBrightness: + dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_DecreaseBrightness: + dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_Blend: + { + if (LAYERTYPE == GPULayerType_3D) + { + dstColor16 = colorop.blend3D(srcColor32, dstColor16); + } + else + { + dstColor16 = colorop.blend(srcColor16, dstColor16, selectedBlendTable); + } + break; + } + } + + dstColor16 |= 0x8000; + } + else + { + FragmentColor &dstColor32 = *compInfo.target.lineColor32; + + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor32 = srcColor32; + break; + + case ColorEffect_IncreaseBrightness: + dstColor32 = colorop.increase(srcColor32, compInfo.renderState.blendEVY); + break; + + case ColorEffect_DecreaseBrightness: + dstColor32 = colorop.decrease(srcColor32, compInfo.renderState.blendEVY); + break; + + case ColorEffect_Blend: + dstColor32 = (LAYERTYPE == GPULayerType_3D) ? colorop.blend3D(srcColor32, dstColor32) : colorop.blend(srcColor32, dstColor32, blendEVA, blendEVB); + break; + } + + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF; + } +} + +template +FORCEINLINE void PixelOperation::Composite16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const +{ + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy16(compInfo, srcColor16); + break; + + case GPUCompositorMode_Copy: + this->_copy16(compInfo, srcColor16); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp16(compInfo, srcColor16); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown16(compInfo, srcColor16); + break; + + default: + this->_unknownEffect16(compInfo, srcColor16, enableColorEffect, spriteAlpha, (OBJMode)spriteMode); + break; + } +} + +template +FORCEINLINE void PixelOperation::Composite32(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const +{ + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy32(compInfo, srcColor32); + break; + + case GPUCompositorMode_Copy: + this->_copy32(compInfo, srcColor32); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp32(compInfo, srcColor32); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown32(compInfo, srcColor32); + break; + + default: + this->_unknownEffect32(compInfo, srcColor32, enableColorEffect, spriteAlpha, (OBJMode)spriteMode); + break; + } +} + +template +static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead, size_t lineWidth, size_t lineCount) +{ + u8 *__restrict dst = (u8 *)dstLineHead + (lineWidth * ELEMENTSIZE); + + for (size_t line = 1; line < lineCount; line++) + { + memcpy(dst, dstLineHead, lineWidth * ELEMENTSIZE); + dst += (lineWidth * ELEMENTSIZE); + } +} + +#if defined(ENABLE_AVX2) + #include "GPU_Operations_AVX2.cpp" +#elif defined(ENABLE_SSE2) + #include "GPU_Operations_SSE2.cpp" +#else + +template +static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) +{ + // Use INTEGERSCALEHINT to provide a hint to CopyLineExpand() for the fastest execution path. + // INTEGERSCALEHINT represents the scaling value of the framebuffer width, and is always + // assumed to be a positive integer. + // + // Use cases: + // - Passing a value of 0 causes CopyLineExpand() to perform a simple copy, using dstWidth + // to copy dstWidth elements. + // - Passing a value of 1 causes CopyLineExpand() to perform a simple copy, ignoring dstWidth + // and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements. + // - Passing any negative value causes CopyLineExpand() to assume that the framebuffer width + // is NOT scaled by an integer value, and will therefore take the safest (but slowest) + // execution path. + // - Passing any positive value greater than 1 causes CopyLineExpand() to expand the line + // using the integer scaling value. + + if (INTEGERSCALEHINT == 0) + { +#if defined(MSB_FIRST) + if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) + { + for (size_t i = 0; i < dstWidth; i++) + { + if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); + } + } + } + else +#endif + { + memcpy(dst, src, dstWidth * ELEMENTSIZE); + } + } + else if (INTEGERSCALEHINT == 1) + { +#if defined(MSB_FIRST) + if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); + } + } + } + else +#endif + { + memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE); + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t S = INTEGERSCALEHINT; + + if (SCALEVERTICAL) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t q = 0; q < S; q++) + { + for (size_t p = 0; p < S; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = ( (u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[(q * (GPU_FRAMEBUFFER_NATIVE_WIDTH * S)) + ((x * S) + p)] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; + } + } + } + } + } + else + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < S; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[(x * S) + p] = ( (u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[(x * S) + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[(x * S) + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; + } + } + } + } + } + else + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; + } + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } +} + +template +static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src, size_t srcWidth) +{ + // Use INTEGERSCALEHINT to provide a hint to CopyLineReduce() for the fastest execution path. + // INTEGERSCALEHINT represents the scaling value of the source framebuffer width, and is always + // assumed to be a positive integer. + // + // Use cases: + // - Passing a value of 0 causes CopyLineReduce() to perform a simple copy, using srcWidth + // to copy srcWidth elements. + // - Passing a value of 1 causes CopyLineReduce() to perform a simple copy, ignoring srcWidth + // and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements. + // - Passing any negative value causes CopyLineReduce() to assume that the framebuffer width + // is NOT scaled by an integer value, and will therefore take the safest (but slowest) + // execution path. + // - Passing any positive value greater than 1 causes CopyLineReduce() to expand the line + // using the integer scaling value. + + if (INTEGERSCALEHINT == 0) + { +#if defined(MSB_FIRST) + if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) + { + for (size_t i = 0; i < srcWidth; i++) + { + if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); + } + } + } + else +#endif + { + memcpy(dst, src, srcWidth * ELEMENTSIZE); + } + } + else if (INTEGERSCALEHINT == 1) + { +#if defined(MSB_FIRST) + if (NEEDENDIANSWAP && (ELEMENTSIZE != 1)) + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] ); + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] ); + } + } + } + else +#endif + { + memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE); + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = srcWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + if (ELEMENTSIZE == 1) + { + ((u8 *)dst)[x] = ((u8 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[x] = ((u16 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[x] = ((u32 *)src)[x * scale]; + } + } + } + else + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[i] = ((u8 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[_gpuDstPitchIndex[i]] ) : ((u16 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[_gpuDstPitchIndex[i]] ) : ((u32 *)src)[_gpuDstPitchIndex[i]]; + } + } + } +} + +template +void GPUEngineBase::_MosaicLine(GPUEngineCompositorInfo &compInfo) +{ + u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; + u16 outColor16; + bool isOpaque; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + if (ISFIRSTLINE && (compInfo.renderState.mosaicWidthBG->begin[x] != 0)) + { + isOpaque = (this->_deferredIndexNative[x] != 0); + outColor16 = (isOpaque) ? (this->_deferredColorNative[x] & 0x7FFF) : 0xFFFF; + mosaicColorBG[x] = outColor16; + } + else + { + outColor16 = mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc[x]]; + } + + isOpaque = (outColor16 != 0xFFFF); + if (isOpaque) + { + this->_deferredColorNative[x] = outColor16; + } + } +} + +template +void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. +} + +template +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +template +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +template +size_t GPUEngineBase::_RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, + size_t &frameX, size_t &spriteX, + u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative) +{ + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + // Window 0 has the highest priority, so always check this first. + if (win0 != NULL) + { + if (win0[i] != 0) + { + didPassWindowTestNative[i] = compInfo.renderState.WIN0_enable[layerID]; + enableColorEffectNative[i] = compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG]; + continue; + } + } + + // Window 1 has medium priority, and is checked after Window 0. + if (win1 != NULL) + { + if (win1[i] != 0) + { + didPassWindowTestNative[i] = compInfo.renderState.WIN1_enable[layerID]; + enableColorEffectNative[i] = compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG]; + continue; + } + } + + // Window OBJ has low priority, and is checked after both Window 0 and Window 1. + if (winObj != NULL) + { + if (winObj[i] != 0) + { + didPassWindowTestNative[i] = compInfo.renderState.WINOBJ_enable[layerID]; + enableColorEffectNative[i] = compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG]; + continue; + } + } + + // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. + // This has the lowest priority, and is always checked last. + didPassWindowTestNative[i] = compInfo.renderState.WINOUT_enable[layerID]; + enableColorEffectNative[i] = compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG]; + } +} + +template +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +template +size_t GPUEngineA::_RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + // Do nothing. This is a placeholder for a manually vectorized version of this method. + return 0; +} + +#endif + +template +void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, + void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount) +{ + switch (INTEGERSCALEHINT) + { + case 0: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; + + CopyLineExpand(dst, src, dstLineWidth * dstLineCount, 1); + break; + } + + case 1: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; + + CopyLineExpand(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH, 1); + break; + } + + default: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * dstLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; + + // TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized. + // + // The implementation below is a stopgap measure for getting the faster code paths to run. + // However, this setup is not ideal, since the code size will greatly increase in order to + // include all possible code paths, possibly causing cache misses on lesser CPUs. + switch (dstLineWidth) + { + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2): + CopyLineExpand<2, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2, 2); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3): + CopyLineExpand<3, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3, 3); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4): + CopyLineExpand<4, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4, 4); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 5): + CopyLineExpand<5, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 5, 5); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 6): + CopyLineExpand<6, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 6, 6); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 7): + CopyLineExpand<7, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 7, 7); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 8): + CopyLineExpand<8, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 8, 8); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 9): + CopyLineExpand<9, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 9, 9); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 10): + CopyLineExpand<10, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 10, 10); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 11): + CopyLineExpand<11, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 11, 11); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 12): + CopyLineExpand<12, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 12, 12); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 13): + CopyLineExpand<13, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 13, 13); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 14): + CopyLineExpand<14, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 14, 14); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 15): + CopyLineExpand<15, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 15, 15); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 16): + CopyLineExpand<16, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 16, 16); + break; + + default: + { + if ((dstLineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0) + { + CopyLineExpand<0xFFFF, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLineWidth, dstLineCount); + } + else + { + CopyLineExpand<-1, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLineWidth, dstLineCount); + } + break; + } + } + break; + } + } +} + +template +void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer) +{ + CopyLineExpandHinted(srcBuffer, lineInfo.indexNative, + dstBuffer, lineInfo.indexCustom, lineInfo.widthCustom, lineInfo.renderCount); +} + +template +void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth, + void *__restrict dstBuffer, const size_t dstLineIndex) +{ + switch (INTEGERSCALEHINT) + { + case 0: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)dstBuffer; + + CopyLineReduce(dst, src, srcLineWidth); + break; + } + + case 1: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; + + CopyLineReduce(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH); + break; + } + + default: + { + const u8 *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (srcLineIndex * srcLineWidth * ELEMENTSIZE) : (u8 *)srcBuffer; + u8 *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (dstLineIndex * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer; + + // TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized. + // + // The implementation below is a stopgap measure for getting the faster code paths to run. + // However, this setup is not ideal, since the code size will greatly increase in order to + // include all possible code paths, possibly causing cache misses on lesser CPUs. + switch (srcLineWidth) + { + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2): + CopyLineReduce<2, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3): + CopyLineReduce<3, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3); + break; + + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4): + CopyLineReduce<4, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4); + break; + + default: + { + if ((srcLineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0) + { + CopyLineReduce<0xFFFF, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, srcLineWidth); + } + else + { + CopyLineReduce<-1, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, srcLineWidth); + } + break; + } + } + break; + } + } +} + +template +void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer) +{ + CopyLineReduceHinted(srcBuffer, lineInfo.indexCustom, lineInfo.widthCustom, + dstBuffer, lineInfo.indexNative); +} + +// These functions are used in gfx3d.cpp +template void CopyLineExpandHinted<0xFFFF, true, false, true, 4>(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); +template void CopyLineReduceHinted<0xFFFF, false, true, 4>(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); diff --git a/desmume/src/GPU_Operations.h b/desmume/src/GPU_Operations.h new file mode 100644 index 000000000..de65f2d6e --- /dev/null +++ b/desmume/src/GPU_Operations.h @@ -0,0 +1,96 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef GPU_OPERATIONS_H +#define GPU_OPERATIONS_H + +#include + +#include "types.h" +#include "./utils/colorspacehandler/colorspacehandler.h" + +#include "GPU.h" + + +template +void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, + void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount); + +template +void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); + +template +void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth, + void *__restrict dstBuffer, const size_t dstLineIndex); + +template +void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer); + +class ColorOperation +{ +public: + ColorOperation() {}; + + FORCEINLINE u16 blend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB) const; + FORCEINLINE u16 blend(const u16 colA, const u16 colB, const TBlendTable *blendTable) const; + template FORCEINLINE FragmentColor blend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB) const; + + FORCEINLINE u16 blend3D(const FragmentColor colA, const u16 colB) const; + template FORCEINLINE FragmentColor blend3D(const FragmentColor colA, const FragmentColor colB) const; + + FORCEINLINE u16 increase(const u16 col, const u16 blendEVY) const; + template FORCEINLINE FragmentColor increase(const FragmentColor col, const u16 blendEVY) const; + + FORCEINLINE u16 decrease(const u16 col, const u16 blendEVY) const; + template FORCEINLINE FragmentColor decrease(const FragmentColor col, const u16 blendEVY) const; +}; + +class PixelOperation +{ +private: + template FORCEINLINE void __selectedEffect(const GPUEngineCompositorInfo &compInfo, const u8 &dstLayerID, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode, ColorEffect &selectedEffect, TBlendTable **selectedBlendTable, u8 &blendEVA, u8 &blendEVB) const; + +protected: + template FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const; + template FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const; + + template FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const; + template FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const; + + template FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const; + template FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const; + + template FORCEINLINE void _unknownEffect16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const; + template FORCEINLINE void _unknownEffect32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const; + +public: + static CACHE_ALIGN u8 BlendTable555[17][17][32][32]; + static CACHE_ALIGN u16 BrightnessUpTable555[17][0x8000]; + static CACHE_ALIGN FragmentColor BrightnessUpTable666[17][0x8000]; + static CACHE_ALIGN FragmentColor BrightnessUpTable888[17][0x8000]; + static CACHE_ALIGN u16 BrightnessDownTable555[17][0x8000]; + static CACHE_ALIGN FragmentColor BrightnessDownTable666[17][0x8000]; + static CACHE_ALIGN FragmentColor BrightnessDownTable888[17][0x8000]; + static void InitLUTs(); + + PixelOperation() {}; + + template FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const; + template FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const; +}; + +#endif // GPU_OPERATIONS_H diff --git a/desmume/src/GPU_Operations_AVX2.cpp b/desmume/src/GPU_Operations_AVX2.cpp new file mode 100644 index 000000000..9b4bbfdf6 --- /dev/null +++ b/desmume/src/GPU_Operations_AVX2.cpp @@ -0,0 +1,3130 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef ENABLE_AVX2 + #error This code requires AVX2 support. + #warning This error might occur if this file is compiled directly. Do not compile this file directly, as it is already included in GPU_Operations.cpp. +#else + +#include "GPU_Operations_AVX2.h" + + +static const ColorOperation_AVX2 colorop_vec; +static const PixelOperation_AVX2 pixelop_vec; + +template +static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, dstWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256s8) / ELEMENTSIZE), _mm256_store_si256((v256s8 *)dst + (X), _mm256_load_si256((v256s8 *)src + (X))) ); + } + else if (INTEGERSCALEHINT == 2) + { + __m256i srcPix; + __m256i srcPixOut[2]; + + switch (ELEMENTSIZE) + { + case 1: + { + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u8) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u8 *)((v256u8 *)src + (X))); \ + srcPix = _mm256_permute4x64_epi64(srcPix, 0xD8); \ + srcPixOut[0] = _mm256_unpacklo_epi8(srcPix, srcPix); \ + srcPixOut[1] = _mm256_unpackhi_epi8(srcPix, srcPix); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u8) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u8) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u8) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u8) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u8) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u8 *)((v256u8 *)src + (X))); \ + srcPix = _mm256_permute4x64_epi64(srcPix, 0xD8); \ + srcPixOut[0] = _mm256_unpacklo_epi8(srcPix, srcPix); \ + srcPixOut[1] = _mm256_unpackhi_epi8(srcPix, srcPix); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u8 *)dst + ((X) * 2) + 1, srcPixOut[1]); \ + ); + } + break; + } + + case 2: + { + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u16) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u16 *)((v256u16 *)src + (X))); \ + srcPix = _mm256_permute4x64_epi64(srcPix, 0xD8); \ + srcPixOut[0] = _mm256_unpacklo_epi16(srcPix, srcPix); \ + srcPixOut[1] = _mm256_unpackhi_epi16(srcPix, srcPix); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u16) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u16) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u16) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u16) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u16) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u16 *)((v256u16 *)src + (X))); \ + srcPix = _mm256_permute4x64_epi64(srcPix, 0xD8); \ + srcPixOut[0] = _mm256_unpacklo_epi16(srcPix, srcPix); \ + srcPixOut[1] = _mm256_unpackhi_epi16(srcPix, srcPix); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u16 *)dst + ((X) * 2) + 1, srcPixOut[1]); \ + ); + } + break; + } + + case 4: + { + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u32) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u32 *)((v256u32 *)src + (X))); \ + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcPix, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); \ + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcPix, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u32) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u32) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u32) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v256u32) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256u32) / ELEMENTSIZE), \ + srcPix = _mm256_load_si256((v256u32 *)((v256u32 *)src + (X))); \ + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcPix, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); \ + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcPix, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + 0, srcPixOut[0]); \ + _mm256_store_si256((v256u32 *)dst + ((X) * 2) + 1, srcPixOut[1]); \ + ); + } + break; + } + } + } + else if (INTEGERSCALEHINT == 3) + { + __m256i srcPixOut[3]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(10,10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(21,20,20,20,19,19,19,18,18,18,17,17,17,16,16,16, 15,15,15,14,14,14,13,13,13,12,12,12,11,11,11,10)); + srcPixOut[2] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,30,30,30,29,29,29,28,28,28,27,27,27,26, 26,26,25,25,25,24,24,24,23,23,23,22,22,22,21,21)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(11,10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(21,20,21,20,19,18,19,18,19,18,17,16,17,16,17,16, 15,14,15,14,15,14,13,12,13,12,13,12,11,10,11,10)); + srcPixOut[2] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,29,28,29,28,29,28,27,26,27,26, 27,26,25,24,25,24,25,24,23,22,23,22,23,22,21,20)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(2, 2, 1, 1, 1, 0, 0, 0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(5, 4, 4, 4, 3, 3, 3, 2)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(7, 7, 7, 6, 6, 6, 5, 5)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 4) + { + __m256i srcPixOut[4]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(15,15,15,15,14,14,14,14,13,13,13,13,12,12,12,12, 11,11,11,11,10,10,10,10, 9, 9, 9, 9, 8, 8, 8, 8)); + srcPixOut[2] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(23,23,23,23,22,22,22,22,21,21,21,21,20,20,20,20, 19,19,19,19,18,18,18,18,17,17,17,17,16,16,16,16)); + srcPixOut[3] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,30,30,30,30,29,29,29,29,28,28,28,28, 27,27,27,27,26,26,26,26,25,25,25,25,24,24,24,24)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(15,14,15,14,15,14,15,14,13,12,13,12,13,12,13,12, 11,10,11,10,11,10,11,10, 9, 8, 9, 8, 9, 8, 9, 8)); + srcPixOut[2] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(23,22,23,22,23,22,23,22,21,20,21,20,21,20,21,20, 19,18,19,18,19,18,19,18,17,16,17,16,17,16,17,16)); + srcPixOut[3] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,31,30,29,28,29,28,29,28,29,28, 27,26,27,26,27,26,27,26,25,24,25,24,25,24,25,24)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); + srcPixOut[3] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 5) + { + __m256i srcPixOut[5]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 6, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(12,12,12,12,11,11,11,11,11,10,10,10,10,10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6)); + srcPixOut[2] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(19,18,18,18,18,18,17,17,17,17,17,16,16,16,16,16, 15,15,15,15,15,14,14,14,14,14,13,13,13,13,13,12)); + srcPixOut[3] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,30,30,30,30,29,29,29,29,28,28,28,28, 27,27,27,27,26,26,26,26,25,25,25,25,24,24,24,24)); + srcPixOut[4] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,30,30,30,30,29,29,29,29,28,28,28,28, 27,27,27,27,26,26,26,26,25,25,25,25,24,24,24,24)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 7, 6, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(13,12,13,12,11,10,11,10,11,10,11,10,11,10, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 7, 6)); + srcPixOut[2] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(19,18,19,18,19,18,17,16,17,16,17,16,17,16,17,16, 15,14,15,14,15,14,15,14,15,14,13,12,13,12,13,12)); + srcPixOut[3] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(25,24,25,24,25,24,25,24,23,22,23,22,23,22,23,22, 23,22,21,20,21,20,21,20,21,20,21,20,19,18,19,18)); + srcPixOut[4] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,31,30,31,30,29,28,29,28,29,28, 29,28,29,28,27,26,27,26,27,26,27,26,27,26,25,24)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(1, 1, 1, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(3, 2, 2, 2, 2, 2, 1, 1)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(4, 4, 4, 4, 3, 3, 3, 3)); + srcPixOut[3] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(6, 6, 5, 5, 5, 5, 5, 4)); + srcPixOut[4] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(7, 7, 7, 7, 7, 6, 6, 6)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 6) + { + __m256i srcPixOut[6]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(10,10,10,10, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5)); + srcPixOut[2] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(15,15,15,15,15,15,14,14,14,14,14,14,13,13,13,13, 13,13,12,12,12,12,12,12,11,11,11,11,11,11,10,10)); + srcPixOut[3] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(21,21,20,20,20,20,20,20,19,19,19,19,19,19,18,18, 18,18,18,18,17,17,17,17,17,17,16,16,16,16,16,16)); + srcPixOut[4] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(26,26,26,26,25,25,25,25,25,25,24,24,24,24,24,24, 23,23,23,23,23,23,22,22,22,22,22,22,21,21,21,21)); + srcPixOut[5] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,31,31,30,30,30,30,30,30,29,29,29,29, 29,29,28,28,28,28,28,28,27,27,27,27,27,27,26,26)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 5, 4, 5, 4, 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(11,10,11,10, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4)); + srcPixOut[2] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(15,14,15,14,15,14,15,14,15,14,15,14,13,12,13,12, 13,12,13,12,13,12,13,12,11,10,11,10,11,10,11,10)); + srcPixOut[3] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(21,20,21,20,21,20,21,20,19,18,19,18,19,18,19,18, 19,18,19,18,17,16,17,16,17,16,17,16,17,16,17,16)); + srcPixOut[4] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(27,26,27,26,25,24,25,24,25,24,25,24,25,24,25,24, 23,22,23,22,23,22,23,22,23,22,23,22,21,20,21,20)); + srcPixOut[5] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,31,30,31,30,31,30,29,28,29,28, 29,28,29,28,29,28,29,28,27,26,27,26,27,26,27,26)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(1, 1, 0, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(2, 2, 2, 2, 1, 1, 1, 1)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(3, 3, 3, 3, 3, 3, 2, 2)); + srcPixOut[3] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(5, 5, 4, 4, 4, 4, 4, 4)); + srcPixOut[4] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(6, 6, 6, 6, 5, 5, 5, 5)); + srcPixOut[5] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 6, 6)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 7) + { + __m256i srcPixOut[7]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4)); + srcPixOut[2] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(13,13,13,13,13,12,12,12,12,12,12,12,11,11,11,11, 11,11,11,10,10,10,10,10,10,10, 9, 9, 9, 9, 9, 9)); + srcPixOut[3] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(18,18,17,17,17,17,17,17,17,16,16,16,16,16,16,16, 15,15,15,15,15,15,15,14,14,14,14,14,14,14,13,13)); + srcPixOut[4] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(22,22,22,22,22,22,21,21,21,21,21,21,21,20,20,20, 20,20,20,20,19,19,19,19,19,19,19,18,18,18,18,18)); + srcPixOut[5] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(27,27,27,26,26,26,26,26,26,26,25,25,25,25,25,25, 25,24,24,24,24,24,24,24,23,23,23,23,23,23,23,22)); + srcPixOut[6] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,31,31,31,30,30,30,30,30,30,30,29,29, 29,29,29,29,29,28,28,28,28,28,28,28,27,27,27,27)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 9, 8, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4)); + srcPixOut[2] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(13,12,13,12,13,12,13,12,13,12,13,12,11,10,11,10, 11,10,11,10,11,10,11,10,11,10, 9, 8, 9, 8, 9, 8)); + srcPixOut[3] = _mm256_shuffle_epi8(srcVec, _mm256_set_epi8(19,18,17,16,17,16,17,16,17,16,17,16,17,16,17,16, 15,14,15,14,15,14,15,14,15,14,15,14,15,14,13,12)); + srcPixOut[4] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(23,22,23,22,23,22,21,20,21,20,21,20,21,20,21,20, 21,20,21,20,19,18,19,18,19,18,19,18,19,18,19,18)); + srcPixOut[5] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(27,26,27,26,27,26,27,26,27,26,25,24,25,24,25,24, 25,24,25,24,25,24,25,24,23,22,23,22,23,22,23,22)); + srcPixOut[6] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,31,30,31,30,31,30,31,30,29,28, 29,28,29,28,29,28,29,28,29,28,29,28,27,26,27,26)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(1, 0, 0, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(2, 2, 1, 1, 1, 1, 1, 1)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(3, 3, 3, 2, 2, 2, 2, 2)); + srcPixOut[3] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(4, 4, 4, 4, 3, 3, 3, 3)); + srcPixOut[4] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(5, 5, 5, 5, 5, 4, 4, 4)); + srcPixOut[5] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(6, 6, 6, 6, 6, 6, 5, 5)); + srcPixOut[6] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 6)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 8) + { + __m256i srcPixOut[8]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + + if (ELEMENTSIZE == 1) + { + const v256u8 src8lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u8 src8hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8( 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4)); + srcPixOut[2] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(11,11,11,11,11,11,11,11,10,10,10,10,10,10,10,10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8)); + srcPixOut[3] = _mm256_shuffle_epi8(src8lo, _mm256_set_epi8(15,15,15,15,15,15,15,15,14,14,14,14,14,14,14,14, 13,13,13,13,13,13,13,13,12,12,12,12,12,12,12,12)); + srcPixOut[4] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(19,19,19,19,19,19,19,19,18,18,18,18,18,18,18,18, 17,17,17,17,17,17,17,17,16,16,16,16,16,16,16,16)); + srcPixOut[5] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(23,23,23,23,23,23,23,23,22,22,22,22,22,22,22,22, 21,21,21,21,21,21,21,21,20,20,20,20,20,20,20,20)); + srcPixOut[6] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(27,27,27,27,27,27,27,27,26,26,26,26,26,26,26,26, 25,25,25,25,25,25,25,25,24,24,24,24,24,24,24,24)); + srcPixOut[7] = _mm256_shuffle_epi8(src8hi, _mm256_set_epi8(31,31,31,31,31,31,31,31,30,30,30,30,30,30,30,30, 29,29,29,29,29,29,29,29,28,28,28,28,28,28,28,28)); + } + else if (ELEMENTSIZE == 2) + { + const v256u16 src16lo = _mm256_permute4x64_epi64(srcVec, 0x44); + const v256u16 src16hi = _mm256_permute4x64_epi64(srcVec, 0xEE); + + srcPixOut[0] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8( 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4)); + srcPixOut[2] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8)); + srcPixOut[3] = _mm256_shuffle_epi8(src16lo, _mm256_set_epi8(15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14, 13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12)); + srcPixOut[4] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(19,18,19,18,19,18,19,18,19,18,19,18,19,18,19,18, 17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16)); + srcPixOut[5] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(23,22,23,22,23,22,23,22,23,22,23,22,23,22,23,22, 21,20,21,20,21,20,21,20,21,20,21,20,21,20,21,20)); + srcPixOut[6] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(27,26,27,26,27,26,27,26,27,26,27,26,27,26,27,26, 25,24,25,24,25,24,25,24,25,24,25,24,25,24,25,24)); + srcPixOut[7] = _mm256_shuffle_epi8(src16hi, _mm256_set_epi8(31,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30, 29,28,29,28,29,28,29,28,29,28,29,28,29,28,29,28)); + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(0)); + srcPixOut[1] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(1)); + srcPixOut[2] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(2)); + srcPixOut[3] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(3)); + srcPixOut[4] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(4)); + srcPixOut[5] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(5)); + srcPixOut[6] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(6)); + srcPixOut[7] = _mm256_permutevar8x32_epi32(srcVec, _mm256_set1_epi32(7)); + } + + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < INTEGERSCALEHINT; lx++) + { + _mm256_store_si256((__m256i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = dstWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + const size_t scaleLo = scale / 2; + const size_t scaleMid = scaleLo + (scale & 0x0001); + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX++, dstX+=scale) + { + const __m256i srcVec = _mm256_load_si256((__m256i *)src + srcX); + const __m256i srcVecLo = _mm256_permute4x64_epi64(srcVec, 0x44); + const __m256i srcVecHi = _mm256_permute4x64_epi64(srcVec, 0xEE); + v256u8 ssse3idx; + + size_t lx = 0; + + for (; lx < scaleLo; lx++) + { + if (ELEMENTSIZE == 1) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u8_16e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 2) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u16_8e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 4) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u32_4e + (lx * sizeof(v256u8)))); + } + + _mm256_store_si256( (__m256i *)dst + dstX + lx, _mm256_shuffle_epi8(srcVecLo, ssse3idx) ); + } + + if (scaleMid > scaleLo) + { + if (ELEMENTSIZE == 1) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u8_16e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 2) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u16_8e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 4) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u32_4e + (lx * sizeof(v256u8)))); + } + + _mm256_store_si256( (__m256i *)dst + dstX + lx, _mm256_shuffle_epi8(srcVec, ssse3idx) ); + lx++; + } + + for (; lx < scale; lx++) + { + if (ELEMENTSIZE == 1) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u8_16e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 2) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u16_8e + (lx * sizeof(v256u8)))); + } + else if (ELEMENTSIZE == 4) + { + ssse3idx = _mm256_load_si256((v256u8 *)(_gpuDstToSrcSSSE3_u32_4e + (lx * sizeof(v256u8)))); + } + + _mm256_store_si256( (__m256i *)dst + dstX + lx, _mm256_shuffle_epi8(srcVecHi, ssse3idx) ); + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } + else + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x]; + } + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } +} + +template +static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src, size_t srcWidth) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, srcWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v256s8) / ELEMENTSIZE), _mm256_store_si256((v256s8 *)dst + (X), _mm256_load_si256((v256s8 *)src + (X))) ); + } + else if (INTEGERSCALEHINT == 2) + { + __m256i srcPix[2]; + __m256i dstPix; + + for (size_t srcX = 0, dstX = 0; dstX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX+=INTEGERSCALEHINT, dstX++) + { + srcPix[0] = _mm256_load_si256((__m256i *)src + srcX + 0); + srcPix[1] = _mm256_load_si256((__m256i *)src + srcX + 1); + + if (ELEMENTSIZE == 1) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set1_epi32(0x00FF00FF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set1_epi32(0x00FF00FF)); + dstPix = _mm256_permute4x64_epi64(_mm256_packus_epi16(srcPix[0], srcPix[1]), 0xD8); + } + else if (ELEMENTSIZE == 2) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set1_epi32(0x0000FFFF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set1_epi32(0x0000FFFF)); + dstPix = _mm256_permute4x64_epi64(_mm256_packus_epi32(srcPix[0], srcPix[1]), 0xD8); + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm256_permutevar8x32_epi32(srcPix[0], _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + srcPix[1] = _mm256_permutevar8x32_epi32(srcPix[1], _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + dstPix = _mm256_permute2x128_si256(srcPix[0], srcPix[1], 0x20); + /* + // Pixel minification algorithm that takes the average value of each color component of the 2x2 pixel group. + __m256i workingPix[4]; + __m256i finalPix[2]; + + srcPix[0] = _mm256_load_si256((__m256i *)src + srcX + 0); + srcPix[1] = _mm256_load_si256((__m256i *)src + srcX + 1); + srcPix[2] = _mm256_load_si256((__m256i *)src + srcX + (GPU_FRAMEBUFFER_NATIVE_WIDTH * INTEGERSCALEHINT / (sizeof(__m256i) / ELEMENTSIZE)) + 0); + srcPix[3] = _mm256_load_si256((__m256i *)src + srcX + (GPU_FRAMEBUFFER_NATIVE_WIDTH * INTEGERSCALEHINT / (sizeof(__m256i) / ELEMENTSIZE)) + 1); + + srcPix[0] = _mm256_permutevar8x32_epi32(srcPix[0], _mm256_set_epi32(7, 5, 6, 4, 3, 1, 2, 0)); + srcPix[1] = _mm256_permutevar8x32_epi32(srcPix[1], _mm256_set_epi32(7, 5, 6, 4, 3, 1, 2, 0)); + srcPix[2] = _mm256_permutevar8x32_epi32(srcPix[2], _mm256_set_epi32(7, 5, 6, 4, 3, 1, 2, 0)); + srcPix[3] = _mm256_permutevar8x32_epi32(srcPix[3], _mm256_set_epi32(7, 5, 6, 4, 3, 1, 2, 0)); + + workingPix[0] = _mm256_unpacklo_epi8(srcPix[0], _mm256_setzero_si256()); + workingPix[1] = _mm256_unpackhi_epi8(srcPix[0], _mm256_setzero_si256()); + workingPix[2] = _mm256_unpacklo_epi8(srcPix[2], _mm256_setzero_si256()); + workingPix[3] = _mm256_unpackhi_epi8(srcPix[2], _mm256_setzero_si256()); + + finalPix[0] = _mm256_adds_epi16(workingPix[0], workingPix[1]); + finalPix[0] = _mm256_adds_epi16(finalPix[0], workingPix[2]); + finalPix[0] = _mm256_adds_epi16(finalPix[0], workingPix[3]); + finalPix[0] = _mm256_srli_epi16(finalPix[0], 2); + + workingPix[0] = _mm256_unpacklo_epi8(srcPix[1], _mm256_setzero_si256()); + workingPix[1] = _mm256_unpackhi_epi8(srcPix[1], _mm256_setzero_si256()); + workingPix[2] = _mm256_unpacklo_epi8(srcPix[3], _mm256_setzero_si256()); + workingPix[3] = _mm256_unpackhi_epi8(srcPix[3], _mm256_setzero_si256()); + + finalPix[1] = _mm256_adds_epi16(workingPix[0], workingPix[1]); + finalPix[1] = _mm256_adds_epi16(finalPix[1], workingPix[2]); + finalPix[1] = _mm256_adds_epi16(finalPix[1], workingPix[3]); + finalPix[1] = _mm256_srli_epi16(finalPix[1], 2); + + dstPix = _mm256_permute4x64_epi64(_mm256_packus_epi16(finalPix[0], finalPix[1]), 0xD8); + */ + } + + _mm256_store_si256((__m256i *)dst + dstX, dstPix); + } + } + else if (INTEGERSCALEHINT == 3) + { + static const u8 X = 0x80; + __m256i srcPix[3]; + + for (size_t srcX = 0, dstX = 0; dstX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX+=INTEGERSCALEHINT, dstX++) + { + srcPix[0] = _mm256_load_si256((__m256i *)src + srcX + 0); + srcPix[1] = _mm256_load_si256((__m256i *)src + srcX + 1); + srcPix[2] = _mm256_load_si256((__m256i *)src + srcX + 2); + + if (ELEMENTSIZE == 1) + { + srcPix[0] = _mm256_shuffle_epi8(srcPix[0], _mm256_set_epi8(30,27,24,21,18, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,15,12, 9, 6, 3, 0)); + srcPix[0] = _mm256_permute4x64_epi64(srcPix[0], 0x9C); + srcPix[0] = _mm256_shuffle_epi8(srcPix[0], _mm256_set_epi8( X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,15,14,13,12,11, 5, 4, 3, 2, 1, 0)); + + srcPix[1] = _mm256_shuffle_epi8(srcPix[1], _mm256_set_epi8( X, X, X, X, X, X, X, X, X, X,31,28,25,22,19,16, 13,10, 7, 4, 1, X, X, X, X, X, X, X, X, X, X, X)); + + srcPix[2] = _mm256_shuffle_epi8(srcPix[2], _mm256_set_epi8(29,26,23,20,17, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,14,11, 8, 5, 2)); + srcPix[2] = _mm256_permute4x64_epi64(srcPix[2], 0xC9); + srcPix[2] = _mm256_shuffle_epi8(srcPix[2], _mm256_set_epi8(31,30,29,28,27,20,19,18,17,16, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X)); + + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[1]); + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[2]); + + _mm256_store_si256((__m256i *)dst + dstX, srcPix[0]); + } + else if (ELEMENTSIZE == 2) + { + srcPix[0] = _mm256_shuffle_epi8(srcPix[0], _mm256_set_epi8(31,30,25,24,19,18, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,13,12, 7, 6, 1, 0)); + srcPix[0] = _mm256_permute4x64_epi64(srcPix[0], 0x9C); + srcPix[0] = _mm256_shuffle_epi8(srcPix[0], _mm256_set_epi8( X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,15,14,13,12,11,10, 5, 4, 3, 2, 1, 0)); + + srcPix[1] = _mm256_shuffle_epi8(srcPix[1], _mm256_set_epi8( X, X, X, X, X, X, X, X, X, X,29,28,23,22,17,16, 11,10, 5, 4, X, X, X, X, X, X, X, X, X, X, X, X)); + + srcPix[2] = _mm256_shuffle_epi8(srcPix[2], _mm256_set_epi8(27,26,21,20, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,15,14, 9, 8, 3, 2, X, X, X, X, X, X)); + srcPix[2] = _mm256_permutevar8x32_epi32(srcPix[2], _mm256_set_epi32( 7, 2, 1, 0, 0, 0, 0, 0)); + + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[1]); + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[2]); + + _mm256_store_si256((__m256i *)dst + dstX, srcPix[0]); + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000)); + srcPix[2] = _mm256_and_si256(srcPix[2], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000)); + + srcPix[0] = _mm256_permutevar8x32_epi32(srcPix[0], _mm256_set_epi32( 7, 7, 7, 7, 7, 6, 3, 0)); + srcPix[1] = _mm256_permutevar8x32_epi32(srcPix[1], _mm256_set_epi32( 0, 0, 7, 4, 1, 0, 0, 0)); + srcPix[2] = _mm256_permutevar8x32_epi32(srcPix[2], _mm256_set_epi32( 5, 2, 0, 0, 0, 0, 0, 0)); + + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[1]); + srcPix[0] = _mm256_or_si256(srcPix[0], srcPix[2]); + + _mm256_store_si256((__m256i *)dst + dstX, srcPix[0]); + } + } + } + else if (INTEGERSCALEHINT == 4) + { + __m256i srcPix[4]; + __m256i dstPix; + + for (size_t srcX = 0, dstX = 0; dstX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m256i) / ELEMENTSIZE); srcX+=INTEGERSCALEHINT, dstX++) + { + srcPix[0] = _mm256_load_si256((__m256i *)src + srcX + 0); + srcPix[1] = _mm256_load_si256((__m256i *)src + srcX + 1); + srcPix[2] = _mm256_load_si256((__m256i *)src + srcX + 2); + srcPix[3] = _mm256_load_si256((__m256i *)src + srcX + 3); + + if (ELEMENTSIZE == 1) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set1_epi32(0x000000FF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set1_epi32(0x000000FF)); + srcPix[2] = _mm256_and_si256(srcPix[2], _mm256_set1_epi32(0x000000FF)); + srcPix[3] = _mm256_and_si256(srcPix[3], _mm256_set1_epi32(0x000000FF)); + + srcPix[0] = _mm256_permute4x64_epi64(_mm256_packus_epi16(srcPix[0], srcPix[1]), 0xD8); + srcPix[1] = _mm256_permute4x64_epi64(_mm256_packus_epi16(srcPix[2], srcPix[3]), 0xD8); + + dstPix = _mm256_permute4x64_epi64(_mm256_packus_epi16(srcPix[0], srcPix[1]), 0xD8); + } + else if (ELEMENTSIZE == 2) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[2] = _mm256_and_si256(srcPix[2], _mm256_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[3] = _mm256_and_si256(srcPix[3], _mm256_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + + srcPix[0] = _mm256_permute4x64_epi64(_mm256_packus_epi32(srcPix[0], srcPix[1]), 0xD8); + srcPix[1] = _mm256_permute4x64_epi64(_mm256_packus_epi32(srcPix[2], srcPix[3]), 0xD8); + + dstPix = _mm256_permute4x64_epi64(_mm256_packus_epi32(srcPix[0], srcPix[1]), 0xD8); + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm256_and_si256(srcPix[0], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[1] = _mm256_and_si256(srcPix[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[2] = _mm256_and_si256(srcPix[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[3] = _mm256_and_si256(srcPix[3], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + + // Here is a special case where we don't need to precede our vpunpckldq instructions with vpermq. + // Data swizzling is unnecessary here since our desired data is already aligned to their 128-bit + // lanes as-is. + srcPix[0] = _mm256_unpacklo_epi32(srcPix[0], srcPix[1]); + srcPix[1] = _mm256_unpacklo_epi32(srcPix[2], srcPix[3]); + + dstPix = _mm256_unpacklo_epi64(srcPix[0], srcPix[1]); + dstPix = _mm256_permutevar8x32_epi32(dstPix, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + } + + _mm256_store_si256((__m256i *)dst + dstX, dstPix); + } + } + else if ( (INTEGERSCALEHINT >= 5) && (INTEGERSCALEHINT <= 32) ) + { + if (ELEMENTSIZE == 1) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ((u8 *)dst)[x] = ((u8 *)src)[x * INTEGERSCALEHINT]; + } + } + else if (ELEMENTSIZE == 2) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ((u16 *)dst)[x] = ((u16 *)src)[x * INTEGERSCALEHINT]; + } + } + else if (ELEMENTSIZE == 4) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=(sizeof(__m256i)/ELEMENTSIZE)) + { + const v256u32 idx = _mm256_madd_epi16(_mm256_set1_epi16(INTEGERSCALEHINT), _mm256_set_epi16(x, 7, x, 6, x, 5, x, 4, x, 3, x, 2, x, 1, x, 0)); + _mm256_store_si256( (v256u32 *)((u32 *)dst + x), _mm256_i32gather_epi32((int const *)src, idx, sizeof(u32)) ); + } + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = srcWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + if (ELEMENTSIZE == 1) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ((u8 *)dst)[x] = ((u8 *)src)[x * scale]; + } + } + else if (ELEMENTSIZE == 2) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ((u16 *)dst)[x] = ((u16 *)src)[x * scale]; + } + } + else if (ELEMENTSIZE == 4) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=(sizeof(__m256i)/ELEMENTSIZE)) + { + const v256u32 idx = _mm256_madd_epi16(_mm256_set1_epi16(scale), _mm256_set_epi16(x, 7, x, 6, x, 5, x, 4, x, 3, x, 2, x, 1, x, 0)); + _mm256_store_si256( (v256u32 *)((u32 *)dst + x), _mm256_i32gather_epi32((int const *)src, idx, sizeof(u32)) ); + } + } + } + else + { + if (ELEMENTSIZE == 1) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ( (u8 *)dst)[x] = ( (u8 *)src)[_gpuDstPitchIndex[x]]; + } + } + else if (ELEMENTSIZE == 2) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + ((u16 *)dst)[x] = ((u16 *)src)[_gpuDstPitchIndex[x]]; + } + } + else if (ELEMENTSIZE == 4) + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=(sizeof(__m256i)/ELEMENTSIZE)) + { + const v256u32 idx = _mm256_load_si256((v256u32 *)(_gpuDstPitchIndex + x)); + _mm256_store_si256( (v256u32 *)((u32 *)dst + x), _mm256_i32gather_epi32((int const *)src, idx, sizeof(u32)) ); + } + } + } +} + +FORCEINLINE v256u16 ColorOperation_AVX2::blend(const v256u16 &colA, const v256u16 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const +{ + v256u16 ra; + v256u16 ga; + v256u16 ba; + v256u16 colorBitMask = _mm256_set1_epi16(0x001F); + + ra = _mm256_or_si256( _mm256_and_si256( colA, colorBitMask), _mm256_and_si256(_mm256_slli_epi16(colB, 8), _mm256_set1_epi16(0x1F00)) ); + ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(colA, 5), colorBitMask), _mm256_and_si256(_mm256_slli_epi16(colB, 3), _mm256_set1_epi16(0x1F00)) ); + ba = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(colA, 10), colorBitMask), _mm256_and_si256(_mm256_srli_epi16(colB, 2), _mm256_set1_epi16(0x1F00)) ); + + const v256u16 blendAB = _mm256_or_si256(blendEVA, _mm256_slli_epi16(blendEVB, 8)); + ra = _mm256_maddubs_epi16(ra, blendAB); + ga = _mm256_maddubs_epi16(ga, blendAB); + ba = _mm256_maddubs_epi16(ba, blendAB); + + ra = _mm256_srli_epi16(ra, 4); + ga = _mm256_srli_epi16(ga, 4); + ba = _mm256_srli_epi16(ba, 4); + + ra = _mm256_min_epi16(ra, colorBitMask); + ga = _mm256_min_epi16(ga, colorBitMask); + ba = _mm256_min_epi16(ba, colorBitMask); + + return _mm256_or_si256(ra, _mm256_or_si256( _mm256_slli_epi16(ga, 5), _mm256_slli_epi16(ba, 10)) ); +} + +// Note that if USECONSTANTBLENDVALUESHINT is true, then this method will assume that blendEVA contains identical values +// for each 16-bit vector element, and also that blendEVB contains identical values for each 16-bit vector element. If +// this assumption is broken, then the resulting color will be undefined. +template +FORCEINLINE v256u32 ColorOperation_AVX2::blend(const v256u32 &colA, const v256u32 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const +{ + v256u16 outColorLo; + v256u16 outColorHi; + v256u32 outColor; + + v256u16 blendAB = _mm256_or_si256(blendEVA, _mm256_slli_epi16(blendEVB, 8)); + + if (USECONSTANTBLENDVALUESHINT) + { + const v256u16 tempColorA = _mm256_permute4x64_epi64(colA, 0xD8); + const v256u16 tempColorB = _mm256_permute4x64_epi64(colB, 0xD8); + + outColorLo = _mm256_unpacklo_epi8(tempColorA, tempColorB); + outColorHi = _mm256_unpackhi_epi8(tempColorA, tempColorB); + + outColorLo = _mm256_maddubs_epi16(outColorLo, blendAB); + outColorHi = _mm256_maddubs_epi16(outColorHi, blendAB); + + outColorLo = _mm256_srli_epi16(outColorLo, 4); + outColorHi = _mm256_srli_epi16(outColorHi, 4); + outColor = _mm256_packus_epi16(outColorLo, outColorHi); + outColor = _mm256_permute4x64_epi64(outColor, 0xD8); + } + else + { + const v256u16 tempColorA = _mm256_permute4x64_epi64(colA, 0xD8); + const v256u16 tempColorB = _mm256_permute4x64_epi64(colB, 0xD8); + + outColorLo = _mm256_unpacklo_epi8(tempColorA, tempColorB); + outColorHi = _mm256_unpackhi_epi8(tempColorA, tempColorB); + + blendAB = _mm256_permute4x64_epi64(blendAB, 0xD8); + const v256u16 blendABLo = _mm256_unpacklo_epi16(blendAB, blendAB); + const v256u16 blendABHi = _mm256_unpackhi_epi16(blendAB, blendAB); + outColorLo = _mm256_maddubs_epi16(outColorLo, blendABLo); + outColorHi = _mm256_maddubs_epi16(outColorHi, blendABHi); + + outColorLo = _mm256_srli_epi16(outColorLo, 4); + outColorHi = _mm256_srli_epi16(outColorHi, 4); + + outColor = _mm256_packus_epi16(outColorLo, outColorHi); + outColor = _mm256_permute4x64_epi64(outColor, 0xD8); + } + + // When the color format is 888, the vpackuswb instruction will naturally clamp + // the color component values to 255. However, when the color format is 666, the + // color component values must be clamped to 63. In this case, we must call pminub + // to do the clamp. + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + outColor = _mm256_min_epu8(outColor, _mm256_set1_epi8(63)); + } + + outColor = _mm256_and_si256(outColor, _mm256_set1_epi32(0x00FFFFFF)); + + return outColor; +} + +FORCEINLINE v256u16 ColorOperation_AVX2::blend3D(const v256u32 &colA_Lo, const v256u32 &colA_Hi, const v256u16 &colB) const +{ + // If the color format of B is 555, then the colA_Hi parameter is required. + // The color format of A is assumed to be RGB666. + v256u32 ra_lo = _mm256_and_si256( colA_Lo, _mm256_set1_epi32(0x000000FF) ); + v256u32 ga_lo = _mm256_and_si256( _mm256_srli_epi32(colA_Lo, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 ba_lo = _mm256_and_si256( _mm256_srli_epi32(colA_Lo, 16), _mm256_set1_epi32(0x000000FF) ); + v256u32 aa_lo = _mm256_srli_epi32(colA_Lo, 24); + + v256u32 ra_hi = _mm256_and_si256( colA_Hi, _mm256_set1_epi32(0x000000FF) ); + v256u32 ga_hi = _mm256_and_si256( _mm256_srli_epi32(colA_Hi, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 ba_hi = _mm256_and_si256( _mm256_srli_epi32(colA_Hi, 16), _mm256_set1_epi32(0x000000FF) ); + v256u32 aa_hi = _mm256_srli_epi32(colA_Hi, 24); + + v256u16 ra = _mm256_packus_epi32(ra_lo, ra_hi); + v256u16 ga = _mm256_packus_epi32(ga_lo, ga_hi); + v256u16 ba = _mm256_packus_epi32(ba_lo, ba_hi); + v256u16 aa = _mm256_packus_epi32(aa_lo, aa_hi); + + ra = _mm256_permute4x64_epi64(ra, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + ba = _mm256_permute4x64_epi64(ba, 0xD8); + aa = _mm256_permute4x64_epi64(aa, 0xD8); + + ra = _mm256_or_si256( ra, _mm256_and_si256(_mm256_slli_epi16(colB, 9), _mm256_set1_epi16(0x3E00)) ); + ga = _mm256_or_si256( ga, _mm256_and_si256(_mm256_slli_epi16(colB, 4), _mm256_set1_epi16(0x3E00)) ); + ba = _mm256_or_si256( ba, _mm256_and_si256(_mm256_srli_epi16(colB, 1), _mm256_set1_epi16(0x3E00)) ); + + aa = _mm256_adds_epu8(aa, _mm256_set1_epi16(1)); + aa = _mm256_or_si256( aa, _mm256_slli_epi16(_mm256_subs_epu16(_mm256_set1_epi8(32), aa), 8) ); + + ra = _mm256_maddubs_epi16(ra, aa); + ga = _mm256_maddubs_epi16(ga, aa); + ba = _mm256_maddubs_epi16(ba, aa); + + ra = _mm256_srli_epi16(ra, 6); + ga = _mm256_srli_epi16(ga, 6); + ba = _mm256_srli_epi16(ba, 6); + + return _mm256_or_si256( _mm256_or_si256(ra, _mm256_slli_epi16(ga, 5)), _mm256_slli_epi16(ba, 10) ); +} + +template +FORCEINLINE v256u32 ColorOperation_AVX2::blend3D(const v256u32 &colA, const v256u32 &colB) const +{ + // If the color format of B is 666 or 888, then the colA_Hi parameter is ignored. + // The color format of A is assumed to match the color format of B. + v256u32 alpha; + v256u16 alphaLo; + v256u16 alphaHi; + + v256u16 tempColor[2] = { + _mm256_permute4x64_epi64(colA, 0xD8), + _mm256_permute4x64_epi64(colB, 0xD8) + }; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + // Does not work for RGBA8888 color format. The reason is because this + // algorithm depends on the vpmaddubsw instruction, which multiplies + // two unsigned 8-bit integers into an intermediate signed 16-bit + // integer. This means that we can overrun the signed 16-bit value + // range, which would be limited to [-32767 - 32767]. For example, a + // color component of value 255 multiplied by an alpha value of 255 + // would equal 65025, which is greater than the upper range of a signed + // 16-bit value. + v256u16 tempColorLo = _mm256_unpacklo_epi8(tempColor[0], tempColor[1]); + v256u16 tempColorHi = _mm256_unpackhi_epi8(tempColor[0], tempColor[1]); + + alpha = _mm256_and_si256( _mm256_srli_epi32(colA, 24), _mm256_set1_epi32(0x0000001F) ); + alpha = _mm256_or_si256( alpha, _mm256_or_si256(_mm256_slli_epi32(alpha, 8), _mm256_slli_epi32(alpha, 16)) ); + alpha = _mm256_adds_epu8(alpha, _mm256_set1_epi8(1)); + + v256u32 invAlpha = _mm256_subs_epu8(_mm256_set1_epi8(32), alpha); + invAlpha = _mm256_permute4x64_epi64(invAlpha, 0xD8); + + alpha = _mm256_permute4x64_epi64(alpha, 0xD8); + alphaLo = _mm256_unpacklo_epi8(alpha, invAlpha); + alphaHi = _mm256_unpackhi_epi8(alpha, invAlpha); + + tempColorLo = _mm256_maddubs_epi16(tempColorLo, alphaLo); + tempColorHi = _mm256_maddubs_epi16(tempColorHi, alphaHi); + + tempColor[0] = _mm256_srli_epi16(tempColorLo, 5); + tempColor[1] = _mm256_srli_epi16(tempColorHi, 5); + } + else + { + v256u16 rgbALo = _mm256_unpacklo_epi8(tempColor[0], _mm256_setzero_si256()); + v256u16 rgbAHi = _mm256_unpackhi_epi8(tempColor[0], _mm256_setzero_si256()); + v256u16 rgbBLo = _mm256_unpacklo_epi8(tempColor[1], _mm256_setzero_si256()); + v256u16 rgbBHi = _mm256_unpackhi_epi8(tempColor[1], _mm256_setzero_si256()); + + alpha = _mm256_and_si256( _mm256_srli_epi32(colA, 24), _mm256_set1_epi32(0x000000FF) ); + alpha = _mm256_or_si256( alpha, _mm256_or_si256(_mm256_slli_epi32(alpha, 8), _mm256_slli_epi32(alpha, 16)) ); + alpha = _mm256_permute4x64_epi64(alpha, 0xD8); + + alphaLo = _mm256_unpacklo_epi8(alpha, _mm256_setzero_si256()); + alphaHi = _mm256_unpackhi_epi8(alpha, _mm256_setzero_si256()); + alphaLo = _mm256_add_epi16(alphaLo, _mm256_set1_epi16(1)); + alphaHi = _mm256_add_epi16(alphaHi, _mm256_set1_epi16(1)); + + rgbALo = _mm256_add_epi16( _mm256_mullo_epi16(rgbALo, alphaLo), _mm256_mullo_epi16(rgbBLo, _mm256_sub_epi16(_mm256_set1_epi16(256), alphaLo)) ); + rgbAHi = _mm256_add_epi16( _mm256_mullo_epi16(rgbAHi, alphaHi), _mm256_mullo_epi16(rgbBHi, _mm256_sub_epi16(_mm256_set1_epi16(256), alphaHi)) ); + + tempColor[0] = _mm256_srli_epi16(rgbALo, 8); + tempColor[1] = _mm256_srli_epi16(rgbAHi, 8); + } + + tempColor[0] = _mm256_packus_epi16(tempColor[0], tempColor[1]); + tempColor[0] = _mm256_permute4x64_epi64(tempColor[0], 0xD8); + + return _mm256_and_si256(tempColor[0], _mm256_set1_epi32(0x00FFFFFF)); +} + +FORCEINLINE v256u16 ColorOperation_AVX2::increase(const v256u16 &col, const v256u16 &blendEVY) const +{ + v256u16 r = _mm256_and_si256( col, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(col, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(col, 10), _mm256_set1_epi16(0x001F) ); + + r = _mm256_add_epi16( r, _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(_mm256_set1_epi16(31), r), blendEVY), 4) ); + g = _mm256_add_epi16( g, _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(_mm256_set1_epi16(31), g), blendEVY), 4) ); + b = _mm256_add_epi16( b, _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(_mm256_set1_epi16(31), b), blendEVY), 4) ); + + return _mm256_or_si256(r, _mm256_or_si256( _mm256_slli_epi16(g, 5), _mm256_slli_epi16(b, 10)) ); +} + +template +FORCEINLINE v256u32 ColorOperation_AVX2::increase(const v256u32 &col, const v256u16 &blendEVY) const +{ + const v256u32 tempCol = _mm256_permute4x64_epi64(col, 0xD8); + v256u16 rgbLo = _mm256_unpacklo_epi8(tempCol, _mm256_setzero_si256()); + v256u16 rgbHi = _mm256_unpackhi_epi8(tempCol, _mm256_setzero_si256()); + + rgbLo = _mm256_add_epi16( rgbLo, _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(_mm256_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4) ); + rgbHi = _mm256_add_epi16( rgbHi, _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(_mm256_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4) ); + + return _mm256_and_si256( _mm256_permute4x64_epi64(_mm256_packus_epi16(rgbLo, rgbHi), 0xD8), _mm256_set1_epi32(0x00FFFFFF) ); +} + +FORCEINLINE v256u16 ColorOperation_AVX2::decrease(const v256u16 &col, const v256u16 &blendEVY) const +{ + v256u16 r = _mm256_and_si256( col, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(col, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(col, 10), _mm256_set1_epi16(0x001F) ); + + r = _mm256_sub_epi16( r, _mm256_srli_epi16(_mm256_mullo_epi16(r, blendEVY), 4) ); + g = _mm256_sub_epi16( g, _mm256_srli_epi16(_mm256_mullo_epi16(g, blendEVY), 4) ); + b = _mm256_sub_epi16( b, _mm256_srli_epi16(_mm256_mullo_epi16(b, blendEVY), 4) ); + + return _mm256_or_si256(r, _mm256_or_si256( _mm256_slli_epi16(g, 5), _mm256_slli_epi16(b, 10)) ); +} + +template +FORCEINLINE v256u32 ColorOperation_AVX2::decrease(const v256u32 &col, const v256u16 &blendEVY) const +{ + const v256u32 tempCol = _mm256_permute4x64_epi64(col, 0xD8); + v256u16 rgbLo = _mm256_unpacklo_epi8(tempCol, _mm256_setzero_si256()); + v256u16 rgbHi = _mm256_unpackhi_epi8(tempCol, _mm256_setzero_si256()); + + rgbLo = _mm256_sub_epi16( rgbLo, _mm256_srli_epi16(_mm256_mullo_epi16(rgbLo, blendEVY), 4) ); + rgbHi = _mm256_sub_epi16( rgbHi, _mm256_srli_epi16(_mm256_mullo_epi16(rgbHi, blendEVY), 4) ); + + return _mm256_and_si256( _mm256_permute4x64_epi64(_mm256_packus_epi16(rgbLo, rgbHi), 0xD8), _mm256_set1_epi32(0x00FFFFFF) ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_copy16(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(src0, alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(src1, alphaBits) ); + } + else + { + v256u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555To6665Opaque_AVX2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555To8888Opaque_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555To8888Opaque_AVX2(src1, src32[2], src32[3]); + } + + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, src32[0] ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, src32[1] ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, src32[2] ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, src32[3] ); + } + + if (!ISDEBUGRENDER) + { + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::_copy32(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(src16[0], alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(src16[1], alphaBits) ); + } + else + { + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, _mm256_or_si256(src0, alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, _mm256_or_si256(src1, alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, _mm256_or_si256(src2, alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, _mm256_or_si256(src3, alphaBits) ); + } + + if (!ISDEBUGRENDER) + { + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::_copyMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(src0, alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(src1, alphaBits), passMask16[1]) ); + } + else + { + v256u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555To6665Opaque_AVX2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555To8888Opaque_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555To8888Opaque_AVX2(src1, src32[2], src32[3]); + } + + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], src32[0] ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], src32[1] ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], src32[2] ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], src32[3] ); + } + + if (!ISDEBUGRENDER) + { + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::_copyMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(src16[0], alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(src16[1], alphaBits), passMask16[1]) ); + } + else + { + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(src0, alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(src1, alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(src2, alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(src3, alphaBits) ); + } + + if (!ISDEBUGRENDER) + { + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessUp16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(colorop_vec.increase(src0, evy16), alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(colorop_vec.increase(src1, evy16), alphaBits) ); + } + else + { + v256u32 dst[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_AVX2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo666X_AVX2(src1, dst[2], dst[3]); + } + else + { + ColorspaceConvert555XTo888X_AVX2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo888X_AVX2(src1, dst[2], dst[3]); + } + + const v256u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm256_set1_epi32(0x1F000000) : _mm256_set1_epi32(0xFF000000); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, _mm256_or_si256(colorop_vec.increase(dst[0], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, _mm256_or_si256(colorop_vec.increase(dst[1], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, _mm256_or_si256(colorop_vec.increase(dst[2], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, _mm256_or_si256(colorop_vec.increase(dst[3], evy16), alphaBits) ); + } + + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessUp32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(colorop_vec.increase(src16[0], evy16), alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(colorop_vec.increase(src16[1], evy16), alphaBits) ); + } + else + { + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, _mm256_or_si256(colorop_vec.increase(src0, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, _mm256_or_si256(colorop_vec.increase(src1, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, _mm256_or_si256(colorop_vec.increase(src2, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, _mm256_or_si256(colorop_vec.increase(src3, evy16), alphaBits) ); + } + + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(colorop_vec.increase(src0, evy16), alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(colorop_vec.increase(src1, evy16), alphaBits), passMask16[1]) ); + } + else + { + v256u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo666X_AVX2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555XTo888X_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo888X_AVX2(src1, src32[2], src32[3]); + } + + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(colorop_vec.increase(src32[0], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(colorop_vec.increase(src32[1], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(colorop_vec.increase(src32[2], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(colorop_vec.increase(src32[3], evy16), alphaBits) ); + } + + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(colorop_vec.increase(src16[0], evy16), alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(colorop_vec.increase(src16[1], evy16), alphaBits), passMask16[1]) ); + } + else + { + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(colorop_vec.increase(src0, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(colorop_vec.increase(src1, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(colorop_vec.increase(src2, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(colorop_vec.increase(src3, evy16), alphaBits) ); + } + + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessDown16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(colorop_vec.decrease(src0, evy16), alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(colorop_vec.decrease(src1, evy16), alphaBits) ); + } + else + { + v256u32 dst[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_AVX2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo666X_AVX2(src1, dst[2], dst[3]); + } + else + { + ColorspaceConvert555XTo888X_AVX2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo888X_AVX2(src1, dst[2], dst[3]); + } + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, _mm256_or_si256(colorop_vec.decrease(dst[0], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, _mm256_or_si256(colorop_vec.decrease(dst[1], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, _mm256_or_si256(colorop_vec.decrease(dst[2], evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, _mm256_or_si256(colorop_vec.decrease(dst[3], evy16), alphaBits) ); + } + + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessDown32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_or_si256(colorop_vec.decrease(src16[0], evy16), alphaBits) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_or_si256(colorop_vec.decrease(src16[1], evy16), alphaBits) ); + } + else + { + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 0, _mm256_or_si256(colorop_vec.decrease(src0, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 1, _mm256_or_si256(colorop_vec.decrease(src1, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 2, _mm256_or_si256(colorop_vec.decrease(src2, evy16), alphaBits) ); + _mm256_store_si256( (v256u32 *)compInfo.target.lineColor32 + 3, _mm256_or_si256(colorop_vec.decrease(src3, evy16), alphaBits) ); + } + + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(colorop_vec.decrease(src0, evy16), alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(colorop_vec.decrease(src1, evy16), alphaBits), passMask16[1]) ); + } + else + { + v256u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo666X_AVX2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555XTo888X_AVX2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo888X_AVX2(src1, src32[2], src32[3]); + } + + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(colorop_vec.decrease(src32[0], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(colorop_vec.decrease(src32[1], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(colorop_vec.decrease(src32[2], evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(colorop_vec.decrease(src32[3], evy16), alphaBits) ); + } + + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const +{ + const v256u8 tempPassMask8 = _mm256_permute4x64_epi64(passMask8, 0xD8); + + v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(tempPassMask8, tempPassMask8), + _mm256_unpackhi_epi8(tempPassMask8, tempPassMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 src16[2] = { + ColorspaceConvert6665To5551_AVX2(src0, src1), + ColorspaceConvert6665To5551_AVX2(src2, src3) + }; + + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(colorop_vec.decrease(src16[0], evy16), alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(colorop_vec.decrease(src16[1], evy16), alphaBits), passMask16[1]) ); + } + else + { + passMask16[0] = _mm256_permute4x64_epi64(passMask16[0], 0xD8); + passMask16[1] = _mm256_permute4x64_epi64(passMask16[1], 0xD8); + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm256_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm256_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm256_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(colorop_vec.decrease(src0, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(colorop_vec.decrease(src1, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(colorop_vec.decrease(src2, evy16), alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(colorop_vec.decrease(src3, evy16), alphaBits) ); + } + + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_AVX2::_unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u16 &src1, const v256u16 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const v256u8 &enableColorEffectMask, + const v256u8 &spriteAlpha, + const v256u8 &spriteMode) const +{ + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + + v256u8 dstTargetBlendEnableMask = _mm256_shuffle_epi8(dstBlendEnableMaskLUT, dstLayerID); + dstTargetBlendEnableMask = _mm256_andnot_si256( _mm256_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); + + // Select the color effect based on the BLDCNT target flags. + const v256u8 colorEffectMask = _mm256_blendv_epi8(_mm256_set1_epi8(ColorEffect_Disable), _mm256_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + v256u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm256_setzero_si256(); + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + __m256i eva_vec256 = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_set1_epi8(compInfo.renderState.blendEVA) : _mm256_set1_epi16(compInfo.renderState.blendEVA); + __m256i evb_vec256 = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_set1_epi8(compInfo.renderState.blendEVB) : _mm256_set1_epi16(compInfo.renderState.blendEVB); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v256u8 isObjTranslucentMask = _mm256_and_si256( dstTargetBlendEnableMask, _mm256_or_si256(_mm256_cmpeq_epi8(spriteMode, _mm256_set1_epi8(OBJMode_Transparent)), _mm256_cmpeq_epi8(spriteMode, _mm256_set1_epi8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v256u8 spriteAlphaMask = _mm256_andnot_si256(_mm256_cmpeq_epi8(spriteAlpha, _mm256_set1_epi8(0xFF)), isObjTranslucentMask); + eva_vec256 = _mm256_blendv_epi8(eva_vec256, spriteAlpha, spriteAlphaMask); + evb_vec256 = _mm256_blendv_epi8(evb_vec256, _mm256_sub_epi8(_mm256_set1_epi8(16), spriteAlpha), spriteAlphaMask); + } + + // ---------- + + __m256i tmpSrc[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = src0; + tmpSrc[1] = src1; + tmpSrc[2] = _mm256_setzero_si256(); + tmpSrc[3] = _mm256_setzero_si256(); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_AVX2(src0, tmpSrc[0], tmpSrc[1]); + ColorspaceConvert555XTo666X_AVX2(src1, tmpSrc[2], tmpSrc[3]); + } + else + { + ColorspaceConvert555XTo888X_AVX2(src0, tmpSrc[0], tmpSrc[1]); + ColorspaceConvert555XTo888X_AVX2(src1, tmpSrc[2], tmpSrc[3]); + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v256u8 brightnessMask8 = _mm256_andnot_si256( forceDstTargetBlendMask, _mm256_and_si256(srcEffectEnableMask, _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_IncreaseBrightness))) ); + const int brightnessUpMaskValue = _mm256_movemask_epi8(brightnessMask8); + + if (brightnessUpMaskValue != 0x00000000) + { + const v256u8 brightnessMask8pre = _mm256_permute4x64_epi64(brightnessMask8, 0xD8); + const v256u16 brightnessMask16[2] = { + _mm256_unpacklo_epi8(brightnessMask8pre, brightnessMask8pre), + _mm256_unpackhi_epi8(brightnessMask8pre, brightnessMask8pre) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v256u16 brightnessMask16pre[2] = { + _mm256_permute4x64_epi64(brightnessMask16[0], 0xD8), + _mm256_permute4x64_epi64(brightnessMask16[1], 0xD8) + }; + const v256u32 brightnessMask32[4] = { + _mm256_unpacklo_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpackhi_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpacklo_epi16(brightnessMask16pre[1], brightnessMask16pre[1]), + _mm256_unpackhi_epi16(brightnessMask16pre[1], brightnessMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm256_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm256_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v256u8 brightnessMask8 = _mm256_andnot_si256( forceDstTargetBlendMask, _mm256_and_si256(srcEffectEnableMask, _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_DecreaseBrightness))) ); + const int brightnessDownMaskValue = _mm256_movemask_epi8(brightnessMask8); + + if (brightnessDownMaskValue != 0x00000000) + { + const v256u8 brightnessMask8pre = _mm256_permute4x64_epi64(brightnessMask8, 0xD8); + const v256u16 brightnessMask16[2] = { + _mm256_unpacklo_epi8(brightnessMask8pre, brightnessMask8pre), + _mm256_unpackhi_epi8(brightnessMask8pre, brightnessMask8pre) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v256u16 brightnessMask16pre[2] = { + _mm256_permute4x64_epi64(brightnessMask16[0], 0xD8), + _mm256_permute4x64_epi64(brightnessMask16[1], 0xD8) + }; + const v256u32 brightnessMask32[4] = { + _mm256_unpacklo_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpackhi_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpacklo_epi16(brightnessMask16pre[1], brightnessMask16pre[1]), + _mm256_unpackhi_epi16(brightnessMask16pre[1], brightnessMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm256_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm256_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v256u8 blendMask8 = _mm256_or_si256( forceDstTargetBlendMask, _mm256_and_si256(_mm256_and_si256(srcEffectEnableMask, dstTargetBlendEnableMask), _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_Blend))) ); + const int blendMaskValue = _mm256_movemask_epi8(blendMask8); + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + if (blendMaskValue != 0x00000000) + { + const v256u8 blendMask8pre = _mm256_permute4x64_epi64(blendMask8, 0xD8); + const v256u16 blendMask16[2] = { + _mm256_unpacklo_epi8(blendMask8pre, blendMask8pre), + _mm256_unpackhi_epi8(blendMask8pre, blendMask8pre) + }; + + v256u16 blendSrc16[2]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + //blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec256, evb_vec256); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec256, evb_vec256); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v256u8 eva_pre = _mm256_permute4x64_epi64(eva_vec256, 0xD8); + const v256u16 tempEVA[2] = { + _mm256_unpacklo_epi8(eva_pre, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(eva_pre, _mm256_setzero_si256()) + }; + + const v256u8 evb_pre = _mm256_permute4x64_epi64(evb_vec256, 0xD8); + const v256u16 tempEVB[2] = { + _mm256_unpacklo_epi8(evb_pre, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(evb_pre, _mm256_setzero_si256()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } + } + + tmpSrc[0] = _mm256_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm256_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); + } + + // Store the final colors. + const v256u8 passMask8pre = _mm256_permute4x64_epi64(passMask8, 0xD8); + const v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(passMask8pre, passMask8pre), + _mm256_unpackhi_epi8(passMask8pre, passMask8pre) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(tmpSrc[0], alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(tmpSrc[1], alphaBits), passMask16[1]) ); + } + else + { + if (blendMaskValue != 0x00000000) + { + const v256u8 blendMask8pre = _mm256_permute4x64_epi64(blendMask8, 0xD8); + const v256u16 blendMask16[2] = { + _mm256_unpacklo_epi8(blendMask8pre, blendMask8pre), + _mm256_unpackhi_epi8(blendMask8pre, blendMask8pre) + }; + + const v256u32 dst32[4] = { + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 0), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 1), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 2), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 3) + }; + + v256u32 blendSrc32[4]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc32[0] = colorop_vec.blend3D(src0, dst32[0]); + //blendSrc32[1] = colorop_vec.blend3D(src1, dst32[1]); + //blendSrc32[2] = colorop_vec.blend3D(src2, dst32[2]); + //blendSrc32[3] = colorop_vec.blend3D(src3, dst32[3]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec256, evb_vec256); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec256, evb_vec256); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec256, evb_vec256); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec256, evb_vec256); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + const v256u8 eva_pre = _mm256_permute4x64_epi64(eva_vec256, 0xD8); + v256u16 tempBlendLo = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(eva_pre, eva_pre), 0xD8); + v256u16 tempBlendHi = _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(eva_pre, eva_pre), 0xD8); + + const v256u16 tempEVA[4] = { + _mm256_unpacklo_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(tempBlendHi, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendHi, _mm256_setzero_si256()) + }; + + const v256u8 evb_pre = _mm256_permute4x64_epi64(evb_vec256, 0xD8); + tempBlendLo = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(evb_pre, evb_pre), 0xD8); + tempBlendHi = _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(evb_pre, evb_pre), 0xD8); + + const v256u16 tempEVB[4] = { + _mm256_unpacklo_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(tempBlendHi, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendHi, _mm256_setzero_si256()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } + } + + const v256u16 blendMask16pre[2] = { + _mm256_permute4x64_epi64(blendMask16[0], 0xD8), + _mm256_permute4x64_epi64(blendMask16[1], 0xD8) + }; + const v256u32 blendMask32[4] = { + _mm256_unpacklo_epi16(blendMask16pre[0], blendMask16pre[0]), + _mm256_unpackhi_epi16(blendMask16pre[0], blendMask16pre[0]), + _mm256_unpacklo_epi16(blendMask16pre[1], blendMask16pre[1]), + _mm256_unpackhi_epi16(blendMask16pre[1], blendMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm256_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm256_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm256_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); + } + + // Store the final colors. + const v256u8 passMask8pre = _mm256_permute4x64_epi64(passMask8, 0xD8); + const v256u16 passMask16pre[2] = { + _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(passMask8pre, passMask8pre), 0xD8 ), + _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(passMask8pre, passMask8pre), 0xD8 ) + }; + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16pre[0], passMask16pre[0]), + _mm256_unpackhi_epi16(passMask16pre[0], passMask16pre[0]), + _mm256_unpacklo_epi16(passMask16pre[1], passMask16pre[1]), + _mm256_unpackhi_epi16(passMask16pre[1], passMask16pre[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(tmpSrc[0], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(tmpSrc[1], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(tmpSrc[2], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(tmpSrc[3], alphaBits) ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::_unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const v256u8 &enableColorEffectMask, + const v256u8 &spriteAlpha, + const v256u8 &spriteMode) const +{ + const v256u8 dstLayerID = _mm256_load_si256((v256u8 *)compInfo.target.lineLayerID); + _mm256_store_si256( (v256u8 *)compInfo.target.lineLayerID, _mm256_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + + v256u8 dstTargetBlendEnableMask = _mm256_shuffle_epi8(dstBlendEnableMaskLUT, dstLayerID); + dstTargetBlendEnableMask = _mm256_andnot_si256( _mm256_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); + + // Select the color effect based on the BLDCNT target flags. + const v256u8 colorEffectMask = _mm256_blendv_epi8(_mm256_set1_epi8(ColorEffect_Disable), _mm256_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + v256u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm256_setzero_si256(); + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + __m256i eva_vec256 = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_set1_epi8(compInfo.renderState.blendEVA) : _mm256_set1_epi16(compInfo.renderState.blendEVA); + __m256i evb_vec256 = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_set1_epi8(compInfo.renderState.blendEVB) : _mm256_set1_epi16(compInfo.renderState.blendEVB); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v256u8 isObjTranslucentMask = _mm256_and_si256( dstTargetBlendEnableMask, _mm256_or_si256(_mm256_cmpeq_epi8(spriteMode, _mm256_set1_epi8(OBJMode_Transparent)), _mm256_cmpeq_epi8(spriteMode, _mm256_set1_epi8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v256u8 spriteAlphaMask = _mm256_andnot_si256(_mm256_cmpeq_epi8(spriteAlpha, _mm256_set1_epi8(0xFF)), isObjTranslucentMask); + eva_vec256 = _mm256_blendv_epi8(eva_vec256, spriteAlpha, spriteAlphaMask); + evb_vec256 = _mm256_blendv_epi8(evb_vec256, _mm256_sub_epi8(_mm256_set1_epi8(16), spriteAlpha), spriteAlphaMask); + } + + // ---------- + + __m256i tmpSrc[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = ColorspaceConvert6665To5551_AVX2(src0, src1); + tmpSrc[1] = ColorspaceConvert6665To5551_AVX2(src2, src3); + tmpSrc[2] = _mm256_setzero_si256(); + tmpSrc[3] = _mm256_setzero_si256(); + } + else + { + tmpSrc[0] = src0; + tmpSrc[1] = src1; + tmpSrc[2] = src2; + tmpSrc[3] = src3; + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v256u8 brightnessMask8 = _mm256_andnot_si256( forceDstTargetBlendMask, _mm256_and_si256(srcEffectEnableMask, _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_IncreaseBrightness))) ); + const int brightnessUpMaskValue = _mm256_movemask_epi8(brightnessMask8); + + if (brightnessUpMaskValue != 0x00000000) + { + const v256u8 brightnessMask8pre = _mm256_permute4x64_epi64(brightnessMask8, 0xD8); + const v256u16 brightnessMask16[2] = { + _mm256_unpacklo_epi8(brightnessMask8pre, brightnessMask8pre), + _mm256_unpackhi_epi8(brightnessMask8pre, brightnessMask8pre) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v256u16 brightnessMask16pre[2] = { + _mm256_permute4x64_epi64(brightnessMask16[0], 0xD8), + _mm256_permute4x64_epi64(brightnessMask16[1], 0xD8) + }; + const v256u32 brightnessMask32[4] = { + _mm256_unpacklo_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpackhi_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpacklo_epi16(brightnessMask16pre[1], brightnessMask16pre[1]), + _mm256_unpackhi_epi16(brightnessMask16pre[1], brightnessMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm256_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm256_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v256u8 brightnessMask8 = _mm256_andnot_si256( forceDstTargetBlendMask, _mm256_and_si256(srcEffectEnableMask, _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_DecreaseBrightness))) ); + const int brightnessDownMaskValue = _mm256_movemask_epi8(brightnessMask8); + + if (brightnessDownMaskValue != 0x00000000) + { + const v256u8 brightnessMask8pre = _mm256_permute4x64_epi64(brightnessMask8, 0xD8); + const v256u16 brightnessMask16[2] = { + _mm256_unpacklo_epi8(brightnessMask8pre, brightnessMask8pre), + _mm256_unpackhi_epi8(brightnessMask8pre, brightnessMask8pre) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v256u16 brightnessMask16pre[2] = { + _mm256_permute4x64_epi64(brightnessMask16[0], 0xD8), + _mm256_permute4x64_epi64(brightnessMask16[1], 0xD8) + }; + const v256u32 brightnessMask32[4] = { + _mm256_unpacklo_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpackhi_epi16(brightnessMask16pre[0], brightnessMask16pre[0]), + _mm256_unpacklo_epi16(brightnessMask16pre[1], brightnessMask16pre[1]), + _mm256_unpackhi_epi16(brightnessMask16pre[1], brightnessMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm256_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm256_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm256_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v256u8 blendMask8 = _mm256_or_si256( forceDstTargetBlendMask, _mm256_and_si256(_mm256_and_si256(srcEffectEnableMask, dstTargetBlendEnableMask), _mm256_cmpeq_epi8(colorEffectMask, _mm256_set1_epi8(ColorEffect_Blend))) ); + const int blendMaskValue = _mm256_movemask_epi8(blendMask8); + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v256u16 dst16[2] = { + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 0), + _mm256_load_si256((v256u16 *)compInfo.target.lineColor16 + 1) + }; + + if (blendMaskValue != 0x00000000) + { + const v256u8 blendMask8pre = _mm256_permute4x64_epi64(blendMask8, 0xD8); + const v256u16 blendMask16[2] = { + _mm256_unpacklo_epi8(blendMask8pre, blendMask8pre), + _mm256_unpackhi_epi8(blendMask8pre, blendMask8pre) + }; + + v256u16 blendSrc16[2]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec256, evb_vec256); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec256, evb_vec256); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v256u8 eva_pre = _mm256_permute4x64_epi64(eva_vec256, 0xD8); + const v256u16 tempEVA[2] = { + _mm256_unpacklo_epi8(eva_pre, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(eva_pre, _mm256_setzero_si256()) + }; + + const v256u8 evb_pre = _mm256_permute4x64_epi64(evb_vec256, 0xD8); + const v256u16 tempEVB[2] = { + _mm256_unpacklo_epi8(evb_pre, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(evb_pre, _mm256_setzero_si256()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } + } + + tmpSrc[0] = _mm256_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm256_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); + } + + // Store the final colors. + const v256u8 passMask8pre = _mm256_permute4x64_epi64(passMask8, 0xD8); + const v256u16 passMask16[2] = { + _mm256_unpacklo_epi8(passMask8pre, passMask8pre), + _mm256_unpackhi_epi8(passMask8pre, passMask8pre) + }; + + const v256u16 alphaBits = _mm256_set1_epi16(0x8000); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 0, _mm256_blendv_epi8(dst16[0], _mm256_or_si256(tmpSrc[0], alphaBits), passMask16[0]) ); + _mm256_store_si256( (v256u16 *)compInfo.target.lineColor16 + 1, _mm256_blendv_epi8(dst16[1], _mm256_or_si256(tmpSrc[1], alphaBits), passMask16[1]) ); + } + else + { + if (blendMaskValue != 0x00000000) + { + const v256u8 blendMask8pre = _mm256_permute4x64_epi64(blendMask8, 0xD8); + const v256u16 blendMask16[2] = { + _mm256_unpacklo_epi8(blendMask8pre, blendMask8pre), + _mm256_unpackhi_epi8(blendMask8pre, blendMask8pre) + }; + + const v256u32 dst32[4] = { + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 0), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 1), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 2), + _mm256_load_si256((v256u32 *)compInfo.target.lineColor32 + 3), + }; + + v256u32 blendSrc32[4]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc32[0] = colorop_vec.blend3D(tmpSrc[0], dst32[0]); + blendSrc32[1] = colorop_vec.blend3D(tmpSrc[1], dst32[1]); + blendSrc32[2] = colorop_vec.blend3D(tmpSrc[2], dst32[2]); + blendSrc32[3] = colorop_vec.blend3D(tmpSrc[3], dst32[3]); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec256, evb_vec256); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec256, evb_vec256); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec256, evb_vec256); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec256, evb_vec256); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + const v256u8 eva_pre = _mm256_permute4x64_epi64(eva_vec256, 0xD8); + v256u16 tempBlendLo = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(eva_pre, eva_pre), 0xD8 ); + v256u16 tempBlendHi = _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(eva_pre, eva_pre), 0xD8 ); + + const v256u16 tempEVA[4] = { + _mm256_unpacklo_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(tempBlendHi, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendHi, _mm256_setzero_si256()) + }; + + const v256u8 evb_pre = _mm256_permute4x64_epi64(evb_vec256, 0xD8); + tempBlendLo = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(evb_pre, evb_pre), 0xD8 ); + tempBlendHi = _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(evb_pre, evb_pre), 0xD8 ); + + const v256u16 tempEVB[4] = { + _mm256_unpacklo_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendLo, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(tempBlendHi, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(tempBlendHi, _mm256_setzero_si256()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } + } + + const v256u16 blendMask16pre[2] = { + _mm256_permute4x64_epi64(blendMask16[0], 0xD8), + _mm256_permute4x64_epi64(blendMask16[1], 0xD8) + }; + const v256u32 blendMask32[4] = { + _mm256_unpacklo_epi16(blendMask16pre[0], blendMask16pre[0]), + _mm256_unpackhi_epi16(blendMask16pre[0], blendMask16pre[0]), + _mm256_unpacklo_epi16(blendMask16pre[1], blendMask16pre[1]), + _mm256_unpackhi_epi16(blendMask16pre[1], blendMask16pre[1]) + }; + + tmpSrc[0] = _mm256_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm256_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm256_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm256_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); + } + + // Store the final colors. + const v256u8 passMask8pre = _mm256_permute4x64_epi64(passMask8, 0xD8); + const v256u16 passMask16pre[2] = { + _mm256_permute4x64_epi64( _mm256_unpacklo_epi8(passMask8pre, passMask8pre), 0xD8 ), + _mm256_permute4x64_epi64( _mm256_unpackhi_epi8(passMask8pre, passMask8pre), 0xD8 ) + }; + + const v256u32 passMask32[4] = { + _mm256_unpacklo_epi16(passMask16pre[0], passMask16pre[0]), + _mm256_unpackhi_epi16(passMask16pre[0], passMask16pre[0]), + _mm256_unpacklo_epi16(passMask16pre[1], passMask16pre[1]), + _mm256_unpackhi_epi16(passMask16pre[1], passMask16pre[1]) + }; + + const v256u32 alphaBits = _mm256_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 0), passMask32[0], _mm256_or_si256(tmpSrc[0], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 1), passMask32[1], _mm256_or_si256(tmpSrc[1], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 2), passMask32[2], _mm256_or_si256(tmpSrc[2], alphaBits) ); + _mm256_maskstore_epi32( (int *)compInfo.target.lineColor32 + ((sizeof(v256u32)/sizeof(int)) * 3), passMask32[3], _mm256_or_si256(tmpSrc[3], alphaBits) ); + } +} + +template +FORCEINLINE void PixelOperation_AVX2::Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u16 &src1, const v256u16 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp16(compInfo, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown16(compInfo, evy16, srcLayerID, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + default: + { + const v256u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm256_load_si256((v256u8 *)enableColorEffectPtr) : _mm256_set1_epi8(0xFF); + const v256u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_load_si256((v256u8 *)sprAlphaPtr) : _mm256_setzero_si256(); + const v256u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_load_si256((v256u8 *)sprModePtr) : _mm256_setzero_si256(); + + this->_unknownEffectMask16(compInfo, + passMask8, + evy16, + srcLayerID, + src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +FORCEINLINE void PixelOperation_AVX2::Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + { + const v256u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm256_load_si256((v256u8 *)enableColorEffectPtr) : _mm256_set1_epi8(0xFF); + const v256u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_load_si256((v256u8 *)sprAlphaPtr) : _mm256_setzero_si256(); + const v256u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm256_load_si256((v256u8 *)sprModePtr) : _mm256_setzero_si256(); + + this->_unknownEffectMask32(compInfo, + passMask8, + evy16, + srcLayerID, + src3, src2, src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +void GPUEngineBase::_MosaicLine(GPUEngineCompositorInfo &compInfo) +{ + const u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=sizeof(v256u16)) + { + const v256u16 dstColor16[2] = { + _mm256_load_si256((v256u16 *)(this->_deferredColorNative + x) + 0), + _mm256_load_si256((v256u16 *)(this->_deferredColorNative + x) + 1) + }; + + if (ISFIRSTLINE) + { + const v256u8 indexVec = _mm256_load_si256((v256u8 *)(this->_deferredIndexNative + x)); + const v256u8 idxMask8 = _mm256_permute4x64_epi64( _mm256_cmpeq_epi8(indexVec, _mm256_setzero_si256()), 0xD8 ); + const v256u16 idxMask16[2] = { + _mm256_unpacklo_epi8(idxMask8, idxMask8), + _mm256_unpackhi_epi8(idxMask8, idxMask8) + }; + + const v256u16 mosaicColor16[2] = { + _mm256_blendv_epi8(_mm256_and_si256(dstColor16[0], _mm256_set1_epi16(0x7FFF)), _mm256_set1_epi16(0xFFFF), idxMask16[0]), + _mm256_blendv_epi8(_mm256_and_si256(dstColor16[1], _mm256_set1_epi16(0x7FFF)), _mm256_set1_epi16(0xFFFF), idxMask16[1]) + }; + + const v256u16 mosaicSetColorMask8 = _mm256_permute4x64_epi64( _mm256_cmpeq_epi16(_mm256_loadu_si256((v256u8 *)(compInfo.renderState.mosaicWidthBG->begin + x)), _mm256_setzero_si256()), 0xD8 ); + const v256u16 mosaicSetColorMask16[2] = { + _mm256_unpacklo_epi8(mosaicSetColorMask8, mosaicSetColorMask8), + _mm256_unpackhi_epi8(mosaicSetColorMask8, mosaicSetColorMask8) + }; + + __m256i mosaicColorOut[2]; + mosaicColorOut[0] = _mm256_blendv_epi8(mosaicColor16[0], _mm256_loadu_si256((v256u16 *)(mosaicColorBG + x) + 0), mosaicSetColorMask16[0]); + mosaicColorOut[1] = _mm256_blendv_epi8(mosaicColor16[1], _mm256_loadu_si256((v256u16 *)(mosaicColorBG + x) + 1), mosaicSetColorMask16[1]); + + _mm256_storeu_si256((v256u16 *)(mosaicColorBG + x) + 0, mosaicColorOut[0]); + _mm256_storeu_si256((v256u16 *)(mosaicColorBG + x) + 1, mosaicColorOut[1]); + } + + const v256u32 outColor32idx[4] = { + _mm256_loadu_si256((v256u32 *)(compInfo.renderState.mosaicWidthBG->trunc32 + x) + 0), + _mm256_loadu_si256((v256u32 *)(compInfo.renderState.mosaicWidthBG->trunc32 + x) + 1), + _mm256_loadu_si256((v256u32 *)(compInfo.renderState.mosaicWidthBG->trunc32 + x) + 2), + _mm256_loadu_si256((v256u32 *)(compInfo.renderState.mosaicWidthBG->trunc32 + x) + 3) + }; + + const v256u16 outColor32[4] = { + _mm256_and_si256( _mm256_i32gather_epi32((int const *)mosaicColorBG, outColor32idx[0], sizeof(u16)), _mm256_set1_epi32(0x0000FFFF) ), + _mm256_and_si256( _mm256_i32gather_epi32((int const *)mosaicColorBG, outColor32idx[1], sizeof(u16)), _mm256_set1_epi32(0x0000FFFF) ), + _mm256_and_si256( _mm256_i32gather_epi32((int const *)mosaicColorBG, outColor32idx[2], sizeof(u16)), _mm256_set1_epi32(0x0000FFFF) ), + _mm256_and_si256( _mm256_i32gather_epi32((int const *)mosaicColorBG, outColor32idx[3], sizeof(u16)), _mm256_set1_epi32(0x0000FFFF) ) + }; + + const v256u16 outColor16[2] = { + _mm256_permute4x64_epi64( _mm256_packus_epi32(outColor32[0], outColor32[1]), 0xD8 ), + _mm256_permute4x64_epi64( _mm256_packus_epi32(outColor32[2], outColor32[3]), 0xD8 ) + }; + + const v256u16 writeColorMask16[2] = { + _mm256_cmpeq_epi16(outColor16[0], _mm256_set1_epi16(0xFFFF)), + _mm256_cmpeq_epi16(outColor16[1], _mm256_set1_epi16(0xFFFF)) + }; + + _mm256_store_si256( (v256u16 *)(this->_deferredColorNative + x) + 0, _mm256_blendv_epi8(outColor16[0], dstColor16[0], writeColorMask16[0]) ); + _mm256_store_si256( (v256u16 *)(this->_deferredColorNative + x) + 1, _mm256_blendv_epi8(outColor16[1], dstColor16[1], writeColorMask16[1]) ); + + } +} + +template +void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32) +{ + static const size_t step = sizeof(v256u8); + + const bool isUsingSrc32 = (srcColorNative32 != NULL); + const v256u16 evy16 = _mm256_set1_epi16(compInfo.renderState.blendEVY); + const v256u8 srcLayerID = _mm256_set1_epi8(compInfo.renderState.selectedLayerID); + const v256u8 srcEffectEnableMask = _mm256_set1_epi8(compInfo.renderState.srcEffectEnable[GPULayerID_OBJ]); + const v256u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm256_load_si256((v256u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm256_setzero_si256(); + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=step, srcColorNative16+=step, srcColorNative32+=step, compInfo.target.xNative+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + v256u8 passMask8; + int passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestNative[GPULayerID_OBJ] + i)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm256_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFFFFFFFF); + } + else + { + passMask8 = _mm256_set1_epi8(0xFF); + passMaskValue = 0xFFFFFFFF; + didAllPixelsPass = true; + } + + if (isUsingSrc32) + { + const v256u32 src[4] = { + _mm256_load_si256((v256u32 *)srcColorNative32 + 0), + _mm256_load_si256((v256u32 *)srcColorNative32 + 1), + _mm256_load_si256((v256u32 *)srcColorNative32 + 2), + _mm256_load_si256((v256u32 *)srcColorNative32 + 3) + }; + + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[3], src[2], src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + else + { + const v256u16 src[2] = { + _mm256_load_si256((v256u16 *)srcColorNative16 + 0), + _mm256_load_si256((v256u16 *)srcColorNative16 + 1) + }; + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + } +} + +template +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +{ + static const size_t step = sizeof(v256u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v256u16 evy16 = _mm256_set1_epi16(compInfo.renderState.blendEVY); + const v256u8 srcLayerID = _mm256_set1_epi8(compInfo.renderState.selectedLayerID); + const v256u8 srcEffectEnableMask = _mm256_set1_epi8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v256u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm256_load_si256((v256u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm256_setzero_si256(); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v256u8 passMask8; + int passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST || (LAYERTYPE == GPULayerType_BG)) + { + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + } + + if (LAYERTYPE == GPULayerType_BG) + { + // Do the index test. Pixels with an index value of 0 are rejected. + const v256u8 idxPassMask8 = _mm256_cmpeq_epi8(_mm256_load_si256((v256u8 *)(srcIndexCustom + compInfo.target.xCustom)), _mm256_setzero_si256()); + + if (WILLPERFORMWINDOWTEST) + { + passMask8 = _mm256_andnot_si256(idxPassMask8, passMask8); + } + else + { + passMask8 = _mm256_xor_si256(idxPassMask8, _mm256_set1_epi32(0xFFFFFFFF)); + } + } + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm256_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFFFFFFFF); + } + else + { + passMask8 = _mm256_set1_epi8(0xFF); + passMaskValue = 0xFFFFFFFF; + didAllPixelsPass = true; + } + + const v256u16 src[2] = { + _mm256_load_si256((v256u16 *)(srcColorCustom16 + compInfo.target.xCustom) + 0), + _mm256_load_si256((v256u16 *)(srcColorCustom16 + compInfo.target.xCustom) + 1) + }; + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + } + + return i; +} + +template +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +{ + static const size_t step = sizeof(v256u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v256u16 evy16 = _mm256_set1_epi16(compInfo.renderState.blendEVY); + const v256u8 srcLayerID = _mm256_set1_epi8(compInfo.renderState.selectedLayerID); + const v256u8 srcEffectEnableMask = _mm256_set1_epi8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v256u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm256_load_si256((v256u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm256_setzero_si256(); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v256u8 passMask8; + int passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm256_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = _mm256_set1_epi8(0xFF); + passMaskValue = 0xFFFFFFFF; + } + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + case NDSColorFormat_BGR666_Rev: + { + const v256u16 src16[2] = { + _mm256_load_si256((v256u16 *)((u16 *)vramColorPtr + i) + 0), + _mm256_load_si256((v256u16 *)((u16 *)vramColorPtr + i) + 1) + }; + + if (LAYERTYPE != GPULayerType_OBJ) + { + v256u8 tempPassMask = _mm256_packus_epi16( _mm256_srli_epi16(src16[0], 15), _mm256_srli_epi16(src16[1], 15) ); + tempPassMask = _mm256_permute4x64_epi64(tempPassMask, 0xD8); + tempPassMask = _mm256_cmpeq_epi8(tempPassMask, _mm256_set1_epi8(1)); + + passMask8 = _mm256_and_si256(tempPassMask, passMask8); + passMaskValue = _mm256_movemask_epi8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFFFFFF); + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src16[1], src16[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + + case NDSColorFormat_BGR888_Rev: + { + const v256u32 src32[4] = { + _mm256_load_si256((v256u32 *)((FragmentColor *)vramColorPtr + i) + 0), + _mm256_load_si256((v256u32 *)((FragmentColor *)vramColorPtr + i) + 1), + _mm256_load_si256((v256u32 *)((FragmentColor *)vramColorPtr + i) + 2), + _mm256_load_si256((v256u32 *)((FragmentColor *)vramColorPtr + i) + 3) + }; + + if (LAYERTYPE != GPULayerType_OBJ) + { + v256u8 tempPassMask = _mm256_packus_epi16( _mm256_permute4x64_epi64( _mm256_packus_epi32(_mm256_srli_epi32(src32[0], 24), _mm256_srli_epi32(src32[1], 24)), 0xD8 ), + _mm256_permute4x64_epi64( _mm256_packus_epi32(_mm256_srli_epi32(src32[2], 24), _mm256_srli_epi32(src32[3], 24)), 0xD8 ) ); + tempPassMask = _mm256_permute4x64_epi64(tempPassMask, 0xD8); + tempPassMask = _mm256_cmpeq_epi8(tempPassMask, _mm256_setzero_si256()); + + passMask8 = _mm256_andnot_si256(tempPassMask, passMask8); + passMaskValue = _mm256_movemask_epi8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFFFFFF); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src32[3], src32[2], src32[1], src32[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + } + } + + return i; +} + +template +size_t GPUEngineBase::_RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, + size_t &frameX, size_t &spriteX, + u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab) +{ + size_t i = 0; + + static const size_t step = sizeof(v256u16); + const v256u8 prioVec8 = _mm256_set1_epi8(prio); + + const size_t ssePixCount = length - (length % step); + for (; i < ssePixCount; i+=step, spriteX+=step, frameX+=step) + { + const v256u8 prioTabVec8 = _mm256_loadu_si256((v256u8 *)(prioTab + frameX)); + const v256u16 color16Lo = _mm256_loadu_si256((v256u16 *)(vramBuffer + spriteX) + 0); + const v256u16 color16Hi = _mm256_loadu_si256((v256u16 *)(vramBuffer + spriteX) + 1); + + const v256u8 alphaCompare = _mm256_cmpeq_epi8( _mm256_permute4x64_epi64(_mm256_packus_epi16(_mm256_srli_epi16(color16Lo, 15), _mm256_srli_epi16(color16Hi, 15)), 0xD8), _mm256_set1_epi8(0x01) ); + const v256u8 prioCompare = _mm256_cmpgt_epi8(prioTabVec8, prioVec8); + + const v256u8 combinedCompare = _mm256_and_si256(prioCompare, alphaCompare); + const v256u8 combinedComparePre = _mm256_permute4x64_epi64(combinedCompare, 0xD8); + const v256u16 combinedLoCompare = _mm256_unpacklo_epi8(combinedComparePre, combinedComparePre); + const v256u16 combinedHiCompare = _mm256_unpackhi_epi8(combinedComparePre, combinedComparePre); + + _mm256_storeu_si256( (v256u16 *)(dst + frameX) + 0, _mm256_blendv_epi8(_mm256_loadu_si256((v256u16 *)(dst + frameX) + 0), color16Lo, combinedLoCompare) ); + _mm256_storeu_si256( (v256u16 *)(dst + frameX) + 1, _mm256_blendv_epi8(_mm256_loadu_si256((v256u16 *)(dst + frameX) + 1), color16Hi, combinedHiCompare) ); + _mm256_storeu_si256( (v256u8 *)(prioTab + frameX), _mm256_blendv_epi8(prioTabVec8, prioVec8, combinedCompare) ); + + if (!ISDEBUGRENDER) + { + _mm256_storeu_si256( (v256u8 *)(dst_alpha + frameX), _mm256_blendv_epi8(_mm256_loadu_si256((v256u8 *)(dst_alpha + frameX)), _mm256_set1_epi8(spriteAlpha + 1), combinedCompare) ); + _mm256_storeu_si256( (v256u8 *)(typeTab + frameX), _mm256_blendv_epi8(_mm256_loadu_si256((v256u8 *)(typeTab + frameX)), _mm256_set1_epi8(OBJMode_Bitmap), combinedCompare) ); + _mm256_storeu_si256( (v256u8 *)(this->_sprNum + frameX), _mm256_blendv_epi8(_mm256_loadu_si256((v256u8 *)(this->_sprNum + frameX)), _mm256_set1_epi8(spriteNum), combinedCompare) ); + } + } + + return i; +} + +void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative) +{ + const v256u8 *__restrict win0Ptr = (const v256u8 *__restrict)win0; + const v256u8 *__restrict win1Ptr = (const v256u8 *__restrict)win1; + const v256u8 *__restrict winObjPtr = (const v256u8 *__restrict)winObj; + + v256u8 *__restrict didPassWindowTestNativePtr = (v256u8 *__restrict)didPassWindowTestNative; + v256u8 *__restrict enableColorEffectNativePtr = (v256u8 *__restrict)enableColorEffectNative; + + v256u8 didPassWindowTest; + v256u8 enableColorEffect; + + v256u8 win0HandledMask; + v256u8 win1HandledMask; + v256u8 winOBJHandledMask; + v256u8 winOUTHandledMask; + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH/sizeof(v256u8); i++) + { + didPassWindowTest = _mm256_setzero_si256(); + enableColorEffect = _mm256_setzero_si256(); + + win0HandledMask = _mm256_setzero_si256(); + win1HandledMask = _mm256_setzero_si256(); + winOBJHandledMask = _mm256_setzero_si256(); + + // Window 0 has the highest priority, so always check this first. + if (win0Ptr != NULL) + { + const v256u8 win0Enable = _mm256_set1_epi8(compInfo.renderState.WIN0_enable[layerID]); + const v256u8 win0Effect = _mm256_set1_epi8(compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG]); + + win0HandledMask = _mm256_cmpeq_epi8(_mm256_load_si256(win0Ptr + i), _mm256_set1_epi8(1)); + didPassWindowTest = _mm256_and_si256(win0HandledMask, win0Enable); + enableColorEffect = _mm256_and_si256(win0HandledMask, win0Effect); + } + + // Window 1 has medium priority, and is checked after Window 0. + if (win1Ptr != NULL) + { + const v256u8 win1Enable = _mm256_set1_epi8(compInfo.renderState.WIN1_enable[layerID]); + const v256u8 win1Effect = _mm256_set1_epi8(compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG]); + + win1HandledMask = _mm256_andnot_si256(win0HandledMask, _mm256_cmpeq_epi8(_mm256_load_si256(win1Ptr + i), _mm256_set1_epi8(1))); + didPassWindowTest = _mm256_blendv_epi8(didPassWindowTest, win1Enable, win1HandledMask); + enableColorEffect = _mm256_blendv_epi8(enableColorEffect, win1Effect, win1HandledMask); + } + + // Window OBJ has low priority, and is checked after both Window 0 and Window 1. + if (winObjPtr != NULL) + { + const v256u8 winObjEnable = _mm256_set1_epi8(compInfo.renderState.WINOBJ_enable[layerID]); + const v256u8 winObjEffect = _mm256_set1_epi8(compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOBJHandledMask = _mm256_andnot_si256( _mm256_or_si256(win0HandledMask, win1HandledMask), _mm256_cmpeq_epi8(_mm256_load_si256(winObjPtr + i), _mm256_set1_epi8(1)) ); + didPassWindowTest = _mm256_blendv_epi8(didPassWindowTest, winObjEnable, winOBJHandledMask); + enableColorEffect = _mm256_blendv_epi8(enableColorEffect, winObjEffect, winOBJHandledMask); + } + + // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. + // This has the lowest priority, and is always checked last. + const v256u8 winOutEnable = _mm256_set1_epi8(compInfo.renderState.WINOUT_enable[layerID]); + const v256u8 winOutEffect = _mm256_set1_epi8(compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOUTHandledMask = _mm256_xor_si256( _mm256_or_si256(win0HandledMask, _mm256_or_si256(win1HandledMask, winOBJHandledMask)), _mm256_set1_epi32(0xFFFFFFFF) ); + didPassWindowTest = _mm256_blendv_epi8(didPassWindowTest, winOutEnable, winOUTHandledMask); + enableColorEffect = _mm256_blendv_epi8(enableColorEffect, winOutEffect, winOUTHandledMask); + + _mm256_store_si256(didPassWindowTestNativePtr + i, didPassWindowTest); + _mm256_store_si256(enableColorEffectNativePtr + i, enableColorEffect); + } +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v256u32) : pixCount * sizeof(u16) / sizeof(v256u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v256u16 dstColor = _mm256_load_si256((v256u16 *)dst + i); + dstColor = colorop_vec.increase(dstColor, _mm256_set1_epi16(intensityClamped)); + dstColor = _mm256_or_si256(dstColor, _mm256_set1_epi16(0x8000)); + _mm256_store_si256((v256u16 *)dst + i, dstColor); + } + else + { + v256u32 dstColor = _mm256_load_si256((v256u32 *)dst + i); + dstColor = colorop_vec.increase(dstColor, _mm256_set1_epi16(intensityClamped)); + dstColor = _mm256_or_si256(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm256_set1_epi32(0x1F000000) : _mm256_set1_epi32(0xFF000000)); + _mm256_store_si256((v256u32 *)dst + i, dstColor); + } + } + + return (i * sizeof(__m256i)); +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v256u32) : pixCount * sizeof(u16) / sizeof(v256u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v256u16 dstColor = _mm256_load_si256((v256u16 *)dst + i); + dstColor = colorop_vec.decrease(dstColor, _mm256_set1_epi16(intensityClamped)); + dstColor = _mm256_or_si256(dstColor, _mm256_set1_epi16(0x8000)); + _mm256_store_si256((v256u16 *)dst + i, dstColor); + } + else + { + v256u32 dstColor = _mm256_load_si256((v256u32 *)dst + i); + dstColor = colorop_vec.decrease(dstColor, _mm256_set1_epi16(intensityClamped)); + dstColor = _mm256_or_si256(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm256_set1_epi32(0x1F000000) : _mm256_set1_epi32(0xFF000000)); + _mm256_store_si256((v256u32 *)dst + i, dstColor); + } + } + + return (i * sizeof(__m256i)); +} + +template +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +{ + static const size_t step = sizeof(v256u32); + + const size_t vecPixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v256u16 evy16 = _mm256_set1_epi16(compInfo.renderState.blendEVY); + const v256u8 srcLayerID = _mm256_set1_epi8(compInfo.renderState.selectedLayerID); + const v256u8 srcEffectEnableMask = _mm256_set1_epi8(compInfo.renderState.srcEffectEnable[GPULayerID_BG0]); + const v256u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm256_load_si256((v256u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm256_setzero_si256(); + + size_t i = 0; + for (; i < vecPixCount; i+=step, srcLinePtr+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + // Determine which pixels pass by doing the window test and the alpha test. + v256u8 passMask8; + int passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[GPULayerID_BG0] + compInfo.target.xCustom)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm256_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = _mm256_set1_epi8(0xFF); + passMaskValue = 0xFFFFFFFF; + } + + const v256u32 src[4] = { + _mm256_load_si256((v256u32 *)srcLinePtr + 0), + _mm256_load_si256((v256u32 *)srcLinePtr + 1), + _mm256_load_si256((v256u32 *)srcLinePtr + 2), + _mm256_load_si256((v256u32 *)srcLinePtr + 3) + }; + + // Do the alpha test. Pixels with an alpha value of 0 are rejected. + const v256u32 srcAlpha = _mm256_permute4x64_epi64( _mm256_packus_epi16( _mm256_permute4x64_epi64( _mm256_packus_epi32(_mm256_srli_epi32(src[0], 24), _mm256_srli_epi32(src[1], 24)), 0xD8 ), + _mm256_permute4x64_epi64( _mm256_packus_epi32(_mm256_srli_epi32(src[2], 24), _mm256_srli_epi32(src[3], 24)), 0xD8 ) ), 0xD8 ); + + passMask8 = _mm256_andnot_si256(_mm256_cmpeq_epi8(srcAlpha, _mm256_setzero_si256()), passMask8); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm256_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFFFFFF); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[3], src[2], src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[GPULayerID_BG0] + compInfo.target.xCustom, + NULL, + NULL); + } + + return i; +} + +template +size_t GPUEngineA::_RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) +{ + const v256u16 blendEVA_vec = _mm256_set1_epi16(blendEVA); + const v256u16 blendEVB_vec = _mm256_set1_epi16(blendEVB); + const v256u8 blendAB = _mm256_or_si256( blendEVA_vec, _mm256_slli_epi16(blendEVB_vec, 8) ); + + __m256i srcA_vec; + __m256i srcB_vec; + __m256i dstColor; + + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? length * sizeof(u32) / sizeof(v256u32) : length * sizeof(u16) / sizeof(v256u16); + for (; i < vecCount; i++) + { + srcA_vec = _mm256_load_si256((__m256i *)srcA + i); + srcB_vec = _mm256_load_si256((__m256i *)srcB + i); + + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + // Get color masks based on if the alpha value is 0. Colors with an alpha value + // equal to 0 are rejected. + v256u32 srcA_alpha = _mm256_and_si256(srcA_vec, _mm256_set1_epi32(0xFF000000)); + v256u32 srcB_alpha = _mm256_and_si256(srcB_vec, _mm256_set1_epi32(0xFF000000)); + v256u32 srcA_masked = _mm256_andnot_si256(_mm256_cmpeq_epi32(srcA_alpha, _mm256_setzero_si256()), srcA_vec); + v256u32 srcB_masked = _mm256_andnot_si256(_mm256_cmpeq_epi32(srcB_alpha, _mm256_setzero_si256()), srcB_vec); + + v256u16 outColorLo; + v256u16 outColorHi; + + const v256u32 srcA_maskedPre = _mm256_permute4x64_epi64(srcA_masked, 0xD8); + const v256u32 srcB_maskedPre = _mm256_permute4x64_epi64(srcB_masked, 0xD8); + + // Temporarily convert the color component values from 8-bit to 16-bit, and then + // do the blend calculation. + outColorLo = _mm256_unpacklo_epi8(srcA_maskedPre, srcB_maskedPre); + outColorHi = _mm256_unpackhi_epi8(srcA_maskedPre, srcB_maskedPre); + + outColorLo = _mm256_maddubs_epi16(outColorLo, blendAB); + outColorHi = _mm256_maddubs_epi16(outColorHi, blendAB); + + outColorLo = _mm256_srli_epi16(outColorLo, 4); + outColorHi = _mm256_srli_epi16(outColorHi, 4); + + // Convert the color components back from 16-bit to 8-bit using a saturated pack. + dstColor = _mm256_packus_epi16(outColorLo, outColorHi); + dstColor = _mm256_permute4x64_epi64(dstColor, 0xD8); + + // Add the alpha components back in. + dstColor = _mm256_and_si256(dstColor, _mm256_set1_epi32(0x00FFFFFF)); + dstColor = _mm256_or_si256(dstColor, srcA_alpha); + dstColor = _mm256_or_si256(dstColor, srcB_alpha); + } + else + { + v256u16 srcA_alpha = _mm256_and_si256(srcA_vec, _mm256_set1_epi16(0x8000)); + v256u16 srcB_alpha = _mm256_and_si256(srcB_vec, _mm256_set1_epi16(0x8000)); + v256u16 srcA_masked = _mm256_andnot_si256( _mm256_cmpeq_epi16(srcA_alpha, _mm256_setzero_si256()), srcA_vec ); + v256u16 srcB_masked = _mm256_andnot_si256( _mm256_cmpeq_epi16(srcB_alpha, _mm256_setzero_si256()), srcB_vec ); + v256u16 colorBitMask = _mm256_set1_epi16(0x001F); + + v256u16 ra; + v256u16 ga; + v256u16 ba; + + ra = _mm256_or_si256( _mm256_and_si256( srcA_masked, colorBitMask), _mm256_and_si256(_mm256_slli_epi16(srcB_masked, 8), _mm256_set1_epi16(0x1F00)) ); + ga = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcA_masked, 5), colorBitMask), _mm256_and_si256(_mm256_slli_epi16(srcB_masked, 3), _mm256_set1_epi16(0x1F00)) ); + ba = _mm256_or_si256( _mm256_and_si256(_mm256_srli_epi16(srcA_masked, 10), colorBitMask), _mm256_and_si256(_mm256_srli_epi16(srcB_masked, 2), _mm256_set1_epi16(0x1F00)) ); + + ra = _mm256_maddubs_epi16(ra, blendAB); + ga = _mm256_maddubs_epi16(ga, blendAB); + ba = _mm256_maddubs_epi16(ba, blendAB); + + ra = _mm256_srli_epi16(ra, 4); + ga = _mm256_srli_epi16(ga, 4); + ba = _mm256_srli_epi16(ba, 4); + + ra = _mm256_min_epi16(ra, colorBitMask); + ga = _mm256_min_epi16(ga, colorBitMask); + ba = _mm256_min_epi16(ba, colorBitMask); + + dstColor = _mm256_or_si256( _mm256_or_si256(_mm256_or_si256(ra, _mm256_slli_epi16(ga, 5)), _mm256_slli_epi16(ba, 10)), _mm256_or_si256(srcA_alpha, srcB_alpha) ); + } + + _mm256_store_si256((__m256i *)dst + i, dstColor); + } + + return (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? i * sizeof(v256u32) / sizeof(u32) : i * sizeof(v256u16) / sizeof(u16); +} + +#endif // ENABLE_AVX2 diff --git a/desmume/src/GPU_Operations_AVX2.h b/desmume/src/GPU_Operations_AVX2.h new file mode 100644 index 000000000..b1873c576 --- /dev/null +++ b/desmume/src/GPU_Operations_AVX2.h @@ -0,0 +1,122 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef GPU_OPERATIONS_AVX2_H +#define GPU_OPERATIONS_AVX2_H + +#include "GPU_Operations.h" + +#ifndef ENABLE_AVX2 + #warning This header requires AVX2 support. +#else + +class ColorOperation_AVX2 +{ +public: + ColorOperation_AVX2() {}; + + FORCEINLINE v256u16 blend(const v256u16 &colA, const v256u16 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const; + template FORCEINLINE v256u32 blend(const v256u32 &colA, const v256u32 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const; + + FORCEINLINE v256u16 blend3D(const v256u32 &colA_Lo, const v256u32 &colA_Hi, const v256u16 &colB) const; + template FORCEINLINE v256u32 blend3D(const v256u32 &colA, const v256u32 &colB) const; + + FORCEINLINE v256u16 increase(const v256u16 &col, const v256u16 &blendEVY) const; + template FORCEINLINE v256u32 increase(const v256u32 &col, const v256u16 &blendEVY) const; + + FORCEINLINE v256u16 decrease(const v256u16 &col, const v256u16 &blendEVY) const; + template FORCEINLINE v256u32 decrease(const v256u32 &col, const v256u16 &blendEVY) const; +}; + +class PixelOperation_AVX2 +{ +protected: + template FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const; + template FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const; + + template + FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u16 &src1, const v256u16 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const v256u8 &enableColorEffectMask, + const v256u8 &spriteAlpha, + const v256u8 &spriteMode) const; + + template + FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const v256u8 &enableColorEffectMask, + const v256u8 &spriteAlpha, + const v256u8 &spriteMode) const; + +public: + PixelOperation_AVX2() {}; + + template + FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u16 &src1, const v256u16 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; + + template + FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v256u8 &passMask8, + const v256u16 &evy16, + const v256u8 &srcLayerID, + const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0, + const v256u8 &srcEffectEnableMask, + const v256u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; +}; + +#endif // ENABLE_AVX2 + +#endif // GPU_OPERATIONS_AVX2_H diff --git a/desmume/src/GPU_Operations_SSE2.cpp b/desmume/src/GPU_Operations_SSE2.cpp new file mode 100644 index 000000000..9f452ce4d --- /dev/null +++ b/desmume/src/GPU_Operations_SSE2.cpp @@ -0,0 +1,2896 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef ENABLE_SSE2 + #error This code requires SSE2 support. + #warning This error might occur if this file is compiled directly. Do not compile this file directly, as it is already included in GPU_Operations.cpp. +#else + +#include "GPU_Operations_SSE2.h" +#include + + +static const ColorOperation_SSE2 colorop_vec; +static const PixelOperation_SSE2 pixelop_vec; + +template +static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, dstWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) ); + } + else if (INTEGERSCALEHINT == 2) + { + __m128i srcVec; + __m128i srcPixOut[2]; + + switch (ELEMENTSIZE) + { + case 1: + { + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ + srcVec = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ + srcPixOut[0] = _mm_unpacklo_epi8(srcVec, srcVec); \ + srcPixOut[1] = _mm_unpackhi_epi8(srcVec, srcVec); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ + srcVec = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ + srcPixOut[0] = _mm_unpacklo_epi8(srcVec, srcVec); \ + srcPixOut[1] = _mm_unpackhi_epi8(srcVec, srcVec); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + 1, srcPixOut[1]); \ + ); + } + break; + } + + case 2: + { + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ + srcVec = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ + srcPixOut[0] = _mm_unpacklo_epi16(srcVec, srcVec); \ + srcPixOut[1] = _mm_unpackhi_epi16(srcVec, srcVec); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 0) + 1, srcPixOut[1]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(__m128i) / ELEMENTSIZE)) * 1) + 1, srcPixOut[1]); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), \ + srcVec = _mm_load_si128((__m128i *)((__m128i *)src + (X))); \ + srcPixOut[0] = _mm_unpacklo_epi16(srcVec, srcVec); \ + srcPixOut[1] = _mm_unpackhi_epi16(srcVec, srcVec); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + 0, srcPixOut[0]); \ + _mm_store_si128((__m128i *)dst + ((X) * 2) + 1, srcPixOut[1]); \ + ); + } + break; + } + + case 4: + { + // If we're also doing vertical expansion, then the total number of instructions for a fully + // unrolled loop is 448 instructions. Therefore, let's not unroll the loop in this case in + // order to avoid overusing the CPU's instruction cache. + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + srcVec = _mm_load_si128((__m128i *)src + srcX); + srcPixOut[0] = _mm_shuffle_epi32(srcVec, 0x50); + srcPixOut[1] = _mm_shuffle_epi32(srcVec, 0xFA); + + _mm_store_si128((__m128i *)dst + dstX + 0, srcPixOut[0]); + _mm_store_si128((__m128i *)dst + dstX + 1, srcPixOut[1]); + + if (SCALEVERTICAL) + { + _mm_store_si128((__m128i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE) * INTEGERSCALEHINT) * 1) + 0, srcPixOut[0]); + _mm_store_si128((__m128i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE) * INTEGERSCALEHINT) * 1) + 1, srcPixOut[1]); + } + } + break; + } + } + } + else if (INTEGERSCALEHINT == 3) + { + __m128i srcPixOut[3]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m128i srcVec = _mm_load_si128((__m128i *)src + srcX); + + if (ELEMENTSIZE == 1) + { +#ifdef ENABLE_SSSE3 + srcPixOut[0] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0)); + srcPixOut[1] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(10,10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5)); + srcPixOut[2] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(15,15,15,14,14,14,13,13,13,12,12,12,11,11,11,10)); +#else + __m128i src8As32[4]; + src8As32[0] = _mm_unpacklo_epi8(srcVec, srcVec); + src8As32[1] = _mm_unpackhi_epi8(srcVec, srcVec); + src8As32[2] = _mm_unpacklo_epi8(src8As32[1], src8As32[1]); + src8As32[3] = _mm_unpackhi_epi8(src8As32[1], src8As32[1]); + src8As32[1] = _mm_unpackhi_epi8(src8As32[0], src8As32[0]); + src8As32[0] = _mm_unpacklo_epi8(src8As32[0], src8As32[0]); + + src8As32[0] = _mm_and_si128(src8As32[0], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); + src8As32[1] = _mm_and_si128(src8As32[1], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); + src8As32[2] = _mm_and_si128(src8As32[2], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); + src8As32[3] = _mm_and_si128(src8As32[3], _mm_set_epi32(0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF)); + + __m128i srcWorking[4]; + + srcWorking[0] = _mm_shuffle_epi32(src8As32[0], 0x40); + srcWorking[1] = _mm_shuffle_epi32(src8As32[0], 0xA5); + srcWorking[2] = _mm_shuffle_epi32(src8As32[0], 0xFE); + srcWorking[3] = _mm_shuffle_epi32(src8As32[1], 0x40); + srcPixOut[0] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); + + srcWorking[0] = _mm_shuffle_epi32(src8As32[1], 0xA5); + srcWorking[1] = _mm_shuffle_epi32(src8As32[1], 0xFE); + srcWorking[2] = _mm_shuffle_epi32(src8As32[2], 0x40); + srcWorking[3] = _mm_shuffle_epi32(src8As32[2], 0xA5); + srcPixOut[1] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); + + srcWorking[0] = _mm_shuffle_epi32(src8As32[2], 0xFE); + srcWorking[1] = _mm_shuffle_epi32(src8As32[3], 0x40); + srcWorking[2] = _mm_shuffle_epi32(src8As32[3], 0xA5); + srcWorking[3] = _mm_shuffle_epi32(src8As32[3], 0xFE); + srcPixOut[2] = _mm_packus_epi16( _mm_packus_epi16(srcWorking[0], srcWorking[1]), _mm_packus_epi16(srcWorking[2], srcWorking[3]) ); +#endif + } + else if (ELEMENTSIZE == 2) + { +#ifdef ENABLE_SSSE3 + srcPixOut[0] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(11,10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4)); + srcPixOut[2] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(15,14,15,14,15,14,13,12,13,12,13,12,11,10,11,10)); +#else + const __m128i src16lo = _mm_shuffle_epi32(srcVec, 0x44); + const __m128i src16hi = _mm_shuffle_epi32(srcVec, 0xEE); + + srcPixOut[0] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16lo, 0x40), 0xA5); + srcPixOut[1] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(srcVec, 0xFE), 0x40); + srcPixOut[2] = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16hi, 0xA5), 0xFE); +#endif + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm_shuffle_epi32(srcVec, 0x40); + srcPixOut[1] = _mm_shuffle_epi32(srcVec, 0xA5); + srcPixOut[2] = _mm_shuffle_epi32(srcVec, 0xFE); + } + + for (size_t lx = 0; lx < (size_t)INTEGERSCALEHINT; lx++) + { + _mm_store_si128((__m128i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < (size_t)INTEGERSCALEHINT; lx++) + { + _mm_store_si128((__m128i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } + else if (INTEGERSCALEHINT == 4) + { + __m128i srcPixOut[4]; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const __m128i srcVec = _mm_load_si128((__m128i *)src + srcX); + + if (ELEMENTSIZE == 1) + { +#ifdef ENABLE_SSSE3 + srcPixOut[0] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0)); + srcPixOut[1] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4)); + srcPixOut[2] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(11,11,11,11,10,10,10,10, 9, 9, 9, 9, 8, 8, 8, 8)); + srcPixOut[3] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(15,15,15,15,14,14,14,14,13,13,13,13,12,12,12,12)); +#else + const __m128i src8_lo = _mm_unpacklo_epi8(srcVec, srcVec); + const __m128i src8_hi = _mm_unpackhi_epi8(srcVec, srcVec); + + srcPixOut[0] = _mm_unpacklo_epi8(src8_lo, src8_lo); + srcPixOut[1] = _mm_unpackhi_epi8(src8_lo, src8_lo); + srcPixOut[2] = _mm_unpacklo_epi8(src8_hi, src8_hi); + srcPixOut[3] = _mm_unpackhi_epi8(src8_hi, src8_hi); +#endif + } + else if (ELEMENTSIZE == 2) + { +#ifdef ENABLE_SSSE3 + srcPixOut[0] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0)); + srcPixOut[1] = _mm_shuffle_epi8(srcVec, _mm_set_epi8( 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4)); + srcPixOut[2] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(11,10,11,10,11,10,11,10, 9, 8, 9, 8, 9, 8, 9, 8)); + srcPixOut[3] = _mm_shuffle_epi8(srcVec, _mm_set_epi8(15,14,15,14,15,14,15,14,13,12,13,12,13,12,13,12)); +#else + const __m128i src16_lo = _mm_unpacklo_epi16(srcVec, srcVec); + const __m128i src16_hi = _mm_unpackhi_epi16(srcVec, srcVec); + + srcPixOut[0] = _mm_unpacklo_epi16(src16_lo, src16_lo); + srcPixOut[1] = _mm_unpackhi_epi16(src16_lo, src16_lo); + srcPixOut[2] = _mm_unpacklo_epi16(src16_hi, src16_hi); + srcPixOut[3] = _mm_unpackhi_epi16(src16_hi, src16_hi); +#endif + } + else if (ELEMENTSIZE == 4) + { + srcPixOut[0] = _mm_shuffle_epi32(srcVec, 0x00); + srcPixOut[1] = _mm_shuffle_epi32(srcVec, 0x55); + srcPixOut[2] = _mm_shuffle_epi32(srcVec, 0xAA); + srcPixOut[3] = _mm_shuffle_epi32(srcVec, 0xFF); + } + + for (size_t lx = 0; lx < (size_t)INTEGERSCALEHINT; lx++) + { + _mm_store_si128((__m128i *)dst + dstX + lx, srcPixOut[lx]); + } + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + for (size_t lx = 0; lx < (size_t)INTEGERSCALEHINT; lx++) + { + _mm_store_si128((__m128i *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + lx, srcPixOut[lx]); + } + } + } + } + } +#ifdef ENABLE_SSSE3 + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = dstWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE); srcX++, dstX+=scale) + { + const __m128i srcVec = _mm_load_si128((__m128i *)src + srcX); + v128u8 ssse3idx; + + for (size_t lx = 0; lx < scale; lx++) + { + if (ELEMENTSIZE == 1) + { + ssse3idx = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u8_16e + (lx * sizeof(v128u8)))); + } + else if (ELEMENTSIZE == 2) + { + ssse3idx = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u16_8e + (lx * sizeof(v128u8)))); + } + else if (ELEMENTSIZE == 4) + { + ssse3idx = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u32_4e + (lx * sizeof(v128u8)))); + } + + _mm_store_si128( (__m128i *)dst + dstX + lx, _mm_shuffle_epi8(srcVec, ssse3idx) ); + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } +#endif + else + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x]; + } + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } +} + +template +static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src, size_t srcWidth) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, srcWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) ); + } + else if (INTEGERSCALEHINT == 2) + { + __m128i srcPix[2]; + + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) + { + srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 2) + 0); + srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 2) + 1); + + if (ELEMENTSIZE == 1) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x00FF00FF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x00FF00FF)); + + _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi16(srcPix[0], srcPix[1])); + } + else if (ELEMENTSIZE == 2) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x0000FFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x0000FFFF)); + +#if defined(ENABLE_SSE4_1) + _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi32(srcPix[0], srcPix[1])); +#elif defined(ENABLE_SSSE3) + srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(15,14,11,10, 7, 6, 3, 2,13,12, 9, 8, 5, 4, 1, 0)); + srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(13,12, 9, 8, 5, 4, 1, 0,15,14,11,10, 7, 6, 3, 2)); + + _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); +#else + srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0xD8); + srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0xD8); + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); + + srcPix[1] = _mm_shufflelo_epi16(srcPix[1], 0xD8); + srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0xD8); + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); + + _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); +#endif + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF)); + + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); + + _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); + } + } + } + else if (INTEGERSCALEHINT == 3) + { +#ifdef ENABLE_SSSE3 + static const u8 X = 0x80; +#endif + __m128i srcPix[3]; + + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) + { + srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 3) + 0); + srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 3) + 1); + srcPix[2] = _mm_load_si128((__m128i *)src + (dstX * 3) + 2); + + if (ELEMENTSIZE == 1) + { +#ifdef ENABLE_SSSE3 + srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8( X, X, X, X, X, X, X, X, X, X,15,12, 9, 6, 3, 0)); + srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8( X, X, X, X, X,14,11, 8, 5, 2, X, X, X, X, X, X)); + srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(13,10, 7, 4, 1, X, X, X, X, X, X, X, X, X, X, X)); + + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); +#else + __m128i srcWorking[3]; + + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFF0000FF, 0x0000FF00, 0x00FF0000, 0xFF0000FF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00FF0000, 0xFF0000FF, 0x0000FF00, 0x00FF0000)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x0000FF00, 0x00FF0000, 0xFF0000FF, 0x0000FF00)); + + srcWorking[0] = _mm_unpacklo_epi8(srcPix[0], _mm_setzero_si128()); + srcWorking[1] = _mm_unpackhi_epi8(srcPix[0], _mm_setzero_si128()); + srcWorking[2] = _mm_unpacklo_epi8(srcPix[1], _mm_setzero_si128()); + srcPix[0] = _mm_or_si128(srcWorking[0], srcWorking[1]); + srcPix[0] = _mm_or_si128(srcPix[0], srcWorking[2]); + + srcWorking[0] = _mm_unpackhi_epi8(srcPix[1], _mm_setzero_si128()); + srcWorking[1] = _mm_unpacklo_epi8(srcPix[2], _mm_setzero_si128()); + srcWorking[2] = _mm_unpackhi_epi8(srcPix[2], _mm_setzero_si128()); + srcPix[1] = _mm_or_si128(srcWorking[0], srcWorking[1]); + srcPix[1] = _mm_or_si128(srcPix[1], srcWorking[2]); + + srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0x6C); + srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0x6C); + srcPix[1] = _mm_shufflelo_epi16(srcPix[1], 0x6C); + srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0x6C); + + srcPix[0] = _mm_packus_epi16(srcPix[0], srcPix[1]); + srcPix[1] = _mm_shuffle_epi32(srcPix[0], 0xB1); + + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000)); + + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); +#endif + _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); + } + else if (ELEMENTSIZE == 2) + { +#ifdef ENABLE_SSSE3 + srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8( X, X, X, X, X, X, X, X, X, X,13,12, 7, 6, 1, 0)); + srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8( X, X, X, X,15,14, 9, 8, 3, 2, X, X, X, X, X, X)); + srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(11,10, 5, 4, X, X, X, X, X, X, X, X, X, X, X, X)); +#else + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x0000FFFF, 0x00000000, 0xFFFF0000, 0x0000FFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0xFFFF0000, 0x0000FFFF, 0x00000000, 0xFFFF0000)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0xFFFF0000, 0x0000FFFF, 0x00000000)); + + srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0x9C); + srcPix[1] = _mm_shufflehi_epi16(srcPix[1], 0x9C); + srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0x9C); + + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0x9C); + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0xE1); + srcPix[2] = _mm_shufflehi_epi16(srcPix[2], 0xC9); +#endif + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); + + _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000)); + + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0x9C); + srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0x78); + + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[2]); + + _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); + } + } + } + else if (INTEGERSCALEHINT == 4) + { + __m128i srcPix[4]; + + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE)); dstX++) + { + srcPix[0] = _mm_load_si128((__m128i *)src + (dstX * 4) + 0); + srcPix[1] = _mm_load_si128((__m128i *)src + (dstX * 4) + 1); + srcPix[2] = _mm_load_si128((__m128i *)src + (dstX * 4) + 2); + srcPix[3] = _mm_load_si128((__m128i *)src + (dstX * 4) + 3); + + if (ELEMENTSIZE == 1) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set1_epi32(0x000000FF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set1_epi32(0x000000FF)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set1_epi32(0x000000FF)); + srcPix[3] = _mm_and_si128(srcPix[3], _mm_set1_epi32(0x000000FF)); + + srcPix[0] = _mm_packus_epi16(srcPix[0], srcPix[1]); + srcPix[1] = _mm_packus_epi16(srcPix[2], srcPix[3]); + + _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi16(srcPix[0], srcPix[1])); + } + else if (ELEMENTSIZE == 2) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + srcPix[3] = _mm_and_si128(srcPix[3], _mm_set_epi32(0x00000000, 0x0000FFFF, 0x00000000, 0x0000FFFF)); + +#if defined(ENABLE_SSE4_1) + srcPix[0] = _mm_packus_epi32(srcPix[0], srcPix[1]); + srcPix[1] = _mm_packus_epi32(srcPix[2], srcPix[3]); + + _mm_store_si128((__m128i *)dst + dstX, _mm_packus_epi32(srcPix[0], srcPix[1])); +#elif defined(ENABLE_SSSE3) + srcPix[0] = _mm_shuffle_epi8(srcPix[0], _mm_set_epi8(15,14,13,12,11,10, 7, 6, 5, 4, 3, 2, 9, 8, 1, 0)); + srcPix[1] = _mm_shuffle_epi8(srcPix[1], _mm_set_epi8(13,12,13,12,11,10, 7, 6, 9, 8, 1, 0, 5, 4, 3, 2)); + srcPix[2] = _mm_shuffle_epi8(srcPix[2], _mm_set_epi8(13,12,13,12, 9, 8, 1, 0,11,10, 7, 6, 5, 4, 3, 2)); + srcPix[3] = _mm_shuffle_epi8(srcPix[3], _mm_set_epi8( 9, 8, 1, 0,15,14,13,12,11,10, 7, 6, 5, 4, 3, 2)); + + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); + srcPix[1] = _mm_or_si128(srcPix[2], srcPix[3]); + + _mm_store_si128((__m128i *)dst + dstX, _mm_or_si128(srcPix[0], srcPix[1])); +#else + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0xD8); + srcPix[2] = _mm_shuffle_epi32(srcPix[2], 0xD8); + srcPix[3] = _mm_shuffle_epi32(srcPix[3], 0xD8); + + srcPix[0] = _mm_unpacklo_epi32(srcPix[0], srcPix[1]); + srcPix[1] = _mm_unpacklo_epi32(srcPix[2], srcPix[3]); + + srcPix[0] = _mm_shuffle_epi32(srcPix[0], 0xD8); + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x8D); + + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); + srcPix[0] = _mm_shufflelo_epi16(srcPix[0], 0xD8); + srcPix[0] = _mm_shufflehi_epi16(srcPix[0], 0xD8); + + _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); +#endif + } + else if (ELEMENTSIZE == 4) + { + srcPix[0] = _mm_and_si128(srcPix[0], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[1] = _mm_and_si128(srcPix[1], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[2] = _mm_and_si128(srcPix[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + srcPix[3] = _mm_and_si128(srcPix[3], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)); + + srcPix[0] = _mm_unpacklo_epi32(srcPix[0], srcPix[1]); + srcPix[1] = _mm_unpacklo_epi32(srcPix[2], srcPix[3]); +#ifdef HOST_64 + srcPix[0] = _mm_unpacklo_epi64(srcPix[0], srcPix[1]); +#else + srcPix[1] = _mm_shuffle_epi32(srcPix[1], 0x4E); + srcPix[0] = _mm_or_si128(srcPix[0], srcPix[1]); +#endif + _mm_store_si128((__m128i *)dst + dstX, srcPix[0]); + } + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = srcWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + if (ELEMENTSIZE == 1) + { + ((u8 *)dst)[x] = ((u8 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[x] = ((u16 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[x] = ((u32 *)src)[x * scale]; + } + } + } + else + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[i] = ( (u8 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = ((u16 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = ((u32 *)src)[_gpuDstPitchIndex[i]]; + } + } + } +} + +FORCEINLINE v128u16 ColorOperation_SSE2::blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const +{ + v128u16 ra; + v128u16 ga; + v128u16 ba; + v128u16 colorBitMask = _mm_set1_epi16(0x001F); + +#ifdef ENABLE_SSSE3 + ra = _mm_or_si128( _mm_and_si128( colA, colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) ); + ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) ); + ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) ); + + const v128u16 blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8)); + ra = _mm_maddubs_epi16(ra, blendAB); + ga = _mm_maddubs_epi16(ga, blendAB); + ba = _mm_maddubs_epi16(ba, blendAB); +#else + ra = _mm_and_si128( colA, colorBitMask); + ga = _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask); + ba = _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask); + + v128u16 rb = _mm_and_si128( colB, colorBitMask); + v128u16 gb = _mm_and_si128(_mm_srli_epi16(colB, 5), colorBitMask); + v128u16 bb = _mm_and_si128(_mm_srli_epi16(colB, 10), colorBitMask); + + ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) ); + ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) ); + ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) ); +#endif + + ra = _mm_srli_epi16(ra, 4); + ga = _mm_srli_epi16(ga, 4); + ba = _mm_srli_epi16(ba, 4); + + ra = _mm_min_epi16(ra, colorBitMask); + ga = _mm_min_epi16(ga, colorBitMask); + ba = _mm_min_epi16(ba, colorBitMask); + + return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) ); +} + +// Note that if USECONSTANTBLENDVALUESHINT is true, then this method will assume that blendEVA contains identical values +// for each 16-bit vector element, and also that blendEVB contains identical values for each 16-bit vector element. If +// this assumption is broken, then the resulting color will be undefined. +template +FORCEINLINE v128u32 ColorOperation_SSE2::blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const +{ + v128u16 outColorLo; + v128u16 outColorHi; + v128u32 outColor; + +#ifdef ENABLE_SSSE3 + const v128u16 blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8)); + + outColorLo = _mm_unpacklo_epi8(colA, colB); + outColorHi = _mm_unpackhi_epi8(colA, colB); + + if (USECONSTANTBLENDVALUESHINT) + { + outColorLo = _mm_maddubs_epi16(outColorLo, blendAB); + outColorHi = _mm_maddubs_epi16(outColorHi, blendAB); + } + else + { + const v128u16 blendABLo = _mm_unpacklo_epi16(blendAB, blendAB); + const v128u16 blendABHi = _mm_unpackhi_epi16(blendAB, blendAB); + outColorLo = _mm_maddubs_epi16(outColorLo, blendABLo); + outColorHi = _mm_maddubs_epi16(outColorHi, blendABHi); + } +#else + const v128u16 colALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128()); + const v128u16 colAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128()); + const v128u16 colBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128()); + const v128u16 colBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128()); + + if (USECONSTANTBLENDVALUESHINT) + { + outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendEVA), _mm_mullo_epi16(colBLo, blendEVB) ); + outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendEVA), _mm_mullo_epi16(colBHi, blendEVB) ); + } + else + { + const v128u16 blendALo = _mm_unpacklo_epi16(blendEVA, blendEVA); + const v128u16 blendAHi = _mm_unpackhi_epi16(blendEVA, blendEVA); + const v128u16 blendBLo = _mm_unpacklo_epi16(blendEVB, blendEVB); + const v128u16 blendBHi = _mm_unpackhi_epi16(blendEVB, blendEVB); + + outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendALo), _mm_mullo_epi16(colBLo, blendBLo) ); + outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendAHi), _mm_mullo_epi16(colBHi, blendBHi) ); + } +#endif + + outColorLo = _mm_srli_epi16(outColorLo, 4); + outColorHi = _mm_srli_epi16(outColorHi, 4); + outColor = _mm_packus_epi16(outColorLo, outColorHi); + + // When the color format is 888, the packuswb instruction will naturally clamp + // the color component values to 255. However, when the color format is 666, the + // color component values must be clamped to 63. In this case, we must call pminub + // to do the clamp. + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63)); + } + + outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF)); + + return outColor; +} + +FORCEINLINE v128u16 ColorOperation_SSE2::blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const +{ + // If the color format of B is 555, then the colA_Hi parameter is required. + // The color format of A is assumed to be RGB666. + v128u32 ra_lo = _mm_and_si128( colA_Lo, _mm_set1_epi32(0x000000FF) ); + v128u32 ga_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 ba_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 16), _mm_set1_epi32(0x000000FF) ); + v128u32 aa_lo = _mm_srli_epi32(colA_Lo, 24); + + v128u32 ra_hi = _mm_and_si128( colA_Hi, _mm_set1_epi32(0x000000FF) ); + v128u32 ga_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 ba_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 16), _mm_set1_epi32(0x000000FF) ); + v128u32 aa_hi = _mm_srli_epi32(colA_Hi, 24); + + v128u16 ra = _mm_packs_epi32(ra_lo, ra_hi); + v128u16 ga = _mm_packs_epi32(ga_lo, ga_hi); + v128u16 ba = _mm_packs_epi32(ba_lo, ba_hi); + v128u16 aa = _mm_packs_epi32(aa_lo, aa_hi); + +#ifdef ENABLE_SSSE3 + ra = _mm_or_si128( ra, _mm_and_si128(_mm_slli_epi16(colB, 9), _mm_set1_epi16(0x3E00)) ); + ga = _mm_or_si128( ga, _mm_and_si128(_mm_slli_epi16(colB, 4), _mm_set1_epi16(0x3E00)) ); + ba = _mm_or_si128( ba, _mm_and_si128(_mm_srli_epi16(colB, 1), _mm_set1_epi16(0x3E00)) ); + + aa = _mm_adds_epu8(aa, _mm_set1_epi16(1)); + aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(32), aa), 8) ); + + ra = _mm_maddubs_epi16(ra, aa); + ga = _mm_maddubs_epi16(ga, aa); + ba = _mm_maddubs_epi16(ba, aa); +#else + aa = _mm_adds_epu16(aa, _mm_set1_epi16(1)); + v128u16 rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) ); + v128u16 gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) ); + v128u16 bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) ); + v128u16 ab = _mm_subs_epu16( _mm_set1_epi16(32), aa ); + + ra = _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) ); + ga = _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) ); + ba = _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) ); +#endif + + ra = _mm_srli_epi16(ra, 6); + ga = _mm_srli_epi16(ga, 6); + ba = _mm_srli_epi16(ba, 6); + + return _mm_or_si128( _mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10) ); +} + +template +FORCEINLINE v128u32 ColorOperation_SSE2::blend3D(const v128u32 &colA, const v128u32 &colB) const +{ + // If the color format of B is 666 or 888, then the colA_Hi parameter is ignored. + // The color format of A is assumed to match the color format of B. + v128u16 rgbALo; + v128u16 rgbAHi; + +#ifdef ENABLE_SSSE3 + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + // Does not work for RGBA8888 color format. The reason is because this + // algorithm depends on the pmaddubsw instruction, which multiplies + // two unsigned 8-bit integers into an intermediate signed 16-bit + // integer. This means that we can overrun the signed 16-bit value + // range, which would be limited to [-32767 - 32767]. For example, a + // color component of value 255 multiplied by an alpha value of 255 + // would equal 65025, which is greater than the upper range of a signed + // 16-bit value. + rgbALo = _mm_unpacklo_epi8(colA, colB); + rgbAHi = _mm_unpackhi_epi8(colA, colB); + + v128u32 alpha = _mm_and_si128( _mm_srli_epi32(colA, 24), _mm_set1_epi32(0x0000001F) ); + alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) ); + alpha = _mm_adds_epu8(alpha, _mm_set1_epi8(1)); + + v128u32 invAlpha = _mm_subs_epu8(_mm_set1_epi8(32), alpha); + v128u16 alphaLo = _mm_unpacklo_epi8(alpha, invAlpha); + v128u16 alphaHi = _mm_unpackhi_epi8(alpha, invAlpha); + + rgbALo = _mm_maddubs_epi16(rgbALo, alphaLo); + rgbAHi = _mm_maddubs_epi16(rgbAHi, alphaHi); + } + else +#endif + { + rgbALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128()); + rgbAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128()); + v128u16 rgbBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128()); + v128u16 rgbBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128()); + + v128u32 alpha = _mm_and_si128( _mm_srli_epi32(colA, 24), _mm_set1_epi32(0x000000FF) ); + alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) ); + + v128u16 alphaLo = _mm_unpacklo_epi8(alpha, _mm_setzero_si128()); + v128u16 alphaHi = _mm_unpackhi_epi8(alpha, _mm_setzero_si128()); + alphaLo = _mm_add_epi16(alphaLo, _mm_set1_epi16(1)); + alphaHi = _mm_add_epi16(alphaHi, _mm_set1_epi16(1)); + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) ); + rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) ); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) ); + rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) ); + } + } + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + rgbALo = _mm_srli_epi16(rgbALo, 5); + rgbAHi = _mm_srli_epi16(rgbAHi, 5); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + rgbALo = _mm_srli_epi16(rgbALo, 8); + rgbAHi = _mm_srli_epi16(rgbAHi, 8); + } + + return _mm_and_si128( _mm_packus_epi16(rgbALo, rgbAHi), _mm_set1_epi32(0x00FFFFFF) ); +} + +FORCEINLINE v128u16 ColorOperation_SSE2::increase(const v128u16 &col, const v128u16 &blendEVY) const +{ + v128u16 r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); + v128u16 g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); + v128u16 b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); + + r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) ); + g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) ); + b_vec128 = _mm_add_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b_vec128), blendEVY), 4) ); + + return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); +} + +template +FORCEINLINE v128u32 ColorOperation_SSE2::increase(const v128u32 &col, const v128u16 &blendEVY) const +{ + v128u16 rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128()); + v128u16 rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128()); + + rgbLo = _mm_add_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4) ); + rgbHi = _mm_add_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4) ); + + return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) ); +} + +FORCEINLINE v128u16 ColorOperation_SSE2::decrease(const v128u16 &col, const v128u16 &blendEVY) const +{ + v128u16 r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); + v128u16 g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); + v128u16 b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); + + r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) ); + g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) ); + b_vec128 = _mm_sub_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(b_vec128, blendEVY), 4) ); + + return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); +} + +template +FORCEINLINE v128u32 ColorOperation_SSE2::decrease(const v128u32 &col, const v128u16 &blendEVY) const +{ + v128u16 rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128()); + v128u16 rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128()); + + rgbLo = _mm_sub_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(rgbLo, blendEVY), 4) ); + rgbHi = _mm_sub_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(rgbHi, blendEVY), 4) ); + + return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(src0, alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(src1, alphaBits) ); + } + else + { + v128u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555To6665Opaque_SSE2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555To8888Opaque_SSE2(src1, src32[2], src32[3]); + } + + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, src32[0] ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, src32[1] ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, src32[2] ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, src32[3] ); + } + + if (!ISDEBUGRENDER) + { + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::_copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + //_mm_set1_epi16(0x801F), + //_mm_set1_epi16(0x801F) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(src16[0], alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(src16[1], alphaBits) ); + } + else + { + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_or_si128(src0, alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_or_si128(src1, alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_or_si128(src2, alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_or_si128(src3, alphaBits) ); + } + + if (!ISDEBUGRENDER) + { + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::_copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(src0, alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(src1, alphaBits), passMask16[1]) ); + } + else + { + v128u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555To6665Opaque_SSE2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555To8888Opaque_SSE2(src1, src32[2], src32[3]); + } + + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], src32[0], passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], src32[1], passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], src32[2], passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], src32[3], passMask32[3]) ); + } + + if (!ISDEBUGRENDER) + { + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::_copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + }; + + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(src16[0], alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(src16[1], alphaBits), passMask16[1]) ); + } + else + { + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v128u32 dst[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst[0], _mm_or_si128(src0, alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst[1], _mm_or_si128(src1, alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst[2], _mm_or_si128(src2, alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst[3], _mm_or_si128(src3, alphaBits), passMask32[3]) ); + } + + if (!ISDEBUGRENDER) + { + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(colorop_vec.increase(src0, evy16), alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(colorop_vec.increase(src1, evy16), alphaBits) ); + } + else + { + v128u32 dst[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_SSE2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo666X_SSE2(src1, dst[2], dst[3]); + } + else + { + ColorspaceConvert555XTo888X_SSE2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo888X_SSE2(src1, dst[2], dst[3]); + } + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm_set1_epi32(0x1F000000) : _mm_set1_epi32(0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_or_si128(colorop_vec.increase(dst[0], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_or_si128(colorop_vec.increase(dst[1], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_or_si128(colorop_vec.increase(dst[2], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_or_si128(colorop_vec.increase(dst[3], evy16), alphaBits) ); + } + + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + }; + + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(colorop_vec.increase(src16[0], evy16), alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(colorop_vec.increase(src16[1], evy16), alphaBits) ); + } + else + { + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_or_si128(colorop_vec.increase(src0, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_or_si128(colorop_vec.increase(src1, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_or_si128(colorop_vec.increase(src2, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_or_si128(colorop_vec.increase(src3, evy16), alphaBits) ); + } + + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(colorop_vec.increase(src0, evy16), alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(colorop_vec.increase(src1, evy16), alphaBits), passMask16[1]) ); + } + else + { + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + v128u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo666X_SSE2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555XTo888X_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo888X_SSE2(src1, src32[2], src32[3]); + } + + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(colorop_vec.increase(src32[0], evy16), alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(colorop_vec.increase(src32[1], evy16), alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(colorop_vec.increase(src32[2], evy16), alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(colorop_vec.increase(src32[3], evy16), alphaBits), passMask32[3]) ); + } + + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + }; + + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(colorop_vec.increase(src16[0], evy16), alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(colorop_vec.increase(src16[1], evy16), alphaBits), passMask16[1]) ); + } + else + { + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(colorop_vec.increase(src0, evy16), alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(colorop_vec.increase(src1, evy16), alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(colorop_vec.increase(src2, evy16), alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(colorop_vec.increase(src3, evy16), alphaBits), passMask32[3]) ); + } + + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(colorop_vec.decrease(src0, evy16), alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(colorop_vec.decrease(src1, evy16), alphaBits) ); + } + else + { + v128u32 dst[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_SSE2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo666X_SSE2(src1, dst[2], dst[3]); + } + else + { + ColorspaceConvert555XTo888X_SSE2(src0, dst[0], dst[1]); + ColorspaceConvert555XTo888X_SSE2(src1, dst[2], dst[3]); + } + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm_set1_epi32(0x1F000000) : _mm_set1_epi32(0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_or_si128(colorop_vec.decrease(dst[0], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_or_si128(colorop_vec.decrease(dst[1], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_or_si128(colorop_vec.decrease(dst[2], evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_or_si128(colorop_vec.decrease(dst[3], evy16), alphaBits) ); + } + + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + }; + + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_or_si128(colorop_vec.decrease(src16[0], evy16), alphaBits) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_or_si128(colorop_vec.decrease(src16[1], evy16), alphaBits) ); + } + else + { + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_or_si128(colorop_vec.decrease(src0, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_or_si128(colorop_vec.decrease(src1, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_or_si128(colorop_vec.decrease(src2, evy16), alphaBits) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_or_si128(colorop_vec.decrease(src3, evy16), alphaBits) ); + } + + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, srcLayerID ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(colorop_vec.decrease(src0, evy16), alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(colorop_vec.decrease(src1, evy16), alphaBits), passMask16[1]) ); + } + else + { + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + v128u32 src32[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo666X_SSE2(src1, src32[2], src32[3]); + } + else + { + ColorspaceConvert555XTo888X_SSE2(src0, src32[0], src32[1]); + ColorspaceConvert555XTo888X_SSE2(src1, src32[2], src32[3]); + } + + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(colorop_vec.decrease(src32[0], evy16), alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(colorop_vec.decrease(src32[1], evy16), alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(colorop_vec.decrease(src32[2], evy16), alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(colorop_vec.decrease(src32[3], evy16), alphaBits), passMask32[3]) ); + } + + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 src16[2] = { + ColorspaceConvert6665To5551_SSE2(src0, src1), + ColorspaceConvert6665To5551_SSE2(src2, src3) + }; + + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(colorop_vec.decrease(src16[0], evy16), alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(colorop_vec.decrease(src16[1], evy16), alphaBits), passMask16[1]) ); + } + else + { + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(colorop_vec.decrease(src0, evy16), alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(colorop_vec.decrease(src1, evy16), alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(colorop_vec.decrease(src2, evy16), alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(colorop_vec.decrease(src3, evy16), alphaBits), passMask32[3]) ); + } + + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); +} + +template +FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const +{ + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + + v128u8 dstTargetBlendEnableMask; + +#ifdef ENABLE_SSSE3 + dstTargetBlendEnableMask = _mm_shuffle_epi8(dstBlendEnableMaskLUT, dstLayerID); +#else + dstTargetBlendEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG0])); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG1])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG2])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG3])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_OBJ])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_Backdrop])) ); +#endif + + dstTargetBlendEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); + + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm_setzero_si128(); + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + __m128i eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVA) : _mm_set1_epi16(compInfo.renderState.blendEVA); + __m128i evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVB) : _mm_set1_epi16(compInfo.renderState.blendEVB); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v128u8 isObjTranslucentMask = _mm_and_si128( dstTargetBlendEnableMask, _mm_or_si128(_mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v128u8 spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask); + eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask); + evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); + } + + // ---------- + + __m128i tmpSrc[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = src0; + tmpSrc[1] = src1; + tmpSrc[2] = _mm_setzero_si128(); + tmpSrc[3] = _mm_setzero_si128(); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555XTo666X_SSE2(src0, tmpSrc[0], tmpSrc[1]); + ColorspaceConvert555XTo666X_SSE2(src1, tmpSrc[2], tmpSrc[3]); + } + else + { + ColorspaceConvert555XTo888X_SSE2(src0, tmpSrc[0], tmpSrc[1]); + ColorspaceConvert555XTo888X_SSE2(src1, tmpSrc[2], tmpSrc[3]); + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); + + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); + + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v128u8 blendMask8 = _mm_or_si128( forceDstTargetBlendMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstTargetBlendEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); + + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + v128u16 blendSrc16[2]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + //blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v128u16 tempEVA[2] = { + _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) + }; + const v128u16 tempEVB[2] = { + _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } + } + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); + + // Store the final colors. + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(tmpSrc[0], alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(tmpSrc[1], alphaBits), passMask16[1]) ); + } + else + { + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + v128u32 blendSrc32[4]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc32[0] = colorop_vec.blend3D(src0, dst32[0]); + //blendSrc32[1] = colorop_vec.blend3D(src1, dst32[1]); + //blendSrc32[2] = colorop_vec.blend3D(src2, dst32[2]); + //blendSrc32[3] = colorop_vec.blend3D(src3, dst32[3]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); + v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); + + const v128u16 tempEVA[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); + tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); + + const v128u16 tempEVB[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } + } + + const v128u32 blendMask32[4] = { + _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), + _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), + _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), + _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); + + // Store the final colors. + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(tmpSrc[0], alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(tmpSrc[1], alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(tmpSrc[2], alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(tmpSrc[3], alphaBits), passMask32[3]) ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const +{ + const v128u8 dstLayerID = _mm_load_si128((v128u8 *)compInfo.target.lineLayerID); + _mm_store_si128( (v128u8 *)compInfo.target.lineLayerID, _mm_blendv_epi8(dstLayerID, srcLayerID, passMask8) ); + + v128u8 dstTargetBlendEnableMask; + +#ifdef ENABLE_SSSE3 + dstTargetBlendEnableMask = _mm_shuffle_epi8(dstBlendEnableMaskLUT, dstLayerID); +#else + dstTargetBlendEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG0])); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG1])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG2])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_BG3])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_OBJ])) ); + dstTargetBlendEnableMask = _mm_or_si128( dstTargetBlendEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), _mm_set1_epi8(compInfo.renderState.dstBlendEnable[GPULayerID_Backdrop])) ); +#endif + + dstTargetBlendEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); + + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm_setzero_si128(); + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + __m128i eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVA) : _mm_set1_epi16(compInfo.renderState.blendEVA); + __m128i evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVB) : _mm_set1_epi16(compInfo.renderState.blendEVB); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v128u8 isObjTranslucentMask = _mm_and_si128( dstTargetBlendEnableMask, _mm_or_si128(_mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v128u8 spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask); + eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask); + evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); + } + + // ---------- + + __m128i tmpSrc[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = ColorspaceConvert6665To5551_SSE2(src0, src1); + tmpSrc[1] = ColorspaceConvert6665To5551_SSE2(src2, src3); + tmpSrc[0] = _mm_setzero_si128(); + tmpSrc[1] = _mm_setzero_si128(); + } + else + { + tmpSrc[0] = src0; + tmpSrc[1] = src1; + tmpSrc[2] = src2; + tmpSrc[3] = src3; + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); + + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); + + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v128u8 blendMask8 = _mm_or_si128( forceDstTargetBlendMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstTargetBlendEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); + + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 dst16[2] = { + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 0), + _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) + }; + + v128u16 blendSrc16[2]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v128u16 tempEVA[2] = { + _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) + }; + const v128u16 tempEVB[2] = { + _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } + } + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); + + // Store the final colors. + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + const v128u16 alphaBits = _mm_set1_epi16(0x8000); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 0, _mm_blendv_epi8(dst16[0], _mm_or_si128(tmpSrc[0], alphaBits), passMask16[0]) ); + _mm_store_si128( (v128u16 *)compInfo.target.lineColor16 + 1, _mm_blendv_epi8(dst16[1], _mm_or_si128(tmpSrc[1], alphaBits), passMask16[1]) ); + } + else + { + const v128u32 dst32[4] = { + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 2), + _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), + }; + + v128u32 blendSrc32[4]; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc32[0] = colorop_vec.blend3D(tmpSrc[0], dst32[0]); + blendSrc32[1] = colorop_vec.blend3D(tmpSrc[1], dst32[1]); + blendSrc32[2] = colorop_vec.blend3D(tmpSrc[2], dst32[2]); + blendSrc32[3] = colorop_vec.blend3D(tmpSrc[3], dst32[3]); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); + v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); + + const v128u16 tempEVA[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); + tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); + + const v128u16 tempEVB[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } + } + + const v128u32 blendMask32[4] = { + _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), + _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), + _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), + _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); + + // Store the final colors. + const v128u16 passMask16[2] = { + _mm_unpacklo_epi8(passMask8, passMask8), + _mm_unpackhi_epi8(passMask8, passMask8) + }; + + const v128u32 passMask32[4] = { + _mm_unpacklo_epi16(passMask16[0], passMask16[0]), + _mm_unpackhi_epi16(passMask16[0], passMask16[0]), + _mm_unpacklo_epi16(passMask16[1], passMask16[1]), + _mm_unpackhi_epi16(passMask16[1], passMask16[1]) + }; + + const v128u32 alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 0, _mm_blendv_epi8(dst32[0], _mm_or_si128(tmpSrc[0], alphaBits), passMask32[0]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 1, _mm_blendv_epi8(dst32[1], _mm_or_si128(tmpSrc[1], alphaBits), passMask32[1]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 2, _mm_blendv_epi8(dst32[2], _mm_or_si128(tmpSrc[2], alphaBits), passMask32[2]) ); + _mm_store_si128( (v128u32 *)compInfo.target.lineColor32 + 3, _mm_blendv_epi8(dst32[3], _mm_or_si128(tmpSrc[3], alphaBits), passMask32[3]) ); + } +} + +template +FORCEINLINE void PixelOperation_SSE2::Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp16(compInfo, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown16(compInfo, evy16, srcLayerID, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + default: + { + const v128u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_load_si128((v128u8 *)enableColorEffectPtr) : _mm_set1_epi8(0xFF); + const v128u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((v128u8 *)sprAlphaPtr) : _mm_setzero_si128(); + const v128u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((v128u8 *)sprModePtr) : _mm_setzero_si128(); + + this->_unknownEffectMask16(compInfo, + passMask8, + evy16, + srcLayerID, + src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +FORCEINLINE void PixelOperation_SSE2::Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + { + const v128u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_load_si128((v128u8 *)enableColorEffectPtr): _mm_set1_epi8(0xFF); + const v128u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((v128u8 *)sprAlphaPtr) : _mm_setzero_si128(); + const v128u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((v128u8 *)sprModePtr) : _mm_setzero_si128(); + + this->_unknownEffectMask32(compInfo, + passMask8, + evy16, + srcLayerID, + src3, src2, src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +void GPUEngineBase::_MosaicLine(GPUEngineCompositorInfo &compInfo) +{ + const u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=sizeof(v128u16)) + { + const v128u16 dstColor16[2] = { + _mm_load_si128((v128u16 *)(this->_deferredColorNative + x) + 0), + _mm_load_si128((v128u16 *)(this->_deferredColorNative + x) + 1) + }; + + if (ISFIRSTLINE) + { + const v128u8 indexVec = _mm_load_si128((v128u8 *)(this->_deferredIndexNative + x)); + const v128u8 idxMask8 = _mm_cmpeq_epi8(indexVec, _mm_setzero_si128()); + const v128u16 idxMask16[2] = { + _mm_unpacklo_epi8(idxMask8, idxMask8), + _mm_unpackhi_epi8(idxMask8, idxMask8) + }; + + const v128u16 mosaicColor16[2] = { + _mm_blendv_epi8(_mm_and_si128(dstColor16[0], _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask16[0]), + _mm_blendv_epi8(_mm_and_si128(dstColor16[1], _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask16[1]) + }; + + const v128u16 mosaicSetColorMask8 = _mm_cmpeq_epi16( _mm_loadu_si128((v128u8 *)(compInfo.renderState.mosaicWidthBG->begin + x)), _mm_setzero_si128() ); + const v128u16 mosaicSetColorMask16[2] = { + _mm_unpacklo_epi8(mosaicSetColorMask8, mosaicSetColorMask8), + _mm_unpackhi_epi8(mosaicSetColorMask8, mosaicSetColorMask8) + }; + + _mm_storeu_si128( (v128u16 *)(mosaicColorBG + x) + 0, _mm_blendv_epi8(mosaicColor16[0], _mm_loadu_si128((v128u16 *)(mosaicColorBG + x) + 0), mosaicSetColorMask16[0]) ); + _mm_storeu_si128( (v128u16 *)(mosaicColorBG + x) + 1, _mm_blendv_epi8(mosaicColor16[1], _mm_loadu_si128((v128u16 *)(mosaicColorBG + x) + 1), mosaicSetColorMask16[1]) ); + } + + const v128u16 outColor16[2] = { + _mm_setr_epi16(mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+0]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+1]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+2]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+3]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+4]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+5]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+6]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+7]]), + + _mm_setr_epi16(mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+8]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+9]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+10]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+11]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+12]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+13]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+14]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+15]]) + }; + + const v128u16 writeColorMask16[2] = { + _mm_cmpeq_epi16(outColor16[0], _mm_set1_epi16(0xFFFF)), + _mm_cmpeq_epi16(outColor16[1], _mm_set1_epi16(0xFFFF)) + }; + + _mm_store_si128( (v128u16 *)(this->_deferredColorNative + x) + 0, _mm_blendv_epi8(outColor16[0], dstColor16[0], writeColorMask16[0]) ); + _mm_store_si128( (v128u16 *)(this->_deferredColorNative + x) + 1, _mm_blendv_epi8(outColor16[1], dstColor16[1], writeColorMask16[1]) ); + } +} + +template +void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32) +{ + static const size_t step = sizeof(v128u8); + + const bool isUsingSrc32 = (srcColorNative32 != NULL); + const v128u16 evy16 = _mm_set1_epi16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = _mm_set1_epi8(compInfo.renderState.srcEffectEnable[GPULayerID_OBJ]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm_load_si128((v128u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm_setzero_si128(); + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=step, srcColorNative16+=step, srcColorNative32+=step, compInfo.target.xNative+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + v128u8 passMask8; + int passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestNative[GPULayerID_OBJ] + i)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFFFF); + } + else + { + passMask8 = _mm_set1_epi8(0xFF); + passMaskValue = 0xFFFF; + didAllPixelsPass = true; + } + + if (isUsingSrc32) + { + const v128u32 src[4] = { + _mm_load_si128((v128u32 *)srcColorNative32 + 0), + _mm_load_si128((v128u32 *)srcColorNative32 + 1), + _mm_load_si128((v128u32 *)srcColorNative32 + 2), + _mm_load_si128((v128u32 *)srcColorNative32 + 3) + }; + + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[3], src[2], src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + else + { + const v128u16 src[2] = { + _mm_load_si128((v128u16 *)srcColorNative16 + 0), + _mm_load_si128((v128u16 *)srcColorNative16 + 1) + }; + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + } +} + +template +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = _mm_set1_epi16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = _mm_set1_epi8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm_load_si128((v128u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm_setzero_si128(); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v128u8 passMask8; + int passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST || (LAYERTYPE == GPULayerType_BG)) + { + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + } + + if (LAYERTYPE == GPULayerType_BG) + { + // Do the index test. Pixels with an index value of 0 are rejected. + const v128u8 idxPassMask8 = _mm_cmpeq_epi8(_mm_load_si128((v128u8 *)(srcIndexCustom + compInfo.target.xCustom)), _mm_setzero_si128()); + + if (WILLPERFORMWINDOWTEST) + { + passMask8 = _mm_andnot_si128(idxPassMask8, passMask8); + } + else + { + passMask8 = _mm_xor_si128(idxPassMask8, _mm_set1_epi32(0xFFFFFFFF)); + } + } + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFFFF); + } + else + { + passMask8 = _mm_set1_epi8(0xFF); + passMaskValue = 0xFFFF; + didAllPixelsPass = true; + } + + const v128u16 src[2] = { + _mm_load_si128((v128u16 *)(srcColorCustom16 + compInfo.target.xCustom) + 0), + _mm_load_si128((v128u16 *)(srcColorCustom16 + compInfo.target.xCustom) + 1) + }; + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + } + + return i; +} + +template +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = _mm_set1_epi16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = _mm_set1_epi8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm_load_si128((v128u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm_setzero_si128(); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v128u8 passMask8; + int passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = _mm_set1_epi8(0xFF); + passMaskValue = 0xFFFF; + } + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + case NDSColorFormat_BGR666_Rev: + { + const v128u16 src16[2] = { + _mm_load_si128((v128u16 *)((u16 *)vramColorPtr + i) + 0), + _mm_load_si128((v128u16 *)((u16 *)vramColorPtr + i) + 1) + }; + + if (LAYERTYPE != GPULayerType_OBJ) + { + v128u8 tempPassMask = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) ); + tempPassMask = _mm_cmpeq_epi8(tempPassMask, _mm_set1_epi8(1)); + + passMask8 = _mm_and_si128(tempPassMask, passMask8); + passMaskValue = _mm_movemask_epi8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFF); + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src16[1], src16[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + + case NDSColorFormat_BGR888_Rev: + { + const v128u32 src32[4] = { + _mm_load_si128((v128u32 *)((FragmentColor *)vramColorPtr + i) + 0), + _mm_load_si128((v128u32 *)((FragmentColor *)vramColorPtr + i) + 1), + _mm_load_si128((v128u32 *)((FragmentColor *)vramColorPtr + i) + 2), + _mm_load_si128((v128u32 *)((FragmentColor *)vramColorPtr + i) + 3) + }; + + if (LAYERTYPE != GPULayerType_OBJ) + { + v128u8 tempPassMask = _mm_packus_epi16( _mm_packs_epi32(_mm_srli_epi32(src32[0], 24), _mm_srli_epi32(src32[1], 24)), _mm_packs_epi32(_mm_srli_epi32(src32[2], 24), _mm_srli_epi32(src32[3], 24)) ); + tempPassMask = _mm_cmpeq_epi8(tempPassMask, _mm_setzero_si128()); + + passMask8 = _mm_andnot_si128(tempPassMask, passMask8); + passMaskValue = _mm_movemask_epi8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFF); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src32[3], src32[2], src32[1], src32[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + } + } + + return i; +} + +template +size_t GPUEngineBase::_RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, + size_t &frameX, size_t &spriteX, + u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab) +{ + size_t i = 0; + + static const size_t step = sizeof(v128u16); + const v128u8 prioVec8 = _mm_set1_epi8(prio); + + const size_t ssePixCount = length - (length % step); + for (; i < ssePixCount; i+=step, spriteX+=step, frameX+=step) + { + const v128u8 prioTabVec8 = _mm_loadu_si128((v128u8 *)(prioTab + frameX)); + const v128u16 color16Lo = _mm_loadu_si128((v128u16 *)(vramBuffer + spriteX) + 0); + const v128u16 color16Hi = _mm_loadu_si128((v128u16 *)(vramBuffer + spriteX) + 1); + + const v128u8 alphaCompare = _mm_cmpeq_epi8( _mm_packus_epi16(_mm_srli_epi16(color16Lo, 15), _mm_srli_epi16(color16Hi, 15)), _mm_set1_epi8(0x01) ); + const v128u8 prioCompare = _mm_cmpgt_epi8(prioTabVec8, prioVec8); + + const v128u8 combinedCompare = _mm_and_si128(prioCompare, alphaCompare); + const v128u16 combinedLoCompare = _mm_unpacklo_epi8(combinedCompare, combinedCompare); + const v128u16 combinedHiCompare = _mm_unpackhi_epi8(combinedCompare, combinedCompare); + + // Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because + // maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite + // to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs. + + _mm_storeu_si128( (v128u16 *)(dst + frameX) + 0, _mm_blendv_epi8(_mm_loadu_si128((v128u16 *)(dst + frameX) + 0), color16Lo, combinedLoCompare) ); + _mm_storeu_si128( (v128u16 *)(dst + frameX) + 1, _mm_blendv_epi8(_mm_loadu_si128((v128u16 *)(dst + frameX) + 1), color16Hi, combinedHiCompare) ); + _mm_storeu_si128( (v128u8 *)(prioTab + frameX), _mm_blendv_epi8(prioTabVec8, prioVec8, combinedCompare) ); + + if (!ISDEBUGRENDER) + { + _mm_storeu_si128( (v128u8 *)(dst_alpha + frameX), _mm_blendv_epi8(_mm_loadu_si128((v128u8 *)(dst_alpha + frameX)), _mm_set1_epi8(spriteAlpha + 1), combinedCompare) ); + _mm_storeu_si128( (v128u8 *)(typeTab + frameX), _mm_blendv_epi8(_mm_loadu_si128((v128u8 *)(typeTab + frameX)), _mm_set1_epi8(OBJMode_Bitmap), combinedCompare) ); + _mm_storeu_si128( (v128u8 *)(this->_sprNum + frameX), _mm_blendv_epi8(_mm_loadu_si128((v128u8 *)(this->_sprNum + frameX)), _mm_set1_epi8(spriteNum), combinedCompare) ); + } + } + + return i; +} + +void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative) +{ + const v128u8 *__restrict win0Ptr = (const v128u8 *__restrict)win0; + const v128u8 *__restrict win1Ptr = (const v128u8 *__restrict)win1; + const v128u8 *__restrict winObjPtr = (const v128u8 *__restrict)winObj; + + v128u8 *__restrict didPassWindowTestNativePtr = (v128u8 *__restrict)didPassWindowTestNative; + v128u8 *__restrict enableColorEffectNativePtr = (v128u8 *__restrict)enableColorEffectNative; + + __m128i didPassWindowTest; + __m128i enableColorEffect; + + __m128i win0HandledMask; + __m128i win1HandledMask; + __m128i winOBJHandledMask; + __m128i winOUTHandledMask; + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH/sizeof(v128u8); i++) + { + didPassWindowTest = _mm_setzero_si128(); + enableColorEffect = _mm_setzero_si128(); + + win0HandledMask = _mm_setzero_si128(); + win1HandledMask = _mm_setzero_si128(); + winOBJHandledMask = _mm_setzero_si128(); + + // Window 0 has the highest priority, so always check this first. + if (win0Ptr != NULL) + { + const v128u8 win0Enable = _mm_set1_epi8(compInfo.renderState.WIN0_enable[layerID]); + const v128u8 win0Effect = _mm_set1_epi8(compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG]); + + win0HandledMask = _mm_cmpeq_epi8(_mm_load_si128(win0Ptr + i), _mm_set1_epi8(1)); + didPassWindowTest = _mm_and_si128(win0HandledMask, win0Enable); + enableColorEffect = _mm_and_si128(win0HandledMask, win0Effect); + } + + // Window 1 has medium priority, and is checked after Window 0. + if (win1Ptr != NULL) + { + const v128u8 win1Enable = _mm_set1_epi8(compInfo.renderState.WIN1_enable[layerID]); + const v128u8 win1Effect = _mm_set1_epi8(compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG]); + + win1HandledMask = _mm_andnot_si128(win0HandledMask, _mm_cmpeq_epi8(_mm_load_si128(win1Ptr + i), _mm_set1_epi8(1))); + didPassWindowTest = _mm_blendv_epi8(didPassWindowTest, win1Enable, win1HandledMask); + enableColorEffect = _mm_blendv_epi8(enableColorEffect, win1Effect, win1HandledMask); + } + + // Window OBJ has low priority, and is checked after both Window 0 and Window 1. + if (winObjPtr != NULL) + { + const v128u8 winObjEnable = _mm_set1_epi8(compInfo.renderState.WINOBJ_enable[layerID]); + const v128u8 winObjEffect = _mm_set1_epi8(compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOBJHandledMask = _mm_andnot_si128( _mm_or_si128(win0HandledMask, win1HandledMask), _mm_cmpeq_epi8(_mm_load_si128(winObjPtr + i), _mm_set1_epi8(1)) ); + didPassWindowTest = _mm_blendv_epi8(didPassWindowTest, winObjEnable, winOBJHandledMask); + enableColorEffect = _mm_blendv_epi8(enableColorEffect, winObjEffect, winOBJHandledMask); + } + + // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. + // This has the lowest priority, and is always checked last. + const v128u8 winOutEnable = _mm_set1_epi8(compInfo.renderState.WINOUT_enable[layerID]); + const v128u8 winOutEffect = _mm_set1_epi8(compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF) ); + didPassWindowTest = _mm_blendv_epi8(didPassWindowTest, winOutEnable, winOUTHandledMask); + enableColorEffect = _mm_blendv_epi8(enableColorEffect, winOutEffect, winOUTHandledMask); + + _mm_store_si128(didPassWindowTestNativePtr + i, didPassWindowTest); + _mm_store_si128(enableColorEffectNativePtr + i, enableColorEffect); + } +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v128u32) : pixCount * sizeof(u16) / sizeof(v128u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v128u16 dstColor = _mm_load_si128((v128u16 *)dst + i); + dstColor = colorop_vec.increase(dstColor, _mm_set1_epi16(intensityClamped)); + dstColor = _mm_or_si128(dstColor, _mm_set1_epi16(0x8000)); + _mm_store_si128((v128u16 *)dst + i, dstColor); + } + else + { + v128u32 dstColor = _mm_load_si128((v128u32 *)dst + i); + dstColor = colorop_vec.increase(dstColor, _mm_set1_epi16(intensityClamped)); + dstColor = _mm_or_si128(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm_set1_epi32(0x1F000000) : _mm_set1_epi32(0xFF000000)); + _mm_store_si128((v128u32 *)dst + i, dstColor); + } + } + + return (i * sizeof(__m128i)); +} + +template +size_t GPUEngineBase::_ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v128u32) : pixCount * sizeof(u16) / sizeof(v128u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v128u16 dstColor = _mm_load_si128((v128u16 *)dst + i); + dstColor = colorop_vec.decrease(dstColor, _mm_set1_epi16(intensityClamped)); + dstColor = _mm_or_si128(dstColor, _mm_set1_epi16(0x8000)); + _mm_store_si128((v128u16 *)dst + i, dstColor); + } + else + { + v128u32 dstColor = _mm_load_si128((v128u32 *)dst + i); + dstColor = colorop_vec.decrease(dstColor, _mm_set1_epi16(intensityClamped)); + dstColor = _mm_or_si128(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? _mm_set1_epi32(0x1F000000) : _mm_set1_epi32(0xFF000000)); + _mm_store_si128((v128u32 *)dst + i, dstColor); + } + } + + return (i * sizeof(__m128i)); +} + +template +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = _mm_set1_epi16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = _mm_set1_epi8(compInfo.renderState.srcEffectEnable[GPULayerID_BG0]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? _mm_load_si128((v128u8 *)compInfo.renderState.dstBlendEnableVecLookup) : _mm_setzero_si128(); + + size_t i = 0; + for (; i < ssePixCount; i+=step, srcLinePtr+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + // Determine which pixels pass by doing the window test and the alpha test. + v128u8 passMask8; + int passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[GPULayerID_BG0] + compInfo.target.xCustom)); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = _mm_set1_epi8(0xFF); + passMaskValue = 0xFFFF; + } + + const v128u32 src[4] = { + _mm_load_si128((v128u32 *)srcLinePtr + 0), + _mm_load_si128((v128u32 *)srcLinePtr + 1), + _mm_load_si128((v128u32 *)srcLinePtr + 2), + _mm_load_si128((v128u32 *)srcLinePtr + 3) + }; + + // Do the alpha test. Pixels with an alpha value of 0 are rejected. + const v128u32 srcAlpha = _mm_packs_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)), + _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) ); + + passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(srcAlpha, _mm_setzero_si128()), passMask8); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = _mm_movemask_epi8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFF); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src[3], src[2], src[1], src[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectCustom[GPULayerID_BG0] + compInfo.target.xCustom, + NULL, + NULL); + } + + return i; +} + +template +size_t GPUEngineA::_RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) +{ + const v128u16 blendEVA_vec = _mm_set1_epi16(blendEVA); + const v128u16 blendEVB_vec = _mm_set1_epi16(blendEVB); + + __m128i srcA_vec; + __m128i srcB_vec; + __m128i dstColor; + +#ifdef ENABLE_SSSE3 + const v128u8 blendAB = _mm_or_si128( blendEVA_vec, _mm_slli_epi16(blendEVB_vec, 8) ); +#endif + + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? length * sizeof(u32) / sizeof(__m128i) : length * sizeof(u16) / sizeof(__m128i); + for (; i < vecCount; i++) + { + srcA_vec = _mm_load_si128((__m128i *)srcA + i); + srcB_vec = _mm_load_si128((__m128i *)srcB + i); + + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + // Get color masks based on if the alpha value is 0. Colors with an alpha value + // equal to 0 are rejected. + v128u32 srcA_alpha = _mm_and_si128(srcA_vec, _mm_set1_epi32(0xFF000000)); + v128u32 srcB_alpha = _mm_and_si128(srcB_vec, _mm_set1_epi32(0xFF000000)); + v128u32 srcA_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcA_alpha, _mm_setzero_si128()), srcA_vec); + v128u32 srcB_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcB_alpha, _mm_setzero_si128()), srcB_vec); + + v128u16 outColorLo; + v128u16 outColorHi; + + // Temporarily convert the color component values from 8-bit to 16-bit, and then + // do the blend calculation. +#ifdef ENABLE_SSSE3 + outColorLo = _mm_unpacklo_epi8(srcA_masked, srcB_masked); + outColorHi = _mm_unpackhi_epi8(srcA_masked, srcB_masked); + + outColorLo = _mm_maddubs_epi16(outColorLo, blendAB); + outColorHi = _mm_maddubs_epi16(outColorHi, blendAB); +#else + v128u16 srcA_maskedLo = _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128()); + v128u16 srcA_maskedHi = _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128()); + v128u16 srcB_maskedLo = _mm_unpacklo_epi8(srcB_masked, _mm_setzero_si128()); + v128u16 srcB_maskedHi = _mm_unpackhi_epi8(srcB_masked, _mm_setzero_si128()); + + outColorLo = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedLo, blendEVA_vec), _mm_mullo_epi16(srcB_maskedLo, blendEVB_vec) ); + outColorHi = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedHi, blendEVA_vec), _mm_mullo_epi16(srcB_maskedHi, blendEVB_vec) ); +#endif + + outColorLo = _mm_srli_epi16(outColorLo, 4); + outColorHi = _mm_srli_epi16(outColorHi, 4); + + // Convert the color components back from 16-bit to 8-bit using a saturated pack. + dstColor = _mm_packus_epi16(outColorLo, outColorHi); + + // Add the alpha components back in. + dstColor = _mm_and_si128(dstColor, _mm_set1_epi32(0x00FFFFFF)); + dstColor = _mm_or_si128(dstColor, srcA_alpha); + dstColor = _mm_or_si128(dstColor, srcB_alpha); + } + else + { + v128u16 srcA_alpha = _mm_and_si128(srcA_vec, _mm_set1_epi16(0x8000)); + v128u16 srcB_alpha = _mm_and_si128(srcB_vec, _mm_set1_epi16(0x8000)); + v128u16 srcA_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcA_alpha, _mm_setzero_si128()), srcA_vec ); + v128u16 srcB_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcB_alpha, _mm_setzero_si128()), srcB_vec ); + v128u16 colorBitMask = _mm_set1_epi16(0x001F); + + v128u16 ra; + v128u16 ga; + v128u16 ba; + +#ifdef ENABLE_SSSE3 + ra = _mm_or_si128( _mm_and_si128( srcA_masked, colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 8), _mm_set1_epi16(0x1F00)) ); + ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 3), _mm_set1_epi16(0x1F00)) ); + ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(srcB_masked, 2), _mm_set1_epi16(0x1F00)) ); + + ra = _mm_maddubs_epi16(ra, blendAB); + ga = _mm_maddubs_epi16(ga, blendAB); + ba = _mm_maddubs_epi16(ba, blendAB); +#else + ra = _mm_and_si128( srcA_masked, colorBitMask); + ga = _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask); + ba = _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask); + + v128u16 rb = _mm_and_si128( srcB_masked, colorBitMask); + v128u16 gb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 5), colorBitMask); + v128u16 bb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 10), colorBitMask); + + ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA_vec), _mm_mullo_epi16(rb, blendEVB_vec) ); + ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA_vec), _mm_mullo_epi16(gb, blendEVB_vec) ); + ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA_vec), _mm_mullo_epi16(bb, blendEVB_vec) ); +#endif + + ra = _mm_srli_epi16(ra, 4); + ga = _mm_srli_epi16(ga, 4); + ba = _mm_srli_epi16(ba, 4); + + ra = _mm_min_epi16(ra, colorBitMask); + ga = _mm_min_epi16(ga, colorBitMask); + ba = _mm_min_epi16(ba, colorBitMask); + + dstColor = _mm_or_si128( _mm_or_si128(_mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10)), _mm_or_si128(srcA_alpha, srcB_alpha) ); + } + + _mm_store_si128((__m128i *)dst + i, dstColor); + } + + return (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? i * sizeof(v128u32) / sizeof(u32) : i * sizeof(v128u16) / sizeof(u16); +} + +#endif // ENABLE_SSE2 diff --git a/desmume/src/GPU_Operations_SSE2.h b/desmume/src/GPU_Operations_SSE2.h new file mode 100644 index 000000000..5e8134657 --- /dev/null +++ b/desmume/src/GPU_Operations_SSE2.h @@ -0,0 +1,122 @@ +/* + Copyright (C) 2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef GPU_OPERATIONS_SSE2_H +#define GPU_OPERATIONS_SSE2_H + +#include "GPU_Operations.h" + +#ifndef ENABLE_SSE2 + #warning This header requires SSE2 support. +#else + +class ColorOperation_SSE2 +{ +public: + ColorOperation_SSE2() {}; + + FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const; + template FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const; + + FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const; + template FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const; + + FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const; + template FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const; + + FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const; + template FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const; +}; + +class PixelOperation_SSE2 +{ +protected: + template FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template + FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const; + + template + FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const; + +public: + PixelOperation_SSE2() {}; + + template + FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; + + template + FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; +}; + +#endif // ENABLE_SSE2 + +#endif // GPU_OPERATIONS_SSE2_H diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index f4604f84b..c02622210 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -46,7 +46,7 @@ #include "driver.h" #include "emufile.h" #include "matrix.h" -#include "GPU.h" +#include "GPU_Operations.h" #include "MMU.h" #include "render3D.h" #include "mem.h"