diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 6a720e019..94eab8ac3 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -2,7 +2,7 @@ Copyright (C) 2006 yopyop Copyright (C) 2006-2007 Theo Berkau Copyright (C) 2007 shash - Copyright (C) 2008-2024 DeSmuME team + Copyright (C) 2008-2025 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -56,6 +56,9 @@ #elif defined(ENABLE_SSE2) #define USEVECTORSIZE_128 #define VECTORSIZE 16 +#elif defined(ENABLE_NEON_A64) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 #endif #if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128) diff --git a/desmume/src/GPU_Operations.cpp b/desmume/src/GPU_Operations.cpp index fef0a68cb..3307e76fe 100644 --- a/desmume/src/GPU_Operations.cpp +++ b/desmume/src/GPU_Operations.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2021-2023 DeSmuME team + Copyright (C) 2021-2025 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -758,6 +758,8 @@ static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead, #include "GPU_Operations_AVX2.cpp" #elif defined(ENABLE_SSE2) #include "GPU_Operations_SSE2.cpp" +#elif defined(ENABLE_NEON_A64) + #include "GPU_Operations_NEON.cpp" #else template diff --git a/desmume/src/GPU_Operations_NEON.cpp b/desmume/src/GPU_Operations_NEON.cpp new file mode 100644 index 000000000..8f2775bd3 --- /dev/null +++ b/desmume/src/GPU_Operations_NEON.cpp @@ -0,0 +1,2902 @@ +/* + Copyright (C) 2025 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef ENABLE_NEON_A64 + #error This code requires ARM64 NEON support. + #warning This error might occur if this file is compiled directly. Do not compile this file directly, as it is already included in GPU_Operations.cpp. +#else + +#include "GPU_Operations_NEON.h" +#include "./utils/colorspacehandler/colorspacehandler_NEON.h" + + +static const ColorOperation_NEON colorop_vec; +static const PixelOperation_NEON pixelop_vec; + +template +static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstWidth, size_t dstLineCount) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, dstWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(uint8x16_t) / ELEMENTSIZE), vst1q_u8((u8 *)((uint8x16_t *)dst + (X)), vld1q_u8((u8 *)((uint8x16_t *)src + (X)))) ); + } + else if (INTEGERSCALEHINT == 2) + { + if (ELEMENTSIZE == 1) + { + v128u8 srcPix; + uint8x16x2_t dstPix; + + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE), \ + srcPix = vld1q_u8((u8 *)((v128u8 *)src + (X))); \ + dstPix.val[0] = vzip1q_u8(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u8(srcPix, srcPix); \ + vst1q_u8_x2((u8 *)((v128u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u8) / ELEMENTSIZE)) * 0)), dstPix); \ + vst1q_u8_x2((u8 *)((v128u8 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u8) / ELEMENTSIZE)) * 1)), dstPix); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE), \ + srcPix = vld1q_u8((u8 *)((v128u8 *)src + (X))); \ + dstPix.val[0] = vzip1q_u8(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u8(srcPix, srcPix); \ + vst1q_u8_x2((u8 *)((v128u8 *)dst + ((X) * 2)), dstPix); \ + ); + } + } + else if (ELEMENTSIZE == 2) + { + v128u16 srcPix; + uint16x8x2_t dstPix; + + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u16) / ELEMENTSIZE), \ + srcPix = vld1q_u16((u16 *)((v128u16 *)src + (X))); \ + dstPix.val[0] = vzip1q_u16(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u16(srcPix, srcPix); \ + vst1q_u16_x2((u16 *)((v128u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u16) / ELEMENTSIZE)) * 0)), dstPix); \ + vst1q_u16_x2((u16 *)((v128u16 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u16) / ELEMENTSIZE)) * 1)), dstPix); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u16) / ELEMENTSIZE), \ + srcPix = vld1q_u16((u16 *)((v128u16 *)src + (X))); \ + dstPix.val[0] = vzip1q_u16(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u16(srcPix, srcPix); \ + vst1q_u16_x2((u16 *)((v128u16 *)dst + ((X) * 2)), dstPix); \ + ); + } + } + else if (ELEMENTSIZE == 4) + { + v128u32 srcPix; + uint32x4x2_t dstPix; + + if (SCALEVERTICAL) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u32) / ELEMENTSIZE), \ + srcPix = vld1q_u32((u32 *)((v128u32 *)src + (X))); \ + dstPix.val[0] = vzip1q_u32(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u32(srcPix, srcPix); \ + vst1q_u32_x2((u32 *)((v128u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u32) / ELEMENTSIZE)) * 0)), dstPix); \ + vst1q_u32_x2((u32 *)((v128u32 *)dst + ((X) * 2) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 / (sizeof(v128u32) / ELEMENTSIZE)) * 1)), dstPix); \ + ); + } + else + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u32) / ELEMENTSIZE), \ + srcPix = vld1q_u32((u32 *)((v128u32 *)src + (X))); \ + dstPix.val[0] = vzip1q_u32(srcPix, srcPix); \ + dstPix.val[1] = vzip2q_u32(srcPix, srcPix); \ + vst1q_u32_x2((u32 *)((v128u32 *)dst + ((X) * 2)), dstPix); \ + ); + } + } + } + else if (INTEGERSCALEHINT == 3) + { + uint8x16x3_t dstPix; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5})); + dstPix.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,10,10})); + dstPix.val[2] = vqtbl1q_u8(srcPix, ((v128u8){10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5})); + dstPix.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9,10,11})); + dstPix.val[2] = vqtbl1q_u8(srcPix, ((v128u8){10,11,10,11,12,13,12,13,12,13,14,15,14,15,14,15})); + } + else if (ELEMENTSIZE == 4) + { + dstPix.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7})); + dstPix.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 6, 7, 4, 5, 6, 7, 8, 9,10,11, 8, 9,10,11})); + dstPix.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9,10,11,12,13,14,15,12,13,14,15,12,13,14,15})); + } + + vst1q_u8_x3( (u8 *)((v128u8 *)dst + dstX), dstPix ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x3( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly)), dstPix ); + } + } + } + } + else if (INTEGERSCALEHINT == 4) + { + uint8x16x4_t dstPix00; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9, 8, 9, 8, 9, 8, 9,10,11,10,11,10,11,10,11})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15})); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix00.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix00.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX), dstPix00 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly)), dstPix00 ); + } + } + } + } + else if (INTEGERSCALEHINT == 5) + { + uint8x16x4_t dstPix00; + uint8x16_t dstPix04; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 9, 9,10,10,10,10,10,11,11,11,11,11,12,12,12,12})); + dstPix04 = vqtbl1q_u8(srcPix, ((v128u8){12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9,10,11,10,11,10,11,10,11,10,11,12,13,12,13})); + dstPix04 = vqtbl1q_u8(srcPix, ((v128u8){12,13,12,13,12,13,14,15,14,15,14,15,14,15,14,15})); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 6, 7, 4, 5, 6, 7, 8, 9,10,11, 8, 9,10,11})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9,10,11, 8, 9,10,11, 8, 9,10,11,12,13,14,15})); + dstPix04 = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 0), dstPix00 ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX + 4), dstPix04 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 0), dstPix00 ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 4), dstPix04 ); + } + } + } + } + else if (INTEGERSCALEHINT == 6) + { + uint8x16x4_t dstPix00; + uint8x16x2_t dstPix04; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,10,10,10,10})); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13})); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 3, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 4, 5})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,10,11,10,11})); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){10,11,10,11,10,11,10,11,12,13,12,13,12,13,12,13})); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){12,13,12,13,14,15,14,15,14,15,14,15,14,15,14,15})); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7})); + dstPix00.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix00.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9,10,11, 8, 9,10,11,12,13,14,15,12,13,14,15})); + dstPix04.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 0), dstPix00 ); + vst1q_u8_x2( (u8 *)((v128u8 *)dst + dstX + 4), dstPix04 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 0), dstPix00 ); + vst1q_u8_x2( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 4), dstPix04 ); + } + } + } + } + else if (INTEGERSCALEHINT == 7) + { + uint8x16x4_t dstPix00; + uint8x16x3_t dstPix04; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9})); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 9, 9, 9, 9, 9, 9,10,10,10,10,10,10,10,11,11,11})); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13})); + dstPix04.val[2] = vqtbl1q_u8(srcPix, ((v128u8){13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9})); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9, 8, 9, 8, 9,10,11,10,11,10,11,10,11,10,11})); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){10,11,10,11,12,13,12,13,12,13,12,13,12,13,12,13})); + dstPix04.val[2] = vqtbl1q_u8(srcPix, ((v128u8){12,13,14,15,14,15,14,15,14,15,14,15,14,15,14,15})); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7})); + dstPix00.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 5, 6, 7, 4, 5, 6, 7, 8, 9,10,11, 8, 9,10,11})); + dstPix04.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 9,10,11,12,13,14,15,12,13,14,15,12,13,14,15})); + dstPix04.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 0), dstPix00 ); + vst1q_u8_x3( (u8 *)((v128u8 *)dst + dstX + 4), dstPix04 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 0), dstPix00 ); + vst1q_u8_x3( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 4), dstPix04 ); + } + } + } + } + else if (INTEGERSCALEHINT == 8) + { + uint8x16x4_t dstPix00; + uint8x16x4_t dstPix04; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1})); + dstPix00.val[1] = vqtbl1q_u8(srcPix, ((v128u8){ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3})); + dstPix00.val[2] = vqtbl1q_u8(srcPix, ((v128u8){ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5})); + dstPix00.val[3] = vqtbl1q_u8(srcPix, ((v128u8){ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7})); + dstPix04.val[0] = vqtbl1q_u8(srcPix, ((v128u8){ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9})); + dstPix04.val[1] = vqtbl1q_u8(srcPix, ((v128u8){10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11})); + dstPix04.val[2] = vqtbl1q_u8(srcPix, ((v128u8){12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13})); + dstPix04.val[3] = vqtbl1q_u8(srcPix, ((v128u8){14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15})); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 0) ); + dstPix00.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 1) ); + dstPix00.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 2) ); + dstPix00.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 3) ); + dstPix04.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 4) ); + dstPix04.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 5) ); + dstPix04.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 6) ); + dstPix04.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 7) ); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix00.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix04.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix04.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix04.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + dstPix04.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 0), dstPix00 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 4), dstPix04 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 0), dstPix00 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 4), dstPix04 ); + } + } + } + } + else if (INTEGERSCALEHINT == 16) + { + uint8x16x4_t dstPix00; + uint8x16x4_t dstPix04; + uint8x16x4_t dstPix08; + uint8x16x4_t dstPix12; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=INTEGERSCALEHINT) + { + const v128u8 srcPix = vld1q_u8( (u8 *)((v128u8 *)src + srcX) ); + + if (ELEMENTSIZE == 1) + { + dstPix00.val[0] = vdupq_laneq_u8(srcPix, 0); + dstPix00.val[1] = vdupq_laneq_u8(srcPix, 1); + dstPix00.val[2] = vdupq_laneq_u8(srcPix, 2); + dstPix00.val[3] = vdupq_laneq_u8(srcPix, 3); + dstPix04.val[0] = vdupq_laneq_u8(srcPix, 4); + dstPix04.val[1] = vdupq_laneq_u8(srcPix, 5); + dstPix04.val[2] = vdupq_laneq_u8(srcPix, 6); + dstPix04.val[3] = vdupq_laneq_u8(srcPix, 7); + dstPix08.val[0] = vdupq_laneq_u8(srcPix, 8); + dstPix08.val[1] = vdupq_laneq_u8(srcPix, 9); + dstPix08.val[2] = vdupq_laneq_u8(srcPix,10); + dstPix08.val[3] = vdupq_laneq_u8(srcPix,11); + dstPix12.val[0] = vdupq_laneq_u8(srcPix,12); + dstPix12.val[1] = vdupq_laneq_u8(srcPix,13); + dstPix12.val[2] = vdupq_laneq_u8(srcPix,14); + dstPix12.val[3] = vdupq_laneq_u8(srcPix,15); + } + else if (ELEMENTSIZE == 2) + { + dstPix00.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 0) ); + dstPix00.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 0) ); + dstPix00.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 1) ); + dstPix00.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 1) ); + dstPix04.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 2) ); + dstPix04.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 2) ); + dstPix04.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 3) ); + dstPix04.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 3) ); + dstPix08.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 4) ); + dstPix08.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 4) ); + dstPix08.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 5) ); + dstPix08.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 5) ); + dstPix12.val[0] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 6) ); + dstPix12.val[1] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 6) ); + dstPix12.val[2] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 7) ); + dstPix12.val[3] = vreinterpretq_u8_u16( vdupq_laneq_u16(vreinterpretq_u16_u8(srcPix), 7) ); + } + else if (ELEMENTSIZE == 4) + { + dstPix00.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix00.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 0) ); + dstPix04.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix04.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix04.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix04.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 1) ); + dstPix08.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix08.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix08.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix08.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 2) ); + dstPix12.val[0] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + dstPix12.val[1] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + dstPix12.val[2] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + dstPix12.val[3] = vreinterpretq_u8_u32( vdupq_laneq_u32(vreinterpretq_u32_u8(srcPix), 3) ); + } + + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 0), dstPix00 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 4), dstPix04 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + 8), dstPix08 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX +12), dstPix12 ); + + if (SCALEVERTICAL) + { + for (size_t ly = 1; ly < (size_t)INTEGERSCALEHINT; ly++) + { + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 0), dstPix00 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 4), dstPix04 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) + 8), dstPix08 ); + vst1q_u8_x4( (u8 *)((v128u8 *)dst + dstX + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * INTEGERSCALEHINT) * ly) +12), dstPix12 ); + } + } + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = dstWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE); srcX++, dstX+=scale) + { + const v128u8 srcVec = vld1q_u8((u8 *)((v128u8 *)src + srcX)); + v128u8 ssse3idx; + + for (size_t lx = 0; lx < scale; lx++) + { + if (ELEMENTSIZE == 1) + { + ssse3idx = vld1q_u8((u8 *)((v128u8 *)(_gpuDstToSrcSSSE3_u8_16e + (lx * sizeof(v128u8))))); + } + else if (ELEMENTSIZE == 2) + { + ssse3idx = vld1q_u8((u8 *)((v128u8 *)(_gpuDstToSrcSSSE3_u16_8e + (lx * sizeof(v128u8))))); + } + else if (ELEMENTSIZE == 4) + { + ssse3idx = vld1q_u8((u8 *)((v128u8 *)(_gpuDstToSrcSSSE3_u32_4e + (lx * sizeof(v128u8))))); + } + + vst1q_u8( (u8 *)((v128u8 *)dst + dstX + lx), vqtbl1q_u8(srcVec, ssse3idx) ); + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } + else + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x]; + } + } + } + + if (SCALEVERTICAL) + { + CopyLinesForVerticalCount(dst, dstWidth, dstLineCount); + } + } +} + +template +static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src, size_t srcWidth) +{ + if (INTEGERSCALEHINT == 0) + { + memcpy(dst, src, srcWidth * ELEMENTSIZE); + } + else if (INTEGERSCALEHINT == 1) + { + MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE), vst1q_u8((u8 *)((v128u8 *)dst + (X)), vld1q_u8((u8 *)((v128u8 *)src + (X)))) ); + } + else if (INTEGERSCALEHINT == 2) + { + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE)); dstX++) + { + if (ELEMENTSIZE == 1) + { + const uint8x16x2_t srcPix = vld1q_u8_x2( (u8 *)((v128u8 *)src + (dstX * 2)) ); + vst1q_u8((u8 *)((v128u8 *)dst + dstX), vuzp1q_u8(srcPix.val[0], srcPix.val[1])); + } + else if (ELEMENTSIZE == 2) + { + const uint16x8x2_t srcPix = vld1q_u16_x2( (u16 *)((v128u16 *)src + (dstX * 2)) ); + vst1q_u16((u16 *)((v128u16 *)dst + dstX), vuzp1q_u16(srcPix.val[0], srcPix.val[1])); + } + else if (ELEMENTSIZE == 4) + { + const uint32x4x2_t srcPix = vld1q_u32_x2( (u32 *)((v128u32 *)src + (dstX * 2)) ); + vst1q_u32((u32 *)((v128u32 *)dst + dstX), vuzp1q_u32(srcPix.val[0], srcPix.val[1])); + /* + // Pixel minification algorithm that takes the average value of each color component of the 2x2 pixel group. + uint8x16x4_t srcPix0 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 2) * 0)) ); + const uint8x16x4_t srcPix1 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 2) * 1)) ); + srcPix0.val[0] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[0]) ); + srcPix0.val[1] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[1]) ); + srcPix0.val[2] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[2]) ); + srcPix0.val[3] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[3]) ); + + srcPix0.val[0] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[0]), srcPix1.val[0] ); + srcPix0.val[1] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[1]), srcPix1.val[1] ); + srcPix0.val[2] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[2]), srcPix1.val[2] ); + srcPix0.val[3] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[3]), srcPix1.val[3] ); + + srcPix0.val[0] = vshrq_n_u16( vreinterpretq_u16_u8(srcPix0.val[0]), 2 ); + srcPix0.val[1] = vshrq_n_u16( vreinterpretq_u16_u8(srcPix0.val[1]), 2 ); + srcPix0.val[2] = vshrq_n_u16( vreinterpretq_u16_u8(srcPix0.val[2]), 2 ); + srcPix0.val[3] = vshrq_n_u16( vreinterpretq_u16_u8(srcPix0.val[3]), 2 ); + + const uint8x16x2_t dstPix = { + vqtbl4q_u8( srcPix0, ((v128u8){ 0,16,32,48, 2,18,34,50, 4,20,36,52, 6,22,38,54}) ), + vqtbl4q_u8( srcPix0, ((v128u8){ 8,24,40,56,10,26,42,58,12,28,44,60,14,30,46,62}) ) + }; + vst1q_u8_x2( (u8 *)((v128u8 *)dst + (dstX * 2)), dstPix); + */ + } + } + } + else if (INTEGERSCALEHINT == 3) + { + uint8x16x3_t srcPix; + + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE)); dstX++) + { + srcPix = vld1q_u8_x3( (u8 *)((v128u8 *)src + (dstX * 3)) ); + + if (ELEMENTSIZE == 1) + { + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl3q_u8( srcPix, ((v128u8){ 0, 3, 6, 9,12,15,18,21,24,27,30,33,36,39,42,45}) ) ); + } + else if (ELEMENTSIZE == 2) + { + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl3q_u8( srcPix, ((v128u8){ 0, 1, 6, 7,12,13,18,19,24,25,30,31,36,37,42,43}) ) ); + } + else if (ELEMENTSIZE == 4) + { + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl3q_u8( srcPix, ((v128u8){ 0, 1, 2, 3,12,13,14,15,24,25,26,27,36,37,38,39}) ) ); + } + } + } + else if (INTEGERSCALEHINT == 4) + { + uint8x16x4_t srcPix0; + + for (size_t dstX = 0; dstX < (GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE)); dstX++) + { + if (ELEMENTSIZE == 1) + { + srcPix0 = vld1q_u8_x4( (u8 *)((v128u8 *)src + (dstX * 4)) ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl4q_u8( srcPix0, ((v128u8){ 0, 4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}) ) ); + } + else if (ELEMENTSIZE == 2) + { + srcPix0 = vld1q_u8_x4( (u8 *)((v128u8 *)src + (dstX * 4)) ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl4q_u8( srcPix0, ((v128u8){ 0, 1, 8, 9,16,17,24,25,32,33,40,41,48,49,56,57}) ) ); + } + else if (ELEMENTSIZE == 4) + { + srcPix0 = vld1q_u8_x4( (u8 *)((v128u8 *)src + (dstX * 4)) ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), vqtbl4q_u8( srcPix0, ((v128u8){ 0, 1, 2, 3,16,17,18,19,32,33,34,35,48,49,50,51}) ) ); + /* + // Pixel minification algorithm that takes the average value of each color component of the 4x4 pixel group. + srcPix0 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 4) * 0)) ); + const uint8x16x4_t srcPix1 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 4) * 1)) ); + const uint8x16x4_t srcPix2 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 4) * 2)) ); + const uint8x16x4_t srcPix3 = vld4q_u8( (u8 *)((v128u8 *)src + (dstX * 4) + ((GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(v128u8) / ELEMENTSIZE) * 4) * 3)) ); + + srcPix0.val[0] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[0]) ); + srcPix0.val[1] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[1]) ); + srcPix0.val[2] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[2]) ); + srcPix0.val[3] = vreinterpretq_u8_u16( vpaddlq_u8(srcPix0.val[3]) ); + + srcPix0.val[0] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[0]), srcPix1.val[0] ); + srcPix0.val[1] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[1]), srcPix1.val[1] ); + srcPix0.val[2] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[2]), srcPix1.val[2] ); + srcPix0.val[3] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[3]), srcPix1.val[3] ); + + srcPix0.val[0] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[0]), srcPix2.val[0] ); + srcPix0.val[1] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[1]), srcPix2.val[1] ); + srcPix0.val[2] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[2]), srcPix2.val[2] ); + srcPix0.val[3] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[3]), srcPix2.val[3] ); + + srcPix0.val[0] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[0]), srcPix3.val[0] ); + srcPix0.val[1] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[1]), srcPix3.val[1] ); + srcPix0.val[2] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[2]), srcPix3.val[2] ); + srcPix0.val[3] = vpadalq_u8( vreinterpretq_u16_u8(srcPix0.val[3]), srcPix3.val[3] ); + + srcPix0.val[0] = vreinterpretq_u8_u32( vpaddlq_u16(vreinterpretq_u16_u8(srcPix0.val[0])) ); + srcPix0.val[1] = vreinterpretq_u8_u32( vpaddlq_u16(vreinterpretq_u16_u8(srcPix0.val[1])) ); + srcPix0.val[2] = vreinterpretq_u8_u32( vpaddlq_u16(vreinterpretq_u16_u8(srcPix0.val[2])) ); + srcPix0.val[3] = vreinterpretq_u8_u32( vpaddlq_u16(vreinterpretq_u16_u8(srcPix0.val[3])) ); + + srcPix0.val[0] = vshrq_n_u32( vreinterpretq_u32_u8(srcPix0.val[0]), 4 ); + srcPix0.val[1] = vshrq_n_u32( vreinterpretq_u32_u8(srcPix0.val[1]), 4 ); + srcPix0.val[2] = vshrq_n_u32( vreinterpretq_u32_u8(srcPix0.val[2]), 4 ); + srcPix0.val[3] = vshrq_n_u32( vreinterpretq_u32_u8(srcPix0.val[3]), 4 ); + + const v128u8 dstPix = vqtbl4q_u8( srcPix0, ((v128u8){ 0,16,32,48, 4,20,36,52, 8,24,40,56,12,28,44,60}) ); + vst1q_u8( (u8 *)((v128u8 *)dst + dstX), dstPix); + */ + } + } + } + else if (INTEGERSCALEHINT > 1) + { + const size_t scale = srcWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + if (ELEMENTSIZE == 1) + { + ((u8 *)dst)[x] = ((u8 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[x] = ((u16 *)src)[x * scale]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[x] = ((u32 *)src)[x * scale]; + } + } + } + else + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) + { + if (ELEMENTSIZE == 1) + { + ( (u8 *)dst)[i] = ( (u8 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 2) + { + ((u16 *)dst)[i] = ((u16 *)src)[_gpuDstPitchIndex[i]]; + } + else if (ELEMENTSIZE == 4) + { + ((u32 *)dst)[i] = ((u32 *)src)[_gpuDstPitchIndex[i]]; + } + } + } +} + +FORCEINLINE v128u16 ColorOperation_NEON::blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const +{ + v128u16 ra; + v128u16 ga; + v128u16 ba; + static const v128u16 colorBitMask = vdupq_n_u16(0x001F); + + ra = vandq_u16( colA, colorBitMask); + ga = vandq_u16(vshrq_n_u16(colA, 5), colorBitMask); + ba = vandq_u16(vshrq_n_u16(colA, 10), colorBitMask); + + v128u16 rb = vandq_u16( colB, colorBitMask); + v128u16 gb = vandq_u16(vshrq_n_u16(colB, 5), colorBitMask); + v128u16 bb = vandq_u16(vshrq_n_u16(colB, 10), colorBitMask); + + ra = vmlaq_u16( vmulq_u16(ra, blendEVA), rb, blendEVB ); + ga = vmlaq_u16( vmulq_u16(ga, blendEVA), gb, blendEVB ); + ba = vmlaq_u16( vmulq_u16(ba, blendEVA), bb, blendEVB ); + + ra = vshrq_n_u16(ra, 4); + ga = vshrq_n_u16(ga, 4); + ba = vshrq_n_u16(ba, 4); + + ra = vminq_u16(ra, colorBitMask); + ga = vminq_u16(ga, colorBitMask); + ba = vminq_u16(ba, colorBitMask); + + return vorrq_u16(ra, vorrq_u16( vshlq_n_u16(ga, 5), vshlq_n_u16(ba, 10)) ); +} + +// Note that if USECONSTANTBLENDVALUESHINT is true, then this method will assume that blendEVA contains identical values +// for each 16-bit vector element, and also that blendEVB contains identical values for each 16-bit vector element. If +// this assumption is broken, then the resulting color will be undefined. +template +FORCEINLINE v128u32 ColorOperation_NEON::blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const +{ + v128u16 outColorLo; + v128u16 outColorHi; + v128u32 outColor; + + if (USECONSTANTBLENDVALUESHINT) + { + const v128u8 blendA = vdupq_laneq_u8(vreinterpretq_u8_u16(blendEVA), 0); + const v128u8 blendB = vdupq_laneq_u8(vreinterpretq_u8_u16(blendEVB), 0); + outColorLo = vmlal_u8( vmull_u8(vget_low_u8( vreinterpretq_u8_u32(colA)), vget_low_u8(blendA)), vget_low_u8( vreinterpretq_u8_u32(colB)), vget_low_u8(blendB) ); + outColorHi = vmlal_high_u8( vmull_high_u8(vreinterpretq_u8_u32(colA), blendA), vreinterpretq_u8_u32(colB), blendB ); + } + else + { + const v128u8 blendA = vqtbl1q_u8(vreinterpretq_u8_u16(blendEVA), ((v128u8){ 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8,12,12,12,12})); + const v128u8 blendB = vqtbl1q_u8(vreinterpretq_u8_u16(blendEVB), ((v128u8){ 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8,12,12,12,12})); + outColorLo = vmlal_u8( vmull_u8(vget_low_u8( vreinterpretq_u8_u32(colA)), vget_low_u8(blendA)), vget_low_u8( vreinterpretq_u8_u32(colB)), vget_low_u8(blendB) ); + outColorHi = vmlal_high_u8( vmull_high_u8(vreinterpretq_u8_u32(colA), blendA), vreinterpretq_u8_u32(colB), blendB ); + } + + outColorLo = vshrq_n_u16(outColorLo, 4); + outColorHi = vshrq_n_u16(outColorHi, 4); + + if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + outColorLo = vminq_u16(outColorLo, vdupq_n_u16(255)); + outColorHi = vminq_u16(outColorHi, vdupq_n_u16(255)); + } + + outColor = vreinterpretq_u32_u8( vuzp1q_u8( vreinterpretq_u8_u16(outColorLo), vreinterpretq_u8_u16(outColorHi) ) ); + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + outColor = vreinterpretq_u32_u8( vminq_u8(vreinterpretq_u8_u32(outColor), vdupq_n_u8(63)) ); + } + + outColor = vandq_u32(outColor, vdupq_n_u32(0x00FFFFFF)); + + return outColor; +} + +FORCEINLINE v128u16 ColorOperation_NEON::blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const +{ + static const u8 X = 0x80; + const uint8x16x2_t col = { vreinterpretq_u8_u32(colA_Lo), vreinterpretq_u8_u32(colA_Hi) }; + + // If the color format of B is 555, then the colA_Hi parameter is required. + // The color format of A is assumed to be RGB666. + v128u16 ra = vreinterpretq_u16_u8( vqtbl2q_u8( col, ((v128u8){ 0, X, 4, X, 8, X,12, X,16, X,20, X,24, X,28, X}) ) ); + v128u16 ga = vreinterpretq_u16_u8( vqtbl2q_u8( col, ((v128u8){ 1, X, 5, X, 9, X,13, X,17, X,21, X,25, X,29, X}) ) ); + v128u16 ba = vreinterpretq_u16_u8( vqtbl2q_u8( col, ((v128u8){ 2, X, 6, X,10, X,14, X,18, X,22, X,26, X,30, X}) ) ); + v128u16 aa = vreinterpretq_u16_u8( vqtbl2q_u8( col, ((v128u8){ 3, X, 7, X,11, X,15, X,19, X,23, X,27, X,31, X}) ) ); + aa = vqaddq_u16(aa, vdupq_n_u16(1)); + + const v128u16 rb = vandq_u16( vshlq_n_u16(colB, 1), vdupq_n_u16(0x003E) ); + const v128u16 gb = vandq_u16( vshrq_n_u16(colB, 4), vdupq_n_u16(0x003E) ); + const v128u16 bb = vandq_u16( vshrq_n_u16(colB, 9), vdupq_n_u16(0x003E) ); + const v128u16 ab = vqsubq_u16( vdupq_n_u16(32), aa ); + + ra = vmlaq_u16( vmulq_u16(ra, aa), rb, ab ); + ga = vmlaq_u16( vmulq_u16(ga, aa), gb, ab ); + ba = vmlaq_u16( vmulq_u16(ba, aa), bb, ab ); + + ra = vshrq_n_u16(ra, 6); + ga = vshrq_n_u16(ga, 6); + ba = vshrq_n_u16(ba, 6); + + return vorrq_u16( vorrq_u16(ra, vshlq_n_u16(ga, 5)), vshlq_n_u16(ba, 10) ); +} + +template +FORCEINLINE v128u32 ColorOperation_NEON::blend3D(const v128u32 &colA, const v128u32 &colB) const +{ + static const u8 X = 0x80; + + // If the color format of B is 666 or 888, then the colA_Hi parameter is ignored. + // The color format of A is assumed to match the color format of B. + v128u16 rgbALo; + v128u16 rgbAHi; + + v128u8 alpha = vqtbl1q_u8( vreinterpretq_u8_u32(colA), ((v128u8){ 3, 3, 3, X, 7, 7, 7, X,11,11,11, X,15,15,15, X}) ); + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + alpha = vaddq_u8(alpha, vdupq_n_u8(1)); + const v128u8 invAlpha = vsubq_u8(vdupq_n_u8(32), alpha); + rgbALo = vmlal_u8( vmull_u8( vget_low_u8(vreinterpretq_u8_u32(colA)), vget_low_u8(alpha) ), vget_low_u8(vreinterpretq_u8_u32(colB)), vget_low_u8(invAlpha) ); + rgbAHi = vmlal_high_u8( vmull_high_u8(vreinterpretq_u8_u32(colA), alpha), vreinterpretq_u8_u32(colB), invAlpha ); + } + else + { + const v128u8 zeroVec = vdupq_n_u8(0); + rgbALo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(colA), zeroVec) ); + rgbAHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(colA), zeroVec) ); + v128u16 rgbBLo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(colB), zeroVec) ); + v128u16 rgbBHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(colB), zeroVec) ); + + v128u16 alphaLo = vreinterpretq_u16_u8( vzip1q_u8(alpha, zeroVec) ); + v128u16 alphaHi = vreinterpretq_u16_u8( vzip2q_u8(alpha, zeroVec) ); + alphaLo = vaddq_u16(alphaLo, vdupq_n_u16(1)); + alphaHi = vaddq_u16(alphaHi, vdupq_n_u16(1)); + const v128u16 invAlphaLo = vsubq_u16(vdupq_n_u16(256), alphaLo); + const v128u16 invAlphaHi = vsubq_u16(vdupq_n_u16(256), alphaHi); + + rgbALo = vmlaq_u16( vmulq_u16(rgbALo, alphaLo), rgbBLo, invAlphaLo ); + rgbAHi = vmlaq_u16( vmulq_u16(rgbAHi, alphaHi), rgbBHi, invAlphaHi ); + } + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + rgbALo = vshrq_n_u16(rgbALo, 5); + rgbAHi = vshrq_n_u16(rgbAHi, 5); + } + else + { + rgbALo = vshrq_n_u16(rgbALo, 8); + rgbAHi = vshrq_n_u16(rgbAHi, 8); + } + + return vandq_u32( vreinterpretq_u32_u8( vuzp1q_u8(vreinterpretq_u8_u16(rgbALo), vreinterpretq_u8_u16(rgbAHi)) ), vdupq_n_u32(0x00FFFFFF) ); +} + +FORCEINLINE v128u16 ColorOperation_NEON::increase(const v128u16 &col, const v128u16 &blendEVY) const +{ + v128u16 r = vandq_u16( col, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(col, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(col, 10), vdupq_n_u16(0x001F) ); + + r = vsraq_n_u16( r, vmulq_u16(vsubq_u16(vdupq_n_u16(31), r), blendEVY), 4 ); + g = vsraq_n_u16( g, vmulq_u16(vsubq_u16(vdupq_n_u16(31), g), blendEVY), 4 ); + b = vsraq_n_u16( b, vmulq_u16(vsubq_u16(vdupq_n_u16(31), b), blendEVY), 4 ); + + return vorrq_u16(r, vorrq_u16( vshlq_n_u16(g, 5), vshlq_n_u16(b, 10)) ); +} + +template +FORCEINLINE v128u32 ColorOperation_NEON::increase(const v128u32 &col, const v128u16 &blendEVY) const +{ + static const v128u8 zeroVec = vdupq_n_u8(0); + v128u16 rgbLo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(col), zeroVec) ); + v128u16 rgbHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(col), zeroVec) ); + + rgbLo = vsraq_n_u16( rgbLo, vmulq_u16(vsubq_u16(vdupq_n_u16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4 ); + rgbHi = vsraq_n_u16( rgbHi, vmulq_u16(vsubq_u16(vdupq_n_u16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4 ); + + return vandq_u32( vreinterpretq_u32_u8( vuzp1q_u8(vreinterpretq_u8_u16(rgbLo), vreinterpretq_u8_u16(rgbHi)) ), vdupq_n_u32(0x00FFFFFF) ); +} + +FORCEINLINE v128u16 ColorOperation_NEON::decrease(const v128u16 &col, const v128u16 &blendEVY) const +{ + v128u16 r = vandq_u16( col, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(col, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(col, 10), vdupq_n_u16(0x001F) ); + + r = vsubq_u16( r, vshrq_n_u16(vmulq_u16(r, blendEVY), 4) ); + g = vsubq_u16( g, vshrq_n_u16(vmulq_u16(g, blendEVY), 4) ); + b = vsubq_u16( b, vshrq_n_u16(vmulq_u16(b, blendEVY), 4) ); + + return vorrq_u16(r, vorrq_u16( vshlq_n_u16(g, 5), vshlq_n_u16(b, 10)) ); +} + +template +FORCEINLINE v128u32 ColorOperation_NEON::decrease(const v128u32 &col, const v128u16 &blendEVY) const +{ + static const v128u8 zeroVec = vdupq_n_u8(0); + v128u16 rgbLo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(col), zeroVec) ); + v128u16 rgbHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(col), zeroVec) ); + + rgbLo = vsubq_u16( rgbLo, vshrq_n_u16(vmulq_u16(rgbLo, blendEVY), 4) ); + rgbHi = vsubq_u16( rgbHi, vshrq_n_u16(vmulq_u16(rgbHi, blendEVY), 4) ); + + return vandq_u32( vreinterpretq_u32_u8( vuzp1q_u8(vreinterpretq_u8_u16(rgbLo), vreinterpretq_u8_u16(rgbHi)) ), vdupq_n_u32(0x00FFFFFF) ); +} + +template +FORCEINLINE void PixelOperation_NEON::_copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vorrq_u16(src0, alphaBits), + vorrq_u16(src1, alphaBits) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t dst32; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo6665Opaque_NEON(src0, dst32.val[0], dst32.val[1]); + ColorspaceConvert555xTo6665Opaque_NEON(src1, dst32.val[2], dst32.val[3]); + } + else + { + ColorspaceConvert555xTo8888Opaque_NEON(src0, dst32.val[0], dst32.val[1]); + ColorspaceConvert555xTo8888Opaque_NEON(src1, dst32.val[2], dst32.val[3]); + } + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + if (!ISDEBUGRENDER) + { + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); + } +} + +template +FORCEINLINE void PixelOperation_NEON::_copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vorrq_u16( ColorspaceConvert6665To5551_NEON(src0, src1), alphaBits), + vorrq_u16( ColorspaceConvert6665To5551_NEON(src2, src3), alphaBits) + }; + vst1q_u16_x2((u16 *)compInfo.target.lineColor16, dst16); + } + else + { + const v128u32 alphaBits = vdupq_n_u32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + const uint32x4x4_t dst32 = { + vorrq_u32(src0, alphaBits), + vorrq_u32(src1, alphaBits), + vorrq_u32(src2, alphaBits), + vorrq_u32(src3, alphaBits) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + if (!ISDEBUGRENDER) + { + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); + } +} + +template +FORCEINLINE void PixelOperation_NEON::_copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(src0, alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(src1, alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t src32; + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo6665Opaque_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo6665Opaque_NEON(src1, src32.val[2], src32.val[3]); + } + else + { + ColorspaceConvert555xTo8888Opaque_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo8888Opaque_NEON(src1, src32.val[2], src32.val[3]); + } + + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], src32.val[0], oldColor32.val[0]), + vbslq_u32(passMask32.val[1], src32.val[1], oldColor32.val[1]), + vbslq_u32(passMask32.val[2], src32.val[2], oldColor32.val[2]), + vbslq_u32(passMask32.val[3], src32.val[3], oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + if (!ISDEBUGRENDER) + { + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); + } +} + +template +FORCEINLINE void PixelOperation_NEON::_copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t src16 = { + ColorspaceConvert6665To5551_NEON(src0, src1), + ColorspaceConvert6665To5551_NEON(src2, src3) + }; + + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(src16.val[0], alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(src16.val[1], alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(src0, alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(src1, alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(src2, alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(src3, alphaBits), oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + if (!ISDEBUGRENDER) + { + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); + } +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vorrq_u16(colorop_vec.increase(src0, evy16), alphaBits), + vorrq_u16(colorop_vec.increase(src1, evy16), alphaBits) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t src32; + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo666x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo666x_NEON(src1, src32.val[2], src32.val[3]); + } + else + { + ColorspaceConvert555xTo888x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo888x_NEON(src1, src32.val[2], src32.val[3]); + } + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + + const uint32x4x4_t dst32 = { + vorrq_u32( colorop_vec.increase(src32.val[0], evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src32.val[1], evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src32.val[2], evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src32.val[3], evy16), alphaBits ) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t src16 = { + ColorspaceConvert6665To5551_NEON(src0, src1), + ColorspaceConvert6665To5551_NEON(src2, src3) + }; + const uint16x8x2_t dst16 = { + vorrq_u16( colorop_vec.increase(src16.val[0], evy16), alphaBits ), + vorrq_u16( colorop_vec.increase(src16.val[1], evy16), alphaBits ) + }; + + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + + const uint32x4x4_t dst32 = { + vorrq_u32( colorop_vec.increase(src0, evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src1, evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src2, evy16), alphaBits ), + vorrq_u32( colorop_vec.increase(src3, evy16), alphaBits ) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(colorop_vec.increase(src0, evy16), alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(colorop_vec.increase(src1, evy16), alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t src32; + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo666x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo666x_NEON(src1, src32.val[2], src32.val[3]); + } + else + { + ColorspaceConvert555xTo888x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo888x_NEON(src1, src32.val[2], src32.val[3]); + } + + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(colorop_vec.increase(src32.val[0], evy16), alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(colorop_vec.increase(src32.val[1], evy16), alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(colorop_vec.increase(src32.val[2], evy16), alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(colorop_vec.increase(src32.val[3], evy16), alphaBits), oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t src16 = { + ColorspaceConvert6665To5551_NEON(src0, src1), + ColorspaceConvert6665To5551_NEON(src2, src3) + }; + + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(colorop_vec.increase(src16.val[0], evy16), alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(colorop_vec.increase(src16.val[1], evy16), alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(colorop_vec.increase(src0, evy16), alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(colorop_vec.increase(src1, evy16), alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(colorop_vec.increase(src2, evy16), alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(colorop_vec.increase(src3, evy16), alphaBits), oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vorrq_u16(colorop_vec.decrease(src0, evy16), alphaBits), + vorrq_u16(colorop_vec.decrease(src1, evy16), alphaBits) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t src32; + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo666x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo666x_NEON(src1, src32.val[2], src32.val[3]); + } + else + { + ColorspaceConvert555xTo888x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo888x_NEON(src1, src32.val[2], src32.val[3]); + } + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + + const uint32x4x4_t dst32 = { + vorrq_u32( colorop_vec.decrease(src32.val[0], evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src32.val[1], evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src32.val[2], evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src32.val[3], evy16), alphaBits ) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t src16 = { + ColorspaceConvert6665To5551_NEON(src0, src1), + ColorspaceConvert6665To5551_NEON(src2, src3) + }; + const uint16x8x2_t dst16 = { + vorrq_u16( colorop_vec.decrease(src16.val[0], evy16), alphaBits ), + vorrq_u16( colorop_vec.decrease(src16.val[1], evy16), alphaBits ) + }; + + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + + const uint32x4x4_t dst32 = { + vorrq_u32( colorop_vec.decrease(src0, evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src1, evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src2, evy16), alphaBits ), + vorrq_u32( colorop_vec.decrease(src3, evy16), alphaBits ) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + vst1q_u8(compInfo.target.lineLayerID, srcLayerID); +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(colorop_vec.decrease(src0, evy16), alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(colorop_vec.decrease(src1, evy16), alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + uint32x4x4_t src32; + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo666x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo666x_NEON(src1, src32.val[2], src32.val[3]); + } + else + { + ColorspaceConvert555xTo888x_NEON(src0, src32.val[0], src32.val[1]); + ColorspaceConvert555xTo888x_NEON(src1, src32.val[2], src32.val[3]); + } + + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(colorop_vec.decrease(src32.val[0], evy16), alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(colorop_vec.decrease(src32.val[1], evy16), alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(colorop_vec.decrease(src32.val[2], evy16), alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(colorop_vec.decrease(src32.val[3], evy16), alphaBits), oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); + +} + +template +FORCEINLINE void PixelOperation_NEON::_brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const +{ + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t src16 = { + ColorspaceConvert6665To5551_NEON(src0, src1), + ColorspaceConvert6665To5551_NEON(src2, src3) + }; + + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(colorop_vec.decrease(src16.val[0], evy16), alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(colorop_vec.decrease(src16.val[1], evy16), alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[0], passMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(passMask16.val[1], passMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(passMask16.val[1], passMask16.val[1]) ) + }; + + const v128u32 alphaBits = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000); + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(colorop_vec.decrease(src0, evy16), alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(colorop_vec.decrease(src1, evy16), alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(colorop_vec.decrease(src2, evy16), alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(colorop_vec.decrease(src3, evy16), alphaBits), oldColor32.val[3]) + }; + + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } + + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); +} + +template +FORCEINLINE void PixelOperation_NEON::_unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const +{ + static uint8x16_t zeroVec = ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); + + v128u8 dstTargetBlendEnableMask = vqtbl1q_u8(dstBlendEnableMaskLUT, dstLayerID); + dstTargetBlendEnableMask = vandq_u8( vmvnq_u8(vceqq_u8(dstLayerID, srcLayerID)), dstTargetBlendEnableMask ); + + v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : zeroVec; + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + v128u8 eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? vdupq_n_u8(compInfo.renderState.blendEVA) : vreinterpretq_u8_u16( vdupq_n_u16(compInfo.renderState.blendEVA) ); + v128u8 evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? vdupq_n_u8(compInfo.renderState.blendEVB) : vreinterpretq_u8_u16( vdupq_n_u16(compInfo.renderState.blendEVB) ); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v128u8 isObjTranslucentMask = vandq_u8( dstTargetBlendEnableMask, vorrq_u8(vceqq_u8(spriteMode, vdupq_n_u8(OBJMode_Transparent)), vceqq_u8(spriteMode, vdupq_n_u8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v128u8 spriteAlphaMask = vandq_u8( vmvnq_u8(vceqq_u8(spriteAlpha, vdupq_n_u8(0xFF))), isObjTranslucentMask ); + eva_vec128 = vbslq_u8(spriteAlphaMask, spriteAlpha, eva_vec128); + evb_vec128 = vbslq_u8(spriteAlphaMask, vsubq_u8(vdupq_n_u8(16), spriteAlpha), evb_vec128); + } + + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = vbslq_u8(enableColorEffectMask, vdupq_n_u8(compInfo.renderState.colorEffect), vdupq_n_u8(ColorEffect_Disable)); + + // ---------- + + uint16x8x2_t tmpSrc16; + uint32x4x4_t tmpSrc32; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc16.val[0] = src0; + tmpSrc16.val[1] = src1; + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555xTo666x_NEON(src0, tmpSrc32.val[0], tmpSrc32.val[1]); + ColorspaceConvert555xTo666x_NEON(src1, tmpSrc32.val[2], tmpSrc32.val[3]); + } + else + { + ColorspaceConvert555xTo888x_NEON(src0, tmpSrc32.val[0], tmpSrc32.val[1]); + ColorspaceConvert555xTo888x_NEON(src1, tmpSrc32.val[2], tmpSrc32.val[3]); + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v128u8 brightnessMask8 = vandq_u8( vmvnq_u8(forceDstTargetBlendMask), vandq_u8(srcEffectEnableMask, vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_IncreaseBrightness))) ); + const u8 brightnessUpMaskValue = vmaxvq_u8(brightnessMask8); + + if (brightnessUpMaskValue != 0) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t brightnessMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(brightnessMask8, brightnessMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(brightnessMask8, brightnessMask8) ) + }; + + tmpSrc16.val[0] = vbslq_u16( brightnessMask16.val[0], colorop_vec.increase(tmpSrc16.val[0], evy16), tmpSrc16.val[0] ); + tmpSrc16.val[1] = vbslq_u16( brightnessMask16.val[1], colorop_vec.increase(tmpSrc16.val[1], evy16), tmpSrc16.val[1] ); + } + else + { + const uint32x4x4_t brightnessMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + tmpSrc32.val[0] = vbslq_u32( brightnessMask32.val[0], colorop_vec.increase(tmpSrc32.val[0], evy16), tmpSrc32.val[0] ); + tmpSrc32.val[1] = vbslq_u32( brightnessMask32.val[1], colorop_vec.increase(tmpSrc32.val[1], evy16), tmpSrc32.val[1] ); + tmpSrc32.val[2] = vbslq_u32( brightnessMask32.val[2], colorop_vec.increase(tmpSrc32.val[2], evy16), tmpSrc32.val[2] ); + tmpSrc32.val[3] = vbslq_u32( brightnessMask32.val[3], colorop_vec.increase(tmpSrc32.val[3], evy16), tmpSrc32.val[3] ); + } + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v128u8 brightnessMask8 = vandq_u8( vmvnq_u8(forceDstTargetBlendMask), vandq_u8(srcEffectEnableMask, vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_DecreaseBrightness))) ); + const u8 brightnessDownMaskValue = vmaxvq_u8(brightnessMask8); + + if (brightnessDownMaskValue != 0) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t brightnessMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(brightnessMask8, brightnessMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(brightnessMask8, brightnessMask8) ) + }; + + tmpSrc16.val[0] = vbslq_u16( brightnessMask16.val[0], colorop_vec.decrease(tmpSrc16.val[0], evy16), tmpSrc16.val[0] ); + tmpSrc16.val[1] = vbslq_u16( brightnessMask16.val[1], colorop_vec.decrease(tmpSrc16.val[1], evy16), tmpSrc16.val[1] ); + } + else + { + const uint32x4x4_t brightnessMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + tmpSrc32.val[0] = vbslq_u32( brightnessMask32.val[0], colorop_vec.decrease(tmpSrc32.val[0], evy16), tmpSrc32.val[0] ); + tmpSrc32.val[1] = vbslq_u32( brightnessMask32.val[1], colorop_vec.decrease(tmpSrc32.val[1], evy16), tmpSrc32.val[1] ); + tmpSrc32.val[2] = vbslq_u32( brightnessMask32.val[2], colorop_vec.decrease(tmpSrc32.val[2], evy16), tmpSrc32.val[2] ); + tmpSrc32.val[3] = vbslq_u32( brightnessMask32.val[3], colorop_vec.decrease(tmpSrc32.val[3], evy16), tmpSrc32.val[3] ); + } + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v128u8 blendMask8 = vorrq_u8( forceDstTargetBlendMask, vandq_u8(vandq_u8(srcEffectEnableMask, dstTargetBlendEnableMask), vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_Blend))) ); + const u8 blendMaskValue = vmaxvq_u8(blendMask8); + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + + if (blendMaskValue != 0) + { + const uint16x8x2_t blendMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(blendMask8, blendMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(blendMask8, blendMask8) ) + }; + + uint16x8x2_t blendSrc16; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + //blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc16.val[0] = colorop_vec.blend(tmpSrc16.val[0], oldColor16.val[0], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc16.val[1] = colorop_vec.blend(tmpSrc16.val[1], oldColor16.val[1], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const uint16x8x2_t tempEVA = { + vreinterpretq_u16_u8( vzip1q_u8(eva_vec128, zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(eva_vec128, zeroVec) ) + }; + const uint16x8x2_t tempEVB = { + vreinterpretq_u16_u8( vzip1q_u8(evb_vec128, zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(evb_vec128, zeroVec) ) + }; + + blendSrc16.val[0] = colorop_vec.blend(tmpSrc16.val[0], oldColor16.val[0], tempEVA.val[0], tempEVB.val[0]); + blendSrc16.val[1] = colorop_vec.blend(tmpSrc16.val[1], oldColor16.val[1], tempEVA.val[1], tempEVB.val[1]); + break; + } + } + + tmpSrc16.val[0] = vbslq_u16(blendMask16.val[0], blendSrc16.val[0], tmpSrc16.val[0]); + tmpSrc16.val[1] = vbslq_u16(blendMask16.val[1], blendSrc16.val[1], tmpSrc16.val[1]); + } + + // Store the final colors. + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(tmpSrc16.val[0], alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(tmpSrc16.val[1], alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + + if (blendMaskValue != 0) + { + uint32x4x4_t blendSrc32; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + //blendSrc32[0] = colorop_vec.blend3D(src0, dst32[0]); + //blendSrc32[1] = colorop_vec.blend3D(src1, dst32[1]); + //blendSrc32[2] = colorop_vec.blend3D(src2, dst32[2]); + //blendSrc32[3] = colorop_vec.blend3D(src3, dst32[3]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc32.val[0] = colorop_vec.blend(tmpSrc32.val[0], oldColor32.val[0], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[1] = colorop_vec.blend(tmpSrc32.val[1], oldColor32.val[1], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[2] = colorop_vec.blend(tmpSrc32.val[2], oldColor32.val[2], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[3] = colorop_vec.blend(tmpSrc32.val[3], oldColor32.val[3], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + uint8x16x2_t tempBlend8 = { + vzip1q_u8(eva_vec128, eva_vec128), + vzip2q_u8(eva_vec128, eva_vec128) + }; + + const uint16x8x4_t tempEVA = { + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[1], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[1], zeroVec) ) + }; + + tempBlend8.val[0] = vzip1q_u8(evb_vec128, evb_vec128); + tempBlend8.val[1] = vzip2q_u8(evb_vec128, evb_vec128); + + const uint16x8x4_t tempEVB = { + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[1], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[1], zeroVec) ) + }; + + blendSrc32.val[0] = colorop_vec.blend(tmpSrc32.val[0], oldColor32.val[0], tempEVA.val[0], tempEVB.val[0]); + blendSrc32.val[1] = colorop_vec.blend(tmpSrc32.val[1], oldColor32.val[1], tempEVA.val[1], tempEVB.val[1]); + blendSrc32.val[2] = colorop_vec.blend(tmpSrc32.val[2], oldColor32.val[2], tempEVA.val[2], tempEVB.val[2]); + blendSrc32.val[3] = colorop_vec.blend(tmpSrc32.val[3], oldColor32.val[3], tempEVA.val[3], tempEVB.val[3]); + break; + } + } + + const uint16x8x2_t blendMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(blendMask8, blendMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(blendMask8, blendMask8) ) + }; + + const uint32x4x4_t blendMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(blendMask16.val[0], blendMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(blendMask16.val[0], blendMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(blendMask16.val[1], blendMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(blendMask16.val[1], blendMask16.val[1]) ) + }; + + tmpSrc32.val[0] = vbslq_u32(blendMask32.val[0], blendSrc32.val[0], tmpSrc32.val[0]); + tmpSrc32.val[1] = vbslq_u32(blendMask32.val[1], blendSrc32.val[1], tmpSrc32.val[1]); + tmpSrc32.val[2] = vbslq_u32(blendMask32.val[2], blendSrc32.val[2], tmpSrc32.val[2]); + tmpSrc32.val[3] = vbslq_u32(blendMask32.val[3], blendSrc32.val[3], tmpSrc32.val[3]); + } + + // Store the final colors. + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + const v128u32 alphaBits = vdupq_n_u32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(tmpSrc32.val[0], alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(tmpSrc32.val[1], alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(tmpSrc32.val[2], alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(tmpSrc32.val[3], alphaBits), oldColor32.val[3]) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } +} + +template +FORCEINLINE void PixelOperation_NEON::_unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const +{ + static uint8x16_t zeroVec = ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + const v128u8 dstLayerID = vld1q_u8(compInfo.target.lineLayerID); + vst1q_u8( compInfo.target.lineLayerID, vbslq_u8(passMask8, srcLayerID, dstLayerID) ); + + v128u8 dstTargetBlendEnableMask = vqtbl1q_u8(dstBlendEnableMaskLUT, dstLayerID); + dstTargetBlendEnableMask = vandq_u8( vmvnq_u8(vceqq_u8(dstLayerID, srcLayerID)), dstTargetBlendEnableMask ); + + v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : zeroVec; + + // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. + // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then + // convert EVA and EVB into vectors of uint16 right before we use them. + v128u8 eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? vdupq_n_u8(compInfo.renderState.blendEVA) : vreinterpretq_u8_u16( vdupq_n_u16(compInfo.renderState.blendEVA) ); + v128u8 evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? vdupq_n_u8(compInfo.renderState.blendEVB) : vreinterpretq_u8_u16( vdupq_n_u16(compInfo.renderState.blendEVB) ); + + if (LAYERTYPE == GPULayerType_OBJ) + { + const v128u8 isObjTranslucentMask = vandq_u8( dstTargetBlendEnableMask, vorrq_u8(vceqq_u8(spriteMode, vdupq_n_u8(OBJMode_Transparent)), vceqq_u8(spriteMode, vdupq_n_u8(OBJMode_Bitmap))) ); + forceDstTargetBlendMask = isObjTranslucentMask; + + const v128u8 spriteAlphaMask = vandq_u8( vmvnq_u8(vceqq_u8(spriteAlpha, vdupq_n_u8(0xFF))), isObjTranslucentMask ); + eva_vec128 = vbslq_u8(spriteAlphaMask, spriteAlpha, eva_vec128); + evb_vec128 = vbslq_u8(spriteAlphaMask, vsubq_u8(vdupq_n_u8(16), spriteAlpha), evb_vec128); + } + + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = vbslq_u8(enableColorEffectMask, vdupq_n_u8(compInfo.renderState.colorEffect), vdupq_n_u8(ColorEffect_Disable)); + + // ---------- + + uint16x8x2_t tmpSrc16; + uint32x4x4_t tmpSrc32; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc16.val[0] = ColorspaceConvert6665To5551_NEON(src0, src1); + tmpSrc16.val[1] = ColorspaceConvert6665To5551_NEON(src2, src3); + } + else + { + tmpSrc32.val[0] = src0; + tmpSrc32.val[1] = src1; + tmpSrc32.val[2] = src2; + tmpSrc32.val[3] = src3; + } + + switch (compInfo.renderState.colorEffect) + { + case ColorEffect_IncreaseBrightness: + { + const v128u8 brightnessMask8 = vandq_u8( vmvnq_u8(forceDstTargetBlendMask), vandq_u8(srcEffectEnableMask, vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_IncreaseBrightness))) ); + const u8 brightnessUpMaskValue = vmaxvq_u8(brightnessMask8); + + if (brightnessUpMaskValue != 0) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t brightnessMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(brightnessMask8, brightnessMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(brightnessMask8, brightnessMask8) ) + }; + + tmpSrc16.val[0] = vbslq_u16( brightnessMask16.val[0], colorop_vec.increase(tmpSrc16.val[0], evy16), tmpSrc16.val[0] ); + tmpSrc16.val[1] = vbslq_u16( brightnessMask16.val[1], colorop_vec.increase(tmpSrc16.val[1], evy16), tmpSrc16.val[1] ); + } + else + { + const uint32x4x4_t brightnessMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + tmpSrc32.val[0] = vbslq_u32( brightnessMask32.val[0], colorop_vec.increase(tmpSrc32.val[0], evy16), tmpSrc32.val[0] ); + tmpSrc32.val[1] = vbslq_u32( brightnessMask32.val[1], colorop_vec.increase(tmpSrc32.val[1], evy16), tmpSrc32.val[1] ); + tmpSrc32.val[2] = vbslq_u32( brightnessMask32.val[2], colorop_vec.increase(tmpSrc32.val[2], evy16), tmpSrc32.val[2] ); + tmpSrc32.val[3] = vbslq_u32( brightnessMask32.val[3], colorop_vec.increase(tmpSrc32.val[3], evy16), tmpSrc32.val[3] ); + } + } + break; + } + + case ColorEffect_DecreaseBrightness: + { + const v128u8 brightnessMask8 = vandq_u8( vmvnq_u8(forceDstTargetBlendMask), vandq_u8(srcEffectEnableMask, vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_DecreaseBrightness))) ); + const u8 brightnessDownMaskValue = vmaxvq_u8(brightnessMask8); + + if (brightnessDownMaskValue != 0) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t brightnessMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(brightnessMask8, brightnessMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(brightnessMask8, brightnessMask8) ) + }; + + tmpSrc16.val[0] = vbslq_u16( brightnessMask16.val[0], colorop_vec.decrease(tmpSrc16.val[0], evy16), tmpSrc16.val[0] ); + tmpSrc16.val[1] = vbslq_u16( brightnessMask16.val[1], colorop_vec.decrease(tmpSrc16.val[1], evy16), tmpSrc16.val[1] ); + } + else + { + const uint32x4x4_t brightnessMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(brightnessMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + tmpSrc32.val[0] = vbslq_u32( brightnessMask32.val[0], colorop_vec.decrease(tmpSrc32.val[0], evy16), tmpSrc32.val[0] ); + tmpSrc32.val[1] = vbslq_u32( brightnessMask32.val[1], colorop_vec.decrease(tmpSrc32.val[1], evy16), tmpSrc32.val[1] ); + tmpSrc32.val[2] = vbslq_u32( brightnessMask32.val[2], colorop_vec.decrease(tmpSrc32.val[2], evy16), tmpSrc32.val[2] ); + tmpSrc32.val[3] = vbslq_u32( brightnessMask32.val[3], colorop_vec.decrease(tmpSrc32.val[3], evy16), tmpSrc32.val[3] ); + } + } + break; + } + + default: + break; + } + + // Render the pixel using the selected color effect. + const v128u8 blendMask8 = vorrq_u8( forceDstTargetBlendMask, vandq_u8(vandq_u8(srcEffectEnableMask, dstTargetBlendEnableMask), vceqq_u8(colorEffect_vec128, vdupq_n_u8(ColorEffect_Blend))) ); + const u8 blendMaskValue = vmaxvq_u8(blendMask8); + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + const uint16x8x2_t oldColor16 = vld1q_u16_x2(compInfo.target.lineColor16); + + if (blendMaskValue != 0) + { + const uint16x8x2_t blendMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(blendMask8, blendMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(blendMask8, blendMask8) ) + }; + + uint16x8x2_t blendSrc16; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc16.val[0] = colorop_vec.blend3D(src0, src1, oldColor16.val[0]); + blendSrc16.val[1] = colorop_vec.blend3D(src2, src3, oldColor16.val[1]); + break; + + case GPULayerType_BG: + blendSrc16.val[0] = colorop_vec.blend(tmpSrc16.val[0], oldColor16.val[0], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc16.val[1] = colorop_vec.blend(tmpSrc16.val[1], oldColor16.val[1], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const uint16x8x2_t tempEVA = { + vreinterpretq_u16_u8( vzip1q_u8(eva_vec128, zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(eva_vec128, zeroVec) ) + }; + const uint16x8x2_t tempEVB = { + vreinterpretq_u16_u8( vzip1q_u8(evb_vec128, zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(evb_vec128, zeroVec) ) + }; + + blendSrc16.val[0] = colorop_vec.blend(tmpSrc16.val[0], oldColor16.val[0], tempEVA.val[0], tempEVB.val[0]); + blendSrc16.val[1] = colorop_vec.blend(tmpSrc16.val[1], oldColor16.val[1], tempEVA.val[1], tempEVB.val[1]); + break; + } + } + + tmpSrc16.val[0] = vbslq_u16(blendMask16.val[0], blendSrc16.val[0], tmpSrc16.val[0]); + tmpSrc16.val[1] = vbslq_u16(blendMask16.val[1], blendSrc16.val[1], tmpSrc16.val[1]); + } + + // Store the final colors. + const uint16x8x2_t passMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(passMask8, passMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(passMask8, passMask8) ) + }; + + const v128u16 alphaBits = vdupq_n_u16(0x8000); + const uint16x8x2_t dst16 = { + vbslq_u16(passMask16.val[0], vorrq_u16(tmpSrc16.val[0], alphaBits), oldColor16.val[0]), + vbslq_u16(passMask16.val[1], vorrq_u16(tmpSrc16.val[1], alphaBits), oldColor16.val[1]) + }; + vst1q_u16_x2(compInfo.target.lineColor16, dst16); + } + else + { + const uint32x4x4_t oldColor32 = vld1q_u32_x4((u32 *)compInfo.target.lineColor32); + + if (blendMaskValue != 0) + { + uint32x4x4_t blendSrc32; + + switch (LAYERTYPE) + { + case GPULayerType_3D: + blendSrc32.val[0] = colorop_vec.blend3D(tmpSrc32.val[0], oldColor32.val[0]); + blendSrc32.val[1] = colorop_vec.blend3D(tmpSrc32.val[1], oldColor32.val[1]); + blendSrc32.val[2] = colorop_vec.blend3D(tmpSrc32.val[2], oldColor32.val[2]); + blendSrc32.val[3] = colorop_vec.blend3D(tmpSrc32.val[3], oldColor32.val[3]); + break; + + case GPULayerType_BG: + blendSrc32.val[0] = colorop_vec.blend(tmpSrc32.val[0], oldColor32.val[0], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[1] = colorop_vec.blend(tmpSrc32.val[1], oldColor32.val[1], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[2] = colorop_vec.blend(tmpSrc32.val[2], oldColor32.val[2], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + blendSrc32.val[3] = colorop_vec.blend(tmpSrc32.val[3], oldColor32.val[3], vreinterpretq_u16_u8(eva_vec128), vreinterpretq_u16_u8(evb_vec128)); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + uint8x16x2_t tempBlend8 = { + vzip1q_u8(eva_vec128, eva_vec128), + vzip2q_u8(eva_vec128, eva_vec128) + }; + + const uint16x8x4_t tempEVA = { + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[1], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[1], zeroVec) ) + }; + + tempBlend8.val[0] = vzip1q_u8(evb_vec128, evb_vec128); + tempBlend8.val[1] = vzip2q_u8(evb_vec128, evb_vec128); + + const uint16x8x4_t tempEVB = { + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[0], zeroVec) ), + vreinterpretq_u16_u8( vzip1q_u8(tempBlend8.val[1], zeroVec) ), + vreinterpretq_u16_u8( vzip2q_u8(tempBlend8.val[1], zeroVec) ) + }; + + blendSrc32.val[0] = colorop_vec.blend(tmpSrc32.val[0], oldColor32.val[0], tempEVA.val[0], tempEVB.val[0]); + blendSrc32.val[1] = colorop_vec.blend(tmpSrc32.val[1], oldColor32.val[1], tempEVA.val[1], tempEVB.val[1]); + blendSrc32.val[2] = colorop_vec.blend(tmpSrc32.val[2], oldColor32.val[2], tempEVA.val[2], tempEVB.val[2]); + blendSrc32.val[3] = colorop_vec.blend(tmpSrc32.val[3], oldColor32.val[3], tempEVA.val[3], tempEVB.val[3]); + break; + } + } + + const uint16x8x2_t blendMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(blendMask8, blendMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(blendMask8, blendMask8) ) + }; + + const uint32x4x4_t blendMask32 = { + vreinterpretq_u32_u16( vzip1q_u16(blendMask16.val[0], blendMask16.val[0]) ), + vreinterpretq_u32_u16( vzip2q_u16(blendMask16.val[0], blendMask16.val[0]) ), + vreinterpretq_u32_u16( vzip1q_u16(blendMask16.val[1], blendMask16.val[1]) ), + vreinterpretq_u32_u16( vzip2q_u16(blendMask16.val[1], blendMask16.val[1]) ) + }; + + tmpSrc32.val[0] = vbslq_u32(blendMask32.val[0], blendSrc32.val[0], tmpSrc32.val[0]); + tmpSrc32.val[1] = vbslq_u32(blendMask32.val[1], blendSrc32.val[1], tmpSrc32.val[1]); + tmpSrc32.val[2] = vbslq_u32(blendMask32.val[2], blendSrc32.val[2], tmpSrc32.val[2]); + tmpSrc32.val[3] = vbslq_u32(blendMask32.val[3], blendSrc32.val[3], tmpSrc32.val[3]); + } + + // Store the final colors. + const uint32x4x4_t passMask32 = { + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){ 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11})) ), + vreinterpretq_u32_u8( vqtbl1q_u8(passMask8, ((v128u8){12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15})) ) + }; + + const v128u32 alphaBits = vdupq_n_u32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); + const uint32x4x4_t dst32 = { + vbslq_u32(passMask32.val[0], vorrq_u32(tmpSrc32.val[0], alphaBits), oldColor32.val[0]), + vbslq_u32(passMask32.val[1], vorrq_u32(tmpSrc32.val[1], alphaBits), oldColor32.val[1]), + vbslq_u32(passMask32.val[2], vorrq_u32(tmpSrc32.val[2], alphaBits), oldColor32.val[2]), + vbslq_u32(passMask32.val[3], vorrq_u32(tmpSrc32.val[3], alphaBits), oldColor32.val[3]) + }; + vst1q_u32_x4((u32 *)compInfo.target.lineColor32, dst32); + } +} + +template +FORCEINLINE void PixelOperation_NEON::Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy16(compInfo, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp16(compInfo, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown16(compInfo, evy16, srcLayerID, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask16(compInfo, passMask8, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask16(compInfo, passMask8, evy16, srcLayerID, src1, src0); + break; + + default: + { + const v128u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? vld1q_u8(enableColorEffectPtr) : vdupq_n_u8(0xFF); + const v128u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? vld1q_u8(sprAlphaPtr) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + const v128u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? vld1q_u8(sprModePtr) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + this->_unknownEffectMask16(compInfo, + passMask8, + evy16, + srcLayerID, + src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +FORCEINLINE void PixelOperation_NEON::Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const +{ + if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass) + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copy32(compInfo, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUp32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDown32(compInfo, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + break; + } + } + else + { + switch (COMPOSITORMODE) + { + case GPUCompositorMode_Debug: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_Copy: + this->_copyMask32(compInfo, passMask8, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightUp: + this->_brightnessUpMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + case GPUCompositorMode_BrightDown: + this->_brightnessDownMask32(compInfo, passMask8, evy16, srcLayerID, src3, src2, src1, src0); + break; + + default: + { + const v128u8 enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? vld1q_u8(enableColorEffectPtr): vdupq_n_u8(0xFF); + const v128u8 spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? vld1q_u8(sprAlphaPtr) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + const v128u8 spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? vld1q_u8(sprModePtr) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + this->_unknownEffectMask32(compInfo, + passMask8, + evy16, + srcLayerID, + src3, src2, src1, src0, + srcEffectEnableMask, + dstBlendEnableMaskLUT, + enableColorEffectMask, + spriteAlpha, + spriteMode); + break; + } + } + } +} + +template +void GPUEngineBase::_MosaicLine(GPUEngineCompositorInfo &compInfo) +{ + const u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID]; + + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=sizeof(v128u16)) + { + const uint16x8x2_t oldColor16 = vld1q_u16_x2(this->_deferredColorNative + x); + + if (ISFIRSTLINE) + { + const v128u8 indexVec = vld1q_u8(this->_deferredIndexNative + x); + const v128u8 idxMask8 = vceqzq_u8(indexVec); + const uint16x8x2_t idxMask16 = { + vreinterpretq_u16_u8( vzip1q_u8(idxMask8, idxMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(idxMask8, idxMask8) ) + }; + + const v128u16 mosaicColor16[2] = { + vbslq_u16( idxMask16.val[0], vdupq_n_u16(0xFFFF), vandq_u16(oldColor16.val[0], vdupq_n_u16(0x7FFF)) ), + vbslq_u16( idxMask16.val[1], vdupq_n_u16(0xFFFF), vandq_u16(oldColor16.val[1], vdupq_n_u16(0x7FFF)) ) + }; + + const v128u8 mosaicSetColorMask8 = vceqzq_u8( vld1q_u8(compInfo.renderState.mosaicWidthBG->begin + x) ); + const v128u16 mosaicSetColorMask16[2] = { + vreinterpretq_u16_u8( vzip1q_u8(mosaicSetColorMask8, mosaicSetColorMask8) ), + vreinterpretq_u16_u8( vzip2q_u8(mosaicSetColorMask8, mosaicSetColorMask8) ) + }; + + const uint16x8x2_t newColor16 = vld1q_u16_x2(mosaicColorBG + x); + const uint16x8x2_t dst16 = { + vbslq_u16( mosaicSetColorMask16[0], newColor16.val[0], mosaicColor16[0] ), + vbslq_u16( mosaicSetColorMask16[1], newColor16.val[1], mosaicColor16[1] ) + }; + vst1q_u16_x2((u16 *)mosaicColorBG + x, dst16); + } + + const v128u16 outColor16[2] = { + ((v128u16){ + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+0]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+1]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+2]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+3]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+4]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+5]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+6]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+7]] + }), + + ((v128u16){ + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+8]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+9]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+10]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+11]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+12]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+13]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+14]], + mosaicColorBG[compInfo.renderState.mosaicWidthBG->trunc32[x+15]] + }) + }; + + const v128u16 writeColorMask16[2] = { + vceqq_u16(outColor16[0], vdupq_n_u16(0xFFFF)), + vceqq_u16(outColor16[1], vdupq_n_u16(0xFFFF)) + }; + + const uint16x8x2_t dst16 = { + vbslq_u16( writeColorMask16[0], oldColor16.val[0], outColor16[0] ), + vbslq_u16( writeColorMask16[1], oldColor16.val[1], outColor16[1] ) + }; + vst1q_u16_x2(this->_deferredColorNative + x, dst16); + } +} + +template +void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const Color4u8 *__restrict srcColorNative32) +{ + static const size_t step = sizeof(v128u8); + + const bool isUsingSrc32 = (srcColorNative32 != NULL); + const v128u16 evy16 = vdupq_n_u16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = vdupq_n_u8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = vdupq_n_u8(compInfo.renderState.srcEffectEnable[GPULayerID_OBJ]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? vld1q_u8(compInfo.renderState.dstBlendEnableVecLookup) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=step, srcColorNative16+=step, srcColorNative32+=step, compInfo.target.xNative+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + v128u8 passMask8; + u16 passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = vld1q_u8(this->_didPassWindowTestNative[GPULayerID_OBJ] + i); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = vaddlvq_u8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFF0); + } + else + { + passMask8 = vdupq_n_u8(0xFF); + passMaskValue = 0xFF0; + didAllPixelsPass = true; + } + + if (isUsingSrc32) + { + const uint32x4x4_t src32 = vld1q_u32_x4((u32 *)srcColorNative32); + + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src32.val[3], src32.val[2], src32.val[1], src32.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + else + { + const uint16x8x2_t src16 = vld1q_u16_x2(srcColorNative16); + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src16.val[1], src16.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha[compInfo.line.indexNative] + i, + this->_sprType[compInfo.line.indexNative] + i); + } + } +} + +template +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = vdupq_n_u16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = vdupq_n_u8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = vdupq_n_u8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? vld1q_u8(compInfo.renderState.dstBlendEnableVecLookup) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v128u8 passMask8; + u16 passMaskValue; + bool didAllPixelsPass; + + if (WILLPERFORMWINDOWTEST || (LAYERTYPE == GPULayerType_BG)) + { + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = vld1q_u8(windowTestPtr + compInfo.target.xCustom); + } + + if (LAYERTYPE == GPULayerType_BG) + { + // Do the index test. Pixels with an index value of 0 are rejected. + const v128u8 idxPassMask8 = vmvnq_u8( vceqzq_u8( vld1q_u8(srcIndexCustom + compInfo.target.xCustom) ) ); + + if (WILLPERFORMWINDOWTEST) + { + passMask8 = vandq_u8(idxPassMask8, passMask8 ); + } + else + { + passMask8 = idxPassMask8; + } + } + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = vaddlvq_u8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + didAllPixelsPass = (passMaskValue == 0xFF0); + } + else + { + passMask8 = vdupq_n_u8(0xFF); + passMaskValue = 0xFF0; + didAllPixelsPass = true; + } + + const uint16x8x2_t src16 = vld1q_u16_x2(srcColorCustom16 + compInfo.target.xCustom); + + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src16.val[1], src16.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + colorEffectEnablePtr + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + } + + return i; +} + +template +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const void *__restrict vramColorPtr) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = vdupq_n_u16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = vdupq_n_u8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = vdupq_n_u8(compInfo.renderState.srcEffectEnable[compInfo.renderState.selectedLayerID]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? vld1q_u8(compInfo.renderState.dstBlendEnableVecLookup) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + size_t i = 0; + for (; i < ssePixCount; i+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + v128u8 passMask8; + u16 passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = vld1q_u8(windowTestPtr + compInfo.target.xCustom); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = vaddlvq_u8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = vdupq_n_u8(0xFF); + passMaskValue = 0xFF0; + } + + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + case NDSColorFormat_BGR666_Rev: + { + const uint16x8x2_t src16 = vld1q_u16_x2((u16 *)vramColorPtr + i); + + if (LAYERTYPE != GPULayerType_OBJ) + { + v128u8 tempPassMask = vuzp1q_u8( vreinterpretq_u8_u16(vshrq_n_u16(src16.val[0], 15)), vreinterpretq_u8_u16(vshrq_n_u16(src16.val[1], 15)) ); + tempPassMask = vceqq_u8(tempPassMask, vdupq_n_u8(1)); + + passMask8 = vandq_u8(tempPassMask, passMask8); + passMaskValue = vaddlvq_u8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFF0); + pixelop_vec.Composite16(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src16.val[1], src16.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + colorEffectEnablePtr + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + + case NDSColorFormat_BGR888_Rev: + { + const uint32x4x4_t src32 = vld1q_u32_x4((u32 *)vramColorPtr + i); + + if (LAYERTYPE != GPULayerType_OBJ) + { + v128u8 tempPassMask = vuzp1q_u8( vreinterpretq_u8_u16(vuzp1q_u16(vreinterpretq_u16_u32(vshrq_n_u32(src32.val[0], 24)), vreinterpretq_u16_u32(vshrq_n_u32(src32.val[1], 24)))), vreinterpretq_u8_u16(vuzp1q_u16(vreinterpretq_u16_u32(vshrq_n_u32(src32.val[2], 24)), vreinterpretq_u16_u32(vshrq_n_u32(src32.val[3], 24)))) ); + tempPassMask = vmvnq_u8( vceqzq_u8(tempPassMask) ); + + passMask8 = vandq_u8(tempPassMask, passMask8); + passMaskValue = vaddlvq_u8(passMask8); + } + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFF0); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src32.val[3], src32.val[2], src32.val[1], src32.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + colorEffectEnablePtr + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); + break; + } + } + } + + return i; +} + +template +size_t GPUEngineBase::_RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, + size_t &frameX, size_t &spriteX, + u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab) +{ + size_t i = 0; + + static const size_t step = sizeof(v128u16); + const v128u8 prioVec8 = vdupq_n_u8(prio); + + const size_t ssePixCount = length - (length % step); + for (; i < ssePixCount; i+=step, spriteX+=step, frameX+=step) + { + const v128u8 prioTabVec8 = vld1q_u8(prioTab + frameX); + const uint16x8x2_t color16 = vld1q_u16_x2(vramBuffer + spriteX); + + const v128u8 alphaCompare = vceqq_u8( vuzp1q_u8(vreinterpretq_u8_u16(vshrq_n_u16(color16.val[0], 15)), vreinterpretq_u8_u16(vshrq_n_u16(color16.val[1], 15))), vdupq_n_u8(0x01) ); + const v128u8 prioCompare = vcgtq_u8(prioTabVec8, prioVec8); + + const v128u8 combinedCompare8 = vandq_u8(prioCompare, alphaCompare); + const uint16x8x2_t combinedCompare16 = { + vreinterpretq_u16_u8( vzip1q_u8(combinedCompare8, combinedCompare8) ), + vreinterpretq_u16_u8( vzip2q_u8(combinedCompare8, combinedCompare8) ) + }; + + const uint16x8x2_t dst16 = vld1q_u16_x2(dst + frameX); + const uint16x8x2_t out16 = { + vbslq_u16(combinedCompare16.val[0], color16.val[0], dst16.val[0]), + vbslq_u16(combinedCompare16.val[1], color16.val[1], dst16.val[1]) + }; + + vst1q_u16_x2(dst + frameX, out16); + vst1q_u8( prioTab + frameX, vbslq_u8(combinedCompare8, prioVec8, prioTabVec8) ); + + if (!ISDEBUGRENDER) + { + vst1q_u8( dst_alpha + frameX, vbslq_u8(combinedCompare8, vdupq_n_u8(spriteAlpha + 1), vld1q_u8(dst_alpha + frameX)) ); + vst1q_u8( typeTab + frameX, vbslq_u8(combinedCompare8, vdupq_n_u8(OBJMode_Bitmap), vld1q_u8(typeTab + frameX)) ); + vst1q_u8( this->_sprNum + frameX, vbslq_u8(combinedCompare8, vdupq_n_u8(spriteNum), vld1q_u8(this->_sprNum + frameX)) ); + } + } + + return i; +} + +void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative) +{ + static const v128u8 zeroVec = ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + const v128u8 *__restrict win0Ptr = (const v128u8 *__restrict)win0; + const v128u8 *__restrict win1Ptr = (const v128u8 *__restrict)win1; + const v128u8 *__restrict winObjPtr = (const v128u8 *__restrict)winObj; + + v128u8 *__restrict didPassWindowTestNativePtr = (v128u8 *__restrict)didPassWindowTestNative; + v128u8 *__restrict enableColorEffectNativePtr = (v128u8 *__restrict)enableColorEffectNative; + + v128u8 didPassWindowTest; + v128u8 enableColorEffect; + + v128u8 win0HandledMask; + v128u8 win1HandledMask; + v128u8 winOBJHandledMask; + v128u8 winOUTHandledMask; + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH/sizeof(v128u8); i++) + { + didPassWindowTest = zeroVec; + enableColorEffect = zeroVec; + + win0HandledMask = zeroVec; + win1HandledMask = zeroVec; + winOBJHandledMask = zeroVec; + + // Window 0 has the highest priority, so always check this first. + if (win0Ptr != NULL) + { + const v128u8 win0Enable = vdupq_n_u8(compInfo.renderState.WIN0_enable[layerID]); + const v128u8 win0Effect = vdupq_n_u8(compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG]); + + win0HandledMask = vceqq_u8(vld1q_u8((u8 *)(win0Ptr + i)), vdupq_n_u8(1)); + didPassWindowTest = vandq_u8(win0HandledMask, win0Enable); + enableColorEffect = vandq_u8(win0HandledMask, win0Effect); + } + + // Window 1 has medium priority, and is checked after Window 0. + if (win1Ptr != NULL) + { + const v128u8 win1Enable = vdupq_n_u8(compInfo.renderState.WIN1_enable[layerID]); + const v128u8 win1Effect = vdupq_n_u8(compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG]); + + win1HandledMask = vandq_u8( vmvnq_u8(win0HandledMask), vceqq_u8(vld1q_u8((u8 *)(win1Ptr + i)), vdupq_n_u8(1)) ); + didPassWindowTest = vbslq_u8(win1HandledMask, win1Enable, didPassWindowTest); + enableColorEffect = vbslq_u8(win1HandledMask, win1Effect, enableColorEffect); + } + + // Window OBJ has low priority, and is checked after both Window 0 and Window 1. + if (winObjPtr != NULL) + { + const v128u8 winObjEnable = vdupq_n_u8(compInfo.renderState.WINOBJ_enable[layerID]); + const v128u8 winObjEffect = vdupq_n_u8(compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOBJHandledMask = vandq_u8( vmvnq_u8(vorrq_u8(win0HandledMask, win1HandledMask)), vceqq_u8(vld1q_u8((u8 *)(winObjPtr + i)), vdupq_n_u8(1)) ); + didPassWindowTest = vbslq_u8(winOBJHandledMask, winObjEnable, didPassWindowTest); + enableColorEffect = vbslq_u8(winOBJHandledMask, winObjEffect, enableColorEffect); + } + + // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. + // This has the lowest priority, and is always checked last. + const v128u8 winOutEnable = vdupq_n_u8(compInfo.renderState.WINOUT_enable[layerID]); + const v128u8 winOutEffect = vdupq_n_u8(compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG]); + + winOUTHandledMask = vmvnq_u8( vorrq_u8(win0HandledMask, vorrq_u8(win1HandledMask, winOBJHandledMask)) ); + didPassWindowTest = vbslq_u8(winOUTHandledMask, winOutEnable, didPassWindowTest); + enableColorEffect = vbslq_u8(winOUTHandledMask, winOutEffect, enableColorEffect); + + vst1q_u8((u8 *)(didPassWindowTestNativePtr + i), didPassWindowTest); + vst1q_u8((u8 *)(enableColorEffectNativePtr + i), enableColorEffect); + } +} + +template +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const Color4u8 *__restrict srcLinePtr) +{ + static const size_t step = sizeof(v128u8); + + const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % step)); + const v128u16 evy16 = vdupq_n_u16(compInfo.renderState.blendEVY); + const v128u8 srcLayerID = vdupq_n_u8(compInfo.renderState.selectedLayerID); + const v128u8 srcEffectEnableMask = vdupq_n_u8(compInfo.renderState.srcEffectEnable[GPULayerID_BG0]); + const v128u8 dstBlendEnableMaskLUT = (COMPOSITORMODE == GPUCompositorMode_Unknown) ? vld1q_u8(compInfo.renderState.dstBlendEnableVecLookup) : ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + size_t i = 0; + for (; i < ssePixCount; i+=step, srcLinePtr+=step, compInfo.target.xCustom+=step, compInfo.target.lineColor16+=step, compInfo.target.lineColor32+=step, compInfo.target.lineLayerID+=step) + { + if (compInfo.target.xCustom >= compInfo.line.widthCustom) + { + compInfo.target.xCustom -= compInfo.line.widthCustom; + } + + // Determine which pixels pass by doing the window test and the alpha test. + v128u8 passMask8; + u16 passMaskValue; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = vld1q_u8(windowTestPtr + compInfo.target.xCustom); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = vaddlvq_u8(passMask8); + if (passMaskValue == 0) + { + continue; + } + } + else + { + passMask8 = vdupq_n_u8(0xFF); + passMaskValue = 0xFF0; + } + + const uint32x4x4_t src32 = vld1q_u32_x4((u32 *)srcLinePtr); + + // Do the alpha test. Pixels with an alpha value of 0 are rejected. + const v128u8 srcAlpha = vuzp1q_u8( vreinterpretq_u8_u16( vuzp1q_u16( vreinterpretq_u16_u32(vshrq_n_u32(src32.val[0], 24)), vreinterpretq_u16_u32(vshrq_n_u32(src32.val[1], 24)) ) ), + vreinterpretq_u8_u16( vuzp1q_u16( vreinterpretq_u16_u32(vshrq_n_u32(src32.val[2], 24)), vreinterpretq_u16_u32(vshrq_n_u32(src32.val[3], 24)) ) ) ); + + passMask8 = vandq_u8( vmvnq_u8(vceqzq_u8(srcAlpha)), passMask8 ); + + // If none of the pixels within the vector pass, then reject them all at once. + passMaskValue = vaddlvq_u8(passMask8); + if (passMaskValue == 0) + { + continue; + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFF0); + pixelop_vec.Composite32(compInfo, + didAllPixelsPass, + passMask8, evy16, + srcLayerID, + src32.val[3], src32.val[2], src32.val[1], src32.val[0], + srcEffectEnableMask, + dstBlendEnableMaskLUT, + colorEffectEnablePtr + compInfo.target.xCustom, + NULL, + NULL); + } + + return i; +} + +template +size_t GPUEngineA::_RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length) +{ + static const v128u8 zeroVec = ((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? length * sizeof(u32) / sizeof(v128u32) : length * sizeof(u16) / sizeof(v128u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + const v128u32 srcA32 = vld1q_u32((u32 *)((v128u32 *)srcA + i)); + const v128u32 srcB32 = vld1q_u32((u32 *)((v128u32 *)srcB + i)); + + // Get color masks based on if the alpha value is 0. Colors with an alpha value + // equal to 0 are rejected. + v128u32 srcA_alpha = vandq_u32(srcA32, vdupq_n_u32(0xFF000000)); + v128u32 srcB_alpha = vandq_u32(srcB32, vdupq_n_u32(0xFF000000)); + v128u32 srcA_masked = vandq_u32( vmvnq_u32(vceqzq_u32(srcA_alpha)), srcA32 ); + v128u32 srcB_masked = vandq_u32( vmvnq_u32(vceqzq_u32(srcB_alpha)), srcB32 ); + + v128u16 dstLo; + v128u16 dstHi; + v128u32 out32; + + // Temporarily convert the color component values from 8-bit to 16-bit, and then + // do the blend calculation. + v128u16 srcA_maskedLo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(srcA_masked), zeroVec) ); + v128u16 srcA_maskedHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(srcA_masked), zeroVec) ); + v128u16 srcB_maskedLo = vreinterpretq_u16_u8( vzip1q_u8(vreinterpretq_u8_u32(srcB_masked), zeroVec) ); + v128u16 srcB_maskedHi = vreinterpretq_u16_u8( vzip2q_u8(vreinterpretq_u8_u32(srcB_masked), zeroVec) ); + + dstLo = vmlaq_n_u16( vmulq_n_u16(srcA_maskedLo, blendEVA), srcB_maskedLo, blendEVB ); + dstHi = vmlaq_n_u16( vmulq_n_u16(srcA_maskedHi, blendEVA), srcB_maskedHi, blendEVB ); + + dstLo = vshrq_n_u16(dstLo, 4); + dstHi = vshrq_n_u16(dstHi, 4); + + // Convert the color components back from 16-bit to 8-bit using a saturated pack. + out32 = vreinterpretq_u32_u8( vuzp1q_u8(vreinterpretq_u8_u16(dstLo), vreinterpretq_u8_u16(dstHi)) ); + + // Add the alpha components back in. + out32 = vandq_u32(out32, vdupq_n_u32(0x00FFFFFF)); + out32 = vorrq_u32(out32, srcA_alpha); + out32 = vorrq_u32(out32, srcB_alpha); + vst1q_u32((u32 *)((v128u32 *)dst + i), out32); + } + else + { + const v128u16 srcA16 = vld1q_u16((u16 *)((v128u16 *)srcA + i)); + const v128u16 srcB16 = vld1q_u16((u16 *)((v128u16 *)srcB + i)); + v128u16 srcA_alpha = vandq_u16(srcA16, vdupq_n_u16(0x8000)); + v128u16 srcB_alpha = vandq_u16(srcB16, vdupq_n_u16(0x8000)); + v128u16 srcA_masked = vandq_u16( vmvnq_u16(vceqzq_u16(srcA_alpha)), srcA16 ); + v128u16 srcB_masked = vandq_u16( vmvnq_u16(vceqzq_u16(srcB_alpha)), srcB16 ); + v128u16 colorBitMask = vdupq_n_u16(0x001F); + + v128u16 ra; + v128u16 ga; + v128u16 ba; + v128u16 dst16; + + ra = vandq_u16( srcA_masked, colorBitMask); + ga = vandq_u16(vshrq_n_u16(srcA_masked, 5), colorBitMask); + ba = vandq_u16(vshrq_n_u16(srcA_masked, 10), colorBitMask); + + v128u16 rb = vandq_u16( srcB_masked, colorBitMask); + v128u16 gb = vandq_u16(vshrq_n_u16(srcB_masked, 5), colorBitMask); + v128u16 bb = vandq_u16(vshrq_n_u16(srcB_masked, 10), colorBitMask); + + ra = vmlaq_n_u16( vmulq_n_u16(ra, blendEVA), rb, blendEVB ); + ga = vmlaq_n_u16( vmulq_n_u16(ga, blendEVA), gb, blendEVB ); + ba = vmlaq_n_u16( vmulq_n_u16(ba, blendEVA), bb, blendEVB ); + + ra = vshrq_n_u16(ra, 4); + ga = vshrq_n_u16(ga, 4); + ba = vshrq_n_u16(ba, 4); + + ra = vminq_u16(ra, colorBitMask); + ga = vminq_u16(ga, colorBitMask); + ba = vminq_u16(ba, colorBitMask); + + dst16 = vorrq_u16( vorrq_u16(vorrq_u16(ra, vshlq_n_u16(ga, 5)), vshlq_n_u16(ba, 10)), vorrq_u16(srcA_alpha, srcB_alpha) ); + vst1q_u16((u16 *)((v128u16 *)dst + i), dst16); + } + } + + return (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? i * sizeof(v128u32) / sizeof(u32) : i * sizeof(v128u16) / sizeof(u16); +} + +template +size_t NDSDisplay::_ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v128u32) : pixCount * sizeof(u16) / sizeof(v128u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v128u16 dstColor = vld1q_u16((u16 *)((v128u16 *)dst + i)); + dstColor = colorop_vec.increase(dstColor, vdupq_n_u16(intensityClamped)); + dstColor = vorrq_u16(dstColor, vdupq_n_u16(0x8000)); + vst1q_u16( (u16 *)((v128u16 *)dst + i), dstColor); + } + else + { + v128u32 dstColor = vld1q_u32((u32 *)((v128u32 *)dst + i)); + dstColor = colorop_vec.increase(dstColor, vdupq_n_u16(intensityClamped)); + dstColor = vorrq_u32(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000)); + vst1q_u32( (u32 *)((v128u32 *)dst + i), dstColor); + } + } + + return (i * sizeof(v128u8)); +} + +template +size_t NDSDisplay::_ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped) +{ + size_t i = 0; + + const size_t vecCount = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? pixCount * sizeof(u32) / sizeof(v128u32) : pixCount * sizeof(u16) / sizeof(v128u16); + for (; i < vecCount; i++) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + v128u16 dstColor = vld1q_u16((u16 *)((v128u16 *)dst + i)); + dstColor = colorop_vec.decrease(dstColor, vdupq_n_u16(intensityClamped)); + dstColor = vorrq_u16(dstColor, vdupq_n_u16(0x8000)); + vst1q_u16( (u16 *)((v128u16 *)dst + i), dstColor); + } + else + { + v128u32 dstColor = vld1q_u32((u32 *)((v128u32 *)dst + i)); + dstColor = colorop_vec.decrease(dstColor, vdupq_n_u16(intensityClamped)); + dstColor = vorrq_u32(dstColor, (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? vdupq_n_u32(0x1F000000) : vdupq_n_u32(0xFF000000)); + vst1q_u32( (u32 *)((v128u32 *)dst + i), dstColor); + } + } + + return (i * sizeof(v128u8)); +} + +#endif // ENABLE_NEON_A64 diff --git a/desmume/src/GPU_Operations_NEON.h b/desmume/src/GPU_Operations_NEON.h new file mode 100644 index 000000000..c2117fe78 --- /dev/null +++ b/desmume/src/GPU_Operations_NEON.h @@ -0,0 +1,122 @@ +/* + Copyright (C) 2025 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef GPU_OPERATIONS_NEON_H +#define GPU_OPERATIONS_NEON_H + +#include "GPU_Operations.h" + +#ifndef ENABLE_NEON_A64 + #warning This header requires ARM64 NEON support. +#else + +class ColorOperation_NEON +{ +public: + ColorOperation_NEON() {}; + + FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const; + template FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const; + + FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const; + template FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const; + + FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const; + template FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const; + + FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const; + template FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const; +}; + +class PixelOperation_NEON +{ +protected: + template FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const; + template FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const; + + template + FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const; + + template + FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const v128u8 &enableColorEffectMask, + const v128u8 &spriteAlpha, + const v128u8 &spriteMode) const; + +public: + PixelOperation_NEON() {}; + + template + FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u16 &src1, const v128u16 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; + + template + FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const v128u8 &passMask8, + const v128u16 &evy16, + const v128u8 &srcLayerID, + const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0, + const v128u8 &srcEffectEnableMask, + const v128u8 &dstBlendEnableMaskLUT, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) const; +}; + +#endif // ENABLE_NEON_A64 + +#endif // GPU_OPERATIONS_NEON_H diff --git a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index b674e6082..479f2b8db 100755 --- a/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/frontend/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -4416,6 +4416,8 @@ ABD2CE4426E05CB000FB15F7 /* DeSmuME (x86_64h).app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "DeSmuME (x86_64h).app"; sourceTree = BUILT_PRODUCTS_DIR; }; ABD42045172319D1006A9B46 /* FileMigrationDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FileMigrationDelegate.h; sourceTree = ""; }; ABD42046172319D1006A9B46 /* FileMigrationDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = FileMigrationDelegate.mm; sourceTree = ""; }; + ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GPU_Operations_NEON.h; sourceTree = ""; }; + ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = GPU_Operations_NEON.cpp; sourceTree = ""; }; ABDD89EF2C30BE97003482B7 /* OGLRender_ES3.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = OGLRender_ES3.h; sourceTree = ""; }; ABDD89F02C30BE97003482B7 /* OGLRender_ES3.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = OGLRender_ES3.cpp; sourceTree = ""; }; ABDDF7C41898F024007583C1 /* Icon_DisplayToggle_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_DisplayToggle_420x420.png; path = images/Icon_DisplayToggle_420x420.png; sourceTree = ""; }; @@ -5685,6 +5687,7 @@ AB1D4BB126E6F8D700A9AE42 /* GPU_Operations.cpp */, AB1D4BB426E6F8D700A9AE42 /* GPU_Operations_SSE2.cpp */, AB1D4BAF26E6F8D700A9AE42 /* GPU_Operations_AVX2.cpp */, + ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */, ABD1FEB81345AC8400AF11D1 /* lua-engine.cpp */, ABD1FEB91345AC8400AF11D1 /* matrix.cpp */, ABD1FEBA1345AC8400AF11D1 /* mc.cpp */, @@ -5729,6 +5732,7 @@ AB1D4BB326E6F8D700A9AE42 /* GPU_Operations.h */, AB1D4BB226E6F8D700A9AE42 /* GPU_Operations_SSE2.h */, AB1D4BB026E6F8D700A9AE42 /* GPU_Operations_AVX2.h */, + ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */, AB796CA215CDCB6B00C59155 /* instruction_attributes.h */, AB796CA315CDCB6B00C59155 /* instructions.h */, ABD1FE841345AC8400AF11D1 /* lua-engine.h */, diff --git a/desmume/src/frontend/posix/codeblocks/desmume.cbp b/desmume/src/frontend/posix/codeblocks/desmume.cbp index 84d041471..984cdd5fa 100644 --- a/desmume/src/frontend/posix/codeblocks/desmume.cbp +++ b/desmume/src/frontend/posix/codeblocks/desmume.cbp @@ -347,6 +347,12 @@ + + + +