From 6a1d9e4848e7f469bfe2e3776e74f99f94ddd7c7 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 1 Dec 2018 15:46:23 -0800 Subject: [PATCH] GPU: Rendering complete OBJ layer lines is now SSE2-accelerated at the native resolution. This change is less of a performance enhancement and more of improving the code consistency. As of now, ALL complete OBJ layer lines, whether internally generated or from read from VRAM, whether rendering at native resolution or custom resolution, should now be SSE2-accelerated. This commit finalizes this concept. (Related to commit 8e9e7c4 and commit 60c01bd.) --- desmume/src/GPU.cpp | 264 +++++++++++++++++++++++++++++++++----------- desmume/src/GPU.h | 25 ++++- 2 files changed, 224 insertions(+), 65 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 8fd915ffb..e8bd108ed 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -3323,9 +3323,10 @@ template FORCEINLINE void GPUEngineBase::_PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &spriteAlpha, const __m128i &srcEffectEnableMask, const __m128i &enableColorEffectMask, + const __m128i &spriteAlpha, + const __m128i &spriteMode, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID) { @@ -3361,14 +3362,13 @@ FORCEINLINE void GPUEngineBase::_PixelUnknownEffectWithMask16_SSE2(GPUEngineComp // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. // Therefore, we're going to treat EVA and EVB as vectors of uint8 so that the OBJ layer can modify them, and then - // convert EVA to EVB into vectors of uint16 right before we use them. + // convert EVA and EVB into vectors of uint16 right before we use them. __m128i eva_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVA) : _mm_set1_epi16(compInfo.renderState.blendEVA); __m128i evb_vec128 = (LAYERTYPE == GPULayerType_OBJ) ? _mm_set1_epi8(compInfo.renderState.blendEVB) : _mm_set1_epi16(compInfo.renderState.blendEVB); if (LAYERTYPE == GPULayerType_OBJ) { - const __m128i objMode_vec128 = _mm_load_si128((__m128i *)(this->_sprTypeCustom + compInfo.target.xCustom)); - const __m128i isObjTranslucentMask = _mm_and_si128( dstTargetBlendEnableMask, _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) ); + const __m128i isObjTranslucentMask = _mm_and_si128( dstTargetBlendEnableMask, _mm_or_si128(_mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(spriteMode, _mm_set1_epi8(OBJMode_Bitmap))) ); forceDstTargetBlendMask = isObjTranslucentMask; const __m128i spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask); @@ -3584,7 +3584,10 @@ FORCEINLINE void GPUEngineBase::_PixelComposite16_SSE2(GPUEngineCompositorInfo & const bool didAllPixelsPass, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - const __m128i &srcEffectEnableMask) + const __m128i &srcEffectEnableMask, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr) { const bool is555and3D = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) && (LAYERTYPE == GPULayerType_3D); __m128i dst[4]; @@ -3687,15 +3690,17 @@ FORCEINLINE void GPUEngineBase::_PixelComposite16_SSE2(GPUEngineCompositorInfo & default: { - const __m128i spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((__m128i *)(this->_sprAlphaCustom + compInfo.target.xCustom)) : _mm_setzero_si128(); - const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF); + const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)enableColorEffectPtr), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF); + const __m128i spriteAlpha = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((__m128i *)sprAlphaPtr) : _mm_setzero_si128(); + const __m128i spriteMode = (LAYERTYPE == GPULayerType_OBJ) ? _mm_load_si128((__m128i *)sprModePtr) : _mm_setzero_si128(); this->_PixelUnknownEffectWithMask16_SSE2(compInfo, passMask8, src3, src2, src1, src0, - spriteAlpha, srcEffectEnableMask, enableColorEffectMask, + spriteAlpha, + spriteMode, dst[3], dst[2], dst[1], dst[0], dstLayerID_vec128); break; @@ -3920,8 +3925,119 @@ FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo this->_PixelComposite(compInfo, srcColor16, 0, enableColorEffect); } +template +void GPUEngineBase::_CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32) +{ + compInfo.target.xNative = 0; + compInfo.target.xCustom = 0; + compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead; + compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead; + compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead; + +#ifdef ENABLE_SSE2 + const __m128i srcEffectEnableMask = compInfo.renderState.srcEffectEnable_SSE2[GPULayerID_OBJ]; + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=16, compInfo.target.xNative++, compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) + { + __m128i passMask8; + + if (WILLPERFORMWINDOWTEST) + { + // Do the window test. + passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestNative[GPULayerID_OBJ] + i)), _mm_set1_epi8(1) ); + } + else + { + passMask8 = _mm_set1_epi8(0xFF); + } + + const int passMaskValue = _mm_movemask_epi8(passMask8); + + // If none of the pixels within the vector pass, then reject them all at once. + if (passMaskValue == 0) + { + continue; + } + + __m128i src[4]; + + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + src[0] = _mm_load_si128((__m128i *)(srcColorNative16 + i + 0)); + src[1] = _mm_load_si128((__m128i *)(srcColorNative16 + i + 8)); + } + else + { + if (srcColorNative32 != NULL) + { + src[0] = _mm_load_si128((__m128i *)(srcColorNative32 + i + 0)); + src[1] = _mm_load_si128((__m128i *)(srcColorNative32 + i + 4)); + src[2] = _mm_load_si128((__m128i *)(srcColorNative32 + i + 8)); + src[3] = _mm_load_si128((__m128i *)(srcColorNative32 + i + 12)); + } + else + { + const __m128i src16[2] = { + _mm_load_si128((__m128i *)(srcColorNative16 + i + 0)), + _mm_load_si128((__m128i *)(srcColorNative16 + i + 8)) + }; + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); + } + } + } + + // Write out the pixels. + const bool didAllPixelsPass = (passMaskValue == 0xFFFF); + this->_PixelComposite16_SSE2(compInfo, + didAllPixelsPass, + passMask8, + src[3], src[2], src[1], src[0], + srcEffectEnableMask, + this->_enableColorEffectNative[GPULayerID_OBJ] + i, + this->_sprAlpha + i, + this->_sprType + i); + } +#else + if (srcColorNative32 != NULL) + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++, compInfo.target.xNative++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) + { + if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[GPULayerID_OBJ][i] == 0) ) + { + continue; + } + + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][i] != 0) : true; + this->_PixelComposite(compInfo, srcColorNative32[i], this->_sprAlpha[i], enableColorEffect); + } + } + else + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++, compInfo.target.xNative++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) + { + if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[GPULayerID_OBJ][i] == 0) ) + { + continue; + } + + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[GPULayerID_OBJ][i] != 0) : true; + this->_PixelComposite(compInfo, srcColorNative16[i], this->_sprAlpha[i], enableColorEffect); + } + } +#endif +} + template -void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u16 *__restrict srcColorCustom, u8 *__restrict srcIndexCustom) +void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u16 *__restrict srcColorCustom16, u8 *__restrict srcIndexCustom) { if (LAYERTYPE == GPULayerType_BG) { @@ -3985,7 +4101,7 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u1 #endif } - CopyLineExpand<0xFFFF, false, false, 2>(srcColorCustom, this->_deferredColorNative, compInfo.line.widthCustom, 1); + CopyLineExpand<0xFFFF, false, false, 2>(srcColorCustom16, this->_deferredColorNative, compInfo.line.widthCustom, 1); CopyLineExpand<0xFFFF, false, false, 1>(srcIndexCustom, this->_deferredIndexNative, compInfo.line.widthCustom, 1); } @@ -4039,14 +4155,14 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u1 if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - src[0] = _mm_load_si128((__m128i *)(srcColorCustom + compInfo.target.xCustom + 0)); - src[1] = _mm_load_si128((__m128i *)(srcColorCustom + compInfo.target.xCustom + 8)); + src[0] = _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 0)); + src[1] = _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 8)); } else { const __m128i src16[2] = { - _mm_load_si128((__m128i *)(srcColorCustom + compInfo.target.xCustom + 0)), - _mm_load_si128((__m128i *)(srcColorCustom + compInfo.target.xCustom + 8)) + _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 0)), + _mm_load_si128((__m128i *)(srcColorCustom16 + compInfo.target.xCustom + 8)) }; if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) @@ -4067,7 +4183,10 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u1 didAllPixelsPass, passMask8, src[3], src[2], src[1], src[0], - srcEffectEnableMask); + srcEffectEnableMask, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); } #endif @@ -4093,7 +4212,7 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u1 } const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, srcColorCustom[compInfo.target.xCustom], 0, enableColorEffect); + this->_PixelComposite(compInfo, srcColorCustom16[compInfo.target.xCustom], 0, enableColorEffect); } } @@ -4193,7 +4312,10 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo didAllPixelsPass, passMask8, src[3], src[2], src[1], src[0], - srcEffectEnableMask); + srcEffectEnableMask, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + this->_sprAlphaCustom + compInfo.target.xCustom, + this->_sprTypeCustom + compInfo.target.xCustom); } #endif @@ -5316,56 +5438,21 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item this->_TransitionLineNativeToCustom(compInfo); } - if (this->isLineRenderNative[compInfo.line.indexNative]) + if (item->nbPixelsX == GPU_FRAMEBUFFER_NATIVE_WIDTH) { - if (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)) + if (this->isLineRenderNative[compInfo.line.indexNative]) { - const FragmentColor *__restrict vramColorPtr = (FragmentColor *)GPU->GetCustomVRAMAddressUsingMappedAddress(this->vramBlockOBJAddress, 0); - - for (size_t i = 0; i < item->nbPixelsX; i++) + if ((OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) && useCustomVRAM) { - const size_t srcX = item->PixelsX[i]; - - if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) - { - continue; - } - - compInfo.target.xNative = srcX; - compInfo.target.xCustom = _gpuDstPitchIndex[srcX]; - compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX; - compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; - compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; - - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX], enableColorEffect); + const FragmentColor *__restrict vramColorPtr = (FragmentColor *)GPU->GetCustomVRAMAddressUsingMappedAddress(this->vramBlockOBJAddress, 0); + this->_CompositeNativeLineOBJ(compInfo, NULL, vramColorPtr); + } + else + { + this->_CompositeNativeLineOBJ(compInfo, this->_sprColor, NULL); } } else - { - for (size_t i = 0; i < item->nbPixelsX; i++) - { - const size_t srcX = item->PixelsX[i]; - - if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) - { - continue; - } - - compInfo.target.xNative = srcX; - compInfo.target.xCustom = _gpuDstPitchIndex[srcX]; - compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX; - compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; - compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; - - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - this->_PixelComposite(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect); - } - } - } - else - { - if (item->nbPixelsX == GPU_FRAMEBUFFER_NATIVE_WIDTH) { if (useCustomVRAM) { @@ -5385,6 +5472,56 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item this->_CompositeLineDeferred(compInfo, this->_sprColorCustom, NULL); } } + } + else + { + if (this->isLineRenderNative[compInfo.line.indexNative]) + { + if (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)) + { + const FragmentColor *__restrict vramColorPtr = (FragmentColor *)GPU->GetCustomVRAMAddressUsingMappedAddress(this->vramBlockOBJAddress, 0); + + for (size_t i = 0; i < item->nbPixelsX; i++) + { + const size_t srcX = item->PixelsX[i]; + + if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) + { + continue; + } + + compInfo.target.xNative = srcX; + compInfo.target.xCustom = _gpuDstPitchIndex[srcX]; + compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX; + compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; + compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; + + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelComposite(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX], enableColorEffect); + } + } + else + { + for (size_t i = 0; i < item->nbPixelsX; i++) + { + const size_t srcX = item->PixelsX[i]; + + if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) ) + { + continue; + } + + compInfo.target.xNative = srcX; + compInfo.target.xCustom = _gpuDstPitchIndex[srcX]; + compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX; + compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; + compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; + + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelComposite(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect); + } + } + } else { void *__restrict dstColorPtr = compInfo.target.lineColorHead; @@ -6956,7 +7093,10 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) didAllPixelsPass, passMask8, src[3], src[2], src[1], src[0], - srcEffectEnableMask); + srcEffectEnableMask, + this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + NULL, + NULL); } #endif diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 70865b2ba..cfbed2f54 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1450,7 +1450,8 @@ protected: TILEENTRY _GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask); template FORCEINLINE void _CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool opaque); - template void _CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u16 *__restrict srcColorCustom, u8 *__restrict srcIndexCustom); + template void _CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32); + template void _CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, u16 *__restrict srcColorCustom16, u8 *__restrict srcIndexCustom); template void _CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); template void _RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG); @@ -1516,9 +1517,27 @@ protected: template FORCEINLINE void _PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); template FORCEINLINE void _PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); template FORCEINLINE void _PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &spriteAlpha, const __m128i &srcEffectEnableMask, const __m128i &enableColorEffectMask, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, const bool didAllPixelsPass, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &srcEffectEnableMask); + template + FORCEINLINE void _PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, + const __m128i &passMask8, + const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, + const __m128i &srcEffectEnableMask, + const __m128i &enableColorEffectMask, + const __m128i &spriteAlpha, + const __m128i &spriteMode, + __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, + __m128i &dstLayerID); + + template + FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, + const bool didAllPixelsPass, + const __m128i &passMask8, + const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, + const __m128i &srcEffectEnableMask, + const u8 *__restrict enableColorEffectPtr, + const u8 *__restrict sprAlphaPtr, + const u8 *__restrict sprModePtr); #endif template FORCEINLINE void _RenderSpriteUpdatePixel(size_t frameX, const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);