From 0d9d59455f94b55265c25662eda019c344196543 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 1 Jul 2016 00:13:32 +0000 Subject: [PATCH] =?UTF-8?q?GPU:=20-=20Remove=20GPUEngineBase::=5FRenderPix?= =?UTF-8?q?el=5FCheckWindows8=5FSSE2()=20and=20GPUEngineBase::=5FRenderPix?= =?UTF-8?q?el8=5FSSE2().=20I=20don=E2=80=99t=20see=20us=20ever=20needing?= =?UTF-8?q?=20to=20use=20these=20methods=20in=20the=20future.=20-=20Replac?= =?UTF-8?q?e=20patterns=20of=20por(pand,pandn)=20with=20pblendvb=20where?= =?UTF-8?q?=20appropriate.=20(Requires=20SSE4.1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desmume/src/GPU.cpp | 417 +++++----------------------------------- desmume/src/GPU.h | 6 +- desmume/src/types.h | 10 +- desmume/src/version.cpp | 8 +- 4 files changed, 63 insertions(+), 378 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 124f56741..e6200e6e3 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -40,6 +40,14 @@ #include "matrix.h" #include "emufile.h" +// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to +// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit +// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it +// should work fine for both SSE4.1 and SSE2. +#if !defined(_SMMINTRIN_H) && defined(__EMMINTRIN_H) + #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) +#endif + #ifdef FASTBUILD #undef FORCEINLINE #define FORCEINLINE @@ -1862,180 +1870,6 @@ FORCEINLINE void GPUEngineBase::_RenderPixel_CheckWindows16_SSE2(const size_t ds enableColorEffect = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1)); } -template -FORCEINLINE void GPUEngineBase::_RenderPixel_CheckWindows8_SSE2(const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const -{ - // If no windows are enabled, then we don't need to perform any window tests. - // In this case, the pixel always passes and the color effect is always processed. - if (!this->_isAnyWindowEnabled) - { - didPassWindowTest = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); - enableColorEffect = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); - return; - } - - u8 didPassValue; - __m128i win_vec128; - - __m128i win0HandledMask = _mm_setzero_si128(); - __m128i win1HandledMask = _mm_setzero_si128(); - __m128i winOBJHandledMask = _mm_setzero_si128(); - __m128i winOUTHandledMask = _mm_setzero_si128(); - - const __m128i dstLoc = _mm_adds_epu16( _mm_set1_epi16(dstX), _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0) ); - - // Window 0 has the highest priority, so always check this first. - if (this->_WIN0_ENABLED) - { - if (this->_isWindowInsideVerticalRange[0]) - { - switch (LAYERID) - { - case GPULayerID_BG0: didPassValue = this->_IORegisterMap->WIN0IN.BG0_Enable; break; - case GPULayerID_BG1: didPassValue = this->_IORegisterMap->WIN0IN.BG1_Enable; break; - case GPULayerID_BG2: didPassValue = this->_IORegisterMap->WIN0IN.BG2_Enable; break; - case GPULayerID_BG3: didPassValue = this->_IORegisterMap->WIN0IN.BG3_Enable; break; - case GPULayerID_OBJ: didPassValue = this->_IORegisterMap->WIN0IN.OBJ_Enable; break; - - default: - didPassValue = 1; - break; - } - - if (ISCUSTOMRENDERINGNEEDED) - { - if (this->_windowLeftCustom[0] > this->_windowRightCustom[0]) - { - win_vec128 = _mm_packs_epi16( _mm_and_si128(_mm_cmplt_epi16(dstLoc, this->_windowLeftCustom_SSE2[0]), _mm_cmpgt_epi16(dstLoc, this->_windowRightCustom_SSE2[0])), - _mm_set1_epi16(0xFFFF) ); - win_vec128 = _mm_xor_si128(win_vec128, _mm_set1_epi32(0xFFFFFFFF)); - } - else - { - win_vec128 = _mm_packs_epi16( _mm_andnot_si128(_mm_cmplt_epi16(dstLoc, this->_windowLeftCustom_SSE2[0]), _mm_cmplt_epi16(dstLoc, this->_windowRightCustom_SSE2[0])), - _mm_setzero_si128() ); - } - - win0HandledMask = win_vec128; - } - else - { - win_vec128 = _mm_loadl_epi64((__m128i *)(this->_curr_win[0] + dstX)); - win0HandledMask = _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)); - } - - didPassWindowTest = _mm_and_si128(win0HandledMask, _mm_set1_epi8(didPassValue)); - enableColorEffect = _mm_and_si128(win0HandledMask, _mm_set1_epi8(this->_IORegisterMap->WIN0IN.Effect_Enable)); - } - else - { - didPassWindowTest = _mm_setzero_si128(); - enableColorEffect = _mm_setzero_si128(); - } - } - - // Window 1 has medium priority, and is checked after Window 0. - if (this->_WIN1_ENABLED && this->_isWindowInsideVerticalRange[1]) - { - switch (LAYERID) - { - case GPULayerID_BG0: didPassValue = this->_IORegisterMap->WIN1IN.BG0_Enable; break; - case GPULayerID_BG1: didPassValue = this->_IORegisterMap->WIN1IN.BG1_Enable; break; - case GPULayerID_BG2: didPassValue = this->_IORegisterMap->WIN1IN.BG2_Enable; break; - case GPULayerID_BG3: didPassValue = this->_IORegisterMap->WIN1IN.BG3_Enable; break; - case GPULayerID_OBJ: didPassValue = this->_IORegisterMap->WIN1IN.OBJ_Enable; break; - - default: - didPassValue = 1; - break; - } - - if (ISCUSTOMRENDERINGNEEDED) - { - if (this->_windowLeftCustom[1] > this->_windowRightCustom[1]) - { - win_vec128 = _mm_packs_epi16( _mm_and_si128(_mm_cmplt_epi16(dstLoc, this->_windowLeftCustom_SSE2[1]), _mm_cmpgt_epi16(dstLoc, this->_windowRightCustom_SSE2[1])), - _mm_set1_epi16(0xFFFF) ); - win_vec128 = _mm_xor_si128(win_vec128, _mm_set1_epi32(0xFFFFFFFF)); - } - else - { - win_vec128 = _mm_packs_epi16( _mm_andnot_si128(_mm_cmplt_epi16(dstLoc, this->_windowLeftCustom_SSE2[1]), _mm_cmplt_epi16(dstLoc, this->_windowRightCustom_SSE2[1])), - _mm_setzero_si128() ); - } - - win1HandledMask = _mm_andnot_si128(win0HandledMask, win_vec128); - } - else - { - win_vec128 = _mm_loadl_epi64((__m128i *)(this->_curr_win[1] + dstX)); - win1HandledMask = _mm_andnot_si128( win0HandledMask, _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)) ); - } - - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(win1HandledMask, _mm_set1_epi8(didPassValue)) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(win1HandledMask, _mm_set1_epi8(this->_IORegisterMap->WIN1IN.Effect_Enable)) ); - } - - // Window OBJ has low priority, and is checked after both Window 0 and Window 1. - if (this->_WINOBJ_ENABLED) - { - switch (LAYERID) - { - case GPULayerID_BG0: didPassValue = this->_IORegisterMap->WINOBJ.BG0_Enable; break; - case GPULayerID_BG1: didPassValue = this->_IORegisterMap->WINOBJ.BG1_Enable; break; - case GPULayerID_BG2: didPassValue = this->_IORegisterMap->WINOBJ.BG2_Enable; break; - case GPULayerID_BG3: didPassValue = this->_IORegisterMap->WINOBJ.BG3_Enable; break; - case GPULayerID_OBJ: didPassValue = this->_IORegisterMap->WINOBJ.OBJ_Enable; break; - - default: - didPassValue = 1; - break; - } - - if (ISCUSTOMRENDERINGNEEDED) - { - win_vec128 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - this->_sprWin[_gpuDstToSrcIndex[dstX+7]], - this->_sprWin[_gpuDstToSrcIndex[dstX+6]], - this->_sprWin[_gpuDstToSrcIndex[dstX+5]], - this->_sprWin[_gpuDstToSrcIndex[dstX+4]], - this->_sprWin[_gpuDstToSrcIndex[dstX+3]], - this->_sprWin[_gpuDstToSrcIndex[dstX+2]], - this->_sprWin[_gpuDstToSrcIndex[dstX+1]], - this->_sprWin[_gpuDstToSrcIndex[dstX+0]]); - } - else - { - win_vec128 = _mm_loadl_epi64((__m128i *)(this->_sprWin + dstX)); - } - - winOBJHandledMask = _mm_andnot_si128( _mm_or_si128(win0HandledMask, win1HandledMask), _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)) ); - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOBJHandledMask, _mm_set1_epi8(didPassValue)) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOBJHandledMask, _mm_set1_epi8(this->_IORegisterMap->WINOBJ.Effect_Enable)) ); - } - - // If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags. - // This has the lowest priority, and is always checked last. - switch (LAYERID) - { - case GPULayerID_BG0: didPassValue = this->_IORegisterMap->WINOUT.BG0_Enable; break; - case GPULayerID_BG1: didPassValue = this->_IORegisterMap->WINOUT.BG1_Enable; break; - case GPULayerID_BG2: didPassValue = this->_IORegisterMap->WINOUT.BG2_Enable; break; - case GPULayerID_BG3: didPassValue = this->_IORegisterMap->WINOUT.BG3_Enable; break; - case GPULayerID_OBJ: didPassValue = this->_IORegisterMap->WINOUT.OBJ_Enable; break; - - default: - break; - } - - winOUTHandledMask = _mm_and_si128( _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), _mm_xor_si128(_mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF)) ); - didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(didPassValue)) ); - enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(this->_IORegisterMap->WINOUT.Effect_Enable)) ); - - didPassWindowTest = _mm_cmpeq_epi8(didPassWindowTest, _mm_set1_epi8(1)); - enableColorEffect = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1)); -} - #endif /*****************************************************************************/ @@ -2321,9 +2155,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX, { // If we're rendering pixels to a debugging context, then assume that the pixel // always passes the window test and that the color effect is always disabled. - _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_or_si128(_mm_and_si128(passMask16_0, _mm_or_si128(srcColorLo_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16_0, dstColorLo_vec128)) ); - _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_or_si128(_mm_and_si128(passMask16_1, _mm_or_si128(srcColorHi_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16_1, dstColorHi_vec128)) ); - _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); + _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_blendv_epi8(dstColorLo_vec128, _mm_or_si128(srcColorLo_vec128, _mm_set1_epi16(0x8000)), passMask16_0) ); + _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_blendv_epi8(dstColorHi_vec128, _mm_or_si128(srcColorHi_vec128, _mm_set1_epi16(0x8000)), passMask16_1) ); + _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_blendv_epi8(dstLayerID_vec128, _mm_set1_epi8(LAYERID), passMask8) ); return; } @@ -2342,9 +2176,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX, if ((LAYERID != GPULayerID_OBJ) && COLOREFFECTDISABLEDHINT) { - _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_or_si128(_mm_and_si128(passMask16_0, _mm_or_si128(srcColorLo_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16_0, dstColorLo_vec128)) ); - _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_or_si128(_mm_and_si128(passMask16_1, _mm_or_si128(srcColorHi_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16_1, dstColorHi_vec128)) ); - _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); + _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_blendv_epi8(dstColorLo_vec128, _mm_or_si128(srcColorLo_vec128, _mm_set1_epi16(0x8000)), passMask16_0) ); + _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_blendv_epi8(dstColorHi_vec128, _mm_or_si128(srcColorHi_vec128, _mm_set1_epi16(0x8000)), passMask16_1) ); + _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_blendv_epi8(dstLayerID_vec128, _mm_set1_epi8(LAYERID), passMask8) ); return; } @@ -2405,7 +2239,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX, } else { - colorEffect_vec128 = _mm_or_si128( _mm_and_si128(enableColorEffectMask, _mm_set1_epi8(BLDCNT.ColorEffect)), _mm_andnot_si128(enableColorEffectMask, _mm_set1_epi8(ColorEffect_Disable)) ); + colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(BLDCNT.ColorEffect), enableColorEffectMask); } __m128i eva_vec128 = _mm_set1_epi16(this->_BLDALPHA_EVA); @@ -2421,8 +2255,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX, const __m128i srcAlpha_vec128 = _mm_loadu_si128((__m128i *)(srcAlpha + dstX)); const __m128i srcAlphaMask = _mm_and_si128( isObjTranslucentMask, _mm_xor_si128(_mm_cmpeq_epi8(srcAlpha_vec128, _mm_set1_epi8(0xFF)), _mm_set1_epi32(0xFFFFFFFF)) ); - eva_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, srcAlpha_vec128), _mm_andnot_si128(srcAlphaMask, eva_vec128) ); - evb_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, _mm_sub_epi8(_mm_set1_epi8(16), srcAlpha_vec128)), _mm_andnot_si128(srcAlphaMask, evb_vec128) ); + eva_vec128 = _mm_blendv_epi8(eva_vec128, srcAlpha_vec128, srcAlphaMask); + evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), srcAlpha_vec128), srcAlphaMask); } __m128i brightnessMask = _mm_setzero_si128(); @@ -2460,159 +2294,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX, const __m128i combinedSrcColorLo_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsLo, blendPixelsLo), disablePixelsLo), _mm_set1_epi16(0x8000) ); const __m128i combinedSrcColorHi_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsHi, blendPixelsHi), disablePixelsHi), _mm_set1_epi16(0x8000) ); - _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_or_si128(_mm_and_si128(passMask16_0, combinedSrcColorLo_vec128), _mm_andnot_si128(passMask16_0, dstColorLo_vec128)) ); - _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_or_si128(_mm_and_si128(passMask16_1, combinedSrcColorHi_vec128), _mm_andnot_si128(passMask16_1, dstColorHi_vec128)) ); - _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); -} - -template -FORCEINLINE void GPUEngineBase::_RenderPixel8_SSE2(const size_t dstX, - const __m128i &srcColor_vec128, - const u8 *__restrict srcAlpha, - void *__restrict dstColorLine, - u8 *__restrict dstLayerIDLine, - __m128i &passMask8) -{ - const __m128i dstColor_vec128 = _mm_loadu_si128((__m128i *)dstColorLine); - const __m128i dstLayerID_vec128 = _mm_loadl_epi64((__m128i *)dstLayerIDLine); - __m128i passMask16 = _mm_unpacklo_epi8(passMask8, passMask8); - - if (ISDEBUGRENDER) - { - // If we're rendering pixels to a debugging context, then assume that the pixel - // always passes the window test and that the color effect is always disabled. - _mm_storeu_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(passMask16, _mm_or_si128(srcColor_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16, dstColor_vec128)) ); - _mm_storel_epi64( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); - return; - } - - __m128i enableColorEffectMask = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); - - if (!NOWINDOWSENABLEDHINT) - { - // Do the window test. - __m128i didPassWindowTest = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); - this->_RenderPixel_CheckWindows8_SSE2(dstX, didPassWindowTest, enableColorEffectMask); - - passMask8 = _mm_and_si128(passMask8, didPassWindowTest); - passMask16 = _mm_unpacklo_epi8(passMask8, passMask8); - } - - if ((LAYERID != GPULayerID_OBJ) && COLOREFFECTDISABLEDHINT) - { - _mm_storeu_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(passMask16, _mm_or_si128(srcColor_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(passMask16, dstColor_vec128)) ); - _mm_storel_epi64( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); - return; - } - - const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT; - u8 srcEffectEnableValue; - - switch (LAYERID) - { - case GPULayerID_BG0: - srcEffectEnableValue = BLDCNT.BG0_Target1; - break; - - case GPULayerID_BG1: - srcEffectEnableValue = BLDCNT.BG1_Target1; - break; - - case GPULayerID_BG2: - srcEffectEnableValue = BLDCNT.BG2_Target1; - break; - - case GPULayerID_BG3: - srcEffectEnableValue = BLDCNT.BG3_Target1; - break; - - case GPULayerID_OBJ: - srcEffectEnableValue = BLDCNT.OBJ_Target1; - break; - - default: - srcEffectEnableValue = 0; - break; - } - const __m128i srcEffectEnableMask = _mm_cmpeq_epi16(_mm_set1_epi16(srcEffectEnableValue), _mm_set1_epi16(1)); - -#ifdef ENABLE_SSSE3 - __m128i dstEffectEnableMask = _mm_unpacklo_epi8( _mm_shuffle_epi8(this->_blend2_SSSE3, dstLayerID_vec128), _mm_setzero_si128() ); -#else - __m128i dstEffectEnableMask = _mm_set_epi16(this->_blend2[dstLayerIDLine[7]], - this->_blend2[dstLayerIDLine[6]], - this->_blend2[dstLayerIDLine[5]], - this->_blend2[dstLayerIDLine[4]], - this->_blend2[dstLayerIDLine[3]], - this->_blend2[dstLayerIDLine[2]], - this->_blend2[dstLayerIDLine[1]], - this->_blend2[dstLayerIDLine[0]]); -#endif - - dstEffectEnableMask = _mm_and_si128( _mm_xor_si128(_mm_cmpeq_epi16(_mm_unpacklo_epi8(dstLayerID_vec128, _mm_setzero_si128()), _mm_set1_epi16(LAYERID)), _mm_set1_epi32(0xFFFFFFFF)), - _mm_xor_si128(_mm_cmpeq_epi16(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF)) ); - - // Select the color effect based on the BLDCNT target flags. - __m128i forceBlendEffectMask = _mm_setzero_si128(); - __m128i colorEffect_vec128; - - if (NOWINDOWSENABLEDHINT) - { - colorEffect_vec128 = _mm_set1_epi16(BLDCNT.ColorEffect); - } - else - { - colorEffect_vec128 = _mm_or_si128( _mm_and_si128(enableColorEffectMask, _mm_set1_epi16(BLDCNT.ColorEffect)), _mm_andnot_si128(enableColorEffectMask, _mm_set1_epi16(ColorEffect_Disable)) ); - } - - __m128i eva_vec128 = _mm_set1_epi16(this->_BLDALPHA_EVA); - __m128i evb_vec128 = _mm_set1_epi16(this->_BLDALPHA_EVB); - const __m128i evy_vec128 = _mm_set1_epi16(this->_BLDALPHA_EVY); - - if (LAYERID == GPULayerID_OBJ) - { - const __m128i objMode_vec128 = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(this->_sprType + dstX)), _mm_setzero_si128() ); - const __m128i isObjTranslucentMask = _mm_and_si128( dstEffectEnableMask, _mm_or_si128(_mm_cmpeq_epi16(objMode_vec128, _mm_set1_epi16(OBJMode_Transparent)), _mm_cmpeq_epi16(objMode_vec128, _mm_set1_epi16(OBJMode_Bitmap))) ); - forceBlendEffectMask = isObjTranslucentMask; - - const __m128i srcAlpha_vec128 = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(srcAlpha + dstX)), _mm_setzero_si128() ); - const __m128i srcAlphaMask = _mm_and_si128( isObjTranslucentMask, _mm_xor_si128(_mm_cmpeq_epi16(srcAlpha_vec128, _mm_set1_epi16(0x00FF)), _mm_set1_epi32(0xFFFFFFFF)) ); - - eva_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, srcAlpha_vec128), _mm_andnot_si128(srcAlphaMask, eva_vec128) ); - evb_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, _mm_sub_epi16(_mm_set1_epi16(16), srcAlpha_vec128)), _mm_andnot_si128(srcAlphaMask, evb_vec128) ); - } - - __m128i brightnessMask = _mm_setzero_si128(); - __m128i brightnessPixels = _mm_setzero_si128(); - - switch (BLDCNT.ColorEffect) - { - case ColorEffect_IncreaseBrightness: - brightnessMask = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi16(colorEffect_vec128, _mm_set1_epi16(ColorEffect_IncreaseBrightness))) ); - brightnessPixels = _mm_and_si128( brightnessMask, this->_ColorEffectIncreaseBrightness(srcColor_vec128, evy_vec128) ); - break; - - case ColorEffect_DecreaseBrightness: - brightnessMask = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi16(colorEffect_vec128, _mm_set1_epi16(ColorEffect_DecreaseBrightness))) ); - brightnessPixels = _mm_and_si128( brightnessMask, this->_ColorEffectDecreaseBrightness(srcColor_vec128, evy_vec128) ); - break; - - default: - break; - } - - // Render the pixel using the selected color effect. - const __m128i blendMask = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi16(colorEffect_vec128, _mm_set1_epi16(ColorEffect_Blend))) ); - const __m128i blendPixels = _mm_and_si128( blendMask, this->_ColorEffectBlend(srcColor_vec128, dstColor_vec128, eva_vec128, evb_vec128) ); - - const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi32(0xFFFFFFFF) ); - const __m128i disablePixels = _mm_and_si128(disableMask, srcColor_vec128); - - // Combine the final colors. - const __m128i combinedSrcColorLo_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixels, blendPixels), disablePixels), _mm_set1_epi16(0x8000) ); - - _mm_storeu_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(passMask16, combinedSrcColorLo_vec128), _mm_andnot_si128(passMask16, dstColor_vec128)) ); - _mm_storel_epi64( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passMask8, dstLayerID_vec128)) ); + _mm_store_si128( (__m128i *)dstColorLine + 0, _mm_blendv_epi8(dstColorLo_vec128, combinedSrcColorLo_vec128, passMask16_0) ); + _mm_store_si128( (__m128i *)dstColorLine + 1, _mm_blendv_epi8(dstColorHi_vec128, combinedSrcColorHi_vec128, passMask16_1) ); + _mm_store_si128( (__m128i *)dstLayerIDLine, _mm_blendv_epi8(dstLayerID_vec128, _mm_set1_epi8(LAYERID), passMask8) ); } #endif @@ -2810,7 +2494,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const NDSColorFormat srcForm dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask ); // Select the color effect based on the BLDCNT target flags. - const __m128i colorEffect_vec128 = _mm_or_si128( _mm_and_si128(enableColorEffectMask, _mm_set1_epi8(BLDCNT.ColorEffect)), _mm_andnot_si128(enableColorEffectMask, _mm_set1_epi8(ColorEffect_Disable)) ); + const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(BLDCNT.ColorEffect), enableColorEffectMask); const __m128i forceBlendEffectMask = _mm_and_si128(enableColorEffectMask, dstEffectEnableMask); const __m128i evy_vec128 = _mm_set1_epi16(this->_BLDALPHA_EVY); @@ -2854,9 +2538,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const NDSColorFormat srcForm srcColorLo_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsLo, blendPixelsLo), disablePixelsLo), _mm_set1_epi16(0x8000) ); srcColorHi_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsHi, blendPixelsHi), disablePixelsHi), _mm_set1_epi16(0x8000) ); - dst0 = _mm_or_si128(_mm_and_si128(passMask16_0, srcColorLo_vec128), _mm_andnot_si128(passMask16_0, dst0)); - dst1 = _mm_or_si128(_mm_and_si128(passMask16_1, srcColorHi_vec128), _mm_andnot_si128(passMask16_1, dst1)); - dstLayerID = _mm_or_si128(_mm_and_si128(passMask8, _mm_set1_epi8(GPULayerID_BG0)), _mm_andnot_si128(passMask8, dstLayerID)); + dst0 = _mm_blendv_epi8(dst0, srcColorLo_vec128, passMask16_0); + dst1 = _mm_blendv_epi8(dst1, srcColorHi_vec128, passMask16_1); + dstLayerID = _mm_blendv_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0), passMask8); } #endif @@ -3062,7 +2746,7 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res const __m128i col_vec128 = _mm_load_si128((__m128i *)(this->_bgLayerColor + x)); const __m128i idxMask = _mm_cmpeq_epi16(_mm_unpacklo_epi8(index_vec128, _mm_setzero_si128()), _mm_setzero_si128()); - const __m128i tmpColor_vec128 = _mm_or_si128( _mm_and_si128(idxMask, _mm_set1_epi16(0xFFFF)), _mm_andnot_si128(idxMask, _mm_and_si128(_mm_set1_epi16(0x7FFF), col_vec128)) ); + const __m128i tmpColor_vec128 = _mm_blendv_epi8(_mm_and_si128(col_vec128, _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask); const __m128i mosaicWidthMask = _mm_cmpeq_epi16( _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_loadu_si128((__m128i *)(this->_mosaicWidthBG + x))), _mm_setzero_si128() ); const __m128i mosaicHeightMask = _mm_cmpeq_epi16(_mm_set1_epi16(this->_mosaicHeightBG[lineIndex].begin), _mm_setzero_si128()); @@ -3080,7 +2764,7 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res const __m128i mosaicColor_vec128 = _mm_loadu_si128((__m128i *)(this->_mosaicColors.bg[LAYERID] + x)); const __m128i mosaicColorMask = _mm_cmpeq_epi16(mosaicColor_vec128, _mm_set1_epi16(0xFFFF)); _mm_storel_epi64( (__m128i *)(this->_bgLayerIndex + x), _mm_andnot_si128(_mm_packs_epi16(mosaicColorMask, _mm_setzero_si128()), index_vec128) ); - _mm_store_si128( (__m128i *)(this->_bgLayerColor + x), _mm_or_si128(_mm_and_si128(mosaicColorMask, col_vec128), _mm_andnot_si128(mosaicColorMask, mosaicColor_vec128)) ); + _mm_store_si128( (__m128i *)(this->_bgLayerColor + x), _mm_blendv_epi8(mosaicColor_vec128, col_vec128, mosaicColorMask) ); } #ifdef ENABLE_SSSE3 @@ -3650,12 +3334,9 @@ void GPUEngineBase::_RenderSpriteBMP(const u8 spriteNum, const u16 l, u16 *__res const size_t ssePixCount = lg - (lg % 8); for (; i < ssePixCount; i += 8, x += 8, sprX += 8) { - __m128i color_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); - const __m128i colorAlpha_vec128 = _mm_and_si128(color_vec128, _mm_set1_epi16(0x8000)); - const __m128i colorAlphaCompare = _mm_cmpeq_epi16(colorAlpha_vec128, _mm_set1_epi16(0x8000)); - color_vec128 = _mm_or_si128( _mm_and_si128(colorAlphaCompare, color_vec128), _mm_andnot_si128(colorAlphaCompare, _mm_loadu_si128((__m128i *)(dst + sprX))) ); - - _mm_storeu_si128((__m128i *)(dst + sprX), color_vec128); + const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); + const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) ); + _mm_storeu_si128( (__m128i *)(dst + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX)), color_vec128, alphaCompare) ); } } else @@ -3665,36 +3346,26 @@ void GPUEngineBase::_RenderSpriteBMP(const u8 spriteNum, const u16 l, u16 *__res const size_t ssePixCount = lg - (lg % 16); for (; i < ssePixCount; i += 16, x += 16, sprX += 16) { - __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + sprX)); + const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + sprX)); + const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); + const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x + 8)); + const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128); + const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) ); - __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); - __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x + 8)); - - const __m128i colorAlphaLo_vec128 = _mm_and_si128(colorLo_vec128, _mm_set1_epi16(0x8000)); - const __m128i colorAlphaHi_vec128 = _mm_and_si128(colorHi_vec128, _mm_set1_epi16(0x8000)); - - const __m128i colorAlphaLoCompare = _mm_cmpeq_epi16(colorAlphaLo_vec128, _mm_set1_epi16(0x8000)); - const __m128i colorAlphaHiCompare = _mm_cmpeq_epi16(colorAlphaHi_vec128, _mm_set1_epi16(0x8000)); - const __m128i colorAlphaPackedCompare = _mm_cmpeq_epi8( _mm_packs_epi16(colorAlphaLoCompare, colorAlphaHiCompare), _mm_set1_epi8(0xFF) ); - - const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, colorAlphaPackedCompare); + const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, alphaCompare); const __m128i combinedLoCompare = _mm_unpacklo_epi8(combinedPackedCompare, combinedPackedCompare); const __m128i combinedHiCompare = _mm_unpackhi_epi8(combinedPackedCompare, combinedPackedCompare); - colorLo_vec128 = _mm_or_si128( _mm_and_si128(combinedLoCompare, colorLo_vec128), _mm_andnot_si128(combinedLoCompare, _mm_loadu_si128((__m128i *)(dst + sprX))) ); - colorHi_vec128 = _mm_or_si128( _mm_and_si128(combinedHiCompare, colorHi_vec128), _mm_andnot_si128(combinedHiCompare, _mm_loadu_si128((__m128i *)(dst + sprX + 8))) ); - const __m128i dstAlpha_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(alpha + 1)), _mm_andnot_si128(combinedPackedCompare, _mm_loadu_si128((__m128i *)(dst_alpha + sprX))) ); - const __m128i dstTypeTab_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(OBJMode_Bitmap)), _mm_andnot_si128(combinedPackedCompare, _mm_loadu_si128((__m128i *)(typeTab + sprX))) ); - prioTab_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, prio_vec128), _mm_andnot_si128(combinedPackedCompare, prioTab_vec128) ); - const __m128i sprNum_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(spriteNum)), _mm_andnot_si128(combinedPackedCompare, _mm_loadu_si128((__m128i *)(this->_sprNum + sprX))) ); - - _mm_storeu_si128((__m128i *)(dst + sprX), colorLo_vec128); - _mm_storeu_si128((__m128i *)(dst + sprX + 8), colorHi_vec128); - _mm_storeu_si128((__m128i *)(dst_alpha + sprX), dstAlpha_vec128); - _mm_storeu_si128((__m128i *)(typeTab + sprX), dstTypeTab_vec128); - _mm_storeu_si128((__m128i *)(prioTab + sprX), prioTab_vec128); - _mm_storeu_si128((__m128i *)(this->_sprNum + sprX), sprNum_vec128); + // Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because + // maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite + // to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs. + _mm_storeu_si128( (__m128i *)(dst + sprX + 0), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 0)), colorLo_vec128, combinedLoCompare) ); + _mm_storeu_si128( (__m128i *)(dst + sprX + 8), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 8)), colorHi_vec128, combinedHiCompare) ); + _mm_storeu_si128( (__m128i *)(dst_alpha + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + sprX)), _mm_set1_epi8(alpha + 1), combinedPackedCompare) ); + _mm_storeu_si128( (__m128i *)(typeTab + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + sprX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) ); + _mm_storeu_si128( (__m128i *)(prioTab + sprX), _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) ); + _mm_storeu_si128( (__m128i *)(this->_sprNum + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + sprX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) ); } } } diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 8ab57b1ab..8f49ab225 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -34,6 +34,10 @@ #include #endif +#ifdef ENABLE_SSE4_1 +#include +#endif + class GPUEngineBase; class EMUFILE; struct MMU_struct; @@ -1341,9 +1345,7 @@ protected: template FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY); template FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY); template FORCEINLINE void _RenderPixel_CheckWindows16_SSE2(const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const; - template FORCEINLINE void _RenderPixel_CheckWindows8_SSE2(const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const; template FORCEINLINE void _RenderPixel16_SSE2(const size_t dstX, const __m128i &srcColorHi_vec128, const __m128i &srcColorLo_vec128, const u8 *__restrict srcAlpha, void *__restrict dstColorLine, u8 *__restrict dstLayerIDLine, __m128i &passMask8); - template FORCEINLINE void _RenderPixel8_SSE2(const size_t dstX, const __m128i &srcColor_vec128, const u8 *__restrict srcAlpha, void *__restrict dstColorLine, u8 *__restrict dstLayerIDLine, __m128i &passMask8); template FORCEINLINE void _RenderPixel3D_SSE2(const NDSColorFormat srcFormat, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID, __m128i &passMask8, __m128i &enableColorEffectMask); #endif diff --git a/desmume/src/types.h b/desmume/src/types.h index fd4a8381f..ce794d5a7 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -68,6 +68,14 @@ #ifdef __SSSE3__ #define ENABLE_SSSE3 #endif + + #ifdef __SSE4_1__ + #define ENABLE_SSE4_1 + #endif + + #ifdef __SSE4_2__ + #define ENABLE_SSE4_2 + #endif #endif #ifdef _MSC_VER @@ -164,7 +172,7 @@ #ifndef LOOPVECTORIZE_DISABLE #if defined(_MSC_VER) - #if _MSC_VER >= 1700 + #if _MSC_VER >= 1700 #define LOOPVECTORIZE_DISABLE loop(no_vector) #else #define LOOPVECTORIZE_DISABLE diff --git a/desmume/src/version.cpp b/desmume/src/version.cpp index 80b5de0f5..2051e59ae 100644 --- a/desmume/src/version.cpp +++ b/desmume/src/version.cpp @@ -78,9 +78,13 @@ #undef DESMUME_SSE_STRING #define DESMUME_SSE_STRING " SSSE3" #endif -#ifdef ENABLE_SSE4 +#ifdef ENABLE_SSE4_1 #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE4" + #define DESMUME_SSE_STRING " SSE4.1" +#endif +#ifdef ENABLE_SSE4_2 + #undef DESMUME_SSE_STRING + #define DESMUME_SSE_STRING " SSE4.2" #endif #ifdef ENABLE_AVX #undef DESMUME_AVX_STRING