From 5b1eb553516c125e0859e7b32e5991aab84664da Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 18 Sep 2021 13:57:07 -0700 Subject: [PATCH] GPU Operations (SSE2): Apply the same code optimizations to the PixelOperation_SSE2::_unknownEffectMask16() and PixelOperation_SSE2::_unknownEffectMask32() methods as their corresponding AVX2 versions. - Also fixes a bug in PixelOperation_SSE2::_unknownEffectMask32() that would cause 3D layers to appear black if the user was running 15-bit color mode. (Regression from commit 0db9872.) --- desmume/src/GPU_Operations_SSE2.cpp | 632 +++++++++++++++------------- 1 file changed, 336 insertions(+), 296 deletions(-) diff --git a/desmume/src/GPU_Operations_SSE2.cpp b/desmume/src/GPU_Operations_SSE2.cpp index 9f452ce4d..9c2c17297 100644 --- a/desmume/src/GPU_Operations_SSE2.cpp +++ b/desmume/src/GPU_Operations_SSE2.cpp @@ -859,15 +859,15 @@ FORCEINLINE v128u32 ColorOperation_SSE2::blend3D(const v128u32 &colA, const v128 FORCEINLINE v128u16 ColorOperation_SSE2::increase(const v128u16 &col, const v128u16 &blendEVY) const { - v128u16 r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); - v128u16 g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); - v128u16 b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); + v128u16 r = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); - r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) ); - g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) ); - b_vec128 = _mm_add_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b_vec128), blendEVY), 4) ); + r = _mm_add_epi16( r, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r), blendEVY), 4) ); + g = _mm_add_epi16( g, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g), blendEVY), 4) ); + b = _mm_add_epi16( b, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b), blendEVY), 4) ); - return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); + return _mm_or_si128(r, _mm_or_si128( _mm_slli_epi16(g, 5), _mm_slli_epi16(b, 10)) ); } template @@ -884,15 +884,15 @@ FORCEINLINE v128u32 ColorOperation_SSE2::increase(const v128u32 &col, const v128 FORCEINLINE v128u16 ColorOperation_SSE2::decrease(const v128u16 &col, const v128u16 &blendEVY) const { - v128u16 r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); - v128u16 g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); - v128u16 b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); + v128u16 r = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) ); - r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) ); - g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) ); - b_vec128 = _mm_sub_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(b_vec128, blendEVY), 4) ); + r = _mm_sub_epi16( r, _mm_srli_epi16(_mm_mullo_epi16(r, blendEVY), 4) ); + g = _mm_sub_epi16( g, _mm_srli_epi16(_mm_mullo_epi16(g, blendEVY), 4) ); + b = _mm_sub_epi16( b, _mm_srli_epi16(_mm_mullo_epi16(b, blendEVY), 4) ); - return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) ); + return _mm_or_si128(r, _mm_or_si128( _mm_slli_epi16(g, 5), _mm_slli_epi16(b, 10)) ); } template @@ -1460,9 +1460,6 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn #endif dstTargetBlendEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); - - // Select the color effect based on the BLDCNT target flags. - const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm_setzero_si128(); // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. @@ -1481,6 +1478,9 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); } + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + // ---------- __m128i tmpSrc[4]; @@ -1508,30 +1508,34 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn case ColorEffect_IncreaseBrightness: { const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); + const int brightnessUpMaskValue = _mm_movemask_epi8(brightnessMask8); - const v128u16 brightnessMask16[2] = { - _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), - _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + if (brightnessUpMaskValue != 0x00000000) { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); - } - else - { - const v128u32 brightnessMask32[4] = { - _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } } break; } @@ -1539,30 +1543,34 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn case ColorEffect_DecreaseBrightness: { const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); + const int brightnessDownMaskValue = _mm_movemask_epi8(brightnessMask8); - const v128u16 brightnessMask16[2] = { - _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), - _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + if (brightnessDownMaskValue != 0x00000000) { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); - } - else - { - const v128u32 brightnessMask32[4] = { - _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } } break; } @@ -1573,11 +1581,7 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn // Render the pixel using the selected color effect. const v128u8 blendMask8 = _mm_or_si128( forceDstTargetBlendMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstTargetBlendEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); - - const v128u16 blendMask16[2] = { - _mm_unpacklo_epi8(blendMask8, blendMask8), - _mm_unpackhi_epi8(blendMask8, blendMask8) - }; + const int blendMaskValue = _mm_movemask_epi8(blendMask8); if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { @@ -1586,43 +1590,51 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) }; - v128u16 blendSrc16[2]; - - switch (LAYERTYPE) + if (blendMaskValue != 0x00000000) { - case GPULayerType_3D: - //blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); - //blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); - printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); - assert(false); - break; - - case GPULayerType_BG: - blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); - blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + + v128u16 blendSrc16[2]; + + switch (LAYERTYPE) { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - const v128u16 tempEVA[2] = { - _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) - }; - const v128u16 tempEVB[2] = { - _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) - }; - - blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); - blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); - break; + case GPULayerType_3D: + //blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + //blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v128u16 tempEVA[2] = { + _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) + }; + const v128u16 tempEVB[2] = { + _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } } + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); } - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); - // Store the final colors. const v128u16 passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), @@ -1635,6 +1647,11 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn } else { + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + const v128u32 dst32[4] = { _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 0), _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 1), @@ -1642,73 +1659,76 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask16(GPUEngineCompositorIn _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), }; - v128u32 blendSrc32[4]; - - switch (LAYERTYPE) + if (blendMaskValue != 0x00000000) { - case GPULayerType_3D: - //blendSrc32[0] = colorop_vec.blend3D(src0, dst32[0]); - //blendSrc32[1] = colorop_vec.blend3D(src1, dst32[1]); - //blendSrc32[2] = colorop_vec.blend3D(src2, dst32[2]); - //blendSrc32[3] = colorop_vec.blend3D(src3, dst32[3]); - printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); - assert(false); - break; - - case GPULayerType_BG: - blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); - blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); - blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); - blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: + v128u32 blendSrc32[4]; + + switch (LAYERTYPE) { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - // - // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only - // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual - // EVA/EVB value is mirrored for each adjacent 16-bit boundary. - v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); - v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); - - const v128u16 tempEVA[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); - tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); - - const v128u16 tempEVB[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); - blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); - blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); - blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); - break; + case GPULayerType_3D: + //blendSrc32[0] = colorop_vec.blend3D(src0, dst32[0]); + //blendSrc32[1] = colorop_vec.blend3D(src1, dst32[1]); + //blendSrc32[2] = colorop_vec.blend3D(src2, dst32[2]); + //blendSrc32[3] = colorop_vec.blend3D(src3, dst32[3]); + printf("GPU: 3D layers cannot be in RGBA5551 format. To composite a 3D layer, use the _unknownEffectMask32() method instead.\n"); + assert(false); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); + v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); + + const v128u16 tempEVA[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); + tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); + + const v128u16 tempEVB[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } } + + const v128u32 blendMask32[4] = { + _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), + _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), + _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), + _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); } - const v128u32 blendMask32[4] = { - _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), - _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), - _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), - _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) - }; - - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); - tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); - tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); - // Store the final colors. const v128u16 passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), @@ -1759,9 +1779,6 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn #endif dstTargetBlendEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID), dstTargetBlendEnableMask ); - - // Select the color effect based on the BLDCNT target flags. - const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); v128u8 forceDstTargetBlendMask = (LAYERTYPE == GPULayerType_3D) ? dstTargetBlendEnableMask : _mm_setzero_si128(); // Do note that OBJ layers can modify EVA or EVB, meaning that these blend values may not be constant for OBJ layers. @@ -1780,6 +1797,9 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); } + // Select the color effect based on the BLDCNT target flags. + const v128u8 colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + // ---------- __m128i tmpSrc[4]; @@ -1788,8 +1808,8 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn { tmpSrc[0] = ColorspaceConvert6665To5551_SSE2(src0, src1); tmpSrc[1] = ColorspaceConvert6665To5551_SSE2(src2, src3); - tmpSrc[0] = _mm_setzero_si128(); - tmpSrc[1] = _mm_setzero_si128(); + tmpSrc[2] = _mm_setzero_si128(); + tmpSrc[3] = _mm_setzero_si128(); } else { @@ -1804,30 +1824,34 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn case ColorEffect_IncreaseBrightness: { const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); + const int brightnessUpMaskValue = _mm_movemask_epi8(brightnessMask8); - const v128u16 brightnessMask16[2] = { - _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), - _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + if (brightnessUpMaskValue != 0x00000000) { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); - } - else - { - const v128u32 brightnessMask32[4] = { - _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.increase(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.increase(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.increase(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.increase(tmpSrc[3], evy16), brightnessMask32[3] ); + } } break; } @@ -1835,30 +1859,34 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn case ColorEffect_DecreaseBrightness: { const v128u8 brightnessMask8 = _mm_andnot_si128( forceDstTargetBlendMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); + const int brightnessDownMaskValue = _mm_movemask_epi8(brightnessMask8); - const v128u16 brightnessMask16[2] = { - _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), - _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) - }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + if (brightnessDownMaskValue != 0x00000000) { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); - } - else - { - const v128u32 brightnessMask32[4] = { - _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + const v128u16 brightnessMask16[2] = { + _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), + _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask16[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask16[1] ); + } + else + { + const v128u32 brightnessMask32[4] = { + _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), + _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), + _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], colorop_vec.decrease(tmpSrc[0], evy16), brightnessMask32[0] ); + tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], colorop_vec.decrease(tmpSrc[1], evy16), brightnessMask32[1] ); + tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], colorop_vec.decrease(tmpSrc[2], evy16), brightnessMask32[2] ); + tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], colorop_vec.decrease(tmpSrc[3], evy16), brightnessMask32[3] ); + } } break; } @@ -1869,11 +1897,7 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn // Render the pixel using the selected color effect. const v128u8 blendMask8 = _mm_or_si128( forceDstTargetBlendMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstTargetBlendEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); - - const v128u16 blendMask16[2] = { - _mm_unpacklo_epi8(blendMask8, blendMask8), - _mm_unpackhi_epi8(blendMask8, blendMask8) - }; + const int blendMaskValue = _mm_movemask_epi8(blendMask8); if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { @@ -1882,41 +1906,49 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn _mm_load_si128((v128u16 *)compInfo.target.lineColor16 + 1) }; - v128u16 blendSrc16[2]; - - switch (LAYERTYPE) + if (blendMaskValue != 0x00000000) { - case GPULayerType_3D: - blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); - blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); - break; - - case GPULayerType_BG: - blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); - blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + + v128u16 blendSrc16[2]; + + switch (LAYERTYPE) { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - const v128u16 tempEVA[2] = { - _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) - }; - const v128u16 tempEVB[2] = { - _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), - _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) - }; - - blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); - blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); - break; + case GPULayerType_3D: + blendSrc16[0] = colorop_vec.blend3D(src0, src1, dst16[0]); + blendSrc16[1] = colorop_vec.blend3D(src2, src3, dst16[1]); + break; + + case GPULayerType_BG: + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], eva_vec128, evb_vec128); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + const v128u16 tempEVA[2] = { + _mm_unpacklo_epi8(eva_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(eva_vec128, _mm_setzero_si128()) + }; + const v128u16 tempEVB[2] = { + _mm_unpacklo_epi8(evb_vec128, _mm_setzero_si128()), + _mm_unpackhi_epi8(evb_vec128, _mm_setzero_si128()) + }; + + blendSrc16[0] = colorop_vec.blend(tmpSrc[0], dst16[0], tempEVA[0], tempEVB[0]); + blendSrc16[1] = colorop_vec.blend(tmpSrc[1], dst16[1], tempEVA[1], tempEVB[1]); + break; + } } + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); } - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); - // Store the final colors. const v128u16 passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), @@ -1936,71 +1968,79 @@ FORCEINLINE void PixelOperation_SSE2::_unknownEffectMask32(GPUEngineCompositorIn _mm_load_si128((v128u32 *)compInfo.target.lineColor32 + 3), }; - v128u32 blendSrc32[4]; - - switch (LAYERTYPE) + if (blendMaskValue != 0x00000000) { - case GPULayerType_3D: - blendSrc32[0] = colorop_vec.blend3D(tmpSrc[0], dst32[0]); - blendSrc32[1] = colorop_vec.blend3D(tmpSrc[1], dst32[1]); - blendSrc32[2] = colorop_vec.blend3D(tmpSrc[2], dst32[2]); - blendSrc32[3] = colorop_vec.blend3D(tmpSrc[3], dst32[3]); - break; - - case GPULayerType_BG: - blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); - blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); - blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); - blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); - break; - - case GPULayerType_OBJ: + const v128u16 blendMask16[2] = { + _mm_unpacklo_epi8(blendMask8, blendMask8), + _mm_unpackhi_epi8(blendMask8, blendMask8) + }; + + v128u32 blendSrc32[4]; + + switch (LAYERTYPE) { - // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. - // - // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only - // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual - // EVA/EVB value is mirrored for each adjacent 16-bit boundary. - v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); - v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); - - const v128u16 tempEVA[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); - tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); - - const v128u16 tempEVB[4] = { - _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), - _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), - _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) - }; - - blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); - blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); - blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); - blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); - break; + case GPULayerType_3D: + blendSrc32[0] = colorop_vec.blend3D(tmpSrc[0], dst32[0]); + blendSrc32[1] = colorop_vec.blend3D(tmpSrc[1], dst32[1]); + blendSrc32[2] = colorop_vec.blend3D(tmpSrc[2], dst32[2]); + blendSrc32[3] = colorop_vec.blend3D(tmpSrc[3], dst32[3]); + break; + + case GPULayerType_BG: + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], eva_vec128, evb_vec128); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], eva_vec128, evb_vec128); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], eva_vec128, evb_vec128); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], eva_vec128, evb_vec128); + break; + + case GPULayerType_OBJ: + { + // For OBJ layers, we need to convert EVA and EVB from vectors of uint8 into vectors of uint16. + // + // Note that we are sending only 4 colors for each colorop_vec.blend() call, and so we are only + // going to send the 4 correspending EVA/EVB vectors as well. In this case, each individual + // EVA/EVB value is mirrored for each adjacent 16-bit boundary. + v128u16 tempBlendLo = _mm_unpacklo_epi8(eva_vec128, eva_vec128); + v128u16 tempBlendHi = _mm_unpackhi_epi8(eva_vec128, eva_vec128); + + const v128u16 tempEVA[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + tempBlendLo = _mm_unpacklo_epi8(evb_vec128, evb_vec128); + tempBlendHi = _mm_unpackhi_epi8(evb_vec128, evb_vec128); + + const v128u16 tempEVB[4] = { + _mm_unpacklo_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendLo, _mm_setzero_si128()), + _mm_unpacklo_epi8(tempBlendHi, _mm_setzero_si128()), + _mm_unpackhi_epi8(tempBlendHi, _mm_setzero_si128()) + }; + + blendSrc32[0] = colorop_vec.blend(tmpSrc[0], dst32[0], tempEVA[0], tempEVB[0]); + blendSrc32[1] = colorop_vec.blend(tmpSrc[1], dst32[1], tempEVA[1], tempEVB[1]); + blendSrc32[2] = colorop_vec.blend(tmpSrc[2], dst32[2], tempEVA[2], tempEVB[2]); + blendSrc32[3] = colorop_vec.blend(tmpSrc[3], dst32[3], tempEVA[3], tempEVB[3]); + break; + } } + + const v128u32 blendMask32[4] = { + _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), + _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), + _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), + _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) + }; + + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); + tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); + tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); + tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); } - const v128u32 blendMask32[4] = { - _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), - _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), - _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), - _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) - }; - - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); - tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); - tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); - // Store the final colors. const v128u16 passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8),