From 250de64ee55982b2ad9a8a97d6a9cd6990a85b45 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 21 Jul 2017 15:49:01 -0700 Subject: [PATCH] GPU: Unify the GPUEngineBase::_PixelEffect*() functions. --- desmume/src/GPU.cpp | 509 ++++++++++++++------------------------------ desmume/src/GPU.h | 8 +- 2 files changed, 167 insertions(+), 350 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 3fd6804ea..a587a9e17 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -1791,8 +1791,8 @@ FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, co } } -template -FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha) +template +FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect) { u16 &dstColor16 = *compInfo.target.lineColor16; FragmentColor &dstColor32 = *compInfo.target.lineColor32; @@ -1805,9 +1805,7 @@ FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; bool forceBlendEffect = false; - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - - if (ISSRCLAYEROBJ && enableColorEffect) + if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect) { //translucent-capable OBJ are forcing the function to blend when the second target is satisfied const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative]; @@ -1953,14 +1951,10 @@ FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, dstLayerID = compInfo.renderState.selectedLayerID; } -template -FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha) +template +FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect) { - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - return; - } - + u16 &dstColor16 = *compInfo.target.lineColor16; FragmentColor &dstColor32 = *compInfo.target.lineColor32; u8 &dstLayerID = *compInfo.target.lineLayerID; @@ -1968,11 +1962,17 @@ FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, u8 blendEVB = compInfo.renderState.blendEVB; const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; - bool forceBlendEffect = false; - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + // 3D rendering has a special override: If the destination pixel is set to blend, then always blend. + // Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless + // blending is forced here. + // + // This behavior must take priority over checking for the window color effect enable flag. + // Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless + // blend forcing takes priority. + bool forceBlendEffect = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnable : false; - if (ISSRCLAYEROBJ && enableColorEffect) + if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect) { //translucent-capable OBJ are forcing the function to blend when the second target is satisfied const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative]; @@ -2021,26 +2021,55 @@ FORCEINLINE void GPUEngineBase::_PixelEffect(GPUEngineCompositorInfo &compInfo, } // Render the pixel using the selected color effect. - switch (selectedEffect) + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - case ColorEffect_Disable: - dstColor32 = srcColor32; - break; - - case ColorEffect_IncreaseBrightness: - dstColor32 = this->_ColorEffectIncreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_DecreaseBrightness: - dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_Blend: - dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); - break; + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); + + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor16 = srcColor16; + break; + + case ColorEffect_IncreaseBrightness: + dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_DecreaseBrightness: + dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; + break; + + case ColorEffect_Blend: + dstColor16 = this->_ColorEffectBlend3D(srcColor32, dstColor16); + break; + } + + dstColor16 |= 0x8000; + } + else + { + switch (selectedEffect) + { + case ColorEffect_Disable: + dstColor32 = srcColor32; + break; + + case ColorEffect_IncreaseBrightness: + dstColor32 = this->_ColorEffectIncreaseBrightness(srcColor32, compInfo.renderState.blendEVY); + break; + + case ColorEffect_DecreaseBrightness: + dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY); + break; + + case ColorEffect_Blend: + dstColor32 = (LAYERTYPE == GPULayerType_3D) ? this->_ColorEffectBlend3D(srcColor32, dstColor32) : this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); + break; + } + + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; } - dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; dstLayerID = compInfo.renderState.selectedLayerID; } @@ -2111,7 +2140,7 @@ FORCEINLINE void GPUEngineBase::_PixelCopyWithMask16_SSE2(GPUEngineCompositorInf } } -template +template FORCEINLINE void GPUEngineBase::_PixelEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, @@ -2148,26 +2177,40 @@ FORCEINLINE void GPUEngineBase::_PixelEffectWithMask16_SSE2(GPUEngineCompositorI dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID_vec128), dstEffectEnableMask ); // Select the color effect based on the BLDCNT target flags. - __m128i forceBlendEffectMask = _mm_setzero_si128(); - const __m128i colorEffect_vec128 = (WILLPERFORMWINDOWTEST) ? _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask) : _mm_set1_epi8(compInfo.renderState.colorEffect); - + const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); + const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); __m128i eva_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVA); __m128i evb_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVB); - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); + __m128i forceBlendEffectMask = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnableMask : _mm_setzero_si128(); - if (ISSRCLAYEROBJ) + if (LAYERTYPE == GPULayerType_OBJ) { const __m128i objMode_vec128 = _mm_loadu_si128((__m128i *)(this->_sprType + compInfo.target.xNative)); const __m128i isObjTranslucentMask = _mm_and_si128( _mm_and_si128(enableColorEffectMask, dstEffectEnableMask), _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) ); forceBlendEffectMask = isObjTranslucentMask; const __m128i spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask); - eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask); evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask); } - __m128i tmpSrc[4] = {src0, src1, src2, src3}; + __m128i tmpSrc[4]; + + if ( (LAYERTYPE == GPULayerType_3D) && (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ) + { + // 3D layer blending requires that all src colors are preserved as 32-bit values. + // Since dst2 and dst3 are currently unused for RGB555 output, we used these variables + // to store the converted 16-bit src colors in a previous step. + tmpSrc[0] = dst2; + tmpSrc[1] = dst3; + } + else + { + tmpSrc[0] = src0; + tmpSrc[1] = src1; + tmpSrc[2] = src2; + tmpSrc[3] = src3; + } switch (compInfo.renderState.colorEffect) { @@ -2231,7 +2274,19 @@ FORCEINLINE void GPUEngineBase::_PixelEffectWithMask16_SSE2(GPUEngineCompositorI if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - const __m128i blendSrc16[2] = { this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128), this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128) }; + __m128i blendSrc16[2]; + + if (LAYERTYPE == GPULayerType_3D) + { + blendSrc16[0] = this->_ColorEffectBlend3D(src0, src1, dst0); + blendSrc16[1] = this->_ColorEffectBlend3D(src2, src3, dst1); + } + else + { + blendSrc16[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128); + blendSrc16[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128); + } + tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); @@ -2244,10 +2299,22 @@ FORCEINLINE void GPUEngineBase::_PixelEffectWithMask16_SSE2(GPUEngineCompositorI } else { - const __m128i blendSrc32[4] = { this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128), - this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128), - this->_ColorEffectBlend(tmpSrc[2], dst2, eva_vec128, evb_vec128), - this->_ColorEffectBlend(tmpSrc[3], dst3, eva_vec128, evb_vec128) }; + __m128i blendSrc32[4]; + + if (LAYERTYPE == GPULayerType_3D) + { + blendSrc32[0] = this->_ColorEffectBlend3D(src0, src0, dst0); + blendSrc32[1] = this->_ColorEffectBlend3D(src1, src1, dst1); + blendSrc32[2] = this->_ColorEffectBlend3D(src2, src2, dst2); + blendSrc32[3] = this->_ColorEffectBlend3D(src3, src3, dst3); + } + else + { + blendSrc32[0] = this->_ColorEffectBlend(tmpSrc[0], dst0, eva_vec128, evb_vec128); + blendSrc32[1] = this->_ColorEffectBlend(tmpSrc[1], dst1, eva_vec128, evb_vec128); + blendSrc32[2] = this->_ColorEffectBlend(tmpSrc[2], dst2, eva_vec128, evb_vec128); + blendSrc32[3] = this->_ColorEffectBlend(tmpSrc[3], dst3, eva_vec128, evb_vec128); + } const __m128i blendMask32[4] = { _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), @@ -2277,269 +2344,6 @@ FORCEINLINE void GPUEngineBase::_PixelEffectWithMask16_SSE2(GPUEngineCompositorI #endif -// TODO: Unify this method with GPUEngineBase::_PixelEffect(). -// We can't unify this yet because the output framebuffer is in RGBA5551, but the 3D source pixels are in RGBA6665. -// However, GPUEngineBase::_PixelEffect() takes source pixels in RGB555. In order to unify the methods, all pixels -// must be processed in RGBA6665. -template -FORCEINLINE void GPUEngineBase::_PixelEffect3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32) -{ - u16 &dstColor16 = *compInfo.target.lineColor16; - FragmentColor &dstColor32 = *compInfo.target.lineColor32; - u8 &dstLayerID = *compInfo.target.lineLayerID; - - const bool dstEffectEnable = (dstLayerID != GPULayerID_BG0) && compInfo.renderState.dstBlendEnable[dstLayerID]; - - // 3D rendering has a special override: If the destination pixel is set to blend, then always blend. - // Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless - // blending is forced here. - // - // This behavior must take priority over checking for the window color effect enable flag. - // Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless - // blend forcing takes priority. - bool forceBlendEffect = dstEffectEnable; - ColorEffect selectedEffect = (forceBlendEffect) ? ColorEffect_Blend : ColorEffect_Disable; - - // If we're not forcing blending, then select the color effect based on the BLDCNT target flags. - if (!forceBlendEffect && enableColorEffect && compInfo.renderState.srcBlendEnable[GPULayerID_BG0]) - { - switch (compInfo.renderState.colorEffect) - { - // For the Blend effect, both first and second target flags must be checked. - case ColorEffect_Blend: - { - if (dstEffectEnable) selectedEffect = compInfo.renderState.colorEffect; - break; - } - - // For the Increase/Decrease Brightness effects, only the first target flag needs to be checked. - // Test case: Bomberman Land Touch! dialog boxes will render too dark without this check. - case ColorEffect_IncreaseBrightness: - case ColorEffect_DecreaseBrightness: - selectedEffect = compInfo.renderState.colorEffect; - break; - - default: - break; - } - } - - // Render the pixel using the selected color effect. - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); - - switch (selectedEffect) - { - case ColorEffect_Disable: - dstColor16 = srcColor16; - break; - - case ColorEffect_IncreaseBrightness: - dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF]; - break; - - case ColorEffect_DecreaseBrightness: - dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF]; - break; - - case ColorEffect_Blend: - dstColor16 = this->_ColorEffectBlend3D(srcColor32, dstColor16); - break; - } - - dstColor16 |= 0x8000; - } - else - { - switch (selectedEffect) - { - case ColorEffect_Disable: - dstColor32 = srcColor32; - break; - - case ColorEffect_IncreaseBrightness: - dstColor32 = this->_ColorEffectIncreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_DecreaseBrightness: - dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY); - break; - - case ColorEffect_Blend: - dstColor32 = this->_ColorEffectBlend3D(srcColor32, dstColor32); - break; - } - - dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; - } - - dstLayerID = GPULayerID_BG0; -} - -#ifdef ENABLE_SSE2 - -template -FORCEINLINE void GPUEngineBase::_Pixel3DEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, - const __m128i &passMask8, - const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, - __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, - __m128i &dstLayerID) -{ - const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8), _mm_unpackhi_epi8(passMask8, passMask8) }; - - const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]), - _mm_unpackhi_epi16(passMask16[0], passMask16[0]), - _mm_unpacklo_epi16(passMask16[1], passMask16[1]), - _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; - - const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF); - - __m128i dstEffectEnableMask; - -#ifdef ENABLE_SSSE3 - dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID); - dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) ); -#else - dstEffectEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) ); -#endif - - dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask ); - const __m128i forceBlendEffectMask = dstEffectEnableMask; - - __m128i tmpSrc[4]; - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - // Rename our converted 16-bit src colors to reflect what they actually are. - tmpSrc[0] = dst2; - tmpSrc[1] = dst3; - } - else - { - tmpSrc[0] = src0; - tmpSrc[1] = src1; - tmpSrc[2] = src2; - tmpSrc[3] = src3; - } - - // Select the color effect based on the BLDCNT target flags. - const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[GPULayerID_BG0]; - const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask); - const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY); - - switch (compInfo.renderState.colorEffect) - { - case ColorEffect_IncreaseBrightness: - { - const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) ); - const __m128i brightnessMask16[2] = { _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask16[1] ); - } - else - { - const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) }; - - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectIncreaseBrightness(tmpSrc[2], evy_vec128), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectIncreaseBrightness(tmpSrc[3], evy_vec128), brightnessMask32[3] ); - } - break; - } - - case ColorEffect_DecreaseBrightness: - { - const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) ); - const __m128i brightnessMask16[2] = { _mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8) }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask16[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask16[1] ); - } - else - { - const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]), - _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]), - _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) }; - - tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness(tmpSrc[0], evy_vec128), brightnessMask32[0] ); - tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness(tmpSrc[1], evy_vec128), brightnessMask32[1] ); - tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectDecreaseBrightness(tmpSrc[2], evy_vec128), brightnessMask32[2] ); - tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectDecreaseBrightness(tmpSrc[3], evy_vec128), brightnessMask32[3] ); - } - break; - } - - default: - break; - } - - // Render the pixel using the selected color effect. - const __m128i blendMask8 = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) ); - const __m128i blendMask16[2] = { _mm_unpacklo_epi8(blendMask8, blendMask8), _mm_unpackhi_epi8(blendMask8, blendMask8) }; - - if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) - { - const __m128i blendSrc16[2] = { this->_ColorEffectBlend3D(src0, src1, dst0), this->_ColorEffectBlend3D(src2, src3, dst1) }; - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]); - - // Combine the final colors. - tmpSrc[0] = _mm_or_si128(tmpSrc[0], _mm_set1_epi16(0x8000)); - tmpSrc[1] = _mm_or_si128(tmpSrc[1], _mm_set1_epi16(0x8000)); - - dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask16[0]); - dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask16[1]); - } - else - { - const __m128i blendSrc32[4] = { this->_ColorEffectBlend3D(src0, src0, dst0), - this->_ColorEffectBlend3D(src1, src1, dst1), - this->_ColorEffectBlend3D(src2, src2, dst2), - this->_ColorEffectBlend3D(src3, src3, dst3) }; - - const __m128i blendMask32[4] = { _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]), - _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]), - _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]), - _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) }; - - const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000); - - tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]); - tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]); - tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]); - tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]); - - tmpSrc[0] = _mm_or_si128(tmpSrc[0], alphaBits); - tmpSrc[1] = _mm_or_si128(tmpSrc[1], alphaBits); - tmpSrc[2] = _mm_or_si128(tmpSrc[2], alphaBits); - tmpSrc[3] = _mm_or_si128(tmpSrc[3], alphaBits); - - dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask32[0]); - dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask32[1]); - dst2 = _mm_blendv_epi8(dst2, tmpSrc[2], passMask32[2]); - dst3 = _mm_blendv_epi8(dst3, tmpSrc[3], passMask32[3]); - } - - dstLayerID = _mm_blendv_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0), passMask8); -} - -#endif - //this is fantastically inaccurate. //we do the early return even though it reduces the resulting accuracy //because we need the speed, and because it is inaccurate anyway @@ -2743,7 +2547,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixelSingle(GPUEngineCompositorInfo &comp } else { - this->_PixelEffect(compInfo, srcColor16, 0); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, srcColor16, 0, enableColorEffect); } } @@ -2955,13 +2760,13 @@ void GPUEngineBase::_RenderPixelsCustom(GPUEngineCompositorInfo &compInfo) { const __m128i spriteAlpha = _mm_setzero_si128(); - this->_PixelEffectWithMask16_SSE2(compInfo, - passMask8, - src[3], src[2], src[1], src[0], - spriteAlpha, - srcEffectEnableMask, - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); + this->_PixelEffectWithMask16_SSE2(compInfo, + passMask8, + src[3], src[2], src[1], src[0], + spriteAlpha, + srcEffectEnableMask, + dst[3], dst[2], dst[1], dst[0], + dstLayerID_vec128); } _mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]); @@ -2998,7 +2803,8 @@ void GPUEngineBase::_RenderPixelsCustom(GPUEngineCompositorInfo &compInfo) } else { - this->_PixelEffect(compInfo, this->_bgLayerColorCustom[compInfo.target.xCustom], 0); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, this->_bgLayerColorCustom[compInfo.target.xCustom], 0, enableColorEffect); } } } @@ -3115,13 +2921,13 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(GPUEngineCompositorInfo &compInfo) { const __m128i spriteAlpha = _mm_setzero_si128(); - this->_PixelEffectWithMask16_SSE2(compInfo, - passMask8, - src[3], src[2], src[1], src[0], - spriteAlpha, - srcEffectEnableMask, - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); + this->_PixelEffectWithMask16_SSE2(compInfo, + passMask8, + src[3], src[2], src[1], src[0], + spriteAlpha, + srcEffectEnableMask, + dst[3], dst[2], dst[1], dst[0], + dstLayerID_vec128); } _mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]); @@ -3160,7 +2966,8 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(GPUEngineCompositorInfo &compInfo) } else { - this->_PixelEffect(compInfo, ((u32 *)vramColorPtr)[i], 0); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, ((u32 *)vramColorPtr)[i], 0, enableColorEffect); } } else @@ -3176,7 +2983,8 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(GPUEngineCompositorInfo &compInfo) } else { - this->_PixelEffect(compInfo, ((u16 *)vramColorPtr)[i], 0); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, ((u16 *)vramColorPtr)[i], 0, enableColorEffect); } } } @@ -4347,7 +4155,8 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } else { - this->_PixelEffect(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX]); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX], enableColorEffect); } } } @@ -4374,7 +4183,8 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } else { - this->_PixelEffect(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX]); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect); } } } @@ -4422,7 +4232,8 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } else { - this->_PixelEffect(compInfo, ((FragmentColor *)vramColorPtr)[dstX], this->_sprAlpha[srcX]); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, ((FragmentColor *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect); } } else @@ -4433,7 +4244,8 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } else { - this->_PixelEffect(compInfo, ((u16 *)vramColorPtr)[dstX], this->_sprAlpha[srcX]); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, ((u16 *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect); } } } @@ -4478,7 +4290,8 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } else { - this->_PixelEffect(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX]); + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + this->_PixelEffect(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect); } } } @@ -5906,13 +5719,17 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) if (hofs == 0) { +#ifdef ENABLE_SSE2 + const size_t ssePixCount = (compInfo.line.widthCustom - (compInfo.line.widthCustom % 16)); + const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID]; +#endif + for (size_t line = 0; line < compInfo.line.renderCount; line++) { compInfo.target.xNative = 0; compInfo.target.xCustom = 0; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = compInfo.line.widthCustom - (compInfo.line.widthCustom % 16); +#ifdef ENABLE_SSE2 for (; compInfo.target.xCustom < ssePixCount; srcLinePtr+=16, compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16) { const __m128i src[4] = { _mm_load_si128((__m128i *)srcLinePtr + 0), @@ -5952,9 +5769,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - // Instead of letting these vectors go to waste, let's convert the src colors to 16-bit now and - // then pack the converted 16-bit colors into these vectors. Note that these are 16-bit source - // pixels now. + // 3D layer blending requires that all src colors are preserved as 32-bit values. + // Since dst2 and dst3 are currently unused for RGB555 output, we using these variables + // to store the converted 16-bit src colors. dst[2] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src[0], _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src[0], _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src[0], _mm_set1_epi32(0x003E0000)), 7)), _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src[1], _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src[1], _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src[1], _mm_set1_epi32(0x003E0000)), 7)) ); dst[3] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src[2], _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src[2], _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src[2], _mm_set1_epi32(0x003E0000)), 7)), @@ -6018,11 +5835,15 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) } else { - this->_Pixel3DEffectWithMask16_SSE2(compInfo, - passMask8, - src[3], src[2], src[1], src[0], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); + const __m128i spriteAlpha = _mm_setzero_si128(); + + this->_PixelEffectWithMask16_SSE2(compInfo, + passMask8, + src[3], src[2], src[1], src[0], + spriteAlpha, + srcEffectEnableMask, + dst[3], dst[2], dst[1], dst[0], + dstLayerID_vec128); } _mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]); @@ -6055,8 +5876,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) else { const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - - this->_PixelEffect3D(compInfo, enableColorEffect, *srcLinePtr); + this->_PixelEffect(compInfo, *srcLinePtr, 0, enableColorEffect); } } } @@ -6092,8 +5912,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) else { const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - - this->_PixelEffect3D(compInfo, enableColorEffect, srcLinePtr[srcX]); + this->_PixelEffect(compInfo, srcLinePtr[srcX], 0, enableColorEffect); } } diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index a6688e750..afc4311dc 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1388,9 +1388,8 @@ protected: template FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16); template FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32); - template FORCEINLINE void _PixelEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha); - template FORCEINLINE void _PixelEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha); - template FORCEINLINE void _PixelEffect3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32); + template FORCEINLINE void _PixelEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect); + template FORCEINLINE void _PixelEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect); FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB); FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable); @@ -1414,8 +1413,7 @@ protected: template void _PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); template void _PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _PixelEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &spriteAlpha, const __m128i &srcEffectEnableMask, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); - template FORCEINLINE void _Pixel3DEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); + template FORCEINLINE void _PixelEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &spriteAlpha, const __m128i &srcEffectEnableMask, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); #endif template void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);