diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 234bebc4f..ac7347c74 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -516,6 +516,7 @@ void GPUEngineBase::_Reset_Base() renderState.dstBlendEnable[GPULayerID_BG3] = false; renderState.dstBlendEnable[GPULayerID_OBJ] = false; renderState.dstBlendEnable[GPULayerID_Backdrop] = false; + renderState.dstAnyBlendEnable = false; #ifdef ENABLE_SSE2 renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128(); @@ -1532,27 +1533,128 @@ void GPUEngineBase::_LineColorCopy(void *__restrict dstBuffer, const void *__res u8 *dst = dstLinePtr; const void *src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * PIXELBYTES) : (u8 *)srcBuffer; - for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) +#if defined(ENABLE_SSE2) + if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2) && (lineCount == 2)) { - for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; ) { if (PIXELBYTES == 2) { - ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; + const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + x)); + const __m128i src16out[2] = { _mm_unpacklo_epi16(src16, src16), _mm_unpackhi_epi16(src16, src16) }; + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src16out[1]); + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src16out[1]); + + x += 8; } else if (PIXELBYTES == 4) { - ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; + const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + x)); + const __m128i src32out[2] = { _mm_unpacklo_epi32(src32, src32), _mm_unpackhi_epi32(src32, src32) }; + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src32out[1]); + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src32out[1]); + + x += 4; } } } - - dst = dstLinePtr + (lineWidth * PIXELBYTES); - - for (size_t line = 1; line < lineCount; line++) + else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) && (lineCount == 4)) { - memcpy(dst, dstLinePtr, lineWidth * PIXELBYTES); - dst += (lineWidth * PIXELBYTES); + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; ) + { + if (PIXELBYTES == 2) + { + const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + x)); + const __m128i src16_lo = _mm_unpacklo_epi16(src16, src16); + const __m128i src16_hi = _mm_unpackhi_epi16(src16, src16); + const __m128i src16out[4] = { _mm_unpacklo_epi16(src16_lo, src16_lo), _mm_unpackhi_epi16(src16_lo, src16_lo), _mm_unpacklo_epi16(src16_hi, src16_hi), _mm_unpackhi_epi16(src16_hi, src16_hi) }; + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[1]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[2]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[3]); + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[1]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[2]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[3]); + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[1]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[2]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[3]); + + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[0]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[1]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[2]); + _mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[3]); + + x += 8; + } + else if (PIXELBYTES == 4) + { + const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + x)); + const __m128i src32_lo = _mm_unpacklo_epi32(src32, src32); + const __m128i src32_hi = _mm_unpackhi_epi32(src32, src32); + const __m128i src32out[4] = { _mm_unpacklo_epi32(src32_lo, src32_lo), _mm_unpackhi_epi32(src32_lo, src32_lo), _mm_unpacklo_epi32(src32_hi, src32_hi), _mm_unpackhi_epi32(src32_hi, src32_hi) }; + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[1]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[2]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[3]); + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[1]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[2]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[3]); + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[1]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[2]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[3]); + + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[0]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[1]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[2]); + _mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[3]); + + x += 4; + } + } + } + else +#endif + { + for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++) + { + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) + { + if (PIXELBYTES == 2) + { + ((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x]; + } + else if (PIXELBYTES == 4) + { + ((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x]; + } + } + } + + dst = dstLinePtr + (lineWidth * PIXELBYTES); + + for (size_t line = 1; line < lineCount; line++) + { + memcpy(dst, dstLinePtr, lineWidth * PIXELBYTES); + dst += (lineWidth * PIXELBYTES); + } } } } @@ -1598,6 +1700,32 @@ void GPUEngineBase::_LineLayerIDCopy(u8 *__restrict dstBuffer, const u8 *__restr } } +template +void GPUEngineBase::_TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo) +{ + if (this->isLineRenderNative[compInfo.line.indexNative]) + { + switch (OUTPUTFORMAT) + { + case NDSColorFormat_BGR555_Rev: + this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); + break; + + case NDSColorFormat_BGR666_Rev: + case NDSColorFormat_BGR888_Rev: + this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); + break; + } + + this->_LineLayerIDCopy(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative); + + compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom; + compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom; + this->isLineRenderNative[compInfo.line.indexNative] = false; + this->nativeLineRenderCount--; + } +} + /*****************************************************************************/ // PIXEL RENDERING /*****************************************************************************/ @@ -1635,9 +1763,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, return; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - - if (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT) + if (COLOREFFECTDISABLEDHINT) { switch (OUTPUTFORMAT) { @@ -1665,6 +1791,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; bool forceBlendEffect = false; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + if (ISSRCLAYEROBJ && enableColorEffect) { //translucent-capable OBJ are forcing the function to blend when the second target is satisfied @@ -1833,9 +1961,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, return; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; - - if (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT) + if (COLOREFFECTDISABLEDHINT) { dstColor32 = srcColor32; dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; @@ -1849,6 +1975,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID]; bool forceBlendEffect = false; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true; + if (ISSRCLAYEROBJ && enableColorEffect) { //translucent-capable OBJ are forcing the function to blend when the second target is satisfied @@ -1981,7 +2109,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(GPUEngineCompositorInfo &com enableColorEffectMask = _mm_set1_epi8(0xFF); } - if ( (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT) || (_mm_movemask_epi8(srcEffectEnableMask) == 0) ) + if (COLOREFFECTDISABLEDHINT) { if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { @@ -2152,13 +2280,31 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(GPUEngineCompositorInfo &com // We can't unify this yet because the output framebuffer is in RGBA5551, but the 3D source pixels are in RGBA6665. // However, GPUEngineBase::_RenderPixel() takes source pixels in RGB555. In order to unify the methods, all pixels // must be processed in RGBA6665. -template +template FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32) { u16 &dstColor16 = *compInfo.target.lineColor16; FragmentColor &dstColor32 = *compInfo.target.lineColor32; u8 &dstLayerID = *compInfo.target.lineLayerID; + if (COLOREFFECTDISABLEDHINT) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + dstColor16 = ColorspaceConvert6665To5551(srcColor32); + dstColor16 |= 0x8000; + } + else + { + dstColor32 = srcColor32; + dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F; + } + + dstLayerID = GPULayerID_BG0; + + return; + } + const bool dstEffectEnable = (dstLayerID != GPULayerID_BG0) && compInfo.renderState.dstBlendEnable[dstLayerID]; // 3D rendering has a special override: If the destination pixel is set to blend, then always blend. @@ -2250,7 +2396,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo #ifdef ENABLE_SSE2 -template +template FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &enableColorEffectMask, @@ -2266,23 +2412,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &com _mm_unpackhi_epi16(passMask16[1], passMask16[1]) }; __m128i dstEffectEnableMask; + __m128i forceBlendEffectMask; -#ifdef ENABLE_SSSE3 - dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID); - dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) ); -#else - dstEffectEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) ); - dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) ); -#endif - - dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask ); - const __m128i forceBlendEffectMask = dstEffectEnableMask; - - if (_mm_movemask_epi8( _mm_or_si128(forceBlendEffectMask, enableColorEffectMask) ) == 0) + if (COLOREFFECTDISABLEDHINT) { if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { @@ -2301,6 +2433,21 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &com return; } +#ifdef ENABLE_SSSE3 + dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID); + dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) ); +#else + dstEffectEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]); + dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) ); + dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) ); + dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) ); + dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) ); + dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) ); +#endif + + dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask ); + forceBlendEffectMask = dstEffectEnableMask; + __m128i tmpSrc[4]; if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { @@ -3214,26 +3361,9 @@ void GPUEngineBase::_RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, co } else { - if (this->isLineRenderNative[compInfo.line.indexNative] && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)) + if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested) { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - } - - this->_LineLayerIDCopy(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative); - - compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom; - compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom; - this->isLineRenderNative[compInfo.line.indexNative] = false; - this->nativeLineRenderCount--; + this->_TransitionLineNativeToCustom(compInfo); } } break; @@ -4007,7 +4137,16 @@ void GPUEngineBase::_RenderLine_Layers(const size_t l) { if ( (layerID == GPULayerID_BG0) && GPU->GetEngineMain()->WillRender3DLayer() ) { - GPU->GetEngineMain()->RenderLine_Layer3D(compInfo); + if ( !compInfo.renderState.dstAnyBlendEnable && ( (compInfo.renderState.colorEffect == ColorEffect_Disable) || + !compInfo.renderState.srcBlendEnable[GPULayerID_BG0] || + (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) ) + { + GPU->GetEngineMain()->RenderLine_Layer3D(compInfo); + } + else + { + GPU->GetEngineMain()->RenderLine_Layer3D(compInfo); + } continue; } } @@ -4029,7 +4168,17 @@ void GPUEngineBase::_RenderLine_Layers(const size_t l) { compInfo.renderState.selectedLayerID = GPULayerID_OBJ; compInfo.renderState.selectedBGLayer = NULL; - this->_RenderLine_LayerOBJ(compInfo, item); + + if ( !compInfo.renderState.dstAnyBlendEnable && ( (compInfo.renderState.colorEffect == ColorEffect_Disable) || + !compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] || + (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) ) + { + this->_RenderLine_LayerOBJ(compInfo, item); + } + else + { + this->_RenderLine_LayerOBJ(compInfo, item); + } } } } @@ -4062,7 +4211,7 @@ void GPUEngineBase::_RenderLine_SetupSprites(GPUEngineCompositorInfo &compInfo) } } -template +template void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item) { bool useCustomVRAM = false; @@ -4084,26 +4233,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item } } - if (this->isLineRenderNative[compInfo.line.indexNative] && useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)) + if (useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)) { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - } - - this->_LineLayerIDCopy(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative); - - compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom; - compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom; - this->isLineRenderNative[compInfo.line.indexNative] = false; - this->nativeLineRenderCount--; + this->_TransitionLineNativeToCustom(compInfo); } if (this->isLineRenderNative[compInfo.line.indexNative]) @@ -4122,9 +4254,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; - this->_RenderPixel(compInfo, - vramColorPtr[srcX], - this->_sprAlpha[srcX]); + this->_RenderPixel(compInfo, + vramColorPtr[srcX], + this->_sprAlpha[srcX]); } } else @@ -4139,9 +4271,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX; compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX; - this->_RenderPixel(compInfo, - this->_sprColor[srcX], - this->_sprAlpha[srcX]); + this->_RenderPixel(compInfo, + this->_sprColor[srcX], + this->_sprAlpha[srcX]); } } } @@ -4177,16 +4309,16 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { FragmentColor *vramColor = (FragmentColor *)vramColorPtr; - this->_RenderPixel(compInfo, - vramColor[dstX], - this->_sprAlpha[srcX]); + this->_RenderPixel(compInfo, + vramColor[dstX], + this->_sprAlpha[srcX]); } else { u16 *vramColor = (u16 *)vramColorPtr; - this->_RenderPixel(compInfo, - vramColor[dstX], - this->_sprAlpha[srcX]); + this->_RenderPixel(compInfo, + vramColor[dstX], + this->_sprAlpha[srcX]); } } } @@ -4218,9 +4350,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX; compInfo.target.lineLayerID = dstLayerIDPtr + dstX; - this->_RenderPixel(compInfo, - this->_sprColor[srcX], - this->_sprAlpha[srcX]); + this->_RenderPixel(compInfo, + this->_sprColor[srcX], + this->_sprAlpha[srcX]); } } @@ -4745,7 +4877,10 @@ template _RenderLine_LayerBG_ApplyColorEffectDisabledHint(compInfo); } @@ -4968,6 +5103,13 @@ void GPUEngineBase::ParseReg_BLDCNT() renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0); renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0); + renderState.dstAnyBlendEnable = renderState.dstBlendEnable[GPULayerID_BG0] || + renderState.dstBlendEnable[GPULayerID_BG1] || + renderState.dstBlendEnable[GPULayerID_BG2] || + renderState.dstBlendEnable[GPULayerID_BG3] || + renderState.dstBlendEnable[GPULayerID_OBJ] || + renderState.dstBlendEnable[GPULayerID_Backdrop]; + #ifdef ENABLE_SSE2 const __m128i one_vec128 = _mm_set1_epi8(1); @@ -5609,7 +5751,7 @@ void GPUEngineA::RenderLine(const size_t l) } } -template +template void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) { const FragmentColor *__restrict framebuffer3D = CurrentRenderer->GetFramebuffer(); @@ -5618,26 +5760,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) return; } - if (this->isLineRenderNative[compInfo.line.indexNative] && !CurrentRenderer->IsFramebufferNativeSize()) + if (!CurrentRenderer->IsFramebufferNativeSize()) { - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - - case NDSColorFormat_BGR666_Rev: - case NDSColorFormat_BGR888_Rev: - this->_LineColorCopy(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative); - break; - } - - this->_LineLayerIDCopy(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative); - - compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom; - compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom; - this->isLineRenderNative[compInfo.line.indexNative] = false; - this->nativeLineRenderCount--; + this->_TransitionLineNativeToCustom(compInfo); } const float customWidthScale = (float)compInfo.line.widthCustom / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; @@ -5716,12 +5841,12 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) dst[3] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 3); } - this->_RenderPixel3D_SSE2(compInfo, - passMask8, - enableColorEffectMask, - src[3], src[2], src[1], src[0], - dst[3], dst[2], dst[1], dst[0], - dstLayerID_vec128); + this->_RenderPixel3D_SSE2(compInfo, + passMask8, + enableColorEffectMask, + src[3], src[2], src[1], src[0], + dst[3], dst[2], dst[1], dst[0], + dstLayerID_vec128); _mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]); _mm_store_si128((__m128i *)*compInfo.target.lineColor + 1, dst[1]); @@ -5748,9 +5873,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - this->_RenderPixel3D(compInfo, - enableColorEffect, - *srcLinePtr); + this->_RenderPixel3D(compInfo, + enableColorEffect, + *srcLinePtr); } } } @@ -5779,9 +5904,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom]; const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; - this->_RenderPixel3D(compInfo, - enableColorEffect, - srcLinePtr[srcX]); + this->_RenderPixel3D(compInfo, + enableColorEffect, + srcLinePtr[srcX]); } srcLinePtr += compInfo.line.widthCustom; diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index f181cd62d..cc054518f 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1194,6 +1194,7 @@ typedef struct __m128i dstBlendEnable_SSE2[6]; #endif #endif // ENABLE_SSE2 + bool dstAnyBlendEnable; u8 WIN0_enable[6]; u8 WIN1_enable[6]; @@ -1345,6 +1346,7 @@ protected: template void _LineColorCopy(void *__restrict dstBuffer, const void *__restrict srcBuffer, const size_t l); template void _LineLayerIDCopy(u8 *__restrict dstBuffer, const u8 *__restrict srcBuffer, const size_t l); + template void _TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo); void _MosaicSpriteLinePixel(GPUEngineCompositorInfo &compInfo, const size_t x, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); void _MosaicSpriteLine(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); @@ -1382,11 +1384,11 @@ protected: template FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); + template void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); template FORCEINLINE void _RenderPixel(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 srcAlpha); template FORCEINLINE void _RenderPixel(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 srcAlpha); - template FORCEINLINE void _RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32); + template FORCEINLINE void _RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32); FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB); FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable); @@ -1408,7 +1410,7 @@ protected: template FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY); template FORCEINLINE void _RenderPixel_CheckWindows16_SSE2(GPUEngineCompositorInfo &compInfo, const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const; template FORCEINLINE void _RenderPixel16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &srcAlpha, const __m128i &srcEffectEnableMask, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID, __m128i &passMask8); - template FORCEINLINE void _RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &enableColorEffectMask, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); + template FORCEINLINE void _RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &enableColorEffectMask, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID); #endif template void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha); @@ -1582,7 +1584,7 @@ public: void ResetCaptureLineStates(); template void RenderLine(const size_t l); - template void RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo); + template void RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo); }; class GPUEngineB : public GPUEngineBase