GPU: Significantly improve GPU compositing performance in many games.

- Adds more possible conditions for compositing to take the fast path
(simple pixel copying).
- Adds SSE2 optimizations for the 2x and 4x scaling cases in
GPUEngineBase::_LineColorCopy().
This commit is contained in:
rogerman 2017-07-19 23:41:43 -07:00
parent c3bce87556
commit 5c6fcb0f47
2 changed files with 255 additions and 128 deletions

View File

@ -516,6 +516,7 @@ void GPUEngineBase::_Reset_Base()
renderState.dstBlendEnable[GPULayerID_BG3] = false;
renderState.dstBlendEnable[GPULayerID_OBJ] = false;
renderState.dstBlendEnable[GPULayerID_Backdrop] = false;
renderState.dstAnyBlendEnable = false;
#ifdef ENABLE_SSE2
renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
@ -1532,27 +1533,128 @@ void GPUEngineBase::_LineColorCopy(void *__restrict dstBuffer, const void *__res
u8 *dst = dstLinePtr;
const void *src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * PIXELBYTES) : (u8 *)srcBuffer;
for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
#if defined(ENABLE_SSE2)
if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2) && (lineCount == 2))
{
for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
{
if (PIXELBYTES == 2)
{
((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x];
const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + x));
const __m128i src16out[2] = { _mm_unpacklo_epi16(src16, src16), _mm_unpackhi_epi16(src16, src16) };
_mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src16out[1]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 2) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src16out[1]);
x += 8;
}
else if (PIXELBYTES == 4)
{
((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x];
const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + x));
const __m128i src32out[2] = { _mm_unpacklo_epi32(src32, src32), _mm_unpackhi_epi32(src32, src32) };
_mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 0)), src32out[1]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 2) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2 * 1)), src32out[1]);
x += 4;
}
}
}
dst = dstLinePtr + (lineWidth * PIXELBYTES);
for (size_t line = 1; line < lineCount; line++)
else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4) && (lineCount == 4))
{
memcpy(dst, dstLinePtr, lineWidth * PIXELBYTES);
dst += (lineWidth * PIXELBYTES);
for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
{
if (PIXELBYTES == 2)
{
const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + x));
const __m128i src16_lo = _mm_unpacklo_epi16(src16, src16);
const __m128i src16_hi = _mm_unpackhi_epi16(src16, src16);
const __m128i src16out[4] = { _mm_unpacklo_epi16(src16_lo, src16_lo), _mm_unpackhi_epi16(src16_lo, src16_lo), _mm_unpacklo_epi16(src16_hi, src16_hi), _mm_unpackhi_epi16(src16_hi, src16_hi) };
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[1]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[2]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src16out[3]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[1]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[2]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src16out[3]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[1]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[2]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src16out[3]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[0]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[1]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 16 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[2]);
_mm_store_si128((__m128i *)((u16 *)dst + (x * 4) + 24 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src16out[3]);
x += 8;
}
else if (PIXELBYTES == 4)
{
const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + x));
const __m128i src32_lo = _mm_unpacklo_epi32(src32, src32);
const __m128i src32_hi = _mm_unpackhi_epi32(src32, src32);
const __m128i src32out[4] = { _mm_unpacklo_epi32(src32_lo, src32_lo), _mm_unpackhi_epi32(src32_lo, src32_lo), _mm_unpacklo_epi32(src32_hi, src32_hi), _mm_unpackhi_epi32(src32_hi, src32_hi) };
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[1]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[2]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 0)), src32out[3]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[1]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[2]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 1)), src32out[3]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[1]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[2]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 2)), src32out[3]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 0 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[0]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 4 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[1]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 8 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[2]);
_mm_store_si128((__m128i *)((u32 *)dst + (x * 4) + 12 + (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4 * 3)), src32out[3]);
x += 4;
}
}
}
else
#endif
{
for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
{
for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
{
if (PIXELBYTES == 2)
{
((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x];
}
else if (PIXELBYTES == 4)
{
((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x];
}
}
}
dst = dstLinePtr + (lineWidth * PIXELBYTES);
for (size_t line = 1; line < lineCount; line++)
{
memcpy(dst, dstLinePtr, lineWidth * PIXELBYTES);
dst += (lineWidth * PIXELBYTES);
}
}
}
}
@ -1598,6 +1700,32 @@ void GPUEngineBase::_LineLayerIDCopy(u8 *__restrict dstBuffer, const u8 *__restr
}
}
template <NDSColorFormat OUTPUTFORMAT>
void GPUEngineBase::_TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo)
{
if (this->isLineRenderNative[compInfo.line.indexNative])
{
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
this->_LineColorCopy<false, true, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
case NDSColorFormat_BGR666_Rev:
case NDSColorFormat_BGR888_Rev:
this->_LineColorCopy<false, true, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
}
this->_LineLayerIDCopy<false, true>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative);
compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
this->isLineRenderNative[compInfo.line.indexNative] = false;
this->nativeLineRenderCount--;
}
}
/*****************************************************************************/
// PIXEL RENDERING
/*****************************************************************************/
@ -1635,9 +1763,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo,
return;
}
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
if (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT)
if (COLOREFFECTDISABLEDHINT)
{
switch (OUTPUTFORMAT)
{
@ -1665,6 +1791,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo,
const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
bool forceBlendEffect = false;
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
if (ISSRCLAYEROBJ && enableColorEffect)
{
//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
@ -1833,9 +1961,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo,
return;
}
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
if (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT)
if (COLOREFFECTDISABLEDHINT)
{
dstColor32 = srcColor32;
dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
@ -1849,6 +1975,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo,
const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
bool forceBlendEffect = false;
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
if (ISSRCLAYEROBJ && enableColorEffect)
{
//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
@ -1981,7 +2109,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(GPUEngineCompositorInfo &com
enableColorEffectMask = _mm_set1_epi8(0xFF);
}
if ( (!ISSRCLAYEROBJ && COLOREFFECTDISABLEDHINT) || (_mm_movemask_epi8(srcEffectEnableMask) == 0) )
if (COLOREFFECTDISABLEDHINT)
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
@ -2152,13 +2280,31 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(GPUEngineCompositorInfo &com
// We can't unify this yet because the output framebuffer is in RGBA5551, but the 3D source pixels are in RGBA6665.
// However, GPUEngineBase::_RenderPixel() takes source pixels in RGB555. In order to unify the methods, all pixels
// must be processed in RGBA6665.
template<NDSColorFormat OUTPUTFORMAT>
template<NDSColorFormat OUTPUTFORMAT, bool COLOREFFECTDISABLEDHINT>
FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32)
{
u16 &dstColor16 = *compInfo.target.lineColor16;
FragmentColor &dstColor32 = *compInfo.target.lineColor32;
u8 &dstLayerID = *compInfo.target.lineLayerID;
if (COLOREFFECTDISABLEDHINT)
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
dstColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
dstColor16 |= 0x8000;
}
else
{
dstColor32 = srcColor32;
dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
}
dstLayerID = GPULayerID_BG0;
return;
}
const bool dstEffectEnable = (dstLayerID != GPULayerID_BG0) && compInfo.renderState.dstBlendEnable[dstLayerID];
// 3D rendering has a special override: If the destination pixel is set to blend, then always blend.
@ -2250,7 +2396,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo
#ifdef ENABLE_SSE2
template <NDSColorFormat OUTPUTFORMAT>
template <NDSColorFormat OUTPUTFORMAT, bool COLOREFFECTDISABLEDHINT>
FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo,
const __m128i &passMask8,
const __m128i &enableColorEffectMask,
@ -2266,23 +2412,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &com
_mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
__m128i dstEffectEnableMask;
__m128i forceBlendEffectMask;
#ifdef ENABLE_SSSE3
dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID);
dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) );
#else
dstEffectEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]);
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) );
#endif
dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask );
const __m128i forceBlendEffectMask = dstEffectEnableMask;
if (_mm_movemask_epi8( _mm_or_si128(forceBlendEffectMask, enableColorEffectMask) ) == 0)
if (COLOREFFECTDISABLEDHINT)
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
@ -2301,6 +2433,21 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(GPUEngineCompositorInfo &com
return;
}
#ifdef ENABLE_SSSE3
dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID);
dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) );
#else
dstEffectEnableMask = _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]);
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) );
dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) );
#endif
dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), dstEffectEnableMask );
forceBlendEffectMask = dstEffectEnableMask;
__m128i tmpSrc[4];
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
@ -3214,26 +3361,9 @@ void GPUEngineBase::_RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, co
}
else
{
if (this->isLineRenderNative[compInfo.line.indexNative] && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested))
if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)
{
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
this->_LineColorCopy<false, true, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
case NDSColorFormat_BGR666_Rev:
case NDSColorFormat_BGR888_Rev:
this->_LineColorCopy<false, true, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
}
this->_LineLayerIDCopy<false, true>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative);
compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
this->isLineRenderNative[compInfo.line.indexNative] = false;
this->nativeLineRenderCount--;
this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
}
}
break;
@ -4007,7 +4137,16 @@ void GPUEngineBase::_RenderLine_Layers(const size_t l)
{
if ( (layerID == GPULayerID_BG0) && GPU->GetEngineMain()->WillRender3DLayer() )
{
GPU->GetEngineMain()->RenderLine_Layer3D<OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
if ( !compInfo.renderState.dstAnyBlendEnable && ( (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
!compInfo.renderState.srcBlendEnable[GPULayerID_BG0] ||
(((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
{
GPU->GetEngineMain()->RenderLine_Layer3D<OUTPUTFORMAT, WILLPERFORMWINDOWTEST, true>(compInfo);
}
else
{
GPU->GetEngineMain()->RenderLine_Layer3D<OUTPUTFORMAT, WILLPERFORMWINDOWTEST, false>(compInfo);
}
continue;
}
}
@ -4029,7 +4168,17 @@ void GPUEngineBase::_RenderLine_Layers(const size_t l)
{
compInfo.renderState.selectedLayerID = GPULayerID_OBJ;
compInfo.renderState.selectedBGLayer = NULL;
this->_RenderLine_LayerOBJ<OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
if ( !compInfo.renderState.dstAnyBlendEnable && ( (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
!compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] ||
(((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
{
this->_RenderLine_LayerOBJ<OUTPUTFORMAT, WILLPERFORMWINDOWTEST, true>(compInfo, item);
}
else
{
this->_RenderLine_LayerOBJ<OUTPUTFORMAT, WILLPERFORMWINDOWTEST, false>(compInfo, item);
}
}
}
}
@ -4062,7 +4211,7 @@ void GPUEngineBase::_RenderLine_SetupSprites(GPUEngineCompositorInfo &compInfo)
}
}
template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT>
void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item)
{
bool useCustomVRAM = false;
@ -4084,26 +4233,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item
}
}
if (this->isLineRenderNative[compInfo.line.indexNative] && useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested))
if (useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested))
{
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
this->_LineColorCopy<false, true, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
case NDSColorFormat_BGR666_Rev:
case NDSColorFormat_BGR888_Rev:
this->_LineColorCopy<false, true, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
}
this->_LineLayerIDCopy<false, true>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative);
compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
this->isLineRenderNative[compInfo.line.indexNative] = false;
this->nativeLineRenderCount--;
this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
}
if (this->isLineRenderNative[compInfo.line.indexNative])
@ -4122,9 +4254,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item
compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, false>(compInfo,
vramColorPtr[srcX],
this->_sprAlpha[srcX]);
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, COLOREFFECTDISABLEDHINT>(compInfo,
vramColorPtr[srcX],
this->_sprAlpha[srcX]);
}
}
else
@ -4139,9 +4271,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item
compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, false>(compInfo,
this->_sprColor[srcX],
this->_sprAlpha[srcX]);
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, COLOREFFECTDISABLEDHINT>(compInfo,
this->_sprColor[srcX],
this->_sprAlpha[srcX]);
}
}
}
@ -4177,16 +4309,16 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item
if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
FragmentColor *vramColor = (FragmentColor *)vramColorPtr;
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, false>(compInfo,
vramColor[dstX],
this->_sprAlpha[srcX]);
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, COLOREFFECTDISABLEDHINT>(compInfo,
vramColor[dstX],
this->_sprAlpha[srcX]);
}
else
{
u16 *vramColor = (u16 *)vramColorPtr;
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, false>(compInfo,
vramColor[dstX],
this->_sprAlpha[srcX]);
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, COLOREFFECTDISABLEDHINT>(compInfo,
vramColor[dstX],
this->_sprAlpha[srcX]);
}
}
}
@ -4218,9 +4350,9 @@ void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, item
compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX;
compInfo.target.lineLayerID = dstLayerIDPtr + dstX;
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, false>(compInfo,
this->_sprColor[srcX],
this->_sprAlpha[srcX]);
this->_RenderPixel<OUTPUTFORMAT, true, false, WILLPERFORMWINDOWTEST, COLOREFFECTDISABLEDHINT>(compInfo,
this->_sprColor[srcX],
this->_sprAlpha[srcX]);
}
}
@ -4745,7 +4877,10 @@ template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER, bool MOSAIC, bool WIL
FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo)
{
#ifndef DISABLE_COLOREFFECTDISABLEHINT
if (compInfo.renderState.colorEffect == ColorEffect_Disable)
if ( (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
!compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] ||
((compInfo.renderState.colorEffect == ColorEffect_Blend) && !compInfo.renderState.dstAnyBlendEnable) ||
(((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) )
{
this->_RenderLine_LayerBG_ApplyColorEffectDisabledHint<OUTPUTFORMAT, ISDEBUGRENDER, MOSAIC, WILLPERFORMWINDOWTEST, true, ISCUSTOMRENDERINGNEEDED>(compInfo);
}
@ -4968,6 +5103,13 @@ void GPUEngineBase::ParseReg_BLDCNT()
renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0);
renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0);
renderState.dstAnyBlendEnable = renderState.dstBlendEnable[GPULayerID_BG0] ||
renderState.dstBlendEnable[GPULayerID_BG1] ||
renderState.dstBlendEnable[GPULayerID_BG2] ||
renderState.dstBlendEnable[GPULayerID_BG3] ||
renderState.dstBlendEnable[GPULayerID_OBJ] ||
renderState.dstBlendEnable[GPULayerID_Backdrop];
#ifdef ENABLE_SSE2
const __m128i one_vec128 = _mm_set1_epi8(1);
@ -5609,7 +5751,7 @@ void GPUEngineA::RenderLine(const size_t l)
}
}
template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT>
void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
{
const FragmentColor *__restrict framebuffer3D = CurrentRenderer->GetFramebuffer();
@ -5618,26 +5760,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
return;
}
if (this->isLineRenderNative[compInfo.line.indexNative] && !CurrentRenderer->IsFramebufferNativeSize())
if (!CurrentRenderer->IsFramebufferNativeSize())
{
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
this->_LineColorCopy<false, true, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
case NDSColorFormat_BGR666_Rev:
case NDSColorFormat_BGR888_Rev:
this->_LineColorCopy<false, true, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, compInfo.line.indexNative);
break;
}
this->_LineLayerIDCopy<false, true>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, compInfo.line.indexNative);
compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
this->isLineRenderNative[compInfo.line.indexNative] = false;
this->nativeLineRenderCount--;
this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
}
const float customWidthScale = (float)compInfo.line.widthCustom / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH;
@ -5716,12 +5841,12 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
dst[3] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 3);
}
this->_RenderPixel3D_SSE2<OUTPUTFORMAT>(compInfo,
passMask8,
enableColorEffectMask,
src[3], src[2], src[1], src[0],
dst[3], dst[2], dst[1], dst[0],
dstLayerID_vec128);
this->_RenderPixel3D_SSE2<OUTPUTFORMAT, COLOREFFECTDISABLEDHINT>(compInfo,
passMask8,
enableColorEffectMask,
src[3], src[2], src[1], src[0],
dst[3], dst[2], dst[1], dst[0],
dstLayerID_vec128);
_mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]);
_mm_store_si128((__m128i *)*compInfo.target.lineColor + 1, dst[1]);
@ -5748,9 +5873,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
this->_RenderPixel3D<OUTPUTFORMAT>(compInfo,
enableColorEffect,
*srcLinePtr);
this->_RenderPixel3D<OUTPUTFORMAT, COLOREFFECTDISABLEDHINT>(compInfo,
enableColorEffect,
*srcLinePtr);
}
}
}
@ -5779,9 +5904,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom];
const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
this->_RenderPixel3D<OUTPUTFORMAT>(compInfo,
enableColorEffect,
srcLinePtr[srcX]);
this->_RenderPixel3D<OUTPUTFORMAT, COLOREFFECTDISABLEDHINT>(compInfo,
enableColorEffect,
srcLinePtr[srcX]);
}
srcLinePtr += compInfo.line.widthCustom;

View File

@ -1194,6 +1194,7 @@ typedef struct
__m128i dstBlendEnable_SSE2[6];
#endif
#endif // ENABLE_SSE2
bool dstAnyBlendEnable;
u8 WIN0_enable[6];
u8 WIN1_enable[6];
@ -1345,6 +1346,7 @@ protected:
template<bool NATIVEDST, bool NATIVESRC, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t PIXELBYTES> void _LineColorCopy(void *__restrict dstBuffer, const void *__restrict srcBuffer, const size_t l);
template<bool NATIVEDST, bool NATIVESRC> void _LineLayerIDCopy(u8 *__restrict dstBuffer, const u8 *__restrict srcBuffer, const size_t l);
template<NDSColorFormat OUTPUTFORMAT> void _TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo);
void _MosaicSpriteLinePixel(GPUEngineCompositorInfo &compInfo, const size_t x, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
void _MosaicSpriteLine(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
@ -1382,11 +1384,11 @@ protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo);
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER, bool WILLPERFORMWINDOWTEST, bool ISCUSTOMRENDERINGNEEDED> void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo);
template<NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item);
template<NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT> void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item);
template<NDSColorFormat OUTPUTFORMAT, bool ISSRCLAYEROBJ, bool ISDEBUGRENDER, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT> FORCEINLINE void _RenderPixel(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 srcAlpha);
template<NDSColorFormat OUTPUTFORMAT, bool ISSRCLAYEROBJ, bool ISDEBUGRENDER, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT> FORCEINLINE void _RenderPixel(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 srcAlpha);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32);
template<NDSColorFormat OUTPUTFORMAT, bool COLOREFFECTDISABLEDHINT> FORCEINLINE void _RenderPixel3D(GPUEngineCompositorInfo &compInfo, const bool enableColorEffect, const FragmentColor srcColor32);
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB);
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable);
@ -1408,7 +1410,7 @@ protected:
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY);
template<bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel_CheckWindows16_SSE2(GPUEngineCompositorInfo &compInfo, const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISSRCLAYEROBJ, bool ISDEBUGRENDER, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &srcAlpha, const __m128i &srcEffectEnableMask, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID, __m128i &passMask8);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &enableColorEffectMask, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT, bool COLOREFFECTDISABLEDHINT> FORCEINLINE void _RenderPixel3D_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &enableColorEffectMask, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
#endif
template<bool ISDEBUGRENDER> void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
@ -1582,7 +1584,7 @@ public:
void ResetCaptureLineStates();
template<NDSColorFormat OUTPUTFORMAT> void RenderLine(const size_t l);
template<NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo);
template<NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST, bool COLOREFFECTDISABLEDHINT> void RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo);
};
class GPUEngineB : public GPUEngineBase