- Replace _mm_set1_epi64x() with _mm_set1_epi32() where appropriate.
- Complete GPUEngineBase::_RenderPixel_SSE2() method.
- Fix potential bug with window checks in GPUEngineBase::_RenderPixel3D_SSE2().
- Do some minor code cleanup.
This commit is contained in:
rogerman 2016-01-09 23:12:41 +00:00
parent a1280789db
commit 0ff9773eaf
2 changed files with 93 additions and 88 deletions

View File

@ -1241,7 +1241,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel_CheckWindows_SSE2(const size_t dstX
break; break;
} }
winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF) ); winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF) );
didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(didPassValue)) ); didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(didPassValue)) );
enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(this->_IORegisterMap->WINOUT.Effect_Enable)) ); enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, _mm_set1_epi8(this->_IORegisterMap->WINOUT.Effect_Enable)) );
} }
@ -1389,33 +1389,39 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(const size_t srcX, const u16 src, c
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
template <GPULayerID LAYERID, bool ISDEBUGRENDER, bool ISCUSTOMRENDERINGNEEDED> template <GPULayerID LAYERID, bool ISDEBUGRENDER, bool ISCUSTOMRENDERINGNEEDED>
FORCEINLINE void GPUEngineBase::_RenderPixel_SSE2(const size_t srcX, const u16 *__restrict src, const u8 *__restrict srcAlpha, u16 *__restrict dstColorLine, u8 *__restrict dstLayerIDLine) FORCEINLINE void GPUEngineBase::_RenderPixel_SSE2(const size_t dstX,
const __m128i &srcColorHi_vec128,
const __m128i &srcColorLo_vec128,
const __m128i &srcOpaqueMask,
const u8 *__restrict srcAlpha,
u16 *__restrict dstColorLine,
u8 *__restrict dstLayerIDLine)
{ {
const __m128i dstColorLo_vec128 = _mm_loadu_si128((__m128i *)dstColorLine);
const __m128i dstColorHi_vec128 = _mm_loadu_si128((__m128i *)(dstColorLine + 8));
const __m128i dstLayerID_vec128 = _mm_loadu_si128((__m128i *)dstLayerIDLine);
const __m128i srcOpaqueMaskLo = _mm_cmpeq_epi16( _mm_unpacklo_epi8(srcOpaqueMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF));
const __m128i srcOpaqueMaskHi = _mm_cmpeq_epi16( _mm_unpackhi_epi8(srcOpaqueMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF));
if (ISDEBUGRENDER) if (ISDEBUGRENDER)
{ {
// If we're rendering pixels to a debugging context, then assume that the pixel // If we're rendering pixels to a debugging context, then assume that the pixel
// always passes the window test and that the color effect is always disabled. // always passes the window test and that the color effect is always disabled.
_mm_store_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_load_si128((__m128i *)src), _mm_set1_epi16(0x8000)) ); _mm_storeu_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(srcOpaqueMaskLo, _mm_or_si128(srcColorLo_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(srcOpaqueMaskLo, dstColorLo_vec128)) );
_mm_store_si128( (__m128i *)(dstColorLine + 8), _mm_or_si128(_mm_load_si128((__m128i *)(src + 8)), _mm_set1_epi16(0x8000)) ); _mm_storeu_si128( (__m128i *)(dstColorLine + 8), _mm_or_si128(_mm_and_si128(srcOpaqueMaskHi, _mm_or_si128(srcColorHi_vec128, _mm_set1_epi16(0x8000))), _mm_andnot_si128(srcOpaqueMaskHi, dstColorHi_vec128)) );
_mm_store_si128( (__m128i *)dstLayerIDLine, _mm_set1_epi8(LAYERID) ); _mm_storeu_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(srcOpaqueMask, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(srcOpaqueMask, dstLayerID_vec128)) );
return; return;
} }
__m128i srcColorLo_vec128 = _mm_load_si128((__m128i *)src);
__m128i srcColorHi_vec128 = _mm_load_si128((__m128i *)(src + 8));
const __m128i dstColorLo_vec128 = _mm_load_si128((__m128i *)dstColorLine);
const __m128i dstColorHi_vec128 = _mm_load_si128((__m128i *)(dstColorLine + 8));
const __m128i dstLayerID_vec128 = _mm_load_si128((__m128i *)dstLayerIDLine);
// Do the window test. // Do the window test.
__m128i didPassWindowTest = _mm_set1_epi8(1); __m128i didPassWindowTest = _mm_set1_epi8(1);
__m128i enableColorEffect = _mm_set1_epi8(1); __m128i enableColorEffect = _mm_set1_epi8(1);
this->_RenderPixel_CheckWindows_SSE2<LAYERID, ISCUSTOMRENDERINGNEEDED>(srcX, didPassWindowTest, enableColorEffect); this->_RenderPixel_CheckWindows_SSE2<LAYERID, ISCUSTOMRENDERINGNEEDED>(dstX, didPassWindowTest, enableColorEffect);
const __m128i passedWindowTestMaskLo = _mm_cmpeq_epi16( _mm_unpacklo_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1) ); const __m128i passedWindowTestMaskLo = _mm_and_si128( srcOpaqueMaskLo, _mm_cmpeq_epi16(_mm_unpacklo_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) );
const __m128i passedWindowTestMaskHi = _mm_cmpeq_epi16( _mm_unpackhi_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1) ); const __m128i passedWindowTestMaskHi = _mm_and_si128( srcOpaqueMaskHi, _mm_cmpeq_epi16(_mm_unpackhi_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) );
const __m128i passedWindowTestLayerID = _mm_packus_epi16(passedWindowTestMaskLo, passedWindowTestMaskHi); const __m128i passedWindowTestLayerID = _mm_and_si128( srcOpaqueMask, _mm_packs_epi16(passedWindowTestMaskLo, passedWindowTestMaskHi) );
const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT; const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT;
u8 srcEffectEnableValue; u8 srcEffectEnableValue;
@ -1464,8 +1470,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel_SSE2(const size_t srcX, const u16 *
this->_blend2[dstLayerIDLine[ 1]], this->_blend2[dstLayerIDLine[ 1]],
this->_blend2[dstLayerIDLine[ 0]]); this->_blend2[dstLayerIDLine[ 0]]);
dstEffectEnableMask = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi8(dstLayerID_vec128, _mm_set1_epi8(LAYERID)), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF)), dstEffectEnableMask = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi8(dstLayerID_vec128, _mm_set1_epi8(LAYERID)), _mm_set1_epi32(0xFFFFFFFF)),
_mm_xor_si128(_mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF)) ); _mm_xor_si128(_mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF)) );
// Select the color effect based on the BLDCNT target flags. // Select the color effect based on the BLDCNT target flags.
const __m128i enableColorEffectMask = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1)); const __m128i enableColorEffectMask = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1));
@ -1478,12 +1484,12 @@ FORCEINLINE void GPUEngineBase::_RenderPixel_SSE2(const size_t srcX, const u16 *
if (LAYERID == GPULayerID_OBJ) if (LAYERID == GPULayerID_OBJ)
{ {
const __m128i objMode_vec128 = _mm_load_si128((__m128i *)(this->_sprType + srcX)); const __m128i objMode_vec128 = _mm_loadu_si128((__m128i *)(this->_sprType + dstX));
const __m128i isObjTranslucentMask = _mm_and_si128( dstEffectEnableMask, _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) ); const __m128i isObjTranslucentMask = _mm_and_si128( dstEffectEnableMask, _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) );
forceBlendEffectMask = isObjTranslucentMask; forceBlendEffectMask = isObjTranslucentMask;
const __m128i srcAlpha_vec128 = _mm_load_si128((__m128i *)(srcAlpha + srcX)); const __m128i srcAlpha_vec128 = _mm_loadu_si128((__m128i *)(srcAlpha + dstX));
const __m128i srcAlphaMask = _mm_and_si128( isObjTranslucentMask, _mm_xor_si128(_mm_cmpeq_epi8(srcAlpha_vec128, _mm_set1_epi8(0xFF)), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF)) ); const __m128i srcAlphaMask = _mm_and_si128( isObjTranslucentMask, _mm_xor_si128(_mm_cmpeq_epi8(srcAlpha_vec128, _mm_set1_epi8(0xFF)), _mm_set1_epi32(0xFFFFFFFF)) );
eva_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, srcAlpha_vec128), _mm_andnot_si128(srcAlphaMask, eva_vec128) ); eva_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, srcAlpha_vec128), _mm_andnot_si128(srcAlphaMask, eva_vec128) );
evb_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, _mm_sub_epi8(_mm_set1_epi8(16), srcAlpha_vec128)), _mm_andnot_si128(srcAlphaMask, evb_vec128) ); evb_vec128 = _mm_or_si128( _mm_and_si128(srcAlphaMask, _mm_sub_epi8(_mm_set1_epi8(16), srcAlpha_vec128)), _mm_andnot_si128(srcAlphaMask, evb_vec128) );
@ -1516,17 +1522,17 @@ FORCEINLINE void GPUEngineBase::_RenderPixel_SSE2(const size_t srcX, const u16 *
const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend(srcColorLo_vec128, dstColorLo_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend(srcColorLo_vec128, dstColorLo_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend(srcColorHi_vec128, dstColorHi_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend(srcColorHi_vec128, dstColorHi_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF) ); const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi32(0xFFFFFFFF) );
const __m128i disablePixelsLo = _mm_and_si128( srcColorLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i disablePixelsLo = _mm_and_si128( srcColorLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i disablePixelsHi = _mm_and_si128( srcColorHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i disablePixelsHi = _mm_and_si128( srcColorHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
// Combine the final colors. // Combine the final colors.
srcColorLo_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsLo, blendPixelsLo), disablePixelsLo), _mm_set1_epi16(0x8000) ); const __m128i combinedSrcColorLo_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsLo, blendPixelsLo), disablePixelsLo), _mm_set1_epi16(0x8000) );
srcColorHi_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsHi, blendPixelsHi), disablePixelsHi), _mm_set1_epi16(0x8000) ); const __m128i combinedSrcColorHi_vec128 = _mm_or_si128( _mm_or_si128(_mm_or_si128(brightnessPixelsHi, blendPixelsHi), disablePixelsHi), _mm_set1_epi16(0x8000) );
_mm_store_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(passedWindowTestMaskLo, srcColorLo_vec128), _mm_andnot_si128(passedWindowTestMaskLo, dstColorLo_vec128)) ); _mm_storeu_si128( (__m128i *)dstColorLine, _mm_or_si128(_mm_and_si128(passedWindowTestMaskLo, combinedSrcColorLo_vec128), _mm_andnot_si128(passedWindowTestMaskLo, dstColorLo_vec128)) );
_mm_store_si128( (__m128i *)(dstColorLine + 8), _mm_or_si128(_mm_and_si128(passedWindowTestMaskHi, srcColorHi_vec128), _mm_andnot_si128(passedWindowTestMaskHi, dstColorHi_vec128)) ); _mm_storeu_si128( (__m128i *)(dstColorLine + 8), _mm_or_si128(_mm_and_si128(passedWindowTestMaskHi, combinedSrcColorHi_vec128), _mm_andnot_si128(passedWindowTestMaskHi, dstColorHi_vec128)) );
_mm_store_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passedWindowTestLayerID, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passedWindowTestLayerID, dstLayerID_vec128)) ); _mm_storeu_si128( (__m128i *)dstLayerIDLine, _mm_or_si128(_mm_and_si128(passedWindowTestLayerID, _mm_set1_epi8(LAYERID)), _mm_andnot_si128(passedWindowTestLayerID, dstLayerID_vec128)) );
} }
#endif #endif
@ -1644,7 +1650,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const size_t dstX,
const __m128i srcAlphaLo_vec128 = _mm_cmpgt_epi16( _mm_packs_epi32( _mm_srli_epi32(_mm_and_si128(srcColor0, _mm_set1_epi32(0xFF000000)), 24), _mm_srli_epi32(_mm_and_si128(srcColor1, _mm_set1_epi32(0xFF000000)), 24) ), _mm_setzero_si128() ); const __m128i srcAlphaLo_vec128 = _mm_cmpgt_epi16( _mm_packs_epi32( _mm_srli_epi32(_mm_and_si128(srcColor0, _mm_set1_epi32(0xFF000000)), 24), _mm_srli_epi32(_mm_and_si128(srcColor1, _mm_set1_epi32(0xFF000000)), 24) ), _mm_setzero_si128() );
const __m128i srcAlphaHi_vec128 = _mm_cmpgt_epi16( _mm_packs_epi32( _mm_srli_epi32(_mm_and_si128(srcColor2, _mm_set1_epi32(0xFF000000)), 24), _mm_srli_epi32(_mm_and_si128(srcColor3, _mm_set1_epi32(0xFF000000)), 24) ), _mm_setzero_si128() ); const __m128i srcAlphaHi_vec128 = _mm_cmpgt_epi16( _mm_packs_epi32( _mm_srli_epi32(_mm_and_si128(srcColor2, _mm_set1_epi32(0xFF000000)), 24), _mm_srli_epi32(_mm_and_si128(srcColor3, _mm_set1_epi32(0xFF000000)), 24) ), _mm_setzero_si128() );
const __m128i srcAlphaLayerID_vec128 = _mm_packus_epi16(srcAlphaLo_vec128, srcAlphaHi_vec128); const __m128i srcAlphaLayerID_vec128 = _mm_packs_epi16(srcAlphaLo_vec128, srcAlphaHi_vec128);
const __m128i dstColorLo_vec128 = _mm_load_si128((__m128i *)dstColorLine); const __m128i dstColorLo_vec128 = _mm_load_si128((__m128i *)dstColorLine);
const __m128i dstColorHi_vec128 = _mm_load_si128((__m128i *)(dstColorLine + 8)); const __m128i dstColorHi_vec128 = _mm_load_si128((__m128i *)(dstColorLine + 8));
@ -1657,7 +1663,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const size_t dstX,
const __m128i passedWindowTestMaskLo = _mm_and_si128( srcAlphaLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) ); const __m128i passedWindowTestMaskLo = _mm_and_si128( srcAlphaLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) );
const __m128i passedWindowTestMaskHi = _mm_and_si128( srcAlphaHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) ); const __m128i passedWindowTestMaskHi = _mm_and_si128( srcAlphaHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(didPassWindowTest, _mm_setzero_si128()), _mm_set1_epi16(1)) );
const __m128i passedWindowTestLayerID = _mm_and_si128( srcAlphaLayerID_vec128, _mm_packus_epi16(passedWindowTestMaskLo, passedWindowTestMaskHi) ); const __m128i passedWindowTestLayerID = _mm_and_si128( srcAlphaLayerID_vec128, _mm_packs_epi16(passedWindowTestMaskLo, passedWindowTestMaskHi) );
const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT; const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT;
const __m128i srcEffectEnableMask = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target1), _mm_set1_epi8(1)); const __m128i srcEffectEnableMask = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target1), _mm_set1_epi8(1));
@ -1678,8 +1684,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const size_t dstX,
this->_blend2[dstLayerIDLine[ 1]], this->_blend2[dstLayerIDLine[ 1]],
this->_blend2[dstLayerIDLine[ 0]]); this->_blend2[dstLayerIDLine[ 0]]);
dstEffectEnableMask = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi8(dstLayerID_vec128, _mm_set1_epi8(GPULayerID_BG0)), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF)), dstEffectEnableMask = _mm_and_si128(_mm_xor_si128(_mm_cmpeq_epi8(dstLayerID_vec128, _mm_set1_epi8(GPULayerID_BG0)), _mm_set1_epi32(0xFFFFFFFF)),
_mm_xor_si128(_mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF)) ); _mm_xor_si128(_mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF)) );
// Select the color effect based on the BLDCNT target flags. // Select the color effect based on the BLDCNT target flags.
const __m128i enableColorEffectMask = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1)); const __m128i enableColorEffectMask = _mm_cmpeq_epi8(enableColorEffect, _mm_set1_epi8(1));
@ -1714,7 +1720,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D_SSE2(const size_t dstX,
const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend3D(srcColor0, srcColor1, dstColorLo_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend3D(srcColor0, srcColor1, dstColorLo_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend3D(srcColor2, srcColor3, dstColorHi_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend3D(srcColor2, srcColor3, dstColorHi_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF) ); const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi32(0xFFFFFFFF) );
const __m128i disablePixelsLo = _mm_and_si128( srcColorLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i disablePixelsLo = _mm_and_si128( srcColorLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i disablePixelsHi = _mm_and_si128( srcColorHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) ); const __m128i disablePixelsHi = _mm_and_si128( srcColorHi_vec128, _mm_cmpeq_epi16(_mm_unpackhi_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
@ -1866,13 +1872,15 @@ void GPUEngineBase::_rot_scale_op(u16 *__restrict dstColorLine, const u16 lineIn
IOREG_BGnY y = param.BGnY; IOREG_BGnY y = param.BGnY;
const s32 dx = (s32)param.BGnPA.value; const s32 dx = (s32)param.BGnPA.value;
const s32 dy = (s32)param.BGnPC.value; const s32 dy = (s32)param.BGnPC.value;
const s32 wmask = wh - 1;
const s32 hmask = ht - 1;
// as an optimization, specially handle the fairly common case of // as an optimization, specially handle the fairly common case of
// "unrotated + unscaled + no boundary checking required" // "unrotated + unscaled + no boundary checking required"
if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0) if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0)
{ {
s32 auxX = (WRAP) ? x.Integer & (wh-1) : x.Integer; s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
const s32 auxY = (WRAP) ? y.Integer & (ht-1) : y.Integer; const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
if (WRAP || (auxX + LG < wh && auxX >= 0 && auxY < ht && auxY >= 0)) if (WRAP || (auxX + LG < wh && auxX >= 0 && auxY < ht && auxY >= 0))
{ {
@ -1882,7 +1890,7 @@ void GPUEngineBase::_rot_scale_op(u16 *__restrict dstColorLine, const u16 lineIn
auxX++; auxX++;
if (WRAP) if (WRAP)
auxX = auxX & (wh-1); auxX = auxX & wmask;
} }
return; return;
@ -1891,8 +1899,8 @@ void GPUEngineBase::_rot_scale_op(u16 *__restrict dstColorLine, const u16 lineIn
for (size_t i = 0; i < LG; i++, x.value += dx, y.value += dy) for (size_t i = 0; i < LG; i++, x.value += dx, y.value += dy)
{ {
const s32 auxX = (WRAP) ? x.Integer & (wh-1) : x.Integer; const s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
const s32 auxY = (WRAP) ? y.Integer & (ht-1) : y.Integer; const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht))) if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht)))
fun(this, dstColorLine, lineIndex, auxX, auxY, wh, map, tile, pal, i); fun(this, dstColorLine, lineIndex, auxX, auxY, wh, map, tile, pal, i);
@ -1911,58 +1919,58 @@ void GPUEngineBase::_apply_rot_fun(u16 *__restrict dstColorLine, const u16 lineI
this->_rot_scale_op<fun, false>(dstColorLine, lineIndex, param, LG, wh, ht, map, tile, pal); this->_rot_scale_op<fun, false>(dstColorLine, lineIndex, param, LG, wh, ht, map, tile, pal);
} }
TILEENTRY GPUEngineBase::_GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask)
{
TILEENTRY theTileEntry;
const u16 tmp = (xOffset & layerWidthMask) >> 3;
u32 mapinfo = tileMapAddress + (tmp & 0x1F) * 2;
if (tmp > 31) mapinfo += 32*32*2;
theTileEntry.val = LOCAL_TO_LE_16( *(u16 *)MMU_gpu_map(mapinfo) );
return theTileEntry;
}
/*****************************************************************************/ /*****************************************************************************/
// BACKGROUND RENDERING -TEXT- // BACKGROUND RENDERING -TEXT-
/*****************************************************************************/ /*****************************************************************************/
// render a text background to the combined pixelbuffer // render a text background to the combined pixelbuffer
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED>
void GPUEngineBase::_RenderLine_TextBG(u16 *__restrict dstColorLine, const u16 lineIndex, u16 XBG, u16 YBG, u16 LG) void GPUEngineBase::_RenderLine_TextBG(u16 *__restrict dstColorLine, const u16 lineIndex, const u16 XBG, const u16 YBG, const u16 LG)
{ {
const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT; const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
const u16 lg = this->_BGLayer[LAYERID].size.width; const u16 lg = this->_BGLayer[LAYERID].size.width;
const u16 ht = this->_BGLayer[LAYERID].size.height; const u16 ht = this->_BGLayer[LAYERID].size.height;
const u16 wmask = (lg-1); const u32 tile = this->_BGLayer[LAYERID].tileEntryAddress;
const u16 hmask = (ht-1); const u16 wmask = lg - 1;
u16 tmp = ((YBG & hmask) >> 3); const u16 hmask = ht - 1;
u32 map;
u32 tile; u16 xoff = XBG;
u16 xoff;
u16 yoff; u16 yoff;
u32 xfin;
const u16 tmp = (YBG & hmask) >> 3;
s8 line_dir = 1; u32 map = this->_BGLayer[LAYERID].tileMapAddress + (tmp & 31) * 64;
u32 mapinfo;
TILEENTRY tileentry;
u32 tmp_map = this->_BGLayer[LAYERID].tileMapAddress + (tmp&31) * 64;
if (tmp > 31) if (tmp > 31)
tmp_map+= ADDRESS_STEP_512B << this->_BGLayer[LAYERID].BGnCNT.ScreenSize; map += ADDRESS_STEP_512B << this->_BGLayer[LAYERID].BGnCNT.ScreenSize;
map = tmp_map; u32 xfin;
tile = this->_BGLayer[LAYERID].tileEntryAddress; s8 line_dir = 1;
xoff = XBG;
if (this->_BGLayer[LAYERID].BGnCNT.PaletteMode == PaletteMode_16x16) // color: 16 palette entries if (this->_BGLayer[LAYERID].BGnCNT.PaletteMode == PaletteMode_16x16) // color: 16 palette entries
{ {
const u16 *__restrict pal = this->_paletteBG; const u16 *__restrict pal = this->_paletteBG;
yoff = (YBG & 0x0007) << 2;
yoff = ((YBG&7)<<2);
xfin = 8 - (xoff&7); xfin = 8 - (xoff&7);
for (size_t x = 0; x < LG; xfin = std::min<u16>(x+8, LG)) for (size_t x = 0; x < LG; xfin = std::min<u16>(x+8, LG))
{ {
u16 tilePalette = 0; const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
tmp = ((xoff&wmask)>>3); const u16 tilePalette = tileEntry.bits.Palette * 16;
mapinfo = map + (tmp&0x1F) * 2;
if(tmp>31) mapinfo += 32*32*2;
tileentry.val = LOCAL_TO_LE_16( *(u16 *)MMU_gpu_map(mapinfo) );
tilePalette = (tileentry.bits.Palette*16); u8 *__restrict line = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x20) + ((tileEntry.bits.VFlip) ? (7*4)-yoff : yoff));
u8 *__restrict line = (u8 *)MMU_gpu_map(tile + (tileentry.bits.TileNum * 0x20) + ((tileentry.bits.VFlip) ? (7*4)-yoff : yoff));
u8 offset = 0; u8 offset = 0;
if (tileentry.bits.HFlip) if (tileEntry.bits.HFlip)
{ {
line += ( 3 - ((xoff & 7) >> 1) ); line += ( 3 - ((xoff & 7) >> 1) );
for (; x < xfin; line--) for (; x < xfin; line--)
@ -2011,22 +2019,17 @@ void GPUEngineBase::_RenderLine_TextBG(u16 *__restrict dstColorLine, const u16 l
else //256-color BG else //256-color BG
{ {
const u16 *__restrict pal = (DISPCNT.ExBGxPalette_Enable) ? *(this->_BGLayer[LAYERID].extPalette) : this->_paletteBG; const u16 *__restrict pal = (DISPCNT.ExBGxPalette_Enable) ? *(this->_BGLayer[LAYERID].extPalette) : this->_paletteBG;
yoff = ((YBG&7)<<3);
xfin = 8 - (xoff&7);
const u32 extPalMask = -DISPCNT.ExBGxPalette_Enable; const u32 extPalMask = -DISPCNT.ExBGxPalette_Enable;
yoff = (YBG & 0x0007) << 3;
xfin = 8 - (xoff&7);
for (size_t x = 0; x < LG; xfin = std::min<u16>(x+8, LG)) for (size_t x = 0; x < LG; xfin = std::min<u16>(x+8, LG))
{ {
tmp = (xoff & (lg-1))>>3; const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
mapinfo = map + (tmp & 31) * 2; const u16 *__restrict tilePal = (u16 *)((u8 *)pal + ((tileEntry.bits.Palette<<9) & extPalMask));
if(tmp > 31) mapinfo += 32*32*2; u8 *__restrict line = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x40) + ((tileEntry.bits.VFlip) ? (7*8)-yoff : yoff));
tileentry.val = LOCAL_TO_LE_16( *(u16 *)MMU_gpu_map(mapinfo) );
const u16 *__restrict tilePal = (u16 *)((u8 *)pal + ((tileentry.bits.Palette<<9) & extPalMask)); if (tileEntry.bits.HFlip)
u8 *__restrict line = (u8 *)MMU_gpu_map(tile + (tileentry.bits.TileNum * 0x40) + ((tileentry.bits.VFlip) ? (7*8)-yoff : yoff));
if (tileentry.bits.HFlip)
{ {
line += (7 - (xoff&7)); line += (7 - (xoff&7));
line_dir = -1; line_dir = -1;
@ -2815,12 +2818,12 @@ void GPUEngineBase::ApplyMasterBrightness()
size_t i = 0; size_t i = 0;
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
__m128i intensity_vec128 = _mm_set1_epi16(intensity); const __m128i intensity_vec128 = _mm_set1_epi16(intensity);
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dst + i));
_mm_store_si128( (__m128i *)(dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)(dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif #endif
@ -2844,12 +2847,12 @@ void GPUEngineBase::ApplyMasterBrightness()
size_t i = 0; size_t i = 0;
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
__m128i intensity_vec128 = _mm_set1_epi16(intensity); const __m128i intensity_vec128 = _mm_set1_epi16(intensity);
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dst + i));
_mm_store_si128( (__m128i *)(dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)(dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif #endif

View File

@ -1198,7 +1198,9 @@ protected:
template<rot_fun fun, bool WRAP> void _rot_scale_op(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *__restrict pal); template<rot_fun fun, bool WRAP> void _rot_scale_op(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *__restrict pal);
template<GPULayerID LAYERID, rot_fun fun> void _apply_rot_fun(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG, const u32 map, const u32 tile, const u16 *__restrict pal); template<GPULayerID LAYERID, rot_fun fun> void _apply_rot_fun(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG, const u32 map, const u32 tile, const u16 *__restrict pal);
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _RenderLine_TextBG(u16 *__restrict dstColorLine, const u16 lineIndex, u16 XBG, u16 YBG, u16 LG); TILEENTRY _GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask);
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _RenderLine_TextBG(u16 *__restrict dstColorLine, const u16 lineIndex, const u16 XBG, const u16 YBG, const u16 LG);
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _RotBG2(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG); template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _RotBG2(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG);
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _ExtRotBG2(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG); template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool MOSAIC, bool ISCUSTOMRENDERINGNEEDED> void _ExtRotBG2(u16 *__restrict dstColorLine, const u16 lineIndex, const IOREG_BGnParameter &param, const u16 LG);
@ -1240,7 +1242,7 @@ protected:
FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY); FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY);
FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY); FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY);
template<GPULayerID LAYERID, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel_CheckWindows_SSE2(const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const; template<GPULayerID LAYERID, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel_CheckWindows_SSE2(const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const;
template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel_SSE2(const size_t srcX, const u16 *__restrict src, const u8 *__restrict srcAlpha, u16 *__restrict dstColorLine, u8 *__restrict dstLayerIDLine); template<GPULayerID LAYERID, bool ISDEBUGRENDER, bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel_SSE2(const size_t dstX, const __m128i &srcColorHi_vec128, const __m128i &srcColorLo_vec128, const __m128i &srcOpaqueMask, const u8 *__restrict srcAlpha, u16 *__restrict dstColorLine, u8 *__restrict dstLayerIDLine);
template<bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel3D_SSE2(const size_t srcX, const FragmentColor *__restrict src, u16 *__restrict dstColorLine, u8 *__restrict dstLayerIDLine); template<bool ISCUSTOMRENDERINGNEEDED> FORCEINLINE void _RenderPixel3D_SSE2(const size_t srcX, const FragmentColor *__restrict src, u16 *__restrict dstColorLine, u8 *__restrict dstLayerIDLine);
#endif #endif