- Reduce overall register contention in some color blending methods.
This commit is contained in:
rogerman 2016-06-21 20:30:52 +00:00
parent 03d8ee62aa
commit 4ae207fb03
1 changed files with 41 additions and 41 deletions

View File

@ -591,12 +591,12 @@ void GPUEngineBase::_ResortBGLayers()
FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB) FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB)
{ {
u16 ra = (colA & 0x001F); u16 ra = colA & 0x001F;
u16 ga = (colA & 0x03E0) >> 5; u16 ga = (colA >> 5) & 0x001F;
u16 ba = (colA & 0x7C00) >> 10; u16 ba = (colA >> 10) & 0x001F;
u16 rb = (colB & 0x001F); u16 rb = colB & 0x001F;
u16 gb = (colB & 0x03E0) >> 5; u16 gb = (colB >> 5) & 0x001F;
u16 bb = (colB & 0x7C00) >> 10; u16 bb = (colB >> 10) & 0x001F;
ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16; ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
if (ra > 31) ra = 31; if (ra > 31) ra = 31;
@ -717,9 +717,9 @@ FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col)
FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY) FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY)
{ {
u16 r = (col & 0x001F); u16 r = col & 0x001F;
u16 g = (col & 0x03E0) >> 5; u16 g = (col >> 5) & 0x001F;
u16 b = (col & 0x7C00) >> 10; u16 b = (col >> 10) & 0x001F;
r = (r + ((31 - r) * blendEVY / 16)); r = (r + ((31 - r) * blendEVY / 16));
g = (g + ((31 - g) * blendEVY / 16)); g = (g + ((31 - g) * blendEVY / 16));
@ -757,9 +757,9 @@ FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectIncreaseBrightness(const u1
FragmentColor newColor; FragmentColor newColor;
newColor.color = 0; newColor.color = 0;
u16 r = (col & 0x001F); u16 r = col & 0x001F;
u16 g = (col & 0x03E0) >> 5; u16 g = (col >> 5) & 0x001F;
u16 b = (col & 0x7C00) >> 10; u16 b = (col >> 10) & 0x001F;
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{ {
@ -834,9 +834,9 @@ FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col)
FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY) FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY)
{ {
u16 r = (col & 0x001F); u16 r = col & 0x001F;
u16 g = (col & 0x03E0) >> 5; u16 g = (col >> 5) & 0x001F;
u16 b = (col & 0x7C00) >> 10; u16 b = (col >> 10) & 0x001F;
r = (r - (r * blendEVY / 16)); r = (r - (r * blendEVY / 16));
g = (g - (g * blendEVY / 16)); g = (g - (g * blendEVY / 16));
@ -874,9 +874,9 @@ FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const u1
FragmentColor newColor; FragmentColor newColor;
newColor.color = 0; newColor.color = 0;
u16 r = (col & 0x001F); u16 r = col & 0x001F;
u16 g = (col & 0x03E0) >> 5; u16 g = (col >> 5) & 0x001F;
u16 b = (col & 0x7C00) >> 10; u16 b = (col >> 10) & 0x001F;
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{ {
@ -944,8 +944,8 @@ FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const Fr
FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY) FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY)
{ {
__m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); __m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) );
__m128i g_vec128 = _mm_srli_epi16( _mm_and_si128(col, _mm_set1_epi16(0x03E0)), 5 ); __m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) );
__m128i b_vec128 = _mm_srli_epi16( _mm_and_si128(col, _mm_set1_epi16(0x7C00)), 10 ); __m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) ); r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) );
g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) ); g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) );
@ -957,8 +957,8 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i
FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY) FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY)
{ {
__m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) ); __m128i r_vec128 = _mm_and_si128( col, _mm_set1_epi16(0x001F) );
__m128i g_vec128 = _mm_srli_epi16( _mm_and_si128(col, _mm_set1_epi16(0x03E0)), 5 ); __m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col, 5), _mm_set1_epi16(0x001F) );
__m128i b_vec128 = _mm_srli_epi16( _mm_and_si128(col, _mm_set1_epi16(0x7C00)), 10 ); __m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) ); r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) );
g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) ); g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) );
@ -970,11 +970,11 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i
FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB) FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB)
{ {
__m128i ra_vec128 = _mm_and_si128( colA, _mm_set1_epi16(0x001F) ); __m128i ra_vec128 = _mm_and_si128( colA, _mm_set1_epi16(0x001F) );
__m128i ga_vec128 = _mm_srli_epi16( _mm_and_si128(colA, _mm_set1_epi16(0x03E0)), 5 ); __m128i ga_vec128 = _mm_and_si128( _mm_srli_epi16(colA, 5), _mm_set1_epi16(0x001F) );
__m128i ba_vec128 = _mm_srli_epi16( _mm_and_si128(colA, _mm_set1_epi16(0x7C00)), 10 ); __m128i ba_vec128 = _mm_and_si128( _mm_srli_epi16(colA, 10), _mm_set1_epi16(0x001F) );
__m128i rb_vec128 = _mm_and_si128( colB, _mm_set1_epi16(0x001F) ); __m128i rb_vec128 = _mm_and_si128( colB, _mm_set1_epi16(0x001F) );
__m128i gb_vec128 = _mm_srli_epi16( _mm_and_si128(colB, _mm_set1_epi16(0x03E0)), 5 ); __m128i gb_vec128 = _mm_and_si128( _mm_srli_epi16(colB, 5), _mm_set1_epi16(0x001F) );
__m128i bb_vec128 = _mm_srli_epi16( _mm_and_si128(colB, _mm_set1_epi16(0x7C00)), 10 ); __m128i bb_vec128 = _mm_and_si128( _mm_srli_epi16(colB, 10), _mm_set1_epi16(0x001F) );
ra_vec128 = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ra_vec128, blendEVA), _mm_mullo_epi16(rb_vec128, blendEVB)), 4 ); ra_vec128 = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ra_vec128, blendEVA), _mm_mullo_epi16(rb_vec128, blendEVB)), 4 );
ra_vec128 = _mm_min_epi16(ra_vec128, _mm_set1_epi16(31)); ra_vec128 = _mm_min_epi16(ra_vec128, _mm_set1_epi16(31));
@ -990,19 +990,19 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const
FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB) FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB)
{ {
__m128i rb = _mm_slli_epi16( _mm_and_si128(_mm_set1_epi16(0x001F), colB), 1); __m128i rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) );
__m128i gb = _mm_srli_epi16( _mm_and_si128(_mm_set1_epi16(0x03E0), colB), 4); __m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) );
__m128i bb = _mm_srli_epi16( _mm_and_si128(_mm_set1_epi16(0x7C00), colB), 9); __m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) );
__m128i ra_lo = _mm_and_si128(_mm_set1_epi32(0x000000FF), colA_Lo); __m128i ra_lo = _mm_and_si128( colA_Lo, _mm_set1_epi32(0x000000FF) );
__m128i ga_lo = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0x0000FF00), colA_Lo), 8 ); __m128i ga_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 8), _mm_set1_epi32(0x000000FF) );
__m128i ba_lo = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0x00FF0000), colA_Lo), 16 ); __m128i ba_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 16), _mm_set1_epi32(0x000000FF) );
__m128i aa_lo = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0xFF000000), colA_Lo), 24 ); __m128i aa_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x000000FF) );
__m128i ra_hi = _mm_and_si128(_mm_set1_epi32(0x000000FF), colA_Hi); __m128i ra_hi = _mm_and_si128( colA_Hi, _mm_set1_epi32(0x000000FF) );
__m128i ga_hi = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0x0000FF00), colA_Hi), 8 ); __m128i ga_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 8), _mm_set1_epi32(0x000000FF) );
__m128i ba_hi = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0x00FF0000), colA_Hi), 16 ); __m128i ba_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 16), _mm_set1_epi32(0x000000FF) );
__m128i aa_hi = _mm_srli_epi32( _mm_and_si128(_mm_set1_epi32(0xFF000000), colA_Hi), 24 ); __m128i aa_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 24), _mm_set1_epi32(0x000000FF) );
__m128i ra = _mm_packs_epi32(ra_lo, ra_hi); __m128i ra = _mm_packs_epi32(ra_lo, ra_hi);
__m128i ga = _mm_packs_epi32(ga_lo, ga_hi); __m128i ga = _mm_packs_epi32(ga_lo, ga_hi);