- GPUEngineBase::_ColorEffectBlend() now supports RGB666 and RGB888 color formats.
- Use some SSSE3-specific optimizations in GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2().
- Do some minor cleanup.
This commit is contained in:
rogerman 2016-06-27 06:11:39 +00:00
parent edf1d305d4
commit 425643bf92
2 changed files with 212 additions and 162 deletions

View File

@ -599,51 +599,36 @@ FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB,
u16 bb = (colB >> 10) & 0x001F;
ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
if (ra > 31) ra = 31;
ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16;
if (ga > 31) ga = 31;
ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16;
if (ba > 31) ba = 31;
ra = (ra > 31) ? 31 : ra;
ga = (ga > 31) ? 31 : ga;
ba = (ba > 31) ? 31 : ba;
return ra | (ga << 5) | (ba << 10);
}
template <NDSColorFormat COLORFORMATB>
FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend(const u16 colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB)
template <NDSColorFormat COLORFORMAT>
FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB)
{
FragmentColor outColor;
u16 rb = colB.r;
u16 gb = colB.g;
u16 bb = colB.b;
u16 r16 = ( (colA.r * blendEVA) + (colB.r * blendEVB) ) / 16;
u16 g16 = ( (colA.g * blendEVA) + (colB.g * blendEVB) ) / 16;
u16 b16 = ( (colA.b * blendEVA) + (colB.b * blendEVB) ) / 16;
if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
{
u16 ra = material_5bit_to_6bit[ colA & 0x001F];
u16 ga = material_5bit_to_6bit[(colA >> 5) & 0x001F];
u16 ba = material_5bit_to_6bit[(colA >> 10) & 0x001F];
ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16;
ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16;
outColor.r = (ra > 63) ? 63 : ra;
outColor.g = (ga > 63) ? 63 : ga;
outColor.b = (ba > 63) ? 63 : ba;
outColor.r = (r16 > 63) ? 63 : r16;
outColor.g = (g16 > 63) ? 63 : g16;
outColor.b = (b16 > 63) ? 63 : b16;
}
else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
{
u16 ra = material_5bit_to_8bit[ colA & 0x001F];
u16 ga = material_5bit_to_8bit[(colA >> 5) & 0x001F];
u16 ba = material_5bit_to_8bit[(colA >> 10) & 0x001F];
ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16;
ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16;
outColor.r = (ra > 255) ? 255 : ra;
outColor.g = (ga > 255) ? 255 : ga;
outColor.b = (ba > 255) ? 255 : ba;
outColor.r = (r16 > 255) ? 255 : r16;
outColor.g = (g16 > 255) ? 255 : g16;
outColor.b = (b16 > 255) ? 255 : b16;
}
outColor.a = 0;
@ -855,36 +840,91 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i
}
}
template <NDSColorFormat COLORFORMAT>
FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB)
{
#ifdef ENABLE_SSSE3
__m128i ra = _mm_or_si128( _mm_and_si128( colA, _mm_set1_epi16(0x001F)), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) );
__m128i ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 5), _mm_set1_epi16(0x001F)), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) );
__m128i ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), _mm_set1_epi16(0x001F)), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) );
__m128i blend = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8));
ra = _mm_srli_epi16( _mm_maddubs_epi16(ra, blend), 4 );
ga = _mm_srli_epi16( _mm_maddubs_epi16(ga, blend), 4 );
ba = _mm_srli_epi16( _mm_maddubs_epi16(ba, blend), 4 );
#else
__m128i ra = _mm_and_si128( colA, _mm_set1_epi16(0x001F) );
__m128i ga = _mm_and_si128( _mm_srli_epi16(colA, 5), _mm_set1_epi16(0x001F) );
__m128i ba = _mm_and_si128( _mm_srli_epi16(colA, 10), _mm_set1_epi16(0x001F) );
__m128i rb = _mm_and_si128( colB, _mm_set1_epi16(0x001F) );
__m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 5), _mm_set1_epi16(0x001F) );
__m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 10), _mm_set1_epi16(0x001F) );
ra = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB)), 4 );
ga = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB)), 4 );
ba = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB)), 4 );
__m128i blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8));
#endif
ra = _mm_min_epi16(ra, _mm_set1_epi16(31));
ga = _mm_min_epi16(ga, _mm_set1_epi16(31));
ba = _mm_min_epi16(ba, _mm_set1_epi16(31));
return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) );
if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
{
__m128i ra;
__m128i ga;
__m128i ba;
__m128i colorBitMask = _mm_set1_epi16(0x001F);
#ifdef ENABLE_SSSE3
ra = _mm_or_si128( _mm_and_si128( colA, colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) );
ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) );
ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) );
ra = _mm_maddubs_epi16(ra, blendAB);
ga = _mm_maddubs_epi16(ga, blendAB);
ba = _mm_maddubs_epi16(ba, blendAB);
#else
ra = _mm_and_si128( colA, colorBitMask);
ga = _mm_and_si128(_mm_srli_epi16(colA, 5), colorBitMask);
ba = _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask);
__m128i rb = _mm_and_si128( colB, colorBitMask);
__m128i gb = _mm_and_si128(_mm_srli_epi16(colB, 5), colorBitMask);
__m128i bb = _mm_and_si128(_mm_srli_epi16(colB, 10), colorBitMask);
ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
#endif
ra = _mm_srli_epi16(ra, 4);
ga = _mm_srli_epi16(ga, 4);
ba = _mm_srli_epi16(ba, 4);
ra = _mm_min_epi16(ra, colorBitMask);
ga = _mm_min_epi16(ga, colorBitMask);
ba = _mm_min_epi16(ba, colorBitMask);
return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) );
}
else
{
__m128i outColorLo;
__m128i outColorHi;
__m128i outColor;
#ifdef ENABLE_SSSE3
outColorLo = _mm_or_si128( _mm_unpacklo_epi8(colA, _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_setzero_si128(), colB) );
outColorHi = _mm_or_si128( _mm_unpackhi_epi8(colA, _mm_setzero_si128()), _mm_unpackhi_epi8(_mm_setzero_si128(), colB) );
outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
#else
__m128i colALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128());
__m128i colAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128());
__m128i colBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128());
__m128i colBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128());
outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendEVA), _mm_mullo_epi16(colBLo, blendEVB) );
outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendEVA), _mm_mullo_epi16(colBHi, blendEVB) );
#endif
outColorLo = _mm_srli_epi16(outColorLo, 4);
outColorHi = _mm_srli_epi16(outColorHi, 4);
outColor = _mm_packus_epi16(outColorLo, outColorHi);
// When the color format is 888, the packuswb instruction will naturally clamp
// the color component values to 255. However, when the color format is 666, the
// color component values must be clamped to 63. In this case, we must call pminub
// to do the clamp.
if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
{
outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
}
outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
return outColor;
}
}
template <NDSColorFormat COLORFORMATA, NDSColorFormat COLORFORMATB>
@ -909,49 +949,43 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
__m128i aa = _mm_packs_epi32(aa_lo, aa_hi);
#ifdef ENABLE_SSSE3
aa = _mm_adds_epu8(aa, _mm_set1_epi16(1)); // Note the value limit of 255 in the case of an 8-bit alpha.
ra = _mm_or_si128( ra, _mm_and_si128(_mm_slli_epi16(colB, 9), _mm_set1_epi16(0x3E00)) );
ga = _mm_or_si128( ga, _mm_and_si128(_mm_slli_epi16(colB, 4), _mm_set1_epi16(0x3E00)) );
ba = _mm_or_si128( ba, _mm_and_si128(_mm_srli_epi16(colB, 1), _mm_set1_epi16(0x3E00)) );
if (COLORFORMATA == NDSColorFormat_BGR666_Rev)
{
aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(32), aa), 8) );
ra = _mm_srli_epi16( _mm_maddubs_epi16(ra, aa), 6 );
ga = _mm_srli_epi16( _mm_maddubs_epi16(ga, aa), 6 );
ba = _mm_srli_epi16( _mm_maddubs_epi16(ba, aa), 6 );
}
else if (COLORFORMATA == NDSColorFormat_BGR888_Rev)
{
// Note that we're not subtracting the color B alpha from 256 here since we're limited to a
// value range of [0:255]. This change shouldn't really make a huge difference in the grand
// scheme of things.
aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(255), aa), 8) );
ra = _mm_srli_epi16( _mm_maddubs_epi16(ra, aa), 9 );
ga = _mm_srli_epi16( _mm_maddubs_epi16(ga, aa), 9 );
ba = _mm_srli_epi16( _mm_maddubs_epi16(ba, aa), 9 );
}
// Note that in the case of an 8-bit alpha, we're not subtracting the color B alpha from 256 here
// since we're limited to a value range of [0:255]. This change shouldn't really make a huge
// difference in the grand scheme of things.
aa = _mm_adds_epu8(aa, _mm_set1_epi16(1)); // Note the value limit of 255 in the case of an 8-bit alpha.
aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8((COLORFORMATA == NDSColorFormat_BGR888_Rev) ? 255 : 32), aa), 8) );
ra = _mm_maddubs_epi16(ra, aa);
ga = _mm_maddubs_epi16(ga, aa);
ba = _mm_maddubs_epi16(ba, aa);
#else
aa = _mm_adds_epu16(aa, _mm_set1_epi16(1));
__m128i rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) );
__m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) );
__m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) );
__m128i ab = _mm_subs_epu16( _mm_set1_epi16((COLORFORMATA == NDSColorFormat_BGR888_Rev) ? 256 : 32), aa );
ra = _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) );
ga = _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) );
ba = _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) );
#endif
if (COLORFORMATA == NDSColorFormat_BGR666_Rev)
{
__m128i ab = _mm_subs_epu16(_mm_set1_epi16(32), aa);
ra = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) ), 6 );
ga = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) ), 6 );
ba = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) ), 6 );
ra = _mm_srli_epi16(ra, 6);
ga = _mm_srli_epi16(ga, 6);
ba = _mm_srli_epi16(ba, 6);
}
else if (COLORFORMATA == NDSColorFormat_BGR888_Rev)
{
__m128i ab = _mm_subs_epu16(_mm_set1_epi16(256), aa);
ra = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) ), 9 );
ga = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) ), 9 );
ba = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) ), 9 );
ra = _mm_srli_epi16(ra, 9);
ga = _mm_srli_epi16(ga, 9);
ba = _mm_srli_epi16(ba, 9);
}
#endif
return _mm_or_si128( _mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10) );
}
@ -959,24 +993,24 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
{
// If the second color format is 666 or 888, then the colA_Hi parameter is ignored.
#ifdef ENABLE_SSSE3
__m128i rgbLo;
__m128i rgbHi;
__m128i rgbALo;
__m128i rgbAHi;
if (COLORFORMATA == COLORFORMATB)
{
rgbLo = _mm_or_si128( _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_setzero_si128(), colB) );
rgbHi = _mm_or_si128( _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128()), _mm_unpackhi_epi8(_mm_setzero_si128(), colB) );
rgbALo = _mm_or_si128( _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_setzero_si128(), colB) );
rgbAHi = _mm_or_si128( _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128()), _mm_unpackhi_epi8(_mm_setzero_si128(), colB) );
}
else if ( (COLORFORMATA == NDSColorFormat_BGR666_Rev) && (COLORFORMATB == NDSColorFormat_BGR888_Rev) )
{
__m128i rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128());
__m128i rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128());
rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128());
rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128());
rgbALo = _mm_or_si128( _mm_slli_epi16(rgbALo, 2), _mm_srli_epi16(rgbALo, 4) );
rgbAHi = _mm_or_si128( _mm_slli_epi16(rgbAHi, 2), _mm_srli_epi16(rgbAHi, 4) );
rgbLo = _mm_or_si128( rgbALo, _mm_unpacklo_epi8(_mm_setzero_si128(), colB) );
rgbHi = _mm_or_si128( rgbAHi, _mm_unpackhi_epi8(_mm_setzero_si128(), colB) );
rgbALo = _mm_or_si128( rgbALo, _mm_unpacklo_epi8(_mm_setzero_si128(), colB) );
rgbAHi = _mm_or_si128( rgbAHi, _mm_unpackhi_epi8(_mm_setzero_si128(), colB) );
}
else if ( (COLORFORMATA == NDSColorFormat_BGR888_Rev) && (COLORFORMATB == NDSColorFormat_BGR666_Rev) )
{
@ -986,8 +1020,8 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
rgbBLo = _mm_or_si128( _mm_slli_epi16(rgbBLo, 2), _mm_srli_epi16(rgbBLo, 4) );
rgbBHi = _mm_or_si128( _mm_slli_epi16(rgbBHi, 2), _mm_srli_epi16(rgbBHi, 4) );
rgbLo = _mm_or_si128( _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128()), _mm_slli_epi16(rgbBLo, 8) );
rgbHi = _mm_or_si128( _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128()), _mm_slli_epi16(rgbBHi, 8) );
rgbALo = _mm_or_si128( _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128()), _mm_slli_epi16(rgbBLo, 8) );
rgbAHi = _mm_or_si128( _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128()), _mm_slli_epi16(rgbBHi, 8) );
}
__m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x000000FF) );
@ -1001,9 +1035,6 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
{
alphaLo = _mm_or_si128(alphaLo, _mm_slli_epi16(_mm_sub_epi16(alphaLo, _mm_set1_epi16(32)), 8));
alphaHi = _mm_or_si128(alphaHi, _mm_slli_epi16(_mm_sub_epi16(alphaHi, _mm_set1_epi16(32)), 8));
rgbLo = _mm_srli_epi16( _mm_maddubs_epi16(rgbLo, alphaLo), 5 );
rgbHi = _mm_srli_epi16( _mm_maddubs_epi16(rgbHi, alphaHi), 5 );
}
else if (COLORFORMATA == NDSColorFormat_BGR888_Rev)
{
@ -1012,12 +1043,10 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
// scheme of things.
alphaLo = _mm_or_si128(alphaLo, _mm_slli_epi16(_mm_sub_epi16(alphaLo, _mm_set1_epi16(255)), 8));
alphaHi = _mm_or_si128(alphaHi, _mm_slli_epi16(_mm_sub_epi16(alphaHi, _mm_set1_epi16(255)), 8));
rgbLo = _mm_srli_epi16( _mm_maddubs_epi16(rgbLo, alphaLo), 8 );
rgbHi = _mm_srli_epi16( _mm_maddubs_epi16(rgbHi, alphaHi), 8 );
}
return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) );
rgbALo = _mm_maddubs_epi16(rgbALo, alphaLo);
rgbAHi = _mm_maddubs_epi16(rgbAHi, alphaHi);
#else
__m128i rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128());
__m128i rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128());
@ -1045,17 +1074,28 @@ FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, c
if (COLORFORMATA == NDSColorFormat_BGR666_Rev)
{
rgbALo = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) ), 5 );
rgbAHi = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) ), 5 );
rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) );
rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) );
}
else if (COLORFORMATA == NDSColorFormat_BGR888_Rev)
{
rgbALo = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) ), 8 );
rgbAHi = _mm_srli_epi16( _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) ), 8 );
rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) );
rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) );
}
#endif
if (COLORFORMATA == NDSColorFormat_BGR666_Rev)
{
rgbALo = _mm_srli_epi16(rgbALo, 5);
rgbAHi = _mm_srli_epi16(rgbAHi, 5);
}
else if (COLORFORMATA == NDSColorFormat_BGR888_Rev)
{
rgbALo = _mm_srli_epi16(rgbALo, 8);
rgbAHi = _mm_srli_epi16(rgbAHi, 8);
}
return _mm_and_si128( _mm_packus_epi16(rgbALo, rgbAHi), _mm_set1_epi32(0x00FFFFFF) );
#endif
}
}
@ -2192,12 +2232,14 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(const size_t srcX, const u16 src, c
break;
case NDSColorFormat_BGR666_Rev:
finalDstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(src, *(FragmentColor *)dstColorLine, blendEVA, blendEVB);
finalDstColor32.color = ConvertColor555To6665Opaque<false>(src);
finalDstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(finalDstColor32, *(FragmentColor *)dstColorLine, blendEVA, blendEVB);
finalDstColor32.a = 0x1F;
break;
case NDSColorFormat_BGR888_Rev:
finalDstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(src, *(FragmentColor *)dstColorLine, blendEVA, blendEVB);
finalDstColor32.color = ConvertColor555To8888Opaque<false>(src);
finalDstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(finalDstColor32, *(FragmentColor *)dstColorLine, blendEVA, blendEVB);
finalDstColor32.a = 0xFF;
break;
}
@ -2382,8 +2424,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel16_SSE2(const size_t dstX,
// Render the pixel using the selected color effect.
const __m128i blendMask = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) );
const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend(srcColorLo_vec128, dstColorLo_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend(srcColorHi_vec128, dstColorHi_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i blendPixelsLo = _mm_and_si128( this->_ColorEffectBlend<NDSColorFormat_BGR555_Rev>(srcColorLo_vec128, dstColorLo_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpacklo_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i blendPixelsHi = _mm_and_si128( this->_ColorEffectBlend<NDSColorFormat_BGR555_Rev>(srcColorHi_vec128, dstColorHi_vec128, eva_vec128, evb_vec128), _mm_cmpeq_epi16(_mm_unpackhi_epi8(blendMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi32(0xFFFFFFFF) );
const __m128i disablePixelsLo = _mm_and_si128( srcColorLo_vec128, _mm_cmpeq_epi16(_mm_unpacklo_epi8(disableMask, _mm_setzero_si128()), _mm_set1_epi16(0x00FF)) );
@ -2540,7 +2582,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel8_SSE2(const size_t dstX,
// Render the pixel using the selected color effect.
const __m128i blendMask = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi16(colorEffect_vec128, _mm_set1_epi16(ColorEffect_Blend))) );
const __m128i blendPixels = _mm_and_si128( blendMask, this->_ColorEffectBlend(srcColor_vec128, dstColor_vec128, eva_vec128, evb_vec128) );
const __m128i blendPixels = _mm_and_si128( blendMask, this->_ColorEffectBlend<NDSColorFormat_BGR555_Rev>(srcColor_vec128, dstColor_vec128, eva_vec128, evb_vec128) );
const __m128i disableMask = _mm_xor_si128( _mm_or_si128(brightnessMask, blendMask), _mm_set1_epi32(0xFFFFFFFF) );
const __m128i disablePixels = _mm_and_si128(disableMask, srcColor_vec128);
@ -6488,6 +6530,10 @@ FragmentColor GPUEngineA::_RenderLine_DispCapture_BlendFunc(const FragmentColor
template<NDSColorFormat COLORFORMAT>
__m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB)
{
#ifdef ENABLE_SSSE3
__m128i blendAB = _mm_or_si128( blendEVA, _mm_slli_epi16(blendEVB, 8) );
#endif
switch (COLORFORMAT)
{
case NDSColorFormat_BGR555_Rev:
@ -6496,46 +6542,43 @@ __m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA,
__m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi16(0x8000));
__m128i srcA_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcA_alpha, _mm_setzero_si128()), srcA );
__m128i srcB_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcB_alpha, _mm_setzero_si128()), srcB );
__m128i colorBitMask = _mm_set1_epi16(0x001F);
__m128i r;
__m128i g;
__m128i b;
__m128i srcB_r = _mm_and_si128(srcB_masked, colorBitMask);
srcB_r = _mm_mullo_epi16(srcB_r, blendEVB);
__m128i ra;
__m128i ga;
__m128i ba;
__m128i srcB_g = _mm_srli_epi16(srcB_masked, 5);
srcB_g = _mm_and_si128(srcB_g, colorBitMask);
srcB_g = _mm_mullo_epi16(srcB_g, blendEVB);
#ifdef ENABLE_SSSE3
ra = _mm_or_si128( _mm_and_si128( srcA_masked, colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 8), _mm_set1_epi16(0x1F00)) );
ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 3), _mm_set1_epi16(0x1F00)) );
ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(srcB_masked, 2), _mm_set1_epi16(0x1F00)) );
__m128i srcB_b = _mm_srli_epi16(srcB_masked, 10);
srcB_b = _mm_and_si128(srcB_b, colorBitMask);
srcB_b = _mm_mullo_epi16(srcB_b, blendEVB);
ra = _mm_maddubs_epi16(ra, blendAB);
ga = _mm_maddubs_epi16(ga, blendAB);
ba = _mm_maddubs_epi16(ba, blendAB);
#else
ra = _mm_and_si128( srcA_masked, colorBitMask);
ga = _mm_and_si128(_mm_srli_epi16(srcA_masked, 5), colorBitMask);
ba = _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask);
r = _mm_and_si128(srcA_masked, colorBitMask);
r = _mm_mullo_epi16(r, blendEVA);
r = _mm_add_epi16(r, srcB_r);
r = _mm_srli_epi16(r, 4);
r = _mm_min_epi16(r, colorBitMask);
__m128i rb = _mm_and_si128( srcB_masked, colorBitMask);
__m128i gb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 5), colorBitMask);
__m128i bb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 10), colorBitMask);
g = _mm_srli_epi16(srcA_masked, 5);
g = _mm_and_si128(g, colorBitMask);
g = _mm_mullo_epi16(g, blendEVA);
g = _mm_add_epi16(g, srcB_g);
g = _mm_srli_epi16(g, 4);
g = _mm_min_epi16(g, colorBitMask);
g = _mm_slli_epi16(g, 5);
ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
#endif
b = _mm_srli_epi16(srcA_masked, 10);
b = _mm_and_si128(b, colorBitMask);
b = _mm_mullo_epi16(b, blendEVA);
b = _mm_add_epi16(b, srcB_b);
b = _mm_srli_epi16(b, 4);
b = _mm_min_epi16(b, colorBitMask);
b = _mm_slli_epi16(b, 10);
ra = _mm_srli_epi16(ra, 4);
ga = _mm_srli_epi16(ga, 4);
ba = _mm_srli_epi16(ba, 4);
return _mm_or_si128( _mm_or_si128(_mm_or_si128(r, g), b), _mm_or_si128(srcA_alpha, srcB_alpha) );
ra = _mm_min_epi16(ra, colorBitMask);
ga = _mm_min_epi16(ga, colorBitMask);
ba = _mm_min_epi16(ba, colorBitMask);
return _mm_or_si128( _mm_or_si128(_mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10)), _mm_or_si128(srcA_alpha, srcB_alpha) );
}
case NDSColorFormat_BGR666_Rev:
@ -6548,26 +6591,33 @@ __m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA,
__m128i srcA_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcA_alpha, _mm_setzero_si128()), srcA);
__m128i srcB_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcB_alpha, _mm_setzero_si128()), srcB);
// Temporarily convert the color component values from 8-bit to 16-bit.
__m128i outColorLo;
__m128i outColorHi;
__m128i outColor;
// Temporarily convert the color component values from 8-bit to 16-bit, and then
// do the blend calculation.
#ifdef ENABLE_SSSE3
outColorLo = _mm_or_si128( _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_setzero_si128(), srcB_masked) );
outColorHi = _mm_or_si128( _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128()), _mm_unpackhi_epi8(_mm_setzero_si128(), srcB_masked) );
outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
#else
__m128i srcA_maskedLo = _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128());
__m128i srcA_maskedHi = _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128());
__m128i srcB_maskedLo = _mm_unpacklo_epi8(srcB_masked, _mm_setzero_si128());
__m128i srcB_maskedHi = _mm_unpackhi_epi8(srcB_masked, _mm_setzero_si128());
// Do the blend calculation. We'll use the srcA variables as our working variables.
srcA_maskedLo = _mm_mullo_epi16(srcA_maskedLo, blendEVA);
srcA_maskedHi = _mm_mullo_epi16(srcA_maskedHi, blendEVA);
srcB_maskedLo = _mm_mullo_epi16(srcB_maskedLo, blendEVB);
srcB_maskedHi = _mm_mullo_epi16(srcB_maskedHi, blendEVB);
outColorLo = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedLo, blendEVA), _mm_mullo_epi16(srcB_maskedLo, blendEVB) );
outColorHi = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedHi, blendEVA), _mm_mullo_epi16(srcB_maskedHi, blendEVB) );
#endif
srcA_maskedLo = _mm_add_epi16(srcA_maskedLo, srcB_maskedLo);
srcA_maskedLo = _mm_srli_epi16(srcA_maskedLo, 4);
srcA_maskedHi = _mm_add_epi16(srcA_maskedHi, srcB_maskedHi);
srcA_maskedHi = _mm_srli_epi16(srcA_maskedHi, 4);
outColorLo = _mm_srli_epi16(outColorLo, 4);
outColorHi = _mm_srli_epi16(outColorHi, 4);
// Convert the color components back from 16-bit to 8-bit using a saturated pack.
__m128i outColor_vec128 = _mm_packus_epi16(srcA_maskedLo, srcA_maskedHi);
outColor = _mm_packus_epi16(outColorLo, outColorHi);
// When the color format is 8888, the packuswb instruction will naturally clamp
// the color component values to 255. However, when the color format is 6665, the
@ -6575,15 +6625,15 @@ __m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA,
// to do the clamp.
if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
{
outColor_vec128 = _mm_min_epu8(outColor_vec128, _mm_set1_epi8(63));
outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
}
// Add the alpha components back in.
outColor_vec128 = _mm_and_si128(outColor_vec128, _mm_set1_epi32(0x00FFFFFF));
outColor_vec128 = _mm_or_si128(outColor_vec128, srcA_alpha);
outColor_vec128 = _mm_or_si128(outColor_vec128, srcB_alpha);
outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
outColor = _mm_or_si128(outColor, srcA_alpha);
outColor = _mm_or_si128(outColor, srcB_alpha);
return outColor_vec128;
return outColor;
}
}
}

View File

@ -1310,7 +1310,7 @@ protected:
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB);
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable);
template<NDSColorFormat COLORFORMATB> FORCEINLINE FragmentColor _ColorEffectBlend(const u16 colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB);
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor _ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB);
template<NDSColorFormat COLORFORMATA> FORCEINLINE u16 _ColorEffectBlend3D(const FragmentColor colA, const u16 colB);
template<NDSColorFormat COLORFORMATA, NDSColorFormat COLORFORMATB> FORCEINLINE FragmentColor _ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB);
@ -1324,7 +1324,7 @@ protected:
FORCEINLINE FragmentColor _ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY);
#ifdef ENABLE_SSE2
FORCEINLINE __m128i _ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB);
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB);
template<NDSColorFormat COLORFORMATA, NDSColorFormat COLORFORMATB> FORCEINLINE __m128i _ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB);
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY);
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY);