New SSSE3 implementation of RGB5A3. About 40% improvement (less cycles) on the plain C version and 17% on the SSE2 version.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6779 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
bedc889a56
commit
f667c03d55
|
@ -1444,12 +1444,98 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
// This is the hard-coded 0xFF alpha constant that is ORed in place after the RGB are calculated
|
||||
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
|
||||
const __m128i aVxff00 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
|
||||
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 4)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
{
|
||||
u32 *newdst = dst+(y+iy)*width+x;
|
||||
#if _M_SSE >= 0x301
|
||||
// Produces a ~40% speed improvement over reference C implementation
|
||||
if (cpu_info.bSSE3)
|
||||
{
|
||||
const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
|
||||
const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)src),mask);
|
||||
int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
|
||||
if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
|
||||
{
|
||||
// Swizzle bits: 00012345 -> 12345123
|
||||
|
||||
//r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2);
|
||||
const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
|
||||
const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
|
||||
|
||||
//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
|
||||
const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
|
||||
const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
|
||||
|
||||
//b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2);
|
||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
|
||||
const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
|
||||
const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
|
||||
_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
|
||||
_mm_storeu_si128( (__m128i*)newdst, final );
|
||||
}
|
||||
else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443.
|
||||
{
|
||||
// Swizzle bits: 00001234 -> 12341234
|
||||
|
||||
//r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf);
|
||||
const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
|
||||
const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
|
||||
|
||||
//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
|
||||
const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
|
||||
const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
|
||||
|
||||
//b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf);
|
||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
||||
const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
|
||||
//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
|
||||
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
||||
const __m128i aV = _mm_or_si128(
|
||||
_mm_slli_epi16(tmpaV, 5),
|
||||
_mm_or_si128(
|
||||
_mm_slli_epi16(tmpaV, 2),
|
||||
_mm_srli_epi16(tmpaV, 1)
|
||||
)
|
||||
);
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
|
||||
const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
|
||||
_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
|
||||
_mm_storeu_si128( (__m128i*)newdst, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: Vectorise (Either 4-way branch or do both and select is better than this)
|
||||
unsigned __int32 *vals = (unsigned __int32*) &valV;
|
||||
int r,g,b,a;
|
||||
for (int i=0; i < 4; ++i)
|
||||
{
|
||||
if (vals[i] & 0x8000)
|
||||
{
|
||||
// Swizzle bits: 00012345 -> 12345123
|
||||
r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2);
|
||||
g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2);
|
||||
b = (((vals[i] ) & 0x1f) << 3) | (((vals[i] ) & 0x1f) >> 2);
|
||||
a = 0xFF;
|
||||
}
|
||||
else
|
||||
{
|
||||
a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1);
|
||||
// Swizzle bits: 00001234 -> 12341234
|
||||
r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf);
|
||||
g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf);
|
||||
b = (((vals[i] ) & 0xf) << 4) | ((vals[i] ) & 0xf);
|
||||
}
|
||||
newdst[i] = r | (g << 8) | (b << 16) | (a << 24);
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
const u16 *newsrc = (const u16*)src;
|
||||
|
||||
// TODO: weak point
|
||||
|
@ -1471,37 +1557,17 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
|
||||
const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
|
||||
|
||||
//newdst[0] = r0 | (_______) | (________) | (________);
|
||||
__m128i final = rV;
|
||||
|
||||
//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
|
||||
const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
|
||||
const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (________) | (________);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
_mm_slli_epi32(gV, 8)
|
||||
);
|
||||
|
||||
//b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2);
|
||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
|
||||
const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (________);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
_mm_slli_epi32(bV, 16)
|
||||
);
|
||||
|
||||
// Alphas are ORed in as a constant __m128i.
|
||||
//a0 = 0xFF;
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
aVxff00
|
||||
);
|
||||
const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
|
||||
_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
|
||||
|
||||
// write the final result:
|
||||
_mm_storeu_si128( (__m128i*)newdst, final );
|
||||
|
@ -1518,29 +1584,14 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
|
||||
const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
|
||||
|
||||
//newdst[0] = r0 | (_______) | (________) | (________);
|
||||
__m128i final = rV;
|
||||
|
||||
//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
|
||||
const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
|
||||
const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (________) | (________);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
_mm_slli_epi32(gV, 8)
|
||||
);
|
||||
|
||||
//b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf);
|
||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
||||
const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (________);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
_mm_slli_epi32(bV, 16)
|
||||
);
|
||||
|
||||
//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
|
||||
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
||||
const __m128i aV = _mm_or_si128(
|
||||
|
@ -1552,10 +1603,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
);
|
||||
|
||||
//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
|
||||
final = _mm_or_si128(
|
||||
final,
|
||||
_mm_slli_epi32(aV, 24)
|
||||
);
|
||||
const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
|
||||
_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
|
||||
|
||||
// write the final result:
|
||||
_mm_storeu_si128( (__m128i*)newdst, final );
|
||||
|
@ -1641,6 +1690,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
newdst[3] = r3 | (g3 << 8) | (b3 << 16) | (a3 << 24);
|
||||
}
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
// Reference C implementation:
|
||||
for (int y = 0; y < height; y += 4)
|
||||
|
|
Loading…
Reference in New Issue