Texture Handler:

- Include SSSE3 versions for unpacking the following texture types: I2, I4, and A5I3.
- As a side-effect of working on these optimizations, the SSE2 versions of ConvertColor555To6665Opaque() and ConvertColor555To8888Opaque() are now a little faster.
This commit is contained in:
rogerman 2016-07-01 10:15:57 +00:00
parent 0d9d59455f
commit a05ddab710
4 changed files with 367 additions and 132 deletions

View File

@ -40,14 +40,6 @@
#include "matrix.h"
#include "emufile.h"
// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to
// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit
// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it
// should work fine for both SSE4.1 and SSE2.
#if !defined(_SMMINTRIN_H) && defined(__EMMINTRIN_H)
#define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a)))
#endif
#ifdef FASTBUILD
#undef FORCEINLINE
#define FORCEINLINE

View File

@ -38,6 +38,14 @@
#include <smmintrin.h>
#endif
// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to
// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit
// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it
// should work fine for both SSE4.1 and SSE2.
#if !defined(_SMMINTRIN_H) && defined(__EMMINTRIN_H)
#define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a)))
#endif
class GPUEngineBase;
class EMUFILE;
struct MMU_struct;
@ -1777,71 +1785,75 @@ FORCEINLINE u16 ConvertColor6665To5551(u32 srcColor)
template <bool SWAP_RB>
FORCEINLINE void ConvertColor555To8888Opaque(const __m128i &src, __m128i &dstLo, __m128i &dstHi)
{
__m128i src32;
// Conversion algorithm:
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07)
if (SWAP_RB)
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000));
src32 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src32, 14), _mm_set1_epi32(0x00070000)));
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x00000700))) );
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src32, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src32, 12), _mm_set1_epi32(0x00000007))) );
dstLo = _mm_or_si128( dstLo, _mm_set1_epi32(0xFF000000) );
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000));
src32 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src32, 14), _mm_set1_epi32(0x00070000)));
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x00000700))) );
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src32, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src32, 12), _mm_set1_epi32(0x00000007))) );
dstHi = _mm_or_si128( dstHi, _mm_set1_epi32(0xFF000000) );
}
else
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000));
src32 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src32, 2), _mm_set1_epi32(0x00000007)));
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x00000700))) );
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00070000))) );
dstLo = _mm_or_si128( dstLo, _mm_set1_epi32(0xFF000000) );
dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000));
src32 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src32, 2), _mm_set1_epi32(0x00000007)));
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x00000700))) );
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00070000))) );
dstHi = _mm_or_si128( dstHi, _mm_set1_epi32(0xFF000000) );
}
__m128i tmpDstLo = dstLo;
dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
}
template <bool SWAP_RB>
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i &src, __m128i &dstLo, __m128i &dstHi)
{
__m128i src32;
// Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
if (SWAP_RB)
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00000001))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000));
src32 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src32, 12), _mm_set1_epi32(0x00010000)));
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src32, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src32, 9), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src32, 14), _mm_set1_epi32(0x00000001))) );
dstLo = _mm_or_si128( dstLo, _mm_set1_epi32(0x1F000000) );
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00010000)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 25), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 30), _mm_set1_epi32(0x00000001))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000));
src32 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src32, 12), _mm_set1_epi32(0x00010000)));
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src32, 1), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src32, 9), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src32, 14), _mm_set1_epi32(0x00000001))) );
dstHi = _mm_or_si128( dstHi, _mm_set1_epi32(0x1F000000) );
}
else
{
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00000001)));
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00010000))) );
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0x1F000000));
src32 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src32, 4), _mm_set1_epi32(0x00000001)));
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src32, 1), _mm_set1_epi32(0x00000100))) );
dstLo = _mm_or_si128( dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src32, 2), _mm_set1_epi32(0x00010000))) );
dstLo = _mm_or_si128( dstLo, _mm_set1_epi32(0x1F000000) );
dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src, 20), _mm_set1_epi32(0x00000001)));
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src, 17), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 9), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_srli_epi32(src, 14), _mm_set1_epi32(0x00010000))) );
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0x1F000000));
src32 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(src32, 4), _mm_set1_epi32(0x00000001)));
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(src32, 1), _mm_set1_epi32(0x00000100))) );
dstHi = _mm_or_si128( dstHi, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src32, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src32, 2), _mm_set1_epi32(0x00010000))) );
dstHi = _mm_or_si128( dstHi, _mm_set1_epi32(0x1F000000) );
}
__m128i tmpDstLo = dstLo;
dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
}
template <bool SWAP_RB>

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2015 DeSmuME team
Copyright (C) 2008-2016 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -243,7 +243,7 @@ public:
static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8};
//used to hold a copy of the palette specified for this texture
u16 pal[256];
CACHE_ALIGN u16 pal[256];
u32 textureMode = (unsigned short)((format>>26)&0x07);
u32 sizeX=(8 << ((format>>20)&0x07));
@ -374,7 +374,7 @@ public:
newitem->invSizeY=1.0f/((float)(sizeY));
newitem->decode_len = sizeX*sizeY*4;
newitem->mode = textureMode;
newitem->decoded = new u8[newitem->decode_len];
newitem->decoded = (u8 *)malloc_alignedCacheLine(newitem->decode_len);
list_push_front(newitem);
//printf("allocating: up to %d with %d items\n",cache_size,index.size());
@ -409,18 +409,14 @@ public:
{
case TEXMODE_A3I5:
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++)
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
u16 c = pal[*adr&31] & 0x7FFF;
u8 alpha = *adr>>5;
if(TEXFORMAT == TexFormat_15bpp)
*dwdst++ = COLOR555TO6665(c,material_3bit_to_5bit[alpha]);
else
*dwdst++ = COLOR555TO8888(c,material_3bit_to_8bit[alpha]);
adr++;
const u16 c = pal[*adr & 31] & 0x7FFF;
const u8 alpha = *adr >> 5;
*dwdst++ = (TEXFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, material_3bit_to_5bit[alpha]) : COLOR555TO8888(c, material_3bit_to_8bit[alpha]);
}
}
break;
@ -428,59 +424,122 @@ public:
case TEXMODE_I2:
{
#ifdef ENABLE_SSSE3
const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)pal);
#endif
if (isPalZeroTransparent)
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++)
#ifdef ENABLE_SSSE3
for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dwdst+=16)
{
u8 bits;
__m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) );
idx = _mm_slli_epi16(idx, 1);
bits = (*adr)&0x3;
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
__m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) );
__m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) );
bits = ((*adr)>>2)&0x3;
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0);
const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1);
bits = ((*adr)>>4)&0x3;
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
__m128i convertedColor[4];
bits = ((*adr)>>6)&0x3;
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
if (TEXFORMAT == TexFormat_15bpp)
{
ConvertColor555To6665Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To6665Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
else
{
ConvertColor555To8888Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To8888Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
adr++;
// Set converted colors to 0 if the palette index is 0.
idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100));
idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100));
convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]);
convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]);
convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]);
convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]);
_mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]);
_mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]);
_mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]);
_mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]);
}
#else
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
u8 idx;
idx = *adr & 0x03;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
idx = (*adr >> 2) & 0x03;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
idx = (*adr >> 4) & 0x03;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
idx = (*adr >> 6) & 0x03;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
}
#endif
}
}
else
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++)
#ifdef ENABLE_SSSE3
for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dwdst+=16)
{
u8 bits;
u16 c;
__m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) );
idx = _mm_slli_epi16(idx, 1);
bits = (*adr)&0x3;
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) );
const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) );
bits = ((*adr)>>2)&0x3;
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0);
const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1);
bits = ((*adr)>>4)&0x3;
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
__m128i convertedColor[4];
bits = ((*adr)>>6)&0x3;
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
if (TEXFORMAT == TexFormat_15bpp)
{
ConvertColor555To6665Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To6665Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
else
{
ConvertColor555To8888Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To8888Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
adr++;
_mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]);
_mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]);
_mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]);
_mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]);
}
#else
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
*dwdst++ = CONVERT(pal[ *adr & 0x03] & 0x7FFF);
*dwdst++ = CONVERT(pal[(*adr >> 2) & 0x03] & 0x7FFF);
*dwdst++ = CONVERT(pal[(*adr >> 4) & 0x03] & 0x7FFF);
*dwdst++ = CONVERT(pal[(*adr >> 6) & 0x03] & 0x7FFF);
}
#endif
}
}
break;
@ -488,43 +547,126 @@ public:
case TEXMODE_I4:
{
#ifdef ENABLE_SSSE3
const __m128i palLo = _mm_load_si128((__m128i *)pal + 0);
const __m128i palHi = _mm_load_si128((__m128i *)pal + 1);
#endif
if (isPalZeroTransparent)
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++)
#ifdef ENABLE_SSSE3
for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dwdst+=16)
{
u8 bits;
__m128i idx = _mm_loadl_epi64((__m128i *)adr);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) );
idx = _mm_slli_epi16(idx, 1);
bits = (*adr)&0xF;
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
__m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) );
__m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) );
bits = ((*adr)>>4);
*dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
adr++;
const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() );
const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0);
const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0);
const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1);
const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1);
const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) );
const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) );
__m128i convertedColor[4];
if (TEXFORMAT == TexFormat_15bpp)
{
ConvertColor555To6665Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To6665Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
else
{
ConvertColor555To8888Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To8888Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
// Set converted colors to 0 if the palette index is 0.
idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100));
idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100));
convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]);
convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]);
convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]);
convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]);
_mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]);
_mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]);
_mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]);
_mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]);
}
#else
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
u8 idx;
idx = *adr & 0xF;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
idx = *adr >> 4;
*dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF);
}
#endif
}
}
else
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++)
#ifdef ENABLE_SSSE3
for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dwdst+=16)
{
u8 bits;
u16 c;
__m128i idx = _mm_loadl_epi64((__m128i *)adr);
idx = _mm_unpacklo_epi8(idx, idx);
idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) );
idx = _mm_slli_epi16(idx, 1);
bits = (*adr)&0xF;
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) );
const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) );
bits = ((*adr)>>4);
c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c);
adr++;
const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() );
const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0);
const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0);
const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1);
const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1);
const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) );
const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) );
__m128i convertedColor[4];
if (TEXFORMAT == TexFormat_15bpp)
{
ConvertColor555To6665Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To6665Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
else
{
ConvertColor555To8888Opaque<false>(palColor0, convertedColor[0], convertedColor[1]);
ConvertColor555To8888Opaque<false>(palColor1, convertedColor[2], convertedColor[3]);
}
_mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]);
_mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]);
_mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]);
_mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]);
}
#else
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
*dwdst++ = CONVERT(pal[*adr & 0x0F] & 0x7FFF);
*dwdst++ = CONVERT(pal[*adr >> 4] & 0x7FFF);
}
#endif
}
}
break;
@ -534,26 +676,23 @@ public:
{
if (isPalZeroTransparent)
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; ++x)
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
*dwdst++ = (*adr == 0) ? 0 : CONVERT(pal[*adr] & 0x7FFF);
adr++;
}
}
}
else
{
for(int j=0;j<ms.numItems;j++)
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; ++x)
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
const u16 c = pal[*adr] & 0x7FFF;
*dwdst++ = CONVERT(c);
adr++;
*dwdst++ = CONVERT(pal[*adr] & 0x7FFF);
}
}
}
@ -588,11 +727,11 @@ public:
//i am guessing we just generate black in that case
bool dead = false;
for (int y = 0; y < yTmpSize; y ++)
for (size_t y = 0; y < yTmpSize; y++)
{
u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX,
((y<<2)+2)*sizeX,((y<<2)+3)*sizeX};
for (int x = 0; x < xTmpSize; x ++, d++)
for (size_t x = 0; x < xTmpSize; x++, d++)
{
if (d >= limit)
dead = true;
@ -713,34 +852,125 @@ public:
case TEXMODE_A5I3:
{
for (int j = 0; j < ms.numItems; j++)
#ifdef ENABLE_SSSE3
const __m128i pal_vec128 = _mm_load_si128((__m128i *)pal);
#endif
for (size_t j = 0; j < ms.numItems; j++)
{
adr = ms.items[j].ptr;
for (u32 x = 0; x < ms.items[j].len; ++x)
#ifdef ENABLE_SSSE3
for (size_t x = 0; x < ms.items[j].len; x+=16, adr+=16, dwdst+=16)
{
const __m128i bits = _mm_loadu_si128((__m128i *)adr);
const __m128i idx = _mm_slli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0x07)), 1 );
const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) );
const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) );
const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0);
const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1);
__m128i tmpColor;
__m128i tmpAlpha;
__m128i convertedColor[4];
if (TEXFORMAT == TexFormat_15bpp)
{
__m128i alpha = _mm_srli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), 3 );
__m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha);
__m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha);
tmpColor = _mm_unpacklo_epi16(palColor0, _mm_setzero_si128());
tmpAlpha = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo);
convertedColor[0] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(tmpColor, 4), _mm_set1_epi32(0x00000001)));
convertedColor[0] = _mm_or_si128( convertedColor[0], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000100))) );
convertedColor[0] = _mm_or_si128( convertedColor[0], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 2), _mm_set1_epi32(0x00010000))) );
convertedColor[0] = _mm_or_si128( convertedColor[0], tmpAlpha);
tmpColor = _mm_unpackhi_epi16(palColor0, _mm_setzero_si128());
tmpAlpha = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo);
convertedColor[1] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(tmpColor, 4), _mm_set1_epi32(0x00000001)));
convertedColor[1] = _mm_or_si128( convertedColor[1], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000100))) );
convertedColor[1] = _mm_or_si128( convertedColor[1], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 2), _mm_set1_epi32(0x00010000))) );
convertedColor[1] = _mm_or_si128( convertedColor[1], tmpAlpha);
tmpColor = _mm_unpacklo_epi16(palColor1, _mm_setzero_si128());
tmpAlpha = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi);
convertedColor[2] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(tmpColor, 4), _mm_set1_epi32(0x00000001)));
convertedColor[2] = _mm_or_si128( convertedColor[2], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000100))) );
convertedColor[2] = _mm_or_si128( convertedColor[2], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 2), _mm_set1_epi32(0x00010000))) );
convertedColor[2] = _mm_or_si128( convertedColor[2], tmpAlpha);
tmpColor = _mm_unpackhi_epi16(palColor1, _mm_setzero_si128());
tmpAlpha = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi);
convertedColor[3] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x0000003E)), _mm_and_si128(_mm_srli_epi32(tmpColor, 4), _mm_set1_epi32(0x00000001)));
convertedColor[3] = _mm_or_si128( convertedColor[3], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00003E00)), _mm_and_si128(_mm_srli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000100))) );
convertedColor[3] = _mm_or_si128( convertedColor[3], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 7), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 2), _mm_set1_epi32(0x00010000))) );
convertedColor[3] = _mm_or_si128( convertedColor[3], tmpAlpha);
}
else
{
__m128i alpha = _mm_or_si128( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), _mm_srli_epi16(_mm_and_si128(bits, _mm_set1_epi8(0xE0)), 5) );
__m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha);
__m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha);
tmpColor = _mm_unpacklo_epi16(palColor0, _mm_setzero_si128());
tmpAlpha = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo);
convertedColor[0] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(tmpColor, 2), _mm_set1_epi32(0x00000007)));
convertedColor[0] = _mm_or_si128( convertedColor[0], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000700))) );
convertedColor[0] = _mm_or_si128( convertedColor[0], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00070000))) );
convertedColor[0] = _mm_or_si128( convertedColor[0], tmpAlpha);
tmpColor = _mm_unpackhi_epi16(palColor0, _mm_setzero_si128());
tmpAlpha = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo);
convertedColor[1] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(tmpColor, 2), _mm_set1_epi32(0x00000007)));
convertedColor[1] = _mm_or_si128( convertedColor[1], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000700))) );
convertedColor[1] = _mm_or_si128( convertedColor[1], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00070000))) );
convertedColor[1] = _mm_or_si128( convertedColor[1], tmpAlpha);
tmpColor = _mm_unpacklo_epi16(palColor1, _mm_setzero_si128());
tmpAlpha = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi);
convertedColor[2] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(tmpColor, 2), _mm_set1_epi32(0x00000007)));
convertedColor[2] = _mm_or_si128( convertedColor[2], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000700))) );
convertedColor[2] = _mm_or_si128( convertedColor[2], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00070000))) );
convertedColor[2] = _mm_or_si128( convertedColor[2], tmpAlpha);
tmpColor = _mm_unpackhi_epi16(palColor1, _mm_setzero_si128());
tmpAlpha = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi);
convertedColor[3] = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(tmpColor, 2), _mm_set1_epi32(0x00000007)));
convertedColor[3] = _mm_or_si128( convertedColor[3], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(tmpColor, 1), _mm_set1_epi32(0x00000700))) );
convertedColor[3] = _mm_or_si128( convertedColor[3], _mm_or_si128(_mm_and_si128(_mm_slli_epi32(tmpColor, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(tmpColor, 4), _mm_set1_epi32(0x00070000))) );
convertedColor[3] = _mm_or_si128( convertedColor[3], tmpAlpha);
}
_mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]);
_mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]);
_mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]);
_mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]);
}
#else
for (size_t x = 0; x < ms.items[j].len; x++, adr++)
{
const u16 c = pal[*adr&0x07] & 0x7FFF;
const u8 alpha = (*adr>>3);
if (TEXFORMAT == TexFormat_15bpp)
*dwdst++ = COLOR555TO6665(c,alpha);
else
*dwdst++ = COLOR555TO8888(c,material_5bit_to_8bit[alpha]);
adr++;
*dwdst++ = (TEXFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, alpha) : COLOR555TO8888(c, material_5bit_to_8bit[alpha]);
}
#endif
}
break;
}
case TEXMODE_16BPP:
{
for (int j = 0; j < ms.numItems; j++)
for (size_t j = 0; j < ms.numItems; j++)
{
const u16 *map = (u16*)ms.items[j].ptr;
const int len = ms.items[j].len>>1;
const size_t len = ms.items[j].len >> 1;
for (int x = 0; x < len; ++x)
for (size_t x = 0; x < len; x++)
{
const u16 c = map[x];
*dwdst++ = (c & 0x8000) ? CONVERT(c&0x7FFF) : 0;
*dwdst++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0;
}
}
break;

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2015 DeSmuME team
Copyright (C) 2008-2016 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -23,6 +23,7 @@
#include <map>
#include "types.h"
#include "common.h"
enum TexCache_TexFormat
{
@ -57,8 +58,8 @@ public:
~TexCacheItem()
{
delete[] decoded;
if(_deleteCallback != NULL) _deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2);
free_aligned(this->decoded);
if (this->_deleteCallback != NULL) this->_deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2);
}
u32 decode_len;
u32 mode;