Windows Port:
- Fix compiling on Windows due to new color conversion code. (Regression from r5455.) GPU: - The SSE2 version of ConvertColor555To8888Opaque() now uses memory lookups instead of calculating things through.
This commit is contained in:
parent
29ff68cda9
commit
9e07cc95b4
|
@ -52,6 +52,7 @@ u32 Render3DFramesPerSecond;
|
|||
CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
|
||||
CACHE_ALIGN u32 color_555_to_666[32768];
|
||||
CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
|
||||
CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
|
||||
CACHE_ALIGN u32 color_555_to_888[32768];
|
||||
|
||||
//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX
|
||||
|
@ -6324,6 +6325,7 @@ GPUSubsystem::GPUSubsystem()
|
|||
{
|
||||
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
|
||||
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
|
||||
#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) )
|
||||
|
||||
for (size_t i = 0; i < 32768; i++)
|
||||
{
|
||||
|
@ -6331,6 +6333,7 @@ GPUSubsystem::GPUSubsystem()
|
|||
color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 );
|
||||
color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) );
|
||||
color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 );
|
||||
color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 );
|
||||
}
|
||||
|
||||
needInitTables = false;
|
||||
|
@ -7088,11 +7091,11 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *dst, size
|
|||
for (; i < ssePixCount; i += 8)
|
||||
{
|
||||
__m128i src_vec128 = _mm_load_si128((__m128i *)(src + i));
|
||||
__m128i dstConverted0, dstConverted1;
|
||||
ConvertColor555To8888Opaque<SWAP_RB>(src_vec128, dstConverted0, dstConverted1);
|
||||
__m128i dstConvertedLo, dstConvertedHi;
|
||||
ConvertColor555To8888Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi);
|
||||
|
||||
_mm_store_si128((__m128i *)(dst + i + 0), dstConverted0);
|
||||
_mm_store_si128((__m128i *)(dst + i + 4), dstConverted1);
|
||||
_mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo);
|
||||
_mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1640,6 +1640,7 @@ extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
|
|||
extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_666[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768];
|
||||
extern CACHE_ALIGN u32 color_555_to_888[32768];
|
||||
|
||||
#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color
|
||||
|
@ -1652,6 +1653,7 @@ extern CACHE_ALIGN u32 color_555_to_888[32768];
|
|||
#endif
|
||||
|
||||
#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color
|
||||
#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped
|
||||
#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color
|
||||
|
||||
#ifdef LOCAL_LE
|
||||
|
@ -1676,13 +1678,7 @@ inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const
|
|||
template <bool SWAP_RB>
|
||||
FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src)
|
||||
{
|
||||
FragmentColor outColor;
|
||||
outColor.r = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 10) & 0x001F) : ((src >> 0) & 0x001F))];
|
||||
outColor.g = material_5bit_to_8bit[((src >> 5) & 0x001F)];
|
||||
outColor.b = material_5bit_to_8bit[((SWAP_RB) ? ((src >> 0) & 0x001F) : ((src >> 10) & 0x001F))];
|
||||
outColor.a = 0xFF;
|
||||
|
||||
return outColor.color;
|
||||
return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF);
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
@ -1760,34 +1756,71 @@ FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor)
|
|||
#ifdef ENABLE_SSE2
|
||||
|
||||
template <bool SWAP_RB>
|
||||
FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dst0, __m128i &dst1)
|
||||
FORCEINLINE void ConvertColor555To8888Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
|
||||
{
|
||||
#if 0
|
||||
// I'm shelving this code until the time when I figure out how to do this conversion faster in SSE2
|
||||
// without using any memory lookups. This code does work, albeit slowly. -- rogerman, 2016-06-17
|
||||
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB8 << 3) | ((srcRGB8 >> 2) & 0x07)
|
||||
if (SWAP_RB)
|
||||
{
|
||||
dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000)));
|
||||
dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
|
||||
dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) );
|
||||
dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000));
|
||||
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 19), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x00070000)));
|
||||
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
|
||||
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00000007))) );
|
||||
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000));
|
||||
|
||||
dst1 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000)));
|
||||
dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
|
||||
dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) );
|
||||
dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000));
|
||||
dstHi = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00070000)));
|
||||
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
|
||||
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 23), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 28), _mm_set1_epi32(0x00000007))) );
|
||||
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000));
|
||||
}
|
||||
else
|
||||
{
|
||||
dst0 = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007)));
|
||||
dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
|
||||
dst0 = _mm_or_si128(dst0, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) );
|
||||
dst0 = _mm_or_si128(dst0, _mm_set1_epi32(0xFF000000));
|
||||
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00000007)));
|
||||
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 6), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_slli_epi32(src, 1), _mm_set1_epi32(0x00000700))) );
|
||||
dstLo = _mm_or_si128(dstLo, _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 9), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_slli_epi32(src, 4), _mm_set1_epi32(0x00070000))) );
|
||||
dstLo = _mm_or_si128(dstLo, _mm_set1_epi32(0xFF000000));
|
||||
|
||||
dst1 = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007)));
|
||||
dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
|
||||
dst1 = _mm_or_si128(dst1, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) );
|
||||
dst1 = _mm_or_si128(dst1, _mm_set1_epi32(0xFF000000));
|
||||
dstHi = _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 13), _mm_set1_epi32(0x000000F8)), _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x00000007)));
|
||||
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 10), _mm_set1_epi32(0x0000F800)), _mm_and_si128(_mm_srli_epi32(src, 15), _mm_set1_epi32(0x00000700))) );
|
||||
dstHi = _mm_or_si128(dstHi, _mm_or_si128(_mm_and_si128(_mm_srli_epi32(src, 7), _mm_set1_epi32(0x00F80000)), _mm_and_si128(_mm_srli_epi32(src, 12), _mm_set1_epi32(0x00070000))) );
|
||||
dstHi = _mm_or_si128(dstHi, _mm_set1_epi32(0xFF000000));
|
||||
}
|
||||
|
||||
__m128i tmpDstLo = dstLo;
|
||||
dstLo = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0xD8), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0x72), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
|
||||
dstHi = _mm_or_si128( _mm_and_si128(_mm_shuffle_epi32(tmpDstLo, 0x72), _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)), _mm_and_si128(_mm_shuffle_epi32(dstHi, 0xD8), _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)) );
|
||||
#else
|
||||
// This code does the same thing as the above, but with memory lookups. It's faster, but kinda
|
||||
// defeats the purpose of using SSE2 due to the memory lookups. -- rogerman, 2016-06-17
|
||||
|
||||
__m128i srcMasked = _mm_and_si128(src, _mm_set1_epi16(0x7FFF));
|
||||
|
||||
if (SWAP_RB)
|
||||
{
|
||||
dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 7)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 6)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 5)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 4)));
|
||||
dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 3)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 2)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 1)),
|
||||
COLOR555TO8888_OPAQUE_SWAP_RB(_mm_extract_epi16(srcMasked, 0)));
|
||||
}
|
||||
else
|
||||
{
|
||||
dstHi = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 7)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 6)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 5)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 4)));
|
||||
dstLo = _mm_set_epi32(COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 3)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 2)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 1)),
|
||||
COLOR555TO8888_OPAQUE(_mm_extract_epi16(srcMasked, 0)));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool SWAP_RB>
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#include "ImageOut.h"
|
||||
#include "formats/rpng.h"
|
||||
#include "formats/rbmp.h"
|
||||
#include "gfx3d.h"
|
||||
#include "GPU.h"
|
||||
|
||||
static u8* Convert15To24(const u16* src, int width, int height)
|
||||
{
|
||||
|
@ -33,11 +33,9 @@ static u8* Convert15To24(const u16* src, int width, int height)
|
|||
{
|
||||
for(int x=0;x<width;x++)
|
||||
{
|
||||
u16 pixel = *src++;
|
||||
u32 expanded = RGB15TO32_NOALPHA(pixel);
|
||||
*tmp_inc++ = (expanded>>16)&0xFF;
|
||||
*tmp_inc++ = (expanded>>8)&0xFF;
|
||||
*tmp_inc++ = expanded&0xFF;
|
||||
u32 dst = ConvertColor555To8888Opaque<true>(*src++);
|
||||
*(u32 *)tmp_inc[i] = (dst & 0x00FFFFFF) | (*(u32 *)tmp_inc & 0xFF000000);
|
||||
tmp_inc += 3;
|
||||
}
|
||||
}
|
||||
return tmp_buffer;
|
||||
|
|
|
@ -320,12 +320,9 @@ static void do_video_conversion(AVIFile* avi, const u16* buffer)
|
|||
{
|
||||
for(int x=0;x<width;x++)
|
||||
{
|
||||
u16 col16 = *buffer++;
|
||||
col16 &=0x7FFF;
|
||||
u32 col24 = color_15bit_to_24bit[col16];
|
||||
*outbuf++ = (col24>>16)&0xFF;
|
||||
*outbuf++ = (col24>>8)&0xFF;
|
||||
*outbuf++ = col24&0xFF;
|
||||
u32 dst = ConvertColor555To8888Opaque<true>(*buffer++);
|
||||
*(u32 *)outbuf = (dst & 0x00FFFFFF) | (*(u32 *)outbuf & 0xFF000000);
|
||||
outbuf += 3;
|
||||
}
|
||||
|
||||
outbuf -= width*3*2;
|
||||
|
|
|
@ -1919,11 +1919,8 @@ static void DoDisplay(bool firstTime)
|
|||
|
||||
//convert pixel format to 32bpp for compositing
|
||||
//why do we do this over and over? well, we are compositing to
|
||||
//filteredbuffer32bpp, and it needs to get refreshed each frame..
|
||||
const int size = video.srcBufferSize/2;
|
||||
u16* src = (u16*)video.srcBuffer;
|
||||
for(int i=0;i<size;i++)
|
||||
video.buffer[i] = RGB15TO24_REVERSE(src[i]);
|
||||
//filteredbuffer32bpp, and it needs to get refreshed each frame.
|
||||
ConvertColorBuffer555To8888Opaque<true>((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16));
|
||||
|
||||
if(firstTime)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue