First Revert my changes to VertexLoader.cpp, i don't own the games that get error so i revert the changes until i can test it myself.

Second:
A experiment. implemented parallelization in texture decoding using openmp. is most a experiment to test the performance in different os/plataforms. in my system (windows x64 amd 1055t) give a speedup in large textures, but i tested in in intel dual core and gives a slowdown. o i limited the use for large textures and cpus with more than 3 cores.
please test an let me know if it improves or degrades the speed.
please for linux and osx user. to enable this you will have to enable your compiler support for openmp to test this code.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7284 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
Rodolfo Osvaldo Bogado 2011-03-04 22:48:54 +00:00
parent 756c40163d
commit c569b33829
4 changed files with 401 additions and 246 deletions

View File

@ -278,9 +278,9 @@ namespace this_thread
inline void yield() inline void yield()
{ {
#ifdef _WIN32 #ifdef _WIN32
Sleep(1); Sleep(0);
#else #else
usleep(1000 * 1); sleep(0);
#endif #endif
} }

View File

@ -27,7 +27,7 @@
#include "LookUpTables.h" #include "LookUpTables.h"
#include <cmath> #include <cmath>
#include <omp.h>
#if _M_SSE >= 0x401 #if _M_SSE >= 0x401
#include <smmintrin.h> #include <smmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
@ -685,33 +685,48 @@ PC_TexFormat GetPC_TexFormat(int texformat, int tlutfmt)
//need to add DXT support too //need to add DXT support too
PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
{ {
//Dont use multithreading in small Textures
if(width > 127 && height > 127)
{
//don't span to many threads they will kill the rest of the emu :)
omp_set_num_threads(cpu_info.num_cores + 2 / 3);
}
else
{
omp_set_num_threads(1);
}
int Wsteps4 = (width + 3) / 4;
int Wsteps8 = (width + 7) / 8;
switch (texformat) switch (texformat)
{ {
case GX_TF_C4: case GX_TF_C4:
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
} }
else else
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
} }
return GetPCFormatFromTLUTFormat(tlutfmt); return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_I4: case GX_TF_I4:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,yStep++)
for (int ix = 0; ix < 4; ix++) for (int ix = 0; ix < 4; ix++)
{ {
int val = src[ix]; int val = src[4 * yStep + ix];
dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4); dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4);
dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF); dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF);
} }
@ -719,20 +734,24 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
return PC_TEX_FMT_I4_AS_I8; return PC_TEX_FMT_I4_AS_I8;
case GX_TF_I8: // speed critical case GX_TF_I8: // speed critical
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
memcpy(dst + (y + iy)*width+x, src, 8); {
((u64*)dst + (y + iy)*width+x)[0] = ((u64*)(src + 8 * xStep))[0];
}
} }
return PC_TEX_FMT_I8; return PC_TEX_FMT_I8;
case GX_TF_C8: case GX_TF_C8:
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} }
else else
{ {
@ -740,36 +759,40 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
#if _M_SSE >= 0x301 #if _M_SSE >= 0x301
if (cpu_info.bSSSE3) { if (cpu_info.bSSSE3) {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} else } else
#endif #endif
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} }
} }
return GetPCFormatFromTLUTFormat(tlutfmt); return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_IA4: case GX_TF_IA4:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesIA4((u16*)dst + (y + iy) * width + x, src); decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep);
} }
return PC_TEX_FMT_IA4_AS_IA8; return PC_TEX_FMT_IA4_AS_IA8;
case GX_TF_IA8: case GX_TF_IA8:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++)
{ {
u16 *ptr = (u16 *)dst + (y + iy) * width + x; u16 *ptr = (u16 *)dst + (y + iy) * width + x;
u16 *s = (u16 *)src; u16 *s = (u16 *)(src + 8 * xStep);
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
*ptr++ = Common::swap16(*s++); *ptr++ = Common::swap16(*s++);
} }
@ -780,27 +803,30 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr); decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
} }
else else
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr); decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr);
} }
return GetPCFormatFromTLUTFormat(tlutfmt); return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_RGB565: case GX_TF_RGB565:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
u16 *ptr = (u16 *)dst + (y + iy) * width + x; u16 *ptr = (u16 *)dst + (y + iy) * width + x;
u16 *s = (u16 *)src; u16 *s = (u16 *)(src + 8 * xStep);
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
*ptr++ = Common::swap16(*s++); *ptr++ = Common::swap16(*s++);
} }
@ -808,11 +834,12 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
return PC_TEX_FMT_RGB565; return PC_TEX_FMT_RGB565;
case GX_TF_RGB5A3: case GX_TF_RGB5A3:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src); decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep));
} }
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_RGBA8: // speed critical case GX_TF_RGBA8: // speed critical
@ -821,6 +848,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
#if _M_SSE >= 0x301 #if _M_SSE >= 0x301
if (cpu_info.bSSSE3) { if (cpu_info.bSSSE3) {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) { for (int y = 0; y < height; y += 4) {
__m128i* p = (__m128i*)(src + y * width * 4); __m128i* p = (__m128i*)(src + y * width * 4);
for (int x = 0; x < width; x += 4) { for (int x = 0; x < width; x += 4) {
@ -862,12 +890,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
#endif #endif
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{ {
const u8* src2 = src + 64 * yStep;
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src2 + 4 * iy + 16);
src += 64;
} }
} }
} }
@ -894,18 +923,19 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
} }
return PC_TEX_FMT_DXT1; return PC_TEX_FMT_DXT1;
#else #else
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
{ {
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
{ {
decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width); const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep;
src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width);
decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width); src2 += sizeof(DXTBlock);
src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width);
decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width); src2 += sizeof(DXTBlock);
src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width);
decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width); src2 += sizeof(DXTBlock);
src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width);
} }
} }
#endif #endif
@ -929,30 +959,45 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
{ {
if(width > 127 && height > 127)
{
//don't span to many threads they will kill the rest of the emu :)
omp_set_num_threads(cpu_info.num_cores + 2 / 3);
}
else
{
omp_set_num_threads(1);
}
int Wsteps4 = (width + 3) / 4;
int Wsteps8 = (width + 7) / 8;
switch (texformat) switch (texformat)
{ {
case GX_TF_C4: case GX_TF_C4:
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src, tlutaddr); decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
} }
else if(tlutfmt == 0) else if(tlutfmt == 0)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr); decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
} }
else else
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr); decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
} }
break; break;
case GX_TF_I4: case GX_TF_I4:
@ -967,11 +1012,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2); const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4); const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6); const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0; iy < 8; iy += 2, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2,xStep++)
{ {
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src); const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
// We want the hi 4 bits of each 8-bit word replicated to 32-bit words: // We want the hi 4 bits of each 8-bit word replicated to 32-bit words:
// (00000000 00000000 HhGgFfEe DdCcBbAa) -> (00000000 00000000 HHGGFFEE DDCCBBAA) // (00000000 00000000 HhGgFfEe DdCcBbAa) -> (00000000 00000000 HHGGFFEE DDCCBBAA)
const __m128i i1 = _mm_and_si128(r0, kMask_xf0); const __m128i i1 = _mm_and_si128(r0, kMask_xf0);
@ -1001,11 +1047,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics. // JSD optimized with SSE2 intrinsics.
// Produces a ~76% speed improvement over reference C implementation. // Produces a ~76% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
for (int iy = 0; iy < 8; iy += 2, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
{ {
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src); const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa) // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
const __m128i r1 = _mm_unpacklo_epi8(r0, r0); const __m128i r1 = _mm_unpacklo_epi8(r0, r0);
@ -1086,17 +1133,17 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation // Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
{ for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
for (int iy = 0; iy < 4; ++iy, src+=8)
{ {
const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4); const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
__m128i *quaddst, r, rgba0, rgba1; __m128i *quaddst, r, rgba0, rgba1;
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
r = _mm_loadl_epi64((const __m128i *)src); r = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to: // Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa) rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
rgba1 = _mm_shuffle_epi8(r, mask7654); // (hhhh gggg ffff eeee) rgba1 = _mm_shuffle_epi8(r, mask7654); // (hhhh gggg ffff eeee)
@ -1105,17 +1152,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_storeu_si128(quaddst, rgba0); _mm_storeu_si128(quaddst, rgba0);
_mm_storeu_si128(quaddst+1, rgba1); _mm_storeu_si128(quaddst+1, rgba1);
} }
}
} else } else
#endif #endif
// JSD optimized with SSE2 intrinsics. // JSD optimized with SSE2 intrinsics.
// Produces an ~86% speed improvement over reference C implementation. // Produces an ~86% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
{ {
// Each loop iteration processes 4 rows from 4 64-bit reads. // Each loop iteration processes 4 rows from 4 64-bit reads.
const u8* src2 = src + 32 * yStep;
// TODO: is it more efficient to group the loads together sequentially and also the stores at the end? // TODO: is it more efficient to group the loads together sequentially and also the stores at the end?
// _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly WORSE, so I // _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly WORSE, so I
// went with _mm_stores. Perhaps there is some edge case here creating the terrible performance or we're // went with _mm_stores. Perhaps there is some edge case here creating the terrible performance or we're
@ -1123,7 +1171,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
__m128i *quaddst; __m128i *quaddst;
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src); const __m128i r0 = _mm_loadl_epi64((const __m128i *)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa) // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
const __m128i r1 = _mm_unpacklo_epi8(r0, r0); const __m128i r1 = _mm_unpacklo_epi8(r0, r0);
@ -1139,8 +1187,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_storeu_si128(quaddst+1, rgba1); _mm_storeu_si128(quaddst+1, rgba1);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src += 8; src2 += 8;
const __m128i r2 = _mm_loadl_epi64((const __m128i *)src); const __m128i r2 = _mm_loadl_epi64((const __m128i *)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa) // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
const __m128i r3 = _mm_unpacklo_epi8(r2, r2); const __m128i r3 = _mm_unpacklo_epi8(r2, r2);
@ -1156,8 +1204,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_storeu_si128(quaddst+1, rgba3); _mm_storeu_si128(quaddst+1, rgba3);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src += 8; src2 += 8;
const __m128i r4 = _mm_loadl_epi64((const __m128i *)src); const __m128i r4 = _mm_loadl_epi64((const __m128i *)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa) // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
const __m128i r5 = _mm_unpacklo_epi8(r4, r4); const __m128i r5 = _mm_unpacklo_epi8(r4, r4);
@ -1173,8 +1221,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_storeu_si128(quaddst+1, rgba5); _mm_storeu_si128(quaddst+1, rgba5);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src += 8; src2 += 8;
const __m128i r6 = _mm_loadl_epi64((const __m128i *)src); const __m128i r6 = _mm_loadl_epi64((const __m128i *)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa) // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
const __m128i r7 = _mm_unpacklo_epi8(r6, r6); const __m128i r7 = _mm_unpacklo_epi8(r6, r6);
@ -1189,7 +1237,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Store (hhhh gggg ffff eeee) out: // Store (hhhh gggg ffff eeee) out:
_mm_storeu_si128(quaddst+1, rgba7); _mm_storeu_si128(quaddst+1, rgba7);
src += 8;
} }
} }
#if 0 #if 0
@ -1218,35 +1265,38 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} }
else if(tlutfmt == 0) else if(tlutfmt == 0)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} }
else else
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr); decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
} }
break; break;
case GX_TF_IA4: case GX_TF_IA4:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesIA4RGBA(dst + (y + iy) * width + x, src); decodebytesIA4RGBA(dst + (y + iy) * width + x, src + 8 * xStep);
} }
break; break;
case GX_TF_IA8: case GX_TF_IA8:
@ -1256,13 +1306,14 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Produces an ~50% speed improvement over SSE2 implementation. // Produces an ~50% speed improvement over SSE2 implementation.
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1); const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src); const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
// Shuffle to (ghhh efff cddd abbb) // Shuffle to (ghhh efff cddd abbb)
const __m128i r1 = _mm_shuffle_epi8(r0, mask); const __m128i r1 = _mm_shuffle_epi8(r0, mask);
_mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 ); _mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 );
@ -1276,15 +1327,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL); const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L); const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL); const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value. // Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) // Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src); const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src+ 8 * xStep));
// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000 0h0f 0d0b) // Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000 0h0f 0d0b)
// This gets us only the I components. // This gets us only the I components.
@ -1340,24 +1391,27 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
if (tlutfmt == 2) if (tlutfmt == 2)
{ {
// Special decoding is required for TLUT format 5A3 // Special decoding is required for TLUT format 5A3
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)src, tlutaddr); decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
} }
else if (tlutfmt == 0) else if (tlutfmt == 0)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr); decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
} }
else else
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr); decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
} }
break; break;
case GX_TF_RGB565: case GX_TF_RGB565:
@ -1369,12 +1423,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
const __m128i kMaskG1 = _mm_set1_epi32(0x00000300); const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000); const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
const __m128i kAlpha = _mm_set1_epi32(0xFF000000); const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
__m128i *dxtsrc = (__m128i *)src; __m128i *dxtsrc = (__m128i *)(src + 8 * xStep);
// Load 4x 16-bit colors: (0000 0000 hgfe dcba) // Load 4x 16-bit colors: (0000 0000 hgfe dcba)
// where hg, fe, ba, and dc are 16-bit colors in big-endian order // where hg, fe, ba, and dc are 16-bit colors in big-endian order
const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc); const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
@ -1458,13 +1512,14 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation // Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
u32 *newdst = dst+(y+iy)*width+x; u32 *newdst = dst+(y+iy)*width+x;
const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1); const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)src),mask); const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)),mask);
int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3 int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF. if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
{ {
@ -1549,12 +1604,13 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics (2 in 4 cases) // JSD optimized with SSE2 intrinsics (2 in 4 cases)
// Produces a ~25% speed improvement over reference C implementation. // Produces a ~25% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0; iy < 4; iy++, src += 8) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{ {
u32 *newdst = dst+(y+iy)*width+x; u32 *newdst = dst+(y+iy)*width+x;
const u16 *newsrc = (const u16*)src; const u16 *newsrc = (const u16*)(src + 8 * xStep);
// TODO: weak point // TODO: weak point
const u16 val0 = Common::swap16(newsrc[0]); const u16 val0 = Common::swap16(newsrc[0]);
@ -1669,14 +1725,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Produces a ~30% speed improvement over SSE2 implementation // Produces a ~30% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4, src += 64) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{ {
const u8* src2 = src + 64 * yStep;
const __m128i mask0312 = _mm_set_epi8(12,15,13,14,8,11,9,10,4,7,5,6,0,3,1,2); const __m128i mask0312 = _mm_set_epi8(12,15,13,14,8,11,9,10,4,7,5,6,0,3,1,2);
const __m128i ar0 = _mm_loadu_si128((__m128i*)src); const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1); const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2); const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3); const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);
const __m128i rgba00 = _mm_shuffle_epi8(_mm_unpacklo_epi8(ar0,gb0),mask0312); const __m128i rgba00 = _mm_shuffle_epi8(_mm_unpacklo_epi8(ar0,gb0),mask0312);
@ -1698,8 +1756,9 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics // JSD optimized with SSE2 intrinsics
// Produces a ~68% speed improvement over reference C implementation. // Produces a ~68% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4, src += 64) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{ {
// Input is divided up into 16-bit words. The texels are split up into AR and GB components where all // Input is divided up into 16-bit words. The texels are split up into AR and GB components where all
// AR components come grouped up first in 32 bytes followed by the GB components in 32 bytes. We are // AR components come grouped up first in 32 bytes followed by the GB components in 32 bytes. We are
@ -1718,15 +1777,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// (RGBA7 RGBA6 RGBA5 RGBA4) // (RGBA7 RGBA6 RGBA5 RGBA4)
// (RGBAb RGBAa RGBA9 RGBA8) // (RGBAb RGBAa RGBA9 RGBA8)
// (RGBAf RGBAe RGBAd RGBAc) // (RGBAf RGBAe RGBAd RGBAc)
const u8* src2 = src + 64 * yStep;
// Loads the 1st half of AR components ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0]) // Loads the 1st half of AR components ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
const __m128i ar0 = _mm_loadu_si128((__m128i*)src); const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
// Loads the 2nd half of AR components ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8]) // Loads the 2nd half of AR components ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1); const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
// Loads the 1st half of GB components ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0]) // Loads the 1st half of GB components ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2); const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
// Loads the 2nd half of GB components ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8]) // Loads the 2nd half of GB components ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3); const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);
__m128i rgba00, rgba01, rgba10, rgba11; __m128i rgba00, rgba01, rgba10, rgba11;
const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL); const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L); const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
@ -1811,15 +1870,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation. // Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is // The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
// faster than both. // faster than both.
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
{ {
for (int x = 0; x < width; x += 8) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
{ {
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers. // We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit. // This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable // Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
// at this level, so we do. // at this level, so we do.
for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2) for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
{ {
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they // JSD NOTE: You may see many strange patterns of behavior in the below code, but they
// are for performance reasons. Sometimes, calculating what should be obvious hard-coded // are for performance reasons. Sometimes, calculating what should be obvious hard-coded
@ -1833,7 +1893,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()); const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
// Load 128 bits, i.e. two DXTBlocks (64-bits each) // Load 128 bits, i.e. two DXTBlocks (64-bits each)
const __m128i dxt = _mm_loadu_si128((__m128i *)src); const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 2 * xStep));
// Copy the 2-bit indices from each DXT block: // Copy the 2-bit indices from each DXT block:
GC_ALIGNED16( u32 dxttmp[4] ); GC_ALIGNED16( u32 dxttmp[4] );
@ -2450,8 +2510,8 @@ void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth
const char* texfmt[] = { const char* texfmt[] = {
// pixel // pixel
"I4", "I8", "IA4", "IA8", "I4", "I8", "IA4", "IA8",
"RGB565", "RGB5A3", "RGBA8", "C4", "RGB565", "RGB5A3", "RGBA8", "0x07",
"C8", "C14X2", "0x0A", "0x0B", "C4", "C8", "C14X2", "0x0B",
"0x0C", "0x0D", "CMPR", "0x0F", "0x0C", "0x0D", "CMPR", "0x0F",
// Z-buffer // Z-buffer
"0x10", "Z8", "0x12", "Z16", "0x10", "Z8", "0x12", "Z16",

View File

@ -514,20 +514,18 @@ void VertexLoader::WriteSetVariable(int bits, void *address, OpArg value)
void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count) void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
{ {
if(count == 0)
return;
m_numLoadedVertices += count; m_numLoadedVertices += count;
INCSTAT(stats.thisFrame.numDrawCalls);
// Flush if our vertex format is different from the currently set. // Flush if our vertex format is different from the currently set.
if (g_nativeVertexFmt != m_NativeFmt) if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
{ {
// We really must flush here. It's possible that the native representations // We really must flush here. It's possible that the native representations
// of the two vtx formats are the same, but we have no way to easily check that // of the two vtx formats are the same, but we have no way to easily check that
// now. // now.
VertexManager::Flush(); VertexManager::Flush();
g_nativeVertexFmt = m_NativeFmt; // Also move the Set() here?
m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
} }
g_nativeVertexFmt = m_NativeFmt;
if (bpmem.genMode.cullmode == 3 && primitive < 5) if (bpmem.genMode.cullmode == 3 && primitive < 5)
{ {
@ -536,6 +534,8 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
return; return;
} }
m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
// Load position and texcoord scale factors. // Load position and texcoord scale factors.
m_VtxAttr.PosFrac = g_VtxAttr[vtx_attr_group].g0.PosFrac; m_VtxAttr.PosFrac = g_VtxAttr[vtx_attr_group].g0.PosFrac;
m_VtxAttr.texCoord[0].Frac = g_VtxAttr[vtx_attr_group].g0.Tex0Frac; m_VtxAttr.texCoord[0].Frac = g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
@ -555,18 +555,86 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
for (int i = 0; i < 2; i++) for (int i = 0; i < 2; i++)
colElements[i] = m_VtxAttr.color[i].Elements; colElements[i] = m_VtxAttr.color[i].Elements;
if(VertexManager::GetRemainingSize() < native_stride * count) // if strips or fans, make sure all vertices can fit in buffer, otherwise flush
int granularity = 1;
switch (primitive) {
case 3: // strip .. hm, weird
case 4: // fan
if (VertexManager::GetRemainingSize() < 3 * native_stride)
VertexManager::Flush(); VertexManager::Flush();
break;
case 6: // line strip
if (VertexManager::GetRemainingSize() < 2 * native_stride)
VertexManager::Flush();
break;
case 0: granularity = 4; break; // quads
case 2: granularity = 3; break; // tris
case 5: granularity = 2; break; // lines
}
VertexManager::AddVertices(primitive,count); int startv = 0, extraverts = 0;
int v = 0;
//int remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
while (v < count)
{
int remainingVerts = VertexManager::GetRemainingSize() / native_stride;
//if (remainingVerts2 - v + startv < remainingVerts)
//remainingVerts = remainingVerts2 - v + startv;
if (remainingVerts < granularity) {
INCSTAT(stats.thisFrame.numBufferSplits);
// This buffer full - break current primitive and flush, to switch to the next buffer.
u8* plastptr = VertexManager::s_pCurBufferPointer;
if (v - startv > 0)
VertexManager::AddVertices(primitive, v - startv + extraverts);
VertexManager::Flush();
//remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
// Why does this need to be so complicated?
switch (primitive) {
case 3: // triangle strip, copy last two vertices
// a little trick since we have to keep track of signs
if (v & 1) {
memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-2*native_stride, native_stride);
memcpy_gc(VertexManager::s_pCurBufferPointer+native_stride, plastptr-native_stride*2, 2*native_stride);
VertexManager::s_pCurBufferPointer += native_stride*3;
extraverts = 3;
}
else {
memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*2, native_stride*2);
VertexManager::s_pCurBufferPointer += native_stride*2;
extraverts = 2;
}
break;
case 4: // tri fan, copy first and last vert
memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*(v-startv+extraverts), native_stride);
VertexManager::s_pCurBufferPointer += native_stride;
memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
VertexManager::s_pCurBufferPointer += native_stride;
extraverts = 2;
break;
case 6: // line strip
memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
VertexManager::s_pCurBufferPointer += native_stride;
extraverts = 1;
break;
default:
extraverts = 0;
break;
}
startv = v;
}
int remainingPrims = remainingVerts / granularity;
remainingVerts = remainingPrims * granularity;
if (count - v < remainingVerts)
remainingVerts = count - v;
#ifdef USE_JIT #ifdef USE_JIT
if (count > 0) { if (remainingVerts > 0) {
loop_counter = count; loop_counter = remainingVerts;
((void (*)())(void*)m_compiledCode)(); ((void (*)())(void*)m_compiledCode)();
} }
#else #else
for (int s = 0; s < count; s++) for (int s = 0; s < remainingVerts; s++)
{ {
tcIndex = 0; tcIndex = 0;
colIndex = 0; colIndex = 0;
@ -576,6 +644,11 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
PRIM_LOG("\n"); PRIM_LOG("\n");
} }
#endif #endif
v += remainingVerts;
}
if (startv < count)
VertexManager::AddVertices(primitive, count - startv + extraverts);
} }
@ -584,17 +657,17 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data) void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data)
{ {
m_numLoadedVertices += count; m_numLoadedVertices += count;
INCSTAT(stats.thisFrame.numDrawCalls);
// Flush if our vertex format is different from the currently set. // Flush if our vertex format is different from the currently set.
if (g_nativeVertexFmt != m_NativeFmt) if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
{ {
// We really must flush here. It's possible that the native representations // We really must flush here. It's possible that the native representations
// of the two vtx formats are the same, but we have no way to easily check that // of the two vtx formats are the same, but we have no way to easily check that
// now. // now.
VertexManager::Flush(); VertexManager::Flush();
g_nativeVertexFmt = m_NativeFmt; // Also move the Set() here?
m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
} }
g_nativeVertexFmt = m_NativeFmt;
if (bpmem.genMode.cullmode == 3 && primitive < 5) if (bpmem.genMode.cullmode == 3 && primitive < 5)
{ {
@ -603,6 +676,27 @@ void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int co
return; return;
} }
m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
// Load position and texcoord scale factors.
m_VtxAttr.PosFrac = g_VtxAttr[vtx_attr_group].g0.PosFrac;
m_VtxAttr.texCoord[0].Frac = g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
m_VtxAttr.texCoord[1].Frac = g_VtxAttr[vtx_attr_group].g1.Tex1Frac;
m_VtxAttr.texCoord[2].Frac = g_VtxAttr[vtx_attr_group].g1.Tex2Frac;
m_VtxAttr.texCoord[3].Frac = g_VtxAttr[vtx_attr_group].g1.Tex3Frac;
m_VtxAttr.texCoord[4].Frac = g_VtxAttr[vtx_attr_group].g2.Tex4Frac;
m_VtxAttr.texCoord[5].Frac = g_VtxAttr[vtx_attr_group].g2.Tex5Frac;
m_VtxAttr.texCoord[6].Frac = g_VtxAttr[vtx_attr_group].g2.Tex6Frac;
m_VtxAttr.texCoord[7].Frac = g_VtxAttr[vtx_attr_group].g2.Tex7Frac;
pVtxAttr = &m_VtxAttr;
posScale = fractionTable[m_VtxAttr.PosFrac];
if (m_NativeFmt->m_components & VB_HAS_UVALL)
for (int i = 0; i < 8; i++)
tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
for (int i = 0; i < 2; i++)
colElements[i] = m_VtxAttr.color[i].Elements;
if(VertexManager::GetRemainingSize() < native_stride * count) if(VertexManager::GetRemainingSize() < native_stride * count)
VertexManager::Flush(); VertexManager::Flush();
memcpy_gc(VertexManager::s_pCurBufferPointer, Data, native_stride * count); memcpy_gc(VertexManager::s_pCurBufferPointer, Data, native_stride * count);

View File

@ -152,6 +152,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile> <ClCompile>
<AdditionalIncludeDirectories>..\Common\Src;..\Core\Src;..\..\..\Externals\SOIL;..\..\..\Externals\CLRun\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>..\Common\Src;..\Core\Src;..\..\..\Externals\SOIL;..\..\..\Externals\CLRun\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile> </ClCompile>
<Link> <Link>
<GenerateDebugInformation>true</GenerateDebugInformation> <GenerateDebugInformation>true</GenerateDebugInformation>