First Revert my changes to VertexLoader.cpp, i don't own the games that get error so i revert the changes until i can test it myself.

Second: A experiment. implemented parallelization in texture decoding using openmp. is most a experiment to test the performance in different os/plataforms. in my system (windows x64 amd 1055t) give a speedup in large textures, but i tested in in intel dual core and gives a slowdown. o i limited the use for large textures and cpus with more than 3 cores. please test an let me know if it improves or degrades the speed. please for linux and osx user. to enable this you will have to enable your compiler support for openmp to test this code. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7284 8ced0084-cf51-0410-be5f-012b33b47a6e
2011-03-04 22:48:54 +00:00 · 2011-03-04 22:48:54 +00:00 · c569b33829
parent 756c40163d
commit c569b33829
4 changed files with 401 additions and 246 deletions
--- a/Source/Core/Common/Src/StdThread.h
+++ b/Source/Core/Common/Src/StdThread.h
@ -278,9 +278,9 @@ namespace this_thread
 inline void yield()
 {
 #ifdef _WIN32
-	Sleep(1);
+	Sleep(0);
 #else
-	usleep(1000 * 1);
+	sleep(0);
 #endif
 }

--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@ -27,7 +27,7 @@
 #include "LookUpTables.h"

 #include <cmath>
-
+#include <omp.h>
 #if _M_SSE >= 0x401
 #include <smmintrin.h>
 #include <emmintrin.h>
@ -685,33 +685,48 @@ PC_TexFormat GetPC_TexFormat(int texformat, int tlutfmt)
 //need to add DXT support too
 PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
+	//Dont use multithreading in small Textures
+	if(width > 127 && height > 127)
+	{
+		//don't span to many threads they will kill the rest of the emu :)
+		omp_set_num_threads(cpu_info.num_cores + 2 / 3);
+	}
+	else
+	{
+		omp_set_num_threads(1);
+	}
+	int Wsteps4 = (width + 3) / 4;
+	int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
+						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);				
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
+						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);						
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_I4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,yStep++)
 						for (int ix = 0; ix < 4; ix++)
 						{
-							int val = src[ix];
+							int val = src[4 * yStep + ix];
 							dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4);
 							dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF);
 						}
@ -719,20 +734,24 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 	   return PC_TEX_FMT_I4_AS_I8;
 	case GX_TF_I8:  // speed critical
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						memcpy(dst + (y + iy)*width+x, src, 8);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+					{
+						((u64*)dst + (y + iy)*width+x)[0] = ((u64*)(src + 8 * xStep))[0];
+					}
 		}
 		return PC_TEX_FMT_I8;
 	case GX_TF_C8:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else
 		{
@ -740,36 +759,40 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #if _M_SSE >= 0x301

 			if (cpu_info.bSSSE3) {
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 			} else
 #endif
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src  + 8 * xStep, tlutaddr);
 			}
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_IA4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesIA4((u16*)dst + (y + iy) * width + x, src);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		return PC_TEX_FMT_IA4_AS_IA8;
 	case GX_TF_IA8:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
-						u16 *s = (u16 *)src;
+						u16 *s = (u16 *)(src + 8 * xStep);
 						for(int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
@ -780,27 +803,30 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_RGB565:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
-						u16 *s = (u16 *)src;
+						u16 *s = (u16 *)(src + 8 * xStep);
 						for(int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
@ -808,11 +834,12 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 		return PC_TEX_FMT_RGB565;
 	case GX_TF_RGB5A3:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
-						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src);
+						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep));
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_RGBA8:  // speed critical
@ -821,6 +848,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #if _M_SSE >= 0x301

 			if (cpu_info.bSSSE3) {
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4) {
 					__m128i* p = (__m128i*)(src + y * width * 4);
 					for (int x = 0; x < width; x += 4) {
@ -862,12 +890,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #endif

 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
+						const u8* src2 = src + 64 * yStep;
 						for (int iy = 0; iy < 4; iy++)
-							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
-						src += 64;
+							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src2 + 4 * iy + 16);						
 					}
 			}
 		}
@ -894,18 +923,19 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 			}
 			return PC_TEX_FMT_DXT1;
 #else
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
-				for (int x = 0; x < width; x += 8)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 				{
-					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
+					const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; 
+					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width);										
 				}
 			}
 #endif
@ -929,30 +959,45 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh

 PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
+	if(width > 127 && height > 127)
+	{
+		//don't span to many threads they will kill the rest of the emu :)
+		omp_set_num_threads(cpu_info.num_cores + 2 / 3);
+	}
+	else
+	{
+		omp_set_num_threads(1);
+	}
+	int Wsteps4 = (width + 3) / 4;
+	int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		else if(tlutfmt == 0)
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
+				
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src  + 4 * xStep, tlutaddr);
 		}
 		break;
 	case GX_TF_I4:
@ -967,11 +1012,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 				const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
 				const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
 				const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 8; iy += 2, src += 8)
+					for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+						for (int iy = 0, xStep =  4 * yStep; iy < 8; iy += 2,xStep++)
 						{
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// We want the hi 4 bits of each 8-bit word replicated to 32-bit words:
 							// (00000000 00000000 HhGgFfEe DdCcBbAa) -> (00000000 00000000 HHGGFFEE DDCCBBAA)
 							const __m128i i1 = _mm_and_si128(r0, kMask_xf0);
@ -1001,11 +1047,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics.
 			// Produces a ~76% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 8; iy += 2, src += 8)
+					for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
 						{
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 							const __m128i r1 = _mm_unpacklo_epi8(r0, r0);

@ -1086,17 +1133,17 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-					{
-						for (int iy = 0; iy < 4; ++iy, src+=8)
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
 						{
 							const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);

 							const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
 							__m128i *quaddst, r, rgba0, rgba1;
 							// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							r = _mm_loadl_epi64((const __m128i *)src);
+							r = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
 							rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
 							rgba1 = _mm_shuffle_epi8(r, mask7654); // (hhhh gggg ffff eeee)
@ -1105,17 +1152,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 							_mm_storeu_si128(quaddst, rgba0);
 							_mm_storeu_si128(quaddst+1, rgba1);
 						}
-				}
+				
 			} else
 #endif
 			// JSD optimized with SSE2 intrinsics.
 			// Produces an ~86% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
 					{
 						// Each loop iteration processes 4 rows from 4 64-bit reads.
-
+						const u8* src2 = src + 32 * yStep;
 						// TODO: is it more efficient to group the loads together sequentially and also the stores at the end?
 						// _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly WORSE, so I
 						// went with _mm_stores. Perhaps there is some edge case here creating the terrible performance or we're
@ -1123,7 +1171,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						__m128i *quaddst;

 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+						const __m128i r0 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r1 = _mm_unpacklo_epi8(r0, r0);

@ -1139,8 +1187,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba1);

 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r2 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r2 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r3 = _mm_unpacklo_epi8(r2, r2);

@ -1156,8 +1204,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba3);

 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r4 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r4 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r5 = _mm_unpacklo_epi8(r4, r4);

@ -1173,8 +1221,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba5);

 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r6 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r6 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r7 = _mm_unpacklo_epi8(r6, r6);

@ -1188,8 +1236,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst, rgba6);
 						// Store (hhhh gggg ffff eeee) out:
 						_mm_storeu_si128(quaddst+1, rgba7);
-
-						src += 8;
+						
 					}
 			}
 #if 0
@ -1218,35 +1265,38 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else if(tlutfmt == 0)
 		{
-
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 			
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);			
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);			

 		}
 		break;
 	case GX_TF_IA4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesIA4RGBA(dst + (y + iy) * width + x, src);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesIA4RGBA(dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		break;
 	case GX_TF_IA8:
@ -1256,13 +1306,14 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces an ~50% speed improvement over SSE2 implementation.
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
 							// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle to (ghhh efff cddd abbb)
 							const __m128i r1 = _mm_shuffle_epi8(r0, mask);
 							_mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 );
@ -1276,15 +1327,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 				const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
 				const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
 				const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
-
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.

 							// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src+ 8 * xStep));

 							// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000 0h0f 0d0b)
 							// This gets us only the I components.
@ -1340,24 +1391,27 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else if (tlutfmt == 0)
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x,  (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		break;
 	case GX_TF_RGB565:
@ -1369,12 +1423,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
 			const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
 			const __m128i kAlpha  = _mm_set1_epi32(0xFF000000);
-
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
-						__m128i *dxtsrc = (__m128i *)src;
+						__m128i *dxtsrc = (__m128i *)(src + 8 * xStep);
 						// Load 4x 16-bit colors: (0000 0000 hgfe dcba)
 						// where hg, fe, ba, and dc are 16-bit colors in big-endian order
 						const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
@ -1458,103 +1512,105 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-					{
-						u32 *newdst = dst+(y+iy)*width+x;
-						const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
-							const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)src),mask);
-							int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
-							if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
-							{
-								// Swizzle bits: 00012345 -> 12345123
-
-								//r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2);
-								const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
-								const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
-
-								//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
-								const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
-								const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
-
-								//b0 = (((val0    ) & 0x1f) << 3) | (((val0    ) & 0x1f) >> 2);
-								const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
-								const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
-
-								//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
-								const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
-													_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
-								_mm_storeu_si128( (__m128i*)newdst, final );
-							}
-							else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443.
-							{
-								// Swizzle bits: 00001234 -> 12341234
-
-								//r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf);
-								const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
-								const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
-
-								//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
-								const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
-								const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
-
-								//b0 = (((val0    ) & 0xf) << 4) | ((val0    ) & 0xf);
-								const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
-								const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
-								//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
-								const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
-								const __m128i aV = _mm_or_si128(
-									_mm_slli_epi16(tmpaV, 5),
-									_mm_or_si128(
-										_mm_slli_epi16(tmpaV, 2),
-										_mm_srli_epi16(tmpaV, 1)
-									)
-								);
-
-								//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
-								const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
-															_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
-								_mm_storeu_si128( (__m128i*)newdst, final );
-							}
-							else
-							{
-								// TODO: Vectorise (Either 4-way branch or do both and select is better than this)
-								u32 *vals = (u32*) &valV;
-								int r,g,b,a;
-								for (int i=0; i < 4; ++i)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						{
+							u32 *newdst = dst+(y+iy)*width+x;
+							const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
+								const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)),mask);
+								int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
+								if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
 								{
-									if (vals[i] & 0x8000)
-									{
-										// Swizzle bits: 00012345 -> 12345123
-										r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2);
-										g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2);
-										b = (((vals[i]    ) & 0x1f) << 3) | (((vals[i]    ) & 0x1f) >> 2);
-										a = 0xFF;
-									}
-									else
-									{
-										a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1);
-										// Swizzle bits: 00001234 -> 12341234
-										r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf);
-										g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf);
-										b = (((vals[i]    ) & 0xf) << 4) | ((vals[i]    ) & 0xf);
-									}
-									newdst[i] = r | (g << 8) | (b << 16) | (a << 24);
+									// Swizzle bits: 00012345 -> 12345123
+
+									//r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2);
+									const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
+									const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
+
+									//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
+									const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
+									const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
+
+									//b0 = (((val0    ) & 0x1f) << 3) | (((val0    ) & 0x1f) >> 2);
+									const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
+									const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
+
+									//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
+									const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
+														_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
+									_mm_storeu_si128( (__m128i*)newdst, final );
 								}
-							}
-					}
+								else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443.
+								{
+									// Swizzle bits: 00001234 -> 12341234
+
+									//r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf);
+									const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
+									const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
+
+									//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
+									const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
+									const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
+
+									//b0 = (((val0    ) & 0xf) << 4) | ((val0    ) & 0xf);
+									const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
+									const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
+									//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
+									const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
+									const __m128i aV = _mm_or_si128(
+										_mm_slli_epi16(tmpaV, 5),
+										_mm_or_si128(
+											_mm_slli_epi16(tmpaV, 2),
+											_mm_srli_epi16(tmpaV, 1)
+										)
+									);
+
+									//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
+									const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
+																_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
+									_mm_storeu_si128( (__m128i*)newdst, final );
+								}
+								else
+								{
+									// TODO: Vectorise (Either 4-way branch or do both and select is better than this)
+									u32 *vals = (u32*) &valV;
+									int r,g,b,a;
+									for (int i=0; i < 4; ++i)
+									{
+										if (vals[i] & 0x8000)
+										{
+											// Swizzle bits: 00012345 -> 12345123
+											r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2);
+											g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2);
+											b = (((vals[i]    ) & 0x1f) << 3) | (((vals[i]    ) & 0x1f) >> 2);
+											a = 0xFF;
+										}
+										else
+										{
+											a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1);
+											// Swizzle bits: 00001234 -> 12341234
+											r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf);
+											g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf);
+											b = (((vals[i]    ) & 0xf) << 4) | ((vals[i]    ) & 0xf);
+										}
+										newdst[i] = r | (g << 8) | (b << 16) | (a << 24);
+									}
+								}
+						}
 			} else
 #endif
 			// JSD optimized with SSE2 intrinsics (2 in 4 cases)
 			// Produces a ~25% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							u32 *newdst = dst+(y+iy)*width+x;
-							const u16 *newsrc = (const u16*)src;
+							const u16 *newsrc = (const u16*)(src + 8 * xStep);

 							// TODO: weak point
 							const u16 val0 = Common::swap16(newsrc[0]);
@ -1669,14 +1725,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~30% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4, src += 64)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
+						const u8* src2 = src + 64 * yStep;
 						const __m128i mask0312 = _mm_set_epi8(12,15,13,14,8,11,9,10,4,7,5,6,0,3,1,2);
-						const __m128i ar0 = _mm_loadu_si128((__m128i*)src);
-						const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1);
-						const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2);
-						const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3);
+						const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
+						const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
+						const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
+						const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);


 						const __m128i rgba00 = _mm_shuffle_epi8(_mm_unpacklo_epi8(ar0,gb0),mask0312);
@ -1698,8 +1756,9 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics
 			// Produces a ~68% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4, src += 64)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
 						// Input is divided up into 16-bit words. The texels are split up into AR and GB components where all
 						// AR components come grouped up first in 32 bytes followed by the GB components in 32 bytes. We are
@ -1718,15 +1777,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						//           (RGBA7 RGBA6 RGBA5 RGBA4)
 						//           (RGBAb RGBAa RGBA9 RGBA8)
 						//           (RGBAf RGBAe RGBAd RGBAc)
-
+						const u8* src2 = src + 64 * yStep;
 						// Loads the 1st half of AR components ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
-						const __m128i ar0 = _mm_loadu_si128((__m128i*)src);
+						const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
 						// Loads the 2nd half of AR components ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
-						const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1);
+						const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
 						// Loads the 1st half of GB components ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
-						const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2);
+						const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
 						// Loads the 2nd half of GB components ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
-						const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3);
+						const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);
 						__m128i rgba00, rgba01, rgba10, rgba11;
 						const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
 						const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
@ -1790,7 +1849,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(dst128, rgba10);
 						dst128 = (__m128i*)( dst + (y + 3) * width + x );
 						_mm_storeu_si128(dst128, rgba11);
-					}
+					}				
 			}
 #if 0
 			// Reference C implementation.
@ -1811,15 +1870,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
 			// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
 			// faster than both.
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
-				for (int x = 0; x < width; x += 8)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 				{
 					// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
 					// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
 					// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
 					// at this level, so we do.
-					for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2)
+					for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
 					{
 						// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
 						// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
@ -1833,7 +1893,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());

 						// Load 128 bits, i.e. two DXTBlocks (64-bits each)
-						const __m128i dxt = _mm_loadu_si128((__m128i *)src);
+						const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 2 * xStep));

 						// Copy the 2-bit indices from each DXT block:
 						GC_ALIGNED16( u32 dxttmp[4] );
@ -2032,7 +2092,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						assert( memcmp(&(tmp0[3]), &dst32[(width * 3)], 16) == 0 );
 						assert( memcmp(&(tmp1[3]), &dst32[(width * 3) + 4], 16) == 0 );
 #endif
-					}
+					}					
 				}
 			}
 #if 0
@ -2450,8 +2510,8 @@ void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth
 const char* texfmt[] = {
 	// pixel
 	"I4",		"I8",		"IA4",		"IA8",
-	"RGB565",	"RGB5A3",	"RGBA8",	"C4",
-	"C8",		"C14X2",	"0x0A",		"0x0B",
+	"RGB565",	"RGB5A3",	"RGBA8",	"0x07",
+	"C4",		"C8",		"C14X2",	"0x0B",
 	"0x0C",		"0x0D",		"CMPR",		"0x0F",
 	// Z-buffer
 	"0x10",		"Z8",		"0x12",		"Z16",
--- a/Source/Core/VideoCommon/Src/VertexLoader.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader.cpp
@ -514,27 +514,27 @@ void VertexLoader::WriteSetVariable(int bits, void *address, OpArg value)

 void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 {
-	if(count == 0)
-		return;
 	m_numLoadedVertices += count;
-	INCSTAT(stats.thisFrame.numDrawCalls);
+
 	// Flush if our vertex format is different from the currently set.
-	if (g_nativeVertexFmt != m_NativeFmt)
+	if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
 	{
 		// We really must flush here. It's possible that the native representations
 		// of the two vtx formats are the same, but we have no way to easily check that 
 		// now. 
 		VertexManager::Flush();
-		g_nativeVertexFmt = m_NativeFmt;
-		m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
-	}	
-	
+		// Also move the Set() here?
+	}
+	g_nativeVertexFmt = m_NativeFmt;
+
 	if (bpmem.genMode.cullmode == 3 && primitive < 5)
 	{
 		// if cull mode is none, ignore triangles and quads
 		DataSkip(count * m_VertexSize);
 		return;
-	}	
+	}
+
+	m_NativeFmt->EnableComponents(m_NativeFmt->m_components);

 	// Load position and texcoord scale factors.
 	m_VtxAttr.PosFrac				= g_VtxAttr[vtx_attr_group].g0.PosFrac;
@ -554,19 +554,87 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 			tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
 	for (int i = 0; i < 2; i++)
 		colElements[i] = m_VtxAttr.color[i].Elements;
-	
-	if(VertexManager::GetRemainingSize() < native_stride * count)
-		VertexManager::Flush();
-	
-	VertexManager::AddVertices(primitive,count);
+
+	// if strips or fans, make sure all vertices can fit in buffer, otherwise flush
+	int granularity = 1;
+	switch (primitive) {
+		case 3: // strip .. hm, weird
+		case 4: // fan
+			if (VertexManager::GetRemainingSize() < 3 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 6: // line strip
+			if (VertexManager::GetRemainingSize() < 2 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 0: granularity = 4; break; // quads
+		case 2: granularity = 3; break; // tris
+		case 5: granularity = 2; break; // lines
+	}
+
+	int startv = 0, extraverts = 0;
+	int v = 0;
+
+	//int remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
+	while (v < count)
+	{
+		int remainingVerts = VertexManager::GetRemainingSize() / native_stride;
+		//if (remainingVerts2 - v + startv < remainingVerts)
+		    //remainingVerts = remainingVerts2 - v + startv;
+		if (remainingVerts < granularity) {
+			INCSTAT(stats.thisFrame.numBufferSplits);
+			// This buffer full - break current primitive and flush, to switch to the next buffer.
+			u8* plastptr = VertexManager::s_pCurBufferPointer;
+			if (v - startv > 0)
+				VertexManager::AddVertices(primitive, v - startv + extraverts);
+			VertexManager::Flush();
+			//remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
+			// Why does this need to be so complicated?
+			switch (primitive) {
+				case 3: // triangle strip, copy last two vertices
+					// a little trick since we have to keep track of signs
+					if (v & 1) {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-2*native_stride, native_stride);
+						memcpy_gc(VertexManager::s_pCurBufferPointer+native_stride, plastptr-native_stride*2, 2*native_stride);
+						VertexManager::s_pCurBufferPointer += native_stride*3;
+						extraverts = 3;
+					}
+					else {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*2, native_stride*2);
+						VertexManager::s_pCurBufferPointer += native_stride*2;
+						extraverts = 2;
+					}
+					break;
+				case 4: // tri fan, copy first and last vert
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*(v-startv+extraverts), native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 2;
+					break;
+				case 6: // line strip
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 1;
+					break;
+				default:
+					extraverts = 0;
+					break;
+			}
+			startv = v;
+		}
+		int remainingPrims = remainingVerts / granularity;
+		remainingVerts = remainingPrims * granularity;
+		if (count - v < remainingVerts)
+			remainingVerts = count - v;

 	#ifdef USE_JIT
-		if (count > 0) {
-			loop_counter = count;
+		if (remainingVerts > 0) {
+			loop_counter = remainingVerts;
 			((void (*)())(void*)m_compiledCode)();
 		}
 	#else
-		for (int s = 0; s < count; s++)
+		for (int s = 0; s < remainingVerts; s++)
 		{
 			tcIndex = 0;
 			colIndex = 0;
@ -575,7 +643,12 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 				m_PipelineStages[i]();
 			PRIM_LOG("\n");
 		}
-	#endif	
+	#endif
+		v += remainingVerts;
+	}
+
+	if (startv < count)
+		VertexManager::AddVertices(primitive, count - startv + extraverts);
 }


@ -584,18 +657,18 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data)
 {
 	m_numLoadedVertices += count;
-	INCSTAT(stats.thisFrame.numDrawCalls);
+
 	// Flush if our vertex format is different from the currently set.
-	if (g_nativeVertexFmt != m_NativeFmt)
+	if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
 	{
 		// We really must flush here. It's possible that the native representations
 		// of the two vtx formats are the same, but we have no way to easily check that 
 		// now. 
 		VertexManager::Flush();
-		g_nativeVertexFmt = m_NativeFmt;
-		m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
-	}	
-	
+		// Also move the Set() here?
+	}
+	g_nativeVertexFmt = m_NativeFmt;
+
 	if (bpmem.genMode.cullmode == 3 && primitive < 5)
 	{
 		// if cull mode is none, ignore triangles and quads
@ -603,6 +676,27 @@ void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int co
 		return;
 	}

+	m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
+
+	// Load position and texcoord scale factors.
+	m_VtxAttr.PosFrac				= g_VtxAttr[vtx_attr_group].g0.PosFrac;
+	m_VtxAttr.texCoord[0].Frac		= g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
+	m_VtxAttr.texCoord[1].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex1Frac;
+	m_VtxAttr.texCoord[2].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex2Frac;
+	m_VtxAttr.texCoord[3].Frac      = g_VtxAttr[vtx_attr_group].g1.Tex3Frac;
+	m_VtxAttr.texCoord[4].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex4Frac;
+	m_VtxAttr.texCoord[5].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex5Frac;
+	m_VtxAttr.texCoord[6].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex6Frac;
+	m_VtxAttr.texCoord[7].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex7Frac;
+
+	pVtxAttr = &m_VtxAttr;
+	posScale = fractionTable[m_VtxAttr.PosFrac];
+	if (m_NativeFmt->m_components & VB_HAS_UVALL)
+		for (int i = 0; i < 8; i++)
+			tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
+	for (int i = 0; i < 2; i++)
+		colElements[i] = m_VtxAttr.color[i].Elements;
+
 	if(VertexManager::GetRemainingSize() < native_stride * count)
 		VertexManager::Flush();
 	memcpy_gc(VertexManager::s_pCurBufferPointer, Data, native_stride * count);
--- a/Source/Core/VideoCommon/VideoCommon.vcxproj
+++ b/Source/Core/VideoCommon/VideoCommon.vcxproj
@ -152,6 +152,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <AdditionalIncludeDirectories>..\Common\Src;..\Core\Src;..\..\..\Externals\SOIL;..\..\..\Externals\CLRun\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>