diff --git a/Source/Core/Common/Src/StdThread.h b/Source/Core/Common/Src/StdThread.h
index 41b856f831..4f3bd68c79 100644
--- a/Source/Core/Common/Src/StdThread.h
+++ b/Source/Core/Common/Src/StdThread.h
@@ -278,9 +278,9 @@ namespace this_thread
 inline void yield()
 {
 #ifdef _WIN32
-	Sleep(1);
+	Sleep(0);
 #else
-	usleep(1000 * 1);
+	sleep(0);
 #endif
 }
 
diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
index 5f75242496..99cf31106c 100644
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@@ -27,7 +27,7 @@
 #include "LookUpTables.h"
 
 #include <cmath>
-
+#include <omp.h>
 #if _M_SSE >= 0x401
 #include <smmintrin.h>
 #include <emmintrin.h>
@@ -685,33 +685,48 @@ PC_TexFormat GetPC_TexFormat(int texformat, int tlutfmt)
 //need to add DXT support too
 PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
+	//Dont use multithreading in small Textures
+	if(width > 127 && height > 127)
+	{
+		//don't span to many threads they will kill the rest of the emu :)
+		omp_set_num_threads(cpu_info.num_cores + 2 / 3);
+	}
+	else
+	{
+		omp_set_num_threads(1);
+	}
+	int Wsteps4 = (width + 3) / 4;
+	int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
+						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);				
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
+						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);						
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_I4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,yStep++)
 						for (int ix = 0; ix < 4; ix++)
 						{
-							int val = src[ix];
+							int val = src[4 * yStep + ix];
 							dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4);
 							dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF);
 						}
@@ -719,20 +734,24 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 	   return PC_TEX_FMT_I4_AS_I8;
 	case GX_TF_I8:  // speed critical
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						memcpy(dst + (y + iy)*width+x, src, 8);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+					{
+						((u64*)dst + (y + iy)*width+x)[0] = ((u64*)(src + 8 * xStep))[0];
+					}
 		}
 		return PC_TEX_FMT_I8;
 	case GX_TF_C8:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else
 		{
@@ -740,36 +759,40 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #if _M_SSE >= 0x301
 
 			if (cpu_info.bSSSE3) {
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 			} else
 #endif
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src  + 8 * xStep, tlutaddr);
 			}
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_IA4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesIA4((u16*)dst + (y + iy) * width + x, src);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		return PC_TEX_FMT_IA4_AS_IA8;
 	case GX_TF_IA8:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
-						u16 *s = (u16 *)src;
+						u16 *s = (u16 *)(src + 8 * xStep);
 						for(int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
@@ -780,27 +803,30 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_RGB565:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
-						u16 *s = (u16 *)src;
+						u16 *s = (u16 *)(src + 8 * xStep);
 						for(int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
@@ -808,11 +834,12 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 		return PC_TEX_FMT_RGB565;
 	case GX_TF_RGB5A3:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
-						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src);
+						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep));
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_RGBA8:  // speed critical
@@ -821,6 +848,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #if _M_SSE >= 0x301
 
 			if (cpu_info.bSSSE3) {
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4) {
 					__m128i* p = (__m128i*)(src + y * width * 4);
 					for (int x = 0; x < width; x += 4) {
@@ -862,12 +890,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 #endif
 
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
+						const u8* src2 = src + 64 * yStep;
 						for (int iy = 0; iy < 4; iy++)
-							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
-						src += 64;
+							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src2 + 4 * iy + 16);						
 					}
 			}
 		}
@@ -894,18 +923,19 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 			}
 			return PC_TEX_FMT_DXT1;
 #else
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
-				for (int x = 0; x < width; x += 8)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 				{
-					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
-					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width);
-										src += sizeof(DXTBlock);
+					const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; 
+					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width);
+										src2 += sizeof(DXTBlock);
+					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width);										
 				}
 			}
 #endif
@@ -929,30 +959,45 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 
 PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
+	if(width > 127 && height > 127)
+	{
+		//don't span to many threads they will kill the rest of the emu :)
+		omp_set_num_threads(cpu_info.num_cores + 2 / 3);
+	}
+	else
+	{
+		omp_set_num_threads(1);
+	}
+	int Wsteps4 = (width + 3) / 4;
+	int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4_5A3_To_rgba32(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		else if(tlutfmt == 0)
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4IA8_To_RGBA(dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
+				
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, src += 4)
-						decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
+						decodebytesC4RGB565_To_RGBA(dst + (y + iy) * width + x, src  + 4 * xStep, tlutaddr);
 		}
 		break;
 	case GX_TF_I4:
@@ -967,11 +1012,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 				const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
 				const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
 				const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 8; iy += 2, src += 8)
+					for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
+						for (int iy = 0, xStep =  4 * yStep; iy < 8; iy += 2,xStep++)
 						{
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// We want the hi 4 bits of each 8-bit word replicated to 32-bit words:
 							// (00000000 00000000 HhGgFfEe DdCcBbAa) -> (00000000 00000000 HHGGFFEE DDCCBBAA)
 							const __m128i i1 = _mm_and_si128(r0, kMask_xf0);
@@ -1001,11 +1047,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics.
 			// Produces a ~76% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 8; iy += 2, src += 8)
+					for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
 						{
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 							const __m128i r1 = _mm_unpacklo_epi8(r0, r0);
 
@@ -1086,17 +1133,17 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-					{
-						for (int iy = 0; iy < 4; ++iy, src+=8)
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
 						{
 							const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
 
 							const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
 							__m128i *quaddst, r, rgba0, rgba1;
 							// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							r = _mm_loadl_epi64((const __m128i *)src);
+							r = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
 							rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
 							rgba1 = _mm_shuffle_epi8(r, mask7654); // (hhhh gggg ffff eeee)
@@ -1105,17 +1152,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 							_mm_storeu_si128(quaddst, rgba0);
 							_mm_storeu_si128(quaddst+1, rgba1);
 						}
-				}
+				
 			} else
 #endif
 			// JSD optimized with SSE2 intrinsics.
 			// Produces an ~86% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
 					{
 						// Each loop iteration processes 4 rows from 4 64-bit reads.
-
+						const u8* src2 = src + 32 * yStep;
 						// TODO: is it more efficient to group the loads together sequentially and also the stores at the end?
 						// _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly WORSE, so I
 						// went with _mm_stores. Perhaps there is some edge case here creating the terrible performance or we're
@@ -1123,7 +1171,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						__m128i *quaddst;
 
 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+						const __m128i r0 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r1 = _mm_unpacklo_epi8(r0, r0);
 
@@ -1139,8 +1187,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba1);
 
 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r2 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r2 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r3 = _mm_unpacklo_epi8(r2, r2);
 
@@ -1156,8 +1204,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba3);
 
 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r4 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r4 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r5 = _mm_unpacklo_epi8(r4, r4);
 
@@ -1173,8 +1221,8 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst+1, rgba5);
 
 						// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-						src += 8;
-						const __m128i r6 = _mm_loadl_epi64((const __m128i *)src);
+						src2 += 8;
+						const __m128i r6 = _mm_loadl_epi64((const __m128i *)src2);
 						// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc bbaa)
 						const __m128i r7 = _mm_unpacklo_epi8(r6, r6);
 
@@ -1188,8 +1236,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(quaddst, rgba6);
 						// Store (hhhh gggg ffff eeee) out:
 						_mm_storeu_si128(quaddst+1, rgba7);
-
-						src += 8;
+						
 					}
 			}
 #if 0
@@ -1218,35 +1265,38 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC8_5A3_To_RGBA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else if(tlutfmt == 0)
 		{
-
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8IA8_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 			
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src, tlutaddr);			
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesC8RGB565_To_RGBA(dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);			
 
 		}
 		break;
 	case GX_TF_IA4:
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesIA4RGBA(dst + (y + iy) * width + x, src);
+					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+							decodebytesIA4RGBA(dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		break;
 	case GX_TF_IA8:
@@ -1256,13 +1306,14 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces an ~50% speed improvement over SSE2 implementation.
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
 							// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src + 8 * xStep));
 							// Shuffle to (ghhh efff cddd abbb)
 							const __m128i r1 = _mm_shuffle_epi8(r0, mask);
 							_mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 );
@@ -1276,15 +1327,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 				const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
 				const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
 				const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
-
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
 
 							// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
-							const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
+							const __m128i r0 = _mm_loadl_epi64((const __m128i *)(src+ 8 * xStep));
 
 							// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000 0h0f 0d0b)
 							// This gets us only the I components.
@@ -1340,24 +1391,27 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2_5A3_To_BGRA32(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else if (tlutfmt == 0)
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2IA8_To_RGBA(dst + (y + iy) * width + x,  (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-						decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)src, tlutaddr);
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						decodebytesC14X2rgb565_To_RGBA(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		break;
 	case GX_TF_RGB565:
@@ -1369,12 +1423,12 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
 			const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
 			const __m128i kAlpha  = _mm_set1_epi32(0xFF000000);
-
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
+				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
-						__m128i *dxtsrc = (__m128i *)src;
+						__m128i *dxtsrc = (__m128i *)(src + 8 * xStep);
 						// Load 4x 16-bit colors: (0000 0000 hgfe dcba)
 						// where hg, fe, ba, and dc are 16-bit colors in big-endian order
 						const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
@@ -1458,103 +1512,105 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-					{
-						u32 *newdst = dst+(y+iy)*width+x;
-						const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
-							const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)src),mask);
-							int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
-							if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
-							{
-								// Swizzle bits: 00012345 -> 12345123
-
-								//r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2);
-								const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
-								const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
-
-								//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
-								const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
-								const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
-
-								//b0 = (((val0    ) & 0x1f) << 3) | (((val0    ) & 0x1f) >> 2);
-								const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
-								const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
-
-								//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
-								const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
-													_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
-								_mm_storeu_si128( (__m128i*)newdst, final );
-							}
-							else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443.
-							{
-								// Swizzle bits: 00001234 -> 12341234
-
-								//r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf);
-								const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
-								const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
-
-								//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
-								const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
-								const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
-
-								//b0 = (((val0    ) & 0xf) << 4) | ((val0    ) & 0xf);
-								const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
-								const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
-								//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
-								const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
-								const __m128i aV = _mm_or_si128(
-									_mm_slli_epi16(tmpaV, 5),
-									_mm_or_si128(
-										_mm_slli_epi16(tmpaV, 2),
-										_mm_srli_epi16(tmpaV, 1)
-									)
-								);
-
-								//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
-								const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
-															_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
-								_mm_storeu_si128( (__m128i*)newdst, final );
-							}
-							else
-							{
-								// TODO: Vectorise (Either 4-way branch or do both and select is better than this)
-								u32 *vals = (u32*) &valV;
-								int r,g,b,a;
-								for (int i=0; i < 4; ++i)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
+						{
+							u32 *newdst = dst+(y+iy)*width+x;
+							const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1);
+								const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)),mask);
+								int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
+								if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
 								{
-									if (vals[i] & 0x8000)
-									{
-										// Swizzle bits: 00012345 -> 12345123
-										r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2);
-										g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2);
-										b = (((vals[i]    ) & 0x1f) << 3) | (((vals[i]    ) & 0x1f) >> 2);
-										a = 0xFF;
-									}
-									else
-									{
-										a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1);
-										// Swizzle bits: 00001234 -> 12341234
-										r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf);
-										g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf);
-										b = (((vals[i]    ) & 0xf) << 4) | ((vals[i]    ) & 0xf);
-									}
-									newdst[i] = r | (g << 8) | (b << 16) | (a << 24);
+									// Swizzle bits: 00012345 -> 12345123
+
+									//r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2);
+									const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f);
+									const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) );
+
+									//g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2);
+									const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f);
+									const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) );
+
+									//b0 = (((val0    ) & 0x1f) << 3) | (((val0    ) & 0x1f) >> 2);
+									const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f);
+									const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) );
+
+									//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
+									const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
+														_mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00));
+									_mm_storeu_si128( (__m128i*)newdst, final );
 								}
-							}
-					}
+								else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443.
+								{
+									// Swizzle bits: 00001234 -> 12341234
+
+									//r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf);
+									const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f);
+									const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV );
+
+									//g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf);
+									const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f);
+									const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV );
+
+									//b0 = (((val0    ) & 0xf) << 4) | ((val0    ) & 0xf);
+									const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
+									const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV );
+									//a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
+									const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
+									const __m128i aV = _mm_or_si128(
+										_mm_slli_epi16(tmpaV, 5),
+										_mm_or_si128(
+											_mm_slli_epi16(tmpaV, 2),
+											_mm_srli_epi16(tmpaV, 1)
+										)
+									);
+
+									//newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24);
+									const __m128i final = _mm_or_si128(	_mm_or_si128(rV,_mm_slli_epi32(gV, 8)),
+																_mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24)));
+									_mm_storeu_si128( (__m128i*)newdst, final );
+								}
+								else
+								{
+									// TODO: Vectorise (Either 4-way branch or do both and select is better than this)
+									u32 *vals = (u32*) &valV;
+									int r,g,b,a;
+									for (int i=0; i < 4; ++i)
+									{
+										if (vals[i] & 0x8000)
+										{
+											// Swizzle bits: 00012345 -> 12345123
+											r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2);
+											g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2);
+											b = (((vals[i]    ) & 0x1f) << 3) | (((vals[i]    ) & 0x1f) >> 2);
+											a = 0xFF;
+										}
+										else
+										{
+											a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1);
+											// Swizzle bits: 00001234 -> 12341234
+											r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf);
+											g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf);
+											b = (((vals[i]    ) & 0xf) << 4) | ((vals[i]    ) & 0xf);
+										}
+										newdst[i] = r | (g << 8) | (b << 16) | (a << 24);
+									}
+								}
+						}
 			} else
 #endif
 			// JSD optimized with SSE2 intrinsics (2 in 4 cases)
 			// Produces a ~25% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-						for (int iy = 0; iy < 4; iy++, src += 8)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
+						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						{
 							u32 *newdst = dst+(y+iy)*width+x;
-							const u16 *newsrc = (const u16*)src;
+							const u16 *newsrc = (const u16*)(src + 8 * xStep);
 
 							// TODO: weak point
 							const u16 val0 = Common::swap16(newsrc[0]);
@@ -1669,14 +1725,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~30% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4, src += 64)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
+						const u8* src2 = src + 64 * yStep;
 						const __m128i mask0312 = _mm_set_epi8(12,15,13,14,8,11,9,10,4,7,5,6,0,3,1,2);
-						const __m128i ar0 = _mm_loadu_si128((__m128i*)src);
-						const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1);
-						const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2);
-						const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3);
+						const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
+						const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
+						const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
+						const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);
 
 
 						const __m128i rgba00 = _mm_shuffle_epi8(_mm_unpacklo_epi8(ar0,gb0),mask0312);
@@ -1698,8 +1756,9 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics
 			// Produces a ~68% speed improvement over reference C implementation.
 			{
+				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4, src += 64)
+					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
 						// Input is divided up into 16-bit words. The texels are split up into AR and GB components where all
 						// AR components come grouped up first in 32 bytes followed by the GB components in 32 bytes. We are
@@ -1718,15 +1777,15 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						//           (RGBA7 RGBA6 RGBA5 RGBA4)
 						//           (RGBAb RGBAa RGBA9 RGBA8)
 						//           (RGBAf RGBAe RGBAd RGBAc)
-
+						const u8* src2 = src + 64 * yStep;
 						// Loads the 1st half of AR components ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
-						const __m128i ar0 = _mm_loadu_si128((__m128i*)src);
+						const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
 						// Loads the 2nd half of AR components ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
-						const __m128i ar1 = _mm_loadu_si128((__m128i*)src+1);
+						const __m128i ar1 = _mm_loadu_si128((__m128i*)src2+1);
 						// Loads the 1st half of GB components ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
-						const __m128i gb0 = _mm_loadu_si128((__m128i*)src+2);
+						const __m128i gb0 = _mm_loadu_si128((__m128i*)src2+2);
 						// Loads the 2nd half of GB components ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
-						const __m128i gb1 = _mm_loadu_si128((__m128i*)src+3);
+						const __m128i gb1 = _mm_loadu_si128((__m128i*)src2+3);
 						__m128i rgba00, rgba01, rgba10, rgba11;
 						const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
 						const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
@@ -1790,7 +1849,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						_mm_storeu_si128(dst128, rgba10);
 						dst128 = (__m128i*)( dst + (y + 3) * width + x );
 						_mm_storeu_si128(dst128, rgba11);
-					}
+					}				
 			}
 #if 0
 			// Reference C implementation.
@@ -1811,15 +1870,16 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
 			// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
 			// faster than both.
+			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
-				for (int x = 0; x < width; x += 8)
+				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 				{
 					// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
 					// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
 					// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
 					// at this level, so we do.
-					for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2)
+					for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
 					{
 						// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
 						// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
@@ -1833,7 +1893,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
 
 						// Load 128 bits, i.e. two DXTBlocks (64-bits each)
-						const __m128i dxt = _mm_loadu_si128((__m128i *)src);
+						const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 2 * xStep));
 
 						// Copy the 2-bit indices from each DXT block:
 						GC_ALIGNED16( u32 dxttmp[4] );
@@ -2032,7 +2092,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						assert( memcmp(&(tmp0[3]), &dst32[(width * 3)], 16) == 0 );
 						assert( memcmp(&(tmp1[3]), &dst32[(width * 3) + 4], 16) == 0 );
 #endif
-					}
+					}					
 				}
 			}
 #if 0
@@ -2450,8 +2510,8 @@ void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth
 const char* texfmt[] = {
 	// pixel
 	"I4",		"I8",		"IA4",		"IA8",
-	"RGB565",	"RGB5A3",	"RGBA8",	"C4",
-	"C8",		"C14X2",	"0x0A",		"0x0B",
+	"RGB565",	"RGB5A3",	"RGBA8",	"0x07",
+	"C4",		"C8",		"C14X2",	"0x0B",
 	"0x0C",		"0x0D",		"CMPR",		"0x0F",
 	// Z-buffer
 	"0x10",		"Z8",		"0x12",		"Z16",
diff --git a/Source/Core/VideoCommon/Src/VertexLoader.cpp b/Source/Core/VideoCommon/Src/VertexLoader.cpp
index ca2308627a..aa7ff2ec4c 100644
--- a/Source/Core/VideoCommon/Src/VertexLoader.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader.cpp
@@ -514,27 +514,27 @@ void VertexLoader::WriteSetVariable(int bits, void *address, OpArg value)
 
 void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 {
-	if(count == 0)
-		return;
 	m_numLoadedVertices += count;
-	INCSTAT(stats.thisFrame.numDrawCalls);
+
 	// Flush if our vertex format is different from the currently set.
-	if (g_nativeVertexFmt != m_NativeFmt)
+	if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
 	{
 		// We really must flush here. It's possible that the native representations
 		// of the two vtx formats are the same, but we have no way to easily check that 
 		// now. 
 		VertexManager::Flush();
-		g_nativeVertexFmt = m_NativeFmt;
-		m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
-	}	
-	
+		// Also move the Set() here?
+	}
+	g_nativeVertexFmt = m_NativeFmt;
+
 	if (bpmem.genMode.cullmode == 3 && primitive < 5)
 	{
 		// if cull mode is none, ignore triangles and quads
 		DataSkip(count * m_VertexSize);
 		return;
-	}	
+	}
+
+	m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
 
 	// Load position and texcoord scale factors.
 	m_VtxAttr.PosFrac				= g_VtxAttr[vtx_attr_group].g0.PosFrac;
@@ -554,19 +554,87 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 			tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
 	for (int i = 0; i < 2; i++)
 		colElements[i] = m_VtxAttr.color[i].Elements;
-	
-	if(VertexManager::GetRemainingSize() < native_stride * count)
-		VertexManager::Flush();
-	
-	VertexManager::AddVertices(primitive,count);
+
+	// if strips or fans, make sure all vertices can fit in buffer, otherwise flush
+	int granularity = 1;
+	switch (primitive) {
+		case 3: // strip .. hm, weird
+		case 4: // fan
+			if (VertexManager::GetRemainingSize() < 3 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 6: // line strip
+			if (VertexManager::GetRemainingSize() < 2 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 0: granularity = 4; break; // quads
+		case 2: granularity = 3; break; // tris
+		case 5: granularity = 2; break; // lines
+	}
+
+	int startv = 0, extraverts = 0;
+	int v = 0;
+
+	//int remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
+	while (v < count)
+	{
+		int remainingVerts = VertexManager::GetRemainingSize() / native_stride;
+		//if (remainingVerts2 - v + startv < remainingVerts)
+		    //remainingVerts = remainingVerts2 - v + startv;
+		if (remainingVerts < granularity) {
+			INCSTAT(stats.thisFrame.numBufferSplits);
+			// This buffer full - break current primitive and flush, to switch to the next buffer.
+			u8* plastptr = VertexManager::s_pCurBufferPointer;
+			if (v - startv > 0)
+				VertexManager::AddVertices(primitive, v - startv + extraverts);
+			VertexManager::Flush();
+			//remainingVerts2 = VertexManager::GetRemainingVertices(primitive);
+			// Why does this need to be so complicated?
+			switch (primitive) {
+				case 3: // triangle strip, copy last two vertices
+					// a little trick since we have to keep track of signs
+					if (v & 1) {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-2*native_stride, native_stride);
+						memcpy_gc(VertexManager::s_pCurBufferPointer+native_stride, plastptr-native_stride*2, 2*native_stride);
+						VertexManager::s_pCurBufferPointer += native_stride*3;
+						extraverts = 3;
+					}
+					else {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*2, native_stride*2);
+						VertexManager::s_pCurBufferPointer += native_stride*2;
+						extraverts = 2;
+					}
+					break;
+				case 4: // tri fan, copy first and last vert
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*(v-startv+extraverts), native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 2;
+					break;
+				case 6: // line strip
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 1;
+					break;
+				default:
+					extraverts = 0;
+					break;
+			}
+			startv = v;
+		}
+		int remainingPrims = remainingVerts / granularity;
+		remainingVerts = remainingPrims * granularity;
+		if (count - v < remainingVerts)
+			remainingVerts = count - v;
 
 	#ifdef USE_JIT
-		if (count > 0) {
-			loop_counter = count;
+		if (remainingVerts > 0) {
+			loop_counter = remainingVerts;
 			((void (*)())(void*)m_compiledCode)();
 		}
 	#else
-		for (int s = 0; s < count; s++)
+		for (int s = 0; s < remainingVerts; s++)
 		{
 			tcIndex = 0;
 			colIndex = 0;
@@ -575,7 +643,12 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 				m_PipelineStages[i]();
 			PRIM_LOG("\n");
 		}
-	#endif	
+	#endif
+		v += remainingVerts;
+	}
+
+	if (startv < count)
+		VertexManager::AddVertices(primitive, count - startv + extraverts);
 }
 
 
@@ -584,18 +657,18 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data)
 {
 	m_numLoadedVertices += count;
-	INCSTAT(stats.thisFrame.numDrawCalls);
+
 	// Flush if our vertex format is different from the currently set.
-	if (g_nativeVertexFmt != m_NativeFmt)
+	if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
 	{
 		// We really must flush here. It's possible that the native representations
 		// of the two vtx formats are the same, but we have no way to easily check that 
 		// now. 
 		VertexManager::Flush();
-		g_nativeVertexFmt = m_NativeFmt;
-		m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
-	}	
-	
+		// Also move the Set() here?
+	}
+	g_nativeVertexFmt = m_NativeFmt;
+
 	if (bpmem.genMode.cullmode == 3 && primitive < 5)
 	{
 		// if cull mode is none, ignore triangles and quads
@@ -603,6 +676,27 @@ void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int co
 		return;
 	}
 
+	m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
+
+	// Load position and texcoord scale factors.
+	m_VtxAttr.PosFrac				= g_VtxAttr[vtx_attr_group].g0.PosFrac;
+	m_VtxAttr.texCoord[0].Frac		= g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
+	m_VtxAttr.texCoord[1].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex1Frac;
+	m_VtxAttr.texCoord[2].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex2Frac;
+	m_VtxAttr.texCoord[3].Frac      = g_VtxAttr[vtx_attr_group].g1.Tex3Frac;
+	m_VtxAttr.texCoord[4].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex4Frac;
+	m_VtxAttr.texCoord[5].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex5Frac;
+	m_VtxAttr.texCoord[6].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex6Frac;
+	m_VtxAttr.texCoord[7].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex7Frac;
+
+	pVtxAttr = &m_VtxAttr;
+	posScale = fractionTable[m_VtxAttr.PosFrac];
+	if (m_NativeFmt->m_components & VB_HAS_UVALL)
+		for (int i = 0; i < 8; i++)
+			tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
+	for (int i = 0; i < 2; i++)
+		colElements[i] = m_VtxAttr.color[i].Elements;
+
 	if(VertexManager::GetRemainingSize() < native_stride * count)
 		VertexManager::Flush();
 	memcpy_gc(VertexManager::s_pCurBufferPointer, Data, native_stride * count);
diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj
index 04c082c1f2..349b430b12 100644
--- a/Source/Core/VideoCommon/VideoCommon.vcxproj
+++ b/Source/Core/VideoCommon/VideoCommon.vcxproj
@@ -152,6 +152,7 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <AdditionalIncludeDirectories>..\Common\Src;..\Core\Src;..\..\..\Externals\SOIL;..\..\..\Externals\CLRun\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>