From 48f51b3ce3da5ce7dc64ea87dc720ec62936f68f Mon Sep 17 00:00:00 2001
From: Gauvain 'GovanifY' Roussel-Tarbouriech <gauvain@govanify.com>
Date: Fri, 26 Mar 2021 23:36:23 +0100
Subject: [PATCH] gs: purge sse2/sse3

---
 plugins/GSdx/GSBlock.h                        | 354 +-----------------
 plugins/GSdx/GSState.cpp                      |   4 -
 plugins/GSdx/GSUtil.cpp                       |  10 -
 plugins/GSdx/GSVector4.h                      |  97 -----
 plugins/GSdx/GSVector4i.h                     | 275 +-------------
 plugins/GSdx/GSdx.vcxproj                     |  19 +-
 .../GSdx/Renderers/Common/GSVertexTrace.cpp   |  61 ---
 .../GSdx/Renderers/DX11/GSRendererDX11.cpp    |   4 -
 .../GSdx/Renderers/OpenGL/GSRendererOGL.cpp   |   4 -
 .../SW/GSDrawScanlineCodeGenerator.cpp        |  36 +-
 .../SW/GSDrawScanlineCodeGenerator.x86.cpp    | 279 ++------------
 plugins/GSdx/Renderers/SW/GSRendererSW.cpp    |  37 --
 plugins/GSdx/stdafx.h                         |  59 +--
 13 files changed, 60 insertions(+), 1179 deletions(-)
diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h
index 1c38fcb529..99b255bcbb 100644
--- a/plugins/GSdx/GSBlock.h
+++ b/plugins/GSdx/GSBlock.h
@@ -158,8 +158,6 @@ public:
 		{
 			GSVector4i v4((int)mask);
 
-			#if _M_SSE >= 0x401
-
 			if(mask == 0xff000000 || mask == 0x00ffffff)
 			{
 				((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, v4);
@@ -169,19 +167,11 @@ public:
 			}
 			else
 			{
-
-			#endif
-
-			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4);
-			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4);
-			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4);
-			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4);
-
-			#if _M_SSE >= 0x401
-
+				((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4);
+				((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4);
+				((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4);
+				((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4);
 			}
-
-			#endif
 		}
 
 		#endif
@@ -524,40 +514,18 @@ public:
 		GSVector4i::store<true>(&d1[0], v1);
 		GSVector4i::store<true>(&d1[1], v3);
 
-		#else
-
-		const GSVector4i* s = (const GSVector4i*)src;
-
-		GSVector4i v0 = s[i * 4 + 0];
-		GSVector4i v1 = s[i * 4 + 1];
-		GSVector4i v2 = s[i * 4 + 2];
-		GSVector4i v3 = s[i * 4 + 3];
-
-		//for(int16 i = 0; i < 8; i++) {v0.i16[i] = i; v1.i16[i] = i + 8; v2.i16[i] = i + 16; v3.i16[i] = i + 24;}
-
-		GSVector4i::sw16(v0, v1, v2, v3);
-		GSVector4i::sw32(v0, v1, v2, v3);
-		GSVector4i::sw16(v0, v2, v1, v3);
-
-		GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
-		GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
-
-		GSVector4i::store<true>(&d0[0], v0);
-		GSVector4i::store<true>(&d0[1], v1);
-		GSVector4i::store<true>(&d1[0], v2);
-		GSVector4i::store<true>(&d1[1], v3);
-
 		#endif
 	}
 
 	template<int i> __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
 	{
+
 		//for(int j = 0; j < 64; j++) ((uint8*)src)[j] = (uint8)j;
 
-		#if 0//_M_SSE >= 0x501
+		#if 0 //_M_SSE >= 0x501
 
 		const GSVector8i* s = (const GSVector8i*)src;
-		
+
 		GSVector8i v0 = s[i * 2 + 0];
 		GSVector8i v1 = s[i * 2 + 1];
 
@@ -578,7 +546,7 @@ public:
 
 		// TODO: not sure if this is worth it, not in this form, there should be a shorter path
 
-		#elif _M_SSE >= 0x301
+		#else
 
 		const GSVector4i* s = (const GSVector4i*)src;
 
@@ -612,36 +580,6 @@ public:
 		GSVector4i::store<true>(&dst[dstpitch * 2], v1);
 		GSVector4i::store<true>(&dst[dstpitch * 3], v2);
 
-		#else
-
-		const GSVector4i* s = (const GSVector4i*)src;
-
-		GSVector4i v0 = s[i * 4 + 0];
-		GSVector4i v1 = s[i * 4 + 1];
-		GSVector4i v2 = s[i * 4 + 2];
-		GSVector4i v3 = s[i * 4 + 3];
-
-		GSVector4i::sw8(v0, v1, v2, v3);
-		GSVector4i::sw16(v0, v1, v2, v3);
-		GSVector4i::sw8(v0, v2, v1, v3);
-		GSVector4i::sw64(v0, v1, v2, v3);
-
-		if((i & 1) == 0)
-		{
-			v2 = v2.yxwz();
-			v3 = v3.yxwz();
-		}
-		else
-		{
-			v0 = v0.yxwz();
-			v1 = v1.yxwz();
-		}
-
-		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
-		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
-		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
-		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
-
 		#endif
 	}
 
@@ -649,8 +587,6 @@ public:
 	{
 		//printf("ReadColumn4\n");
 
-		#if _M_SSE >= 0x301
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0 = s[i * 4 + 0].xzyw();
@@ -680,46 +616,6 @@ public:
 		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
 		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
 		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
-
-		#else
-
-		const GSVector4i* s = (const GSVector4i*)src;
-
-		GSVector4i v0 = s[i * 4 + 0];
-		GSVector4i v1 = s[i * 4 + 1];
-		GSVector4i v2 = s[i * 4 + 2];
-		GSVector4i v3 = s[i * 4 + 3];
-
-		GSVector4i::sw32(v0, v1, v2, v3);
-		GSVector4i::sw32(v0, v1, v2, v3);
-		GSVector4i::sw4(v0, v2, v1, v3);
-		GSVector4i::sw8(v0, v1, v2, v3);
-		GSVector4i::sw16(v0, v2, v1, v3);
-
-		v0 = v0.xzyw();
-		v1 = v1.xzyw();
-		v2 = v2.xzyw();
-		v3 = v3.xzyw();
-
-		GSVector4i::sw64(v0, v1, v2, v3);
-
-		if((i & 1) == 0)
-		{
-			v2 = v2.yxwzlh();
-			v3 = v3.yxwzlh();
-		}
-		else
-		{
-			v0 = v0.yxwzlh();
-			v1 = v1.yxwzlh();
-		}
-
-		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
-		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
-		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
-		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
-
-		#endif
 	}
 
 	static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
@@ -1238,7 +1134,6 @@ public:
 	{
 		for(int j = 0; j < 8; j++, dst += dstpitch)
 		{
-			#if _M_SSE >= 0x401
 
 			const GSVector4i* s = (const GSVector4i*)src;
 
@@ -1246,15 +1141,6 @@ public:
 			GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
 
 			((GSVector4i*)dst)[0] = v0.pu32(v1);
-
-			#else
-
-			for(int i = 0; i < 8; i++)
-			{
-				((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 24];
-			}
-
-			#endif
 		}
 	}
 
@@ -1273,23 +1159,12 @@ public:
 	{
 		for(int j = 0; j < 8; j++, dst += dstpitch)
 		{
-			#if _M_SSE >= 0x401
-
 			const GSVector4i* s = (const GSVector4i*)src;
 
 			GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
 			GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
 
 			((GSVector4i*)dst)[0] = v0.pu32(v1);
-
-			#else
-
-			for(int i = 0; i < 8; i++)
-			{
-				((uint16*)dst)[i] = (uint16)pal[(src[j * 8 + i] >> 24) & 0xf];
-			}
-
-			#endif
 		}
 	}
 
@@ -1308,23 +1183,12 @@ public:
 	{
 		for(int j = 0; j < 8; j++, dst += dstpitch)
 		{
-			#if _M_SSE >= 0x401
-
 			const GSVector4i* s = (const GSVector4i*)src;
 
 			GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
 			GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
 
 			((GSVector4i*)dst)[0] = v0.pu32(v1);
-
-			#else
-
-			for(int i = 0; i < 8; i++)
-			{
-				((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 28];
-			}
-
-			#endif
 		}
 	}
 
@@ -1486,32 +1350,6 @@ public:
 			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
 		}
 
-		#else
-
-		GSVector4i v0, v1, v2, v3;
-		GSVector4i mask = GSVector4i::xff000000();
-
-		for(int i = 0; i < 4; i++, src += srcpitch * 2)
-		{
-			v4 = GSVector4i::loadl(&src[srcpitch * 0]);
-			v5 = GSVector4i::loadl(&src[srcpitch * 1]);
-
-			v6 = v4.upl16(v5);
-
-			v4 = v6.upl8(v6);
-			v5 = v6.uph8(v6);
-
-			v0 = v4.upl16(v4);
-			v1 = v4.uph16(v4);
-			v2 = v5.upl16(v5);
-			v3 = v5.uph16(v5);
-			
-			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask);
-			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask);
-			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask);
-			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
-		}
-
 		#endif
 	}
 
@@ -1608,47 +1446,6 @@ public:
 			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
 		}
 
-		#else
-
-		GSVector4i v0, v1, v2, v3;
-		GSVector4i mask = GSVector4i(0x0f000000);
-
-		for(int i = 0; i < 2; i++, src += srcpitch * 4)
-		{
-			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
-
-			v4 = v.upl8(v >> 4);
-			v5 = v.uph8(v >> 4);
-
-			v6 = v4.upl16(v5);
-			v7 = v4.uph16(v5);
-
-			v4 = v6.upl8(v6);
-			v5 = v6.uph8(v6);
-			v6 = v7.upl8(v7);
-			v7 = v7.uph8(v7);
-
-			v0 = v4.upl16(v4);
-			v1 = v4.uph16(v4);
-			v2 = v5.upl16(v5);
-			v3 = v5.uph16(v5);
-
-			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
-			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
-			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
-			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
-
-			v0 = v6.upl16(v6);
-			v1 = v6.uph16(v6);
-			v2 = v7.upl16(v7);
-			v3 = v7.uph16(v7);
-
-			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
-			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
-			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
-			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
-		}
-
 		#endif
 	}
 
@@ -1736,47 +1533,6 @@ public:
 			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
 		}
 
-		#else
-
-		GSVector4i v0, v1, v2, v3;
-		GSVector4i mask = GSVector4i::xf0000000();
-
-		for(int i = 0; i < 2; i++, src += srcpitch * 4)
-		{
-			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
-
-			v4 = (v << 4).upl8(v);
-			v5 = (v << 4).uph8(v);
-
-			v6 = v4.upl16(v5);
-			v7 = v4.uph16(v5);
-
-			v4 = v6.upl8(v6);
-			v5 = v6.uph8(v6);
-			v6 = v7.upl8(v7);
-			v7 = v7.uph8(v7);
-
-			v0 = v4.upl16(v4);
-			v1 = v4.uph16(v4);
-			v2 = v5.upl16(v5);
-			v3 = v5.uph16(v5);
-
-			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
-			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
-			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
-			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
-
-			v0 = v6.upl16(v6);
-			v1 = v6.uph16(v6);
-			v2 = v7.upl16(v7);
-			v3 = v7.uph16(v7);
-
-			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
-			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
-			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
-			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
-		}
-
 		#endif
 	}
 
@@ -1882,39 +1638,6 @@ public:
 			d1[1] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
 		}
 
-		#elif 0 // not faster
-		
-		const GSVector4i* s = (const GSVector4i*)src;
-
-		GSVector4i TA0(TEXA.TA0 << 24);
-		GSVector4i TA1(TEXA.TA1 << 24);
-
-		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
-		{
-			GSVector4i v0 = s[i * 4 + 0];
-			GSVector4i v1 = s[i * 4 + 1];
-			GSVector4i v2 = s[i * 4 + 2];
-			GSVector4i v3 = s[i * 4 + 3];
-
-			GSVector4i::sw16(v0, v1, v2, v3);
-			GSVector4i::sw32(v0, v1, v2, v3);
-			GSVector4i::sw16(v0, v2, v1, v3);
-
-			GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
-
-			d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
-			d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
-			d0[2] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
-			d0[3] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
-			
-			GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
-
-			d1[0] = Expand16to32<AEM>(v2.upl16(v2), TA0, TA1);
-			d1[1] = Expand16to32<AEM>(v2.uph16(v2), TA0, TA1);
-			d1[2] = Expand16to32<AEM>(v3.upl16(v3), TA0, TA1);
-			d1[3] = Expand16to32<AEM>(v3.uph16(v3), TA0, TA1);
-		}
-
 		#else
 		
 		alignas(32) uint16 block[16 * 8];
@@ -1930,8 +1653,6 @@ public:
 	{
 		//printf("ReadAndExpandBlock8_32\n");
 
-		#if _M_SSE >= 0x401
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0, v1, v2, v3;
@@ -1973,16 +1694,6 @@ public:
 			v2.gather32_8<>(pal, (GSVector4i*)dst);
 			dst += dstpitch;
 		}
-
-		#else
-
-		alignas(32) uint8 block[16 * 16];
-
-		ReadBlock8(src, (uint8*)block, sizeof(block) / 16);
-
-		ExpandBlock8_32(block, dst, dstpitch, pal);
-
-		#endif
 	}
 
 	// TODO: ReadAndExpandBlock8_16
@@ -1991,8 +1702,6 @@ public:
 	{
 		//printf("ReadAndExpandBlock4_32\n");
 
-		#if _M_SSE >= 0x401
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0, v1, v2, v3;
@@ -2050,16 +1759,6 @@ public:
 			v3.gather64_8<>(pal, (GSVector4i*)dst);
 			dst += dstpitch;
 		}
-
-		#else
-
-		alignas(32) uint8 block[(32 / 2) * 16];
-
-		ReadBlock4(src, (uint8*)block, sizeof(block) / 16);
-
-		ExpandBlock4_32(block, dst, dstpitch, pal);
-
-		#endif
 	}
 
 	// TODO: ReadAndExpandBlock4_16
@@ -2068,8 +1767,6 @@ public:
 	{
 		//printf("ReadAndExpandBlock8H_32\n");
 
-		#if _M_SSE >= 0x401
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0, v1, v2, v3;
@@ -2093,16 +1790,6 @@ public:
 
 			dst += dstpitch;
 		}
-
-		#else
-
-		alignas(32) uint32 block[8 * 8];
-
-		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
-
-		ExpandBlock8H_32(block, dst, dstpitch, pal);
-
-		#endif
 	}
 
 	// TODO: ReadAndExpandBlock8H_16
@@ -2110,9 +1797,6 @@ public:
 	__forceinline static void ReadAndExpandBlock4HL_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
 	{
 		//printf("ReadAndExpandBlock4HL_32\n");
-
-		#if _M_SSE >= 0x401
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0, v1, v2, v3;
@@ -2136,16 +1820,6 @@ public:
 
 			dst += dstpitch;
 		}
-
-		#else
-
-		alignas(32) uint32 block[8 * 8];
-
-		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
-
-		ExpandBlock4HL_32(block, dst, dstpitch, pal);
-
-		#endif
 	}
 
 	// TODO: ReadAndExpandBlock4HL_16
@@ -2154,8 +1828,6 @@ public:
 	{
 		//printf("ReadAndExpandBlock4HH_32\n");
 
-		#if _M_SSE >= 0x401
-
 		const GSVector4i* s = (const GSVector4i*)src;
 
 		GSVector4i v0, v1, v2, v3;
@@ -2179,16 +1851,6 @@ public:
 
 			dst += dstpitch;
 		}
-
-		#else
-
-		alignas(32) uint32 block[8 * 8];
-
-		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
-
-		ExpandBlock4HH_32(block, dst, dstpitch, pal);
-
-		#endif
 	}
 
 	// TODO: ReadAndExpandBlock4HH_16
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index ccea31e96b..cdaee1f341 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -2734,11 +2734,7 @@ __forceinline void GSState::VertexKick(uint32 skip)
 
 	GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy);
 
-#if _M_SSE >= 0x401
 	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
-#else
-	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32());
-#endif
 
 	m_vertex.tail = ++tail;
 	m_vertex.xy_tail = ++xy_tail;
diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp
index b5f0f06d2c..08f8a352cd 100644
--- a/plugins/GSdx/GSUtil.cpp
+++ b/plugins/GSdx/GSUtil.cpp
@@ -85,10 +85,6 @@ const char* GSUtil::GetLibName()
 		"AVX", sw_sse
 #elif _M_SSE >= 0x401
 		"SSE4.1", sw_sse
-#elif _M_SSE >= 0x301
-		"SSSE3", sw_sse
-#elif _M_SSE >= 0x200
-		"SSE2", sw_sse
 #endif
 	);
 
@@ -221,13 +217,7 @@ bool GSUtil::CheckSSE()
 	};
 
 	ISA checks[] = {
-		{Xbyak::util::Cpu::tSSE2, "SSE2"},
-#if _M_SSE >= 0x301
-		{Xbyak::util::Cpu::tSSSE3, "SSSE3"},
-#endif
-#if _M_SSE >= 0x401
 		{Xbyak::util::Cpu::tSSE41, "SSE41"},
-#endif
 #if _M_SSE >= 0x500
 		{Xbyak::util::Cpu::tAVX, "AVX1"},
 #endif
diff --git a/plugins/GSdx/GSVector4.h b/plugins/GSdx/GSVector4.h
index f97a8db817..77422cb351 100644
--- a/plugins/GSdx/GSVector4.h
+++ b/plugins/GSdx/GSVector4.h
@@ -250,33 +250,7 @@ public:
 
 	template<int mode> __forceinline GSVector4 round() const
 	{
-		#if _M_SSE >= 0x401
-
 		return GSVector4(_mm_round_ps(m, mode));
-
-		#else
-
-		GSVector4 a = *this;
-
-		GSVector4 b = (a & cast(GSVector4i::x80000000())) | m_x4b000000;
-
-		b = a + b - b;
-
-		if((mode & 7) == (Round_NegInf & 7))
-		{
-			return b - ((a < b) & m_one);
-		}
-
-		if((mode & 7) == (Round_PosInf & 7))
-		{
-			return b + ((a > b) & m_one);
-		}
-
-		ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented
-
-		return b;
-
-		#endif
 	}
 
 	__forceinline GSVector4 floor() const
@@ -404,65 +378,29 @@ public:
 
 	__forceinline GSVector4 hadd() const
 	{
-		#if _M_SSE >= 0x300
-		
 		return GSVector4(_mm_hadd_ps(m, m));
-		
-		#else
-		
-		return xzxz() + ywyw();
-		
-		#endif
 	}
 
 	__forceinline GSVector4 hadd(const GSVector4& v) const
 	{
-		#if _M_SSE >= 0x300
-		
 		return GSVector4(_mm_hadd_ps(m, v.m));
-		
-		#else
-		
-		return xzxz(v) + ywyw(v);
-		
-		#endif
 	}
 
 	__forceinline GSVector4 hsub() const
 	{
-		#if _M_SSE >= 0x300
-		
 		return GSVector4(_mm_hsub_ps(m, m));
-		
-		#else
-		
-		return xzxz() - ywyw();
-		
-		#endif
 	}
 
 	__forceinline GSVector4 hsub(const GSVector4& v) const
 	{
-		#if _M_SSE >= 0x300
-		
 		return GSVector4(_mm_hsub_ps(m, v.m));
-		
-		#else
-		
-		return xzxz(v) - ywyw(v);
-
-		#endif
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int i> __forceinline GSVector4 dp(const GSVector4& v) const
 	{
 		return GSVector4(_mm_dp_ps(m, v.m, i));
 	}
 
-	#endif
-
 	__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
 	{
 		return GSVector4(_mm_min_ps(_mm_max_ps(m, a), b));
@@ -493,26 +431,14 @@ public:
 		return GSVector4(_mm_max_ps(m, a));
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int mask> __forceinline GSVector4 blend32(const GSVector4& a)  const
 	{
 		return GSVector4(_mm_blend_ps(m, a, mask));
 	}
 
-	#endif
-
 	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask)  const
 	{
-		#if _M_SSE >= 0x401
-
 		return GSVector4(_mm_blendv_ps(m, a, mask));
-
-		#else
-
-		return GSVector4(_mm_or_ps(_mm_andnot_ps(mask, m), _mm_and_ps(mask, a)));
-
-		#endif
 	}
 
 	__forceinline GSVector4 upl(const GSVector4& a) const
@@ -566,16 +492,10 @@ public:
 
 		return _mm_testz_ps(m, m) != 0;
 
-		#elif _M_SSE >= 0x401
-
 		__m128i a = _mm_castps_si128(m);
 
 		return _mm_testz_si128(a, a) != 0;
 
-		#else
-
-		return mask() == 0;
-
 		#endif
 	}
 
@@ -643,7 +563,6 @@ public:
 		}
 
 		#endif
-
 	}
 
 #ifdef __linux__
@@ -663,28 +582,12 @@ GSVector.h:2973:15: error:  shadows template parm 'int i'
 
 	template<int index> __forceinline int extract32() const
 	{
-		#if _M_SSE >= 0x401
-
 		return _mm_extract_ps(m, index);
-
-		#else
-
-		return i32[index];
-
-		#endif
 	}
 #else
 	template<int i> __forceinline int extract32() const
 	{
-		#if _M_SSE >= 0x401
-
 		return _mm_extract_ps(m, i);
-
-		#else
-
-		return i32[i];
-
-		#endif
 	}
 #endif
 
diff --git a/plugins/GSdx/GSVector4i.h b/plugins/GSdx/GSVector4i.h
index db376cdd57..e3c6dc5808 100644
--- a/plugins/GSdx/GSVector4i.h
+++ b/plugins/GSdx/GSVector4i.h
@@ -229,15 +229,7 @@ public:
 
 	__forceinline GSVector4i runion_ordered(const GSVector4i& a) const
 	{
-		#if _M_SSE >= 0x401
-
 		return min_i32(a).upl64(max_i32(a).srl<8>());
-
-		#else
-
-		return GSVector4i(std::min(x, a.x), std::min(y, a.y), std::max(z, a.z), std::max(w, a.w));
-
-		#endif
 	}
 
 	__forceinline GSVector4i rintersect(const GSVector4i& a) const
@@ -295,8 +287,6 @@ public:
 		return (uint32)store(v);
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i8(a).min_i8(b);
@@ -307,8 +297,6 @@ public:
 		return max_i8(a.xyxy()).min_i8(a.zwzw());
 	}
 
-	#endif
-
 	__forceinline GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i16(a).min_i16(b);
@@ -319,8 +307,6 @@ public:
 		return max_i16(a.xyxy()).min_i16(a.zwzw());
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i32(a).min_i32(b);
@@ -331,34 +317,6 @@ public:
 		return max_i32(a.xyxy()).min_i32(a.zwzw());
 	}
 
-	#else
-
-	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
-	{
-		GSVector4i v;
-
-		v.x = std::min(std::max(x, a.x), b.x);
-		v.y = std::min(std::max(y, a.y), b.y);
-		v.z = std::min(std::max(z, a.z), b.z);
-		v.w = std::min(std::max(w, a.w), b.w);
-
-		return v;
-	}
-
-	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
-	{
-		GSVector4i v;
-
-		v.x = std::min(std::max(x, a.x), a.z);
-		v.y = std::min(std::max(y, a.y), a.w);
-		v.z = std::min(std::max(z, a.x), a.z);
-		v.w = std::min(std::max(w, a.y), a.w);
-
-		return v;
-	}
-
-	#endif
-
 	__forceinline GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u8(a).min_u8(b);
@@ -369,8 +327,6 @@ public:
 		return max_u8(a.xyxy()).min_u8(a.zwzw());
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u16(a).min_u16(b);
@@ -381,10 +337,6 @@ public:
 		return max_u16(a.xyxy()).min_u16(a.zwzw());
 	}
 
-	#endif
-
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u32(a).min_u32(b);
@@ -395,10 +347,6 @@ public:
 		return max_u32(a.xyxy()).min_u32(a.zwzw());
 	}
 
-	#endif
-
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i min_i8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi8(m, a));
@@ -409,8 +357,6 @@ public:
 		return GSVector4i(_mm_max_epi8(m, a));
 	}
 
-	#endif
-
 	__forceinline GSVector4i min_i16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi16(m, a));
@@ -421,8 +367,6 @@ public:
 		return GSVector4i(_mm_max_epi16(m, a));
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i min_i32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi32(m, a));
@@ -433,8 +377,6 @@ public:
 		return GSVector4i(_mm_max_epi32(m, a));
 	}
 
-	#endif
-
 	__forceinline GSVector4i min_u8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epu8(m, a));
@@ -445,8 +387,6 @@ public:
 		return GSVector4i(_mm_max_epu8(m, a));
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i min_u16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epu16(m, a));
@@ -467,8 +407,6 @@ public:
 		return GSVector4i(_mm_max_epu32(m, a));
 	}
 
-	#endif
-
 	__forceinline static int min_i16(int a, int b)
 	{
 		 return store(load(a).min_i16(load(b)));
@@ -481,26 +419,14 @@ public:
 
 	__forceinline GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
 	{
-		#if _M_SSE >= 0x401
-
 		return GSVector4i(_mm_blendv_epi8(m, a, mask));
-
-		#else
-
-		return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
-
-		#endif
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int mask> __forceinline GSVector4i blend16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_blend_epi16(m, a, mask));
 	}
 
-	#endif
-
 	#if _M_SSE >= 0x501
 
 	template<int mask> __forceinline GSVector4i blend32(const GSVector4i& v) const
@@ -517,26 +443,14 @@ public:
 
 	__forceinline GSVector4i mix16(const GSVector4i& a) const
 	{
-		#if _M_SSE >= 0x401
-
 		return blend16<0xaa>(a);
-
-		#else
-
-		return blend8(a, GSVector4i::xffff0000());
-
-		#endif
 	}
 
-	#if _M_SSE >= 0x301
-
 	__forceinline GSVector4i shuffle8(const GSVector4i& mask) const
 	{
 		return GSVector4i(_mm_shuffle_epi8(m, mask));
 	}
 
-	#endif
-
 	__forceinline GSVector4i ps16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packs_epi16(m, a));
@@ -567,8 +481,6 @@ public:
 		return GSVector4i(_mm_packs_epi32(m, m));
 	}
 
-	#if _M_SSE >= 0x401
-
 	__forceinline GSVector4i pu32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packus_epi32(m, a));
@@ -579,8 +491,6 @@ public:
 		return GSVector4i(_mm_packus_epi32(m, m));
 	}
 
-	#endif
-
 	__forceinline GSVector4i upl8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpacklo_epi8(m, a));
@@ -685,8 +595,6 @@ public:
 		return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128()));
 	}
 
-	#if _M_SSE >= 0x401
-
 	// WARNING!!!
 	//
 	// MSVC (2008, 2010 ctp) believes that there is a "mem, reg" form of the pmovz/sx* instructions,
@@ -752,50 +660,6 @@ public:
 		return GSVector4i(_mm_cvtepu32_epi64(m));
 	}
 
-	#else
-
-	__forceinline GSVector4i u8to16() const
-	{
-		return upl8();
-	}
-
-	__forceinline GSVector4i u8to32() const
-	{
-		return upl8().upl16();
-	}
-
-	__forceinline GSVector4i u8to64() const
-	{
-		return upl8().upl16().upl32();
-	}
-
-	__forceinline GSVector4i u16to32() const
-	{
-		return upl16();
-	}
-
-	__forceinline GSVector4i u16to64() const
-	{
-		return upl16().upl32();
-	}
-
-	__forceinline GSVector4i u32to64() const
-	{
-		return upl32();
-	}
-
-	__forceinline GSVector4i i8to16() const
-	{
-		return zero().upl8(*this).sra16(8);
-	}
-
-	__forceinline GSVector4i i16to32() const
-	{
-		return zero().upl16(*this).sra32(16);
-	}
-
-	#endif
-
 	template<int i> __forceinline GSVector4i srl() const
 	{
 		return GSVector4i(_mm_srli_si128(m, i));
@@ -803,20 +667,7 @@ public:
 
 	template<int i> __forceinline GSVector4i srl(const GSVector4i& v)
 	{
-		#if _M_SSE >= 0x301
-
 		return GSVector4i(_mm_alignr_epi8(v.m, m, i));
-
-		#else
-
-		// The `& 0xF` keeps the compiler happy on cases that won't actually be hit
-		if(i == 0) return *this;
-		else if(i < 16) return srl<i & 0xF>() | v.sll<(16 - i) & 0xF>();
-		else if(i == 16) return v;
-		else if(i < 32) return v.srl<(i - 16) & 0xF>();
-		else return zero();
-
-		#endif
 	}
 
 	template<int i> __forceinline GSVector4i sll() const
@@ -1013,15 +864,11 @@ public:
 		return GSVector4i(_mm_mullo_epi16(m, v.m));
 	}
 
-	#if _M_SSE >= 0x301
-
 	__forceinline GSVector4i mul16hrs(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_mulhrs_epi16(m, v.m));
 	}
 
-	#endif
-
 	GSVector4i madd(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_madd_epi16(m, v.m));
@@ -1073,21 +920,11 @@ public:
 
 	__forceinline bool eq(const GSVector4i& v) const
 	{
-		#if _M_SSE >= 0x401
-		
 		// pxor, ptest, je
 		
 		GSVector4i t = *this ^ v;
 		
 		return _mm_testz_si128(t, t) != 0;
-
-		#else
-
-		// pcmpeqd, pmovmskb, cmp, je
-
-		return eq32(v).alltrue();
-
-		#endif
 	}
 
 	__forceinline GSVector4i eq8(const GSVector4i& v) const
@@ -1167,37 +1004,17 @@ public:
 
 	__forceinline bool allfalse() const
 	{
-		#if _M_SSE >= 0x401
-
 		return _mm_testz_si128(m, m) != 0;
-
-		#else
-
-		return mask() == 0;
-
-		#endif
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int i> __forceinline GSVector4i insert8(int a) const
 	{
 		return GSVector4i(_mm_insert_epi8(m, a, i));
 	}
 
-	#endif
-
 	template<int i> __forceinline int extract8() const
 	{
-		#if _M_SSE >= 0x401
-
 		return _mm_extract_epi8(m, i);
-
-		#else
-
-		return (int)u8[i];
-
-		#endif
 	}
 
 	template<int i> __forceinline GSVector4i insert16(int a) const
@@ -1210,59 +1027,34 @@ public:
 		return _mm_extract_epi16(m, i);
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int i> __forceinline GSVector4i insert32(int a) const
 	{
 		return GSVector4i(_mm_insert_epi32(m, a, i));
 	}
 
-	#endif
-
 	template<int i> __forceinline int extract32() const
 	{
 		if(i == 0) return GSVector4i::store(*this);
 
-		#if _M_SSE >= 0x401
-
 		return _mm_extract_epi32(m, i);
-
-		#else
-
-		return i32[i];
-
-		#endif
 	}
 
 	#ifdef _M_AMD64
 
-	#if _M_SSE >= 0x401
-
 	template<int i> __forceinline GSVector4i insert64(int64 a) const
 	{
 		return GSVector4i(_mm_insert_epi64(m, a, i));
 	}
 
-	#endif
-
 	template<int i> __forceinline int64 extract64() const
 	{
 		if(i == 0) return GSVector4i::storeq(*this);
 
-		#if _M_SSE >= 0x401
-
 		return _mm_extract_epi64(m, i);
-
-		#else
-
-		return i64[i];
-
-		#endif
 	}
 
 	#endif
 
-	#if _M_SSE >= 0x401
 
 	template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
 	{
@@ -1340,8 +1132,6 @@ public:
 		return v;
 	}
 
-	#endif
-
 	template<int src, class T> __forceinline GSVector4i gather16_4(const T* ptr) const
 	{
 		GSVector4i v;
@@ -1418,8 +1208,6 @@ public:
 		return v;
 	}
 
-	#if _M_SSE >= 0x401
-
 	template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
 	{
 		GSVector4i v;
@@ -1479,56 +1267,7 @@ public:
 		return v;
 	}
 
-	#else
-
-	template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
-	{
-		return GSVector4i(
-			(int)ptr[extract8<src + 0>() & 0xf],
-			(int)ptr[extract8<src + 0>() >> 4],
-			(int)ptr[extract8<src + 1>() & 0xf],
-			(int)ptr[extract8<src + 1>() >> 4]);
-	}
-
-	template<int src, class T> __forceinline GSVector4i gather32_8(const T* ptr) const
-	{
-		return GSVector4i(
-			(int)ptr[extract8<src + 0>()],
-			(int)ptr[extract8<src + 1>()],
-			(int)ptr[extract8<src + 2>()],
-			(int)ptr[extract8<src + 3>()]);
-	}
-
-	template<int src, class T> __forceinline GSVector4i gather32_16(const T* ptr) const
-	{
-		return GSVector4i(
-			(int)ptr[extract16<src + 0>()],
-			(int)ptr[extract16<src + 1>()],
-			(int)ptr[extract16<src + 2>()],
-			(int)ptr[extract16<src + 3>()]);
-	}
-
-	template<class T> __forceinline GSVector4i gather32_32(const T* ptr) const
-	{
-		return GSVector4i(
-			(int)ptr[extract32<0>()],
-			(int)ptr[extract32<1>()],
-			(int)ptr[extract32<2>()],
-			(int)ptr[extract32<3>()]);
-	}
-
-	template<class T1, class T2> __forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const
-	{
-		return GSVector4i(
-			(int)ptr2[ptr1[extract32<0>()]],
-			(int)ptr2[ptr1[extract32<1>()]],
-			(int)ptr2[ptr1[extract32<2>()]],
-			(int)ptr2[ptr1[extract32<3>()]]);
-	}
-
-	#endif
-
-	#if defined(_M_AMD64) && _M_SSE >= 0x401
+	#if defined(_M_AMD64)
 
 	template<int src, class T> __forceinline GSVector4i gather64_4(const T* ptr) const
 	{
@@ -1620,8 +1359,6 @@ public:
 
 	#endif
 
-	#if _M_SSE >= 0x401
-
 	template<class T> __forceinline void gather8_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
 	{
 		dst[0] = gather8_4<0>(ptr);
@@ -1633,8 +1370,6 @@ public:
 		dst[0] = gather8_8<>(ptr);
 	}
 
-	#endif
-
 	template<class T> __forceinline void gather16_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
 	{
 		dst[0] = gather16_4<0>(ptr);
@@ -1742,15 +1477,7 @@ public:
 
 	__forceinline static GSVector4i loadnt(const void* p)
 	{
-		#if _M_SSE >= 0x401
-
 		return GSVector4i(_mm_stream_load_si128((__m128i*)p));
-
-		#else
-
-		return GSVector4i(_mm_load_si128((__m128i*)p));
-
-		#endif
 	}
 
 	__forceinline static GSVector4i loadl(const void* p)
diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj
index a65963c9c5..19e9af6a1f 100644
--- a/plugins/GSdx/GSdx.vcxproj
+++ b/plugins/GSdx/GSdx.vcxproj
@@ -17,14 +17,6 @@
       <Configuration>Debug</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug SSE4|Win32">
-      <Configuration>Debug SSE4</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug SSE4|x64">
-      <Configuration>Debug SSE4</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
     <ProjectConfiguration Include="Release AVX2|Win32">
       <Configuration>Release AVX2</Configuration>
       <Platform>Win32</Platform>
@@ -41,14 +33,6 @@
       <Configuration>Release</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
-    <ProjectConfiguration Include="Release SSE4|Win32">
-      <Configuration>Release SSE4</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release SSE4|x64">
-      <Configuration>Release SSE4</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
   </ItemGroup>
   <Import Project="$(SolutionDir)common\vsprops\WinSDK.props" />
   <PropertyGroup Label="Globals">
@@ -67,8 +51,7 @@
   <ImportGroup Label="ExtensionSettings" />
   <ImportGroup Label="PropertySheets">
     <Import Project="vsprops\ProjectRootDir.props" />
-    <Import Condition="'$(Configuration)'=='Release' Or '$(Configuration)'=='Debug'" Project="vsprops\sse2.props" />
-    <Import Condition="'$(Configuration)'=='Release SSE4' Or '$(Configuration)'=='Debug SSE4'" Project="vsprops\sse4.props" />
+    <Import Project="vsprops\sse4.props" />
     <Import Condition="'$(Configuration)'=='Release AVX2' Or '$(Configuration)'=='Debug AVX2'" Project="vsprops\avx2.props" />
     <Import Project="vsprops\common.props" />
     <Import Condition="$(Configuration.Contains(Debug))" Project="vsprops\debug.props" />
diff --git a/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp b/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp
index 4df7c0e32b..b093811c7e 100644
--- a/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp
+++ b/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp
@@ -175,18 +175,9 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 	GSVector4i cmin = GSVector4i::xffffffff();
 	GSVector4i cmax = GSVector4i::zero();
 
-	#if _M_SSE >= 0x401
-
 	GSVector4i pmin = GSVector4i::xffffffff();
 	GSVector4i pmax = GSVector4i::zero();
 
-	#else
-
-	GSVector4 pmin = s_minmax.xxxx();
-	GSVector4 pmax = s_minmax.yyyy();
-	
-	#endif
-
 	const GSVertex* RESTRICT v = (GSVertex*)vertex;
 
 	for(int i = 0; i < count; i += n)
@@ -233,21 +224,10 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 			GSVector4i xy = xyzf.upl16();
 			GSVector4i z = xyzf.yyyy();
 
-			#if _M_SSE >= 0x401
-
 			GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf));
 
 			pmin = pmin.min_u32(p);
 			pmax = pmax.max_u32(p);
-
-			#else
-
-			GSVector4 p = GSVector4(xy.upl64(z.srl32(1).upl32(xyzf.wwww())));
-
-			pmin = pmin.min(p);
-			pmax = pmax.max(p);
-
-			#endif
 		}
 		else if(primclass == GS_LINE_CLASS)
 		{
@@ -314,23 +294,11 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 			GSVector4i xy1 = xyzf1.upl16();
 			GSVector4i z1 = xyzf1.yyyy();
 
-			#if _M_SSE >= 0x401
-
 			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
 			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
 
 			pmin = pmin.min_u32(p0.min_u32(p1));
 			pmax = pmax.max_u32(p0.max_u32(p1));
-
-			#else
-
-			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
-			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
-
-			pmin = pmin.min(p0.min(p1));
-			pmax = pmax.max(p0.max(p1));
-
-			#endif
 		}
 		else if(primclass == GS_TRIANGLE_CLASS)
 		{
@@ -406,25 +374,12 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 			GSVector4i xy2 = xyzf2.upl16();
 			GSVector4i z2 = xyzf2.yyyy();
 
-			#if _M_SSE >= 0x401
-
 			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
 			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
 			GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2));
 
 			pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1));
 			pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1));
-
-			#else
-
-			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
-			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
-			GSVector4 p2 = GSVector4(xy2.upl64(z2.srl32(1).upl32(xyzf2.wwww())));
-
-			pmin = pmin.min(p2).min(p0.min(p1));
-			pmax = pmax.max(p2).max(p0.max(p1));
-
-			#endif
 		}
 		else if(primclass == GS_SPRITE_CLASS)
 		{
@@ -491,23 +446,11 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 			GSVector4i xy1 = xyzf1.upl16();
 			GSVector4i z1 = xyzf1.yyyy();
 
-			#if _M_SSE >= 0x401
-
 			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1));
 			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
 
 			pmin = pmin.min_u32(p0.min_u32(p1));
 			pmax = pmax.max_u32(p0.max_u32(p1));
-
-			#else
-
-			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf1.wwww())));
-			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
-
-			pmin = pmin.min(p0.min(p1));
-			pmax = pmax.max(p0.max(p1));
-
-			#endif
 		}
 	}
 
@@ -516,13 +459,9 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
 	// be true if depth isn't constant but close enough. It also imply that
 	// pmin.z & 1 == 0 and pax.z & 1 == 0
 
-	#if _M_SSE >= 0x401
-
 	pmin = pmin.blend16<0x30>(pmin.srl32(1));
 	pmax = pmax.blend16<0x30>(pmax.srl32(1));
 
-	#endif
-
 	GSVector4 o(context->XYOFFSET);
 	GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f);
 
diff --git a/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp b/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp
index 91726a8490..842e4e329f 100644
--- a/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp
+++ b/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp
@@ -944,12 +944,8 @@ void GSRendererDX11::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sou
 		m_ps_sel.fog = 1;
 
 		GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]);
-#if _M_SSE >= 0x401
 		// Blend AREF to avoid to load a random value for alpha (dirty cache)
 		ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF);
-#else
-		ps_cb.FogColor_AREF = fc;
-#endif
 	}
 
 	// Warning must be done after EmulateZbuffer
diff --git a/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp b/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp
index 6173e291d8..57657f6b1a 100644
--- a/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp
+++ b/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp
@@ -1180,12 +1180,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
 		m_ps_sel.fog = 1;
 
 		GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]);
-#if _M_SSE >= 0x401
 		// Blend AREF to avoid to load a random value for alpha (dirty cache)
 		ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF);
-#else
-		ps_cb.FogColor_AREF = fc;
-#endif
 	}
 
 	// Warning must be done after EmulateZbuffer
diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
index df6391f10b..1508978081 100644
--- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
+++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
@@ -121,18 +121,7 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t
 	}
 	else
 	{
-		if(m_cpu.has(util::Cpu::tSSE41))
-		{
-			pblendw(a, b, 0xaa);
-		}
-		else
-		{
-			pcmpeqd(temp, temp);
-			psrld(temp, 16);
-			pand(a, temp);
-			pandn(temp, b);
-			por(a, temp);
-		}
+		pblendw(a, b, 0xaa);
 	}
 }
 
@@ -154,17 +143,8 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
 	}
 	else
 	{
-		if(m_cpu.has(util::Cpu::tSSE41))
-		{
-			packuswb(a, a);
-			pmovzxbw(a, a);
-		}
-		else
-		{
-			packuswb(a, a);
-			pxor(temp, temp);
-			punpcklbw(a, temp);
-		}
+		packuswb(a, a);
+		pmovzxbw(a, a);
 	}
 }
 
@@ -223,10 +203,8 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
 {
 	if(m_cpu.has(util::Cpu::tAVX))
 		vpblendvb(a, a, b, xmm0);
-	else if(m_cpu.has(util::Cpu::tSSE41))
+	else 
 		pblendvb(a, b);
-	else
-		blend(a, b, xmm0);
 }
 
 void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
@@ -235,15 +213,11 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
 	{
 		vpblendvb(b, a, b, xmm0);
 	}
-	else if(m_cpu.has(util::Cpu::tSSE41))
+	else
 	{
 		pblendvb(a, b);
 		movdqa(b, a);
 	}
-	else
-	{
-		blendr(b, a, xmm0);
-	}
 }
 
 void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
index 78711e48a8..ea59e11cf8 100644
--- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
+++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
@@ -644,20 +644,9 @@ void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
 		// Clamp Z to ZPSM_FMT_MAX
 		if (m_sel.zclamp)
 		{
-#if _M_SSE >= 0x401
 			pcmpeqd(temp1, temp1);
 			psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
 			pminsd(xmm0, temp1);
-#else
-			pcmpeqd(temp1, temp1);
-			psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
-			pcmpgtd(temp1, xmm0);
-			pand(xmm0, temp1);
-			pcmpeqd(temp2, temp2);
-			pxor(temp1, temp2);
-			psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
-			por(xmm0, temp1);
-#endif
 		}
 
 		if(m_sel.zwrite)
@@ -1089,15 +1078,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
 		movdqa(xmm4, ptr[&m_local.gd->t.min]);
 		movdqa(xmm5, ptr[&m_local.gd->t.max]);
 
-		if(m_cpu.has(util::Cpu::tSSE41))
-		{
-			movdqa(xmm0, ptr[&m_local.gd->t.mask]);
-		}
-		else
-		{
-			movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
-			movdqa(xmm6, xmm0);
-		}
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
 
 		// uv0
 
@@ -1118,11 +1099,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
 		pminsw(uv0, xmm5);
 
 		// clamp.blend8(repeat, m_local.gd->t.mask);
-
-		if(m_cpu.has(util::Cpu::tSSE41))
-			pblendvb(uv0, xmm1);
-		else
-			blendr(uv0, xmm1, xmm0);
+		pblendvb(uv0, xmm1);
 
 		// uv1
 
@@ -1143,11 +1120,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
 		pminsw(uv1, xmm5);
 
 		// clamp.blend8(repeat, m_local.gd->t.mask);
-
-		if(m_cpu.has(util::Cpu::tSSE41))
-			pblendvb(uv1, xmm1);
-		else
-			blendr(uv1, xmm1, xmm6);
+		pblendvb(uv1, xmm1);
 	}
 }
 
@@ -1908,15 +1881,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1)
 	}
 	else
 	{
-		if(m_cpu.has(util::Cpu::tSSE41))
-		{
-			movdqa(xmm0, ptr[&m_local.gd->t.mask]);
-		}
-		else
-		{
-			movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
-			movdqa(xmm4, xmm0);
-		}
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
 
 		// uv0
 
@@ -1937,11 +1902,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1)
 		pminsw(uv0, xmm6);
 
 		// clamp.blend8(repeat, m_local.gd->t.mask);
-
-		if(m_cpu.has(util::Cpu::tSSE41))
-			pblendvb(uv0, xmm1);
-		else
-			blendr(uv0, xmm1, xmm0);
+		pblendvb(uv0, xmm1);
 
 		// uv1
 
@@ -1963,10 +1924,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1)
 
 		// clamp.blend8(repeat, m_local.gd->t.mask);
 
-		if(m_cpu.has(util::Cpu::tSSE41))
-			pblendvb(uv1, xmm1);
-		else
-			blendr(uv1, xmm1, xmm4);
+		pblendvb(uv1, xmm1);	
 	}
 }
 
@@ -2435,20 +2393,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
 	// Clamp Z to ZPSM_FMT_MAX
 	if (m_sel.zclamp)
 	{
-#if _M_SSE >= 0x401
 		pcmpeqd(xmm7, xmm7);
 		psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
 		pminsd(xmm1, xmm7);
-#else
-		static GSVector4i all_1s = GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
-		pcmpeqd(xmm7, xmm7);
-		psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
-		pcmpgtd(xmm7, xmm1);
-		pand(xmm1, xmm7);
-		pxor(xmm7, ptr[&all_1s]);
-		psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
-		por(xmm1, xmm7);
-#endif
 	}
 
 	bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
@@ -2669,15 +2616,6 @@ void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
 
 	if(m_sel.pabe)
 	{
-		if(!m_cpu.has(util::Cpu::tSSE41))
-		{
-			// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
-			movdqa(xmm0, xmm4);
-			pslld(xmm0, 8);
-			psrad(xmm0, 31);
-
-		}
-
 		psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
 
 		// ga = c[1].blend8(ga, mask).mix16(c[1]);
@@ -2862,24 +2800,13 @@ void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& ad
 	case 0:
 		if(i == 0) movd(dst, src);
 		else {
-			if(m_cpu.has(util::Cpu::tSSE41)) {
-				pextrd(dst, src, i);
-			} else {
-				pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
-				movd(dst, xmm0);
-			}
-
+			pextrd(dst, src, i);
 		}
 		break;
 	case 1:
 		if(i == 0) movd(eax, src);
 		else {
-			if(m_cpu.has(util::Cpu::tSSE41)) {
-				pextrd(eax, src, i);
-			} else {
-				pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
-				movd(eax, xmm0);
-			}
+			pextrd(eax, src, i);
 		}
 		xor(eax, dst);
 		and(eax, 0xffffff);
@@ -2918,153 +2845,28 @@ void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
 
 	if(m_sel.mmin && !m_sel.lcm)
 	{
-		if(m_cpu.has(util::Cpu::tSSE41))
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
+
+		if(pixels == 4)
 		{
+			movdqa(ptr[&m_local.temp.test], xmm7);
+		}
 
-			const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
+		for(uint8 j = 0; j < 4; j++)
+		{
+			mov(ebx, ptr[&lod_i->u32[j]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
 
-			if(pixels == 4)
+			for(int i = 0; i < pixels; i++)
 			{
-				movdqa(ptr[&m_local.temp.test], xmm7);
-			}
-
-			for(uint8 j = 0; j < 4; j++)
-			{
-				mov(ebx, ptr[&lod_i->u32[j]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				for(int i = 0; i < pixels; i++)
-				{
-					ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
-				}
-			}
-
-			if(pixels == 4)
-			{
-				movdqa(xmm5, xmm7);
-				movdqa(xmm7, ptr[&m_local.temp.test]);
+				ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
 			}
 		}
-		else
+
+		if(pixels == 4)
 		{
-
-			if(pixels == 4)
-			{
-				movdqa(ptr[&m_local.temp.test], xmm7);
-
-				mov(ebx, ptr[&lod_i->u32[0]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm6, xmm5, 0);
-				psrldq(xmm5, 4);
-				ReadTexel_SSE(xmm4, xmm2, 0);
-				psrldq(xmm2, 4);
-
-				mov(ebx, ptr[&lod_i->u32[1]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm1, xmm5, 0);
-				psrldq(xmm5, 4);
-				ReadTexel_SSE(xmm7, xmm2, 0);
-				psrldq(xmm2, 4);
-
-				punpckldq(xmm6, xmm1);
-				punpckldq(xmm4, xmm7);
-
-				mov(ebx, ptr[&lod_i->u32[2]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm1, xmm5, 0);
-				psrldq(xmm5, 4);
-				ReadTexel_SSE(xmm7, xmm2, 0);
-				psrldq(xmm2, 4);
-
-				mov(ebx, ptr[&lod_i->u32[3]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm5, xmm5, 0);
-				ReadTexel_SSE(xmm2, xmm2, 0);
-
-				punpckldq(xmm1, xmm5);
-				punpckldq(xmm7, xmm2);
-
-				punpcklqdq(xmm6, xmm1);
-				punpcklqdq(xmm4, xmm7);
-
-				mov(ebx, ptr[&lod_i->u32[0]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm1, xmm0, 0);
-				psrldq(xmm0, 4);
-				ReadTexel_SSE(xmm5, xmm3, 0);
-				psrldq(xmm3, 4);
-
-				mov(ebx, ptr[&lod_i->u32[1]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm2, xmm0, 0);
-				psrldq(xmm0, 4);
-				ReadTexel_SSE(xmm7, xmm3, 0);
-				psrldq(xmm3, 4);
-
-				punpckldq(xmm1, xmm2);
-				punpckldq(xmm5, xmm7);
-
-				mov(ebx, ptr[&lod_i->u32[2]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm2, xmm0, 0);
-				psrldq(xmm0, 4);
-				ReadTexel_SSE(xmm7, xmm3, 0);
-				psrldq(xmm3, 4);
-
-				mov(ebx, ptr[&lod_i->u32[3]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm0, xmm0, 0);
-				ReadTexel_SSE(xmm3, xmm3, 0);
-
-				punpckldq(xmm2, xmm0);
-				punpckldq(xmm7, xmm3);
-
-				punpcklqdq(xmm1, xmm2);
-				punpcklqdq(xmm5, xmm7);
-
-				movdqa(xmm7, ptr[&m_local.temp.test]);
-			}
-			else
-			{
-				mov(ebx, ptr[&lod_i->u32[0]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm6, xmm5, 0);
-				psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
-
-				mov(ebx, ptr[&lod_i->u32[1]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm1, xmm5, 0);
-				psrldq(xmm5, 4);
-
-				punpckldq(xmm6, xmm1);
-
-				mov(ebx, ptr[&lod_i->u32[2]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm1, xmm5, 0);
-				psrldq(xmm5, 4);
-
-				mov(ebx, ptr[&lod_i->u32[3]]);
-				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
-
-				ReadTexel_SSE(xmm4, xmm5, 0);
-				// psrldq(xmm5, 4);
-
-				punpckldq(xmm1, xmm4);
-
-				punpcklqdq(xmm6, xmm1);
-			}
-
+			movdqa(xmm5, xmm7);
+			movdqa(xmm7, ptr[&m_local.temp.test]);
 		}
 	}
 	else
@@ -3077,41 +2879,12 @@ void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
 
 		const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
 
-		if(m_cpu.has(util::Cpu::tSSE41))
+		for(int i = 0; i < pixels; i++)
 		{
-			for(int i = 0; i < pixels; i++)
+			for(uint8 j = 0; j < 4; j++)
 			{
-				for(uint8 j = 0; j < 4; j++)
-				{
-					ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
-				}
+				ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
 			}
-
-		} else {
-			const int t[] = {1, 4, 1, 5, 2, 5, 2, 0};
-
-			for(int i = 0; i < pixels; i++)
-			{
-				const Xmm& addr = Xmm(r[i * 2 + 0]);
-				const Xmm& dst = Xmm(r[i * 2 + 1]);
-				const Xmm& temp1 = Xmm(t[i * 2 + 0]);
-				const Xmm& temp2 = Xmm(t[i * 2 + 1]);
-
-				ReadTexel_SSE(dst, addr, 0);
-				psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
-				ReadTexel_SSE(temp1, addr, 0);
-				psrldq(addr, 4);
-				punpckldq(dst, temp1);
-
-				ReadTexel_SSE(temp1, addr, 0);
-				psrldq(addr, 4);
-				ReadTexel_SSE(temp2, addr, 0);
-				// psrldq(addr, 4);
-				punpckldq(temp1, temp2);
-
-				punpcklqdq(dst, temp1);
-			}
-
 		}
 	}
 }
diff --git a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp
index fd424400c1..cb98275428 100644
--- a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp
+++ b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp
@@ -277,37 +277,17 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 	
 	GSVector4i off = (GSVector4i)m_context->XYOFFSET;
 	GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
-
-	#if _M_SSE >= 0x401
-
 	GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
 
-	#else
-
-	uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
-
-	#endif
-
 	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
 	{
 		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
 
-		#if _M_SSE >= 0x401
-
 		GSVector4i xyzuvf(src->m[1]);
 
 		GSVector4i xy = xyzuvf.upl16() - off;
 		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
 
-		#else
-
-		uint32 z = src->XYZ.Z;
-
-		GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - off;
-		GSVector4i zf = GSVector4i((int)std::min<uint32>(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later
-
-		#endif
-
 		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale;
 		dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
 
@@ -317,15 +297,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 		{
 			if(fst)
 			{
-				#if _M_SSE >= 0x401
-
 				t = GSVector4(xyzuvf.uph16() << (16 - 4));
-
-				#else
-
-				t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4));
-
-				#endif
 			}
 			else if(q_div)
 			{
@@ -350,17 +322,8 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 
 		if(primclass == GS_SPRITE_CLASS)
 		{
-			#if _M_SSE >= 0x401
-
 			xyzuvf = xyzuvf.min_u32(z_max);
 			t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
-
-			#else
-
-			z = std::min(z, z_max);
-			t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
-
-			#endif
 		}
 
 		dst->t = t;
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index cfcb833b55..e2e9fff37f 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -241,60 +241,39 @@ typedef int64 sint64;
 	#define _M_SSE 0x500
 #elif defined(__SSE4_1__)
 	#define _M_SSE 0x401
-#elif defined(__SSSE3__)
-	#define _M_SSE 0x301
-#elif defined(__SSE2__)
-	#define _M_SSE 0x200
 #endif
 
 #endif
 
 #if !defined(_M_SSE) && (!defined(_WIN32) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2)
 
-	#define _M_SSE 0x200
+	#define _M_SSE 0x401
 
 #endif
 
-#if _M_SSE >= 0x200
-
-	#include <xmmintrin.h>
-	#include <emmintrin.h>
-
-	#ifndef _MM_DENORMALS_ARE_ZERO
-	#define _MM_DENORMALS_ARE_ZERO 0x0040
-	#endif
-
-	#define MXCSR (_MM_DENORMALS_ARE_ZERO | _MM_MASK_MASK | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON)
-
-	#define _MM_TRANSPOSE4_SI128(row0, row1, row2, row3) \
-	{ \
-		__m128 tmp0 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0x44); \
-		__m128 tmp2 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0xEE); \
-		__m128 tmp1 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0x44); \
-		__m128 tmp3 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0xEE); \
-		(row0) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0x88)); \
-		(row1) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0xDD)); \
-		(row2) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0x88)); \
-		(row3) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); \
-	}
-
-#else
-
-#error TODO: GSVector4 and GSRasterizer needs SSE2
+#include <xmmintrin.h>
+#include <emmintrin.h>
 
+#ifndef _MM_DENORMALS_ARE_ZERO
+#define _MM_DENORMALS_ARE_ZERO 0x0040
 #endif
 
-#if _M_SSE >= 0x301
+#define MXCSR (_MM_DENORMALS_ARE_ZERO | _MM_MASK_MASK | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON)
 
-	#include <tmmintrin.h>
+#define _MM_TRANSPOSE4_SI128(row0, row1, row2, row3) \
+{ \
+	__m128 tmp0 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0x44); \
+	__m128 tmp2 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0xEE); \
+	__m128 tmp1 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0x44); \
+	__m128 tmp3 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0xEE); \
+	(row0) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0x88)); \
+	(row1) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0xDD)); \
+	(row2) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0x88)); \
+	(row3) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); \
+}
 
-#endif
-
-#if _M_SSE >= 0x401
-
-	#include <smmintrin.h>
-
-#endif
+#include <tmmintrin.h>
+#include <smmintrin.h>
 
 #if _M_SSE >= 0x500